├── doc ├── frame.png └── intro.png ├── saved_models └── yolov3_weights.sh ├── model ├── __pycache__ │ ├── clip.cpython-39.pyc │ ├── loss.cpython-36.pyc │ ├── loss.cpython-38.pyc │ ├── loss.cpython-39.pyc │ ├── convlstm.cpython-36.pyc │ ├── convlstm.cpython-38.pyc │ ├── convlstm.cpython-39.pyc │ ├── darknet.cpython-36.pyc │ ├── darknet.cpython-38.pyc │ ├── darknet.cpython-39.pyc │ ├── modulation.cpython-36.pyc │ ├── modulation.cpython-38.pyc │ ├── modulation.cpython-39.pyc │ ├── grounding_model.cpython-36.pyc │ ├── grounding_model.cpython-38.pyc │ ├── grounding_model.cpython-39.pyc │ └── grounding_modelbest.cpython-38.pyc ├── convlstm.py ├── loss.py ├── yolov3.cfg ├── grounding_model.py ├── darknet.py └── modulation.py ├── utils ├── __pycache__ │ ├── utils.cpython-36.pyc │ ├── utils.cpython-38.pyc │ ├── utils.cpython-39.pyc │ ├── __init__.cpython-36.pyc │ ├── __init__.cpython-38.pyc │ ├── __init__.cpython-39.pyc │ ├── checkpoint.cpython-36.pyc │ ├── checkpoint.cpython-38.pyc │ ├── checkpoint.cpython-39.pyc │ ├── transforms.cpython-36.pyc │ ├── transforms.cpython-38.pyc │ ├── transforms.cpython-39.pyc │ ├── word_utils.cpython-36.pyc │ ├── word_utils.cpython-38.pyc │ ├── word_utils.cpython-39.pyc │ ├── parsing_metrics.cpython-36.pyc │ ├── parsing_metrics.cpython-38.pyc │ └── parsing_metrics.cpython-39.pyc ├── __init__.py ├── losses.py ├── misc_utils.py ├── checkpoint.py ├── word_utils.py ├── utils.py ├── parsing_metrics.py ├── transforms.py ├── transformsv2.py └── temp.py ├── dataset ├── __pycache__ │ ├── data_loader.cpython-36.pyc │ ├── data_loader.cpython-38.pyc │ └── data_loader.cpython-39.pyc ├── data_loaderv2.py └── data_loader.py ├── ln_data └── README.md ├── README.md └── evaluation_results.py /doc/frame.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/doc/frame.png -------------------------------------------------------------------------------- /doc/intro.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/doc/intro.png -------------------------------------------------------------------------------- /saved_models/yolov3_weights.sh: -------------------------------------------------------------------------------- 1 | #wget -P saved_models https://pjreddie.com/media/files/yolov3.weights -------------------------------------------------------------------------------- /model/__pycache__/clip.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/clip.cpython-39.pyc -------------------------------------------------------------------------------- /model/__pycache__/loss.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/loss.cpython-36.pyc -------------------------------------------------------------------------------- /model/__pycache__/loss.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/loss.cpython-38.pyc -------------------------------------------------------------------------------- /model/__pycache__/loss.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/loss.cpython-39.pyc -------------------------------------------------------------------------------- /utils/__pycache__/utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/utils.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/utils.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/utils.cpython-39.pyc -------------------------------------------------------------------------------- /model/__pycache__/convlstm.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/convlstm.cpython-36.pyc -------------------------------------------------------------------------------- /model/__pycache__/convlstm.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/convlstm.cpython-38.pyc -------------------------------------------------------------------------------- /model/__pycache__/convlstm.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/convlstm.cpython-39.pyc -------------------------------------------------------------------------------- /model/__pycache__/darknet.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/darknet.cpython-36.pyc -------------------------------------------------------------------------------- /model/__pycache__/darknet.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/darknet.cpython-38.pyc -------------------------------------------------------------------------------- /model/__pycache__/darknet.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/darknet.cpython-39.pyc -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/__init__.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /model/__pycache__/modulation.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/modulation.cpython-36.pyc -------------------------------------------------------------------------------- /model/__pycache__/modulation.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/modulation.cpython-38.pyc -------------------------------------------------------------------------------- /model/__pycache__/modulation.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/modulation.cpython-39.pyc -------------------------------------------------------------------------------- /utils/__pycache__/checkpoint.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/checkpoint.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/checkpoint.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/checkpoint.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/checkpoint.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/checkpoint.cpython-39.pyc -------------------------------------------------------------------------------- /utils/__pycache__/transforms.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/transforms.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/transforms.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/transforms.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/transforms.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/transforms.cpython-39.pyc -------------------------------------------------------------------------------- /utils/__pycache__/word_utils.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/word_utils.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/word_utils.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/word_utils.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/word_utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/word_utils.cpython-39.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/data_loader.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/dataset/__pycache__/data_loader.cpython-36.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/data_loader.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/dataset/__pycache__/data_loader.cpython-38.pyc -------------------------------------------------------------------------------- /dataset/__pycache__/data_loader.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/dataset/__pycache__/data_loader.cpython-39.pyc -------------------------------------------------------------------------------- /model/__pycache__/grounding_model.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/grounding_model.cpython-36.pyc -------------------------------------------------------------------------------- /model/__pycache__/grounding_model.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/grounding_model.cpython-38.pyc -------------------------------------------------------------------------------- /model/__pycache__/grounding_model.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/grounding_model.cpython-39.pyc -------------------------------------------------------------------------------- /utils/__pycache__/parsing_metrics.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/parsing_metrics.cpython-36.pyc -------------------------------------------------------------------------------- /utils/__pycache__/parsing_metrics.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/parsing_metrics.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__pycache__/parsing_metrics.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/parsing_metrics.cpython-39.pyc -------------------------------------------------------------------------------- /ln_data/README.md: -------------------------------------------------------------------------------- 1 | # Dataset 2 | Download the YouRefIt dataset from [Dataset Request Page](https://yixchen.github.io/YouRefIt/request.html) and put here. -------------------------------------------------------------------------------- /model/__pycache__/grounding_modelbest.cpython-38.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/grounding_modelbest.cpython-38.pyc -------------------------------------------------------------------------------- /utils/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | # ----------------------------------------------------------------------------- 3 | # Copyright (c) Edgar Andrés Margffoy-Tuay, Emilio Botero and Juan Camilo Pérez 4 | # 5 | # Licensed under the terms of the MIT License 6 | # (see LICENSE for details) 7 | # ----------------------------------------------------------------------------- 8 | 9 | """Misc data and other helping utillites.""" 10 | 11 | from .word_utils import Corpus 12 | from .transforms import ResizeImage, ResizeAnnotation 13 | 14 | Corpus 15 | ResizeImage 16 | ResizeAnnotation 17 | 18 | 19 | class AverageMeter(object): 20 | """Computes and stores the average and current value""" 21 | 22 | def __init__(self): 23 | self.reset() 24 | 25 | def reset(self): 26 | self.val = 0 27 | self.avg = 0 28 | self.sum = 0 29 | self.count = 0 30 | 31 | def update(self, val, n=1): 32 | self.val = val 33 | self.sum += val * n 34 | self.count += n 35 | self.avg = self.sum / self.count 36 | -------------------------------------------------------------------------------- /utils/losses.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Custom loss function definitions. 5 | """ 6 | 7 | import torch.nn as nn 8 | import torch.nn.functional as F 9 | 10 | 11 | class IoULoss(nn.Module): 12 | """ 13 | Creates a criterion that computes the Intersection over Union (IoU) 14 | between a segmentation mask and its ground truth. 15 | 16 | Rahman, M.A. and Wang, Y: 17 | Optimizing Intersection-Over-Union in Deep Neural Networks for 18 | Image Segmentation. International Symposium on Visual Computing (2016) 19 | http://www.cs.umanitoba.ca/~ywang/papers/isvc16.pdf 20 | """ 21 | 22 | def __init__(self, size_average=True): 23 | super().__init__() 24 | self.size_average = size_average 25 | 26 | def forward(self, input, target): 27 | input = F.sigmoid(input) 28 | intersection = (input * target).sum() 29 | union = ((input + target) - (input * target)).sum() 30 | iou = intersection / union 31 | iou_dual = input.size(0) - iou 32 | if self.size_average: 33 | iou_dual = iou_dual / input.size(0) 34 | return iou_dual 35 | -------------------------------------------------------------------------------- /utils/misc_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Misc download and visualization helper functions and class wrappers. 5 | """ 6 | 7 | import sys 8 | import time 9 | import torch 10 | from visdom import Visdom 11 | 12 | 13 | def reporthook(count, block_size, total_size): 14 | global start_time 15 | if count == 0: 16 | start_time = time.time() 17 | return 18 | duration = time.time() - start_time 19 | progress_size = int(count * block_size) 20 | speed = int(progress_size / (1024 * duration)) 21 | percent = min(int(count * block_size * 100 / total_size), 100) 22 | sys.stdout.write("\r...%d%%, %d MB, %d KB/s, %d seconds passed" % 23 | (percent, progress_size / (1024 * 1024), speed, duration)) 24 | sys.stdout.flush() 25 | 26 | 27 | class VisdomWrapper(Visdom): 28 | def __init__(self, *args, env=None, **kwargs): 29 | Visdom.__init__(self, *args, **kwargs) 30 | self.env = env 31 | self.plots = {} 32 | 33 | def init_line_plot(self, name, 34 | X=torch.zeros((1,)).cpu(), 35 | Y=torch.zeros((1,)).cpu(), **opts): 36 | self.plots[name] = self.line(X=X, Y=Y, env=self.env, opts=opts) 37 | 38 | def plot_line(self, name, **kwargs): 39 | self.line(win=self.plots[name], env=self.env, **kwargs) 40 | -------------------------------------------------------------------------------- /utils/checkpoint.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import numpy as np 4 | import torch 5 | import torch.nn.functional as F 6 | from collections import OrderedDict 7 | 8 | def save_checkpoint(state, is_best, args, filename='default'): 9 | if filename=='default': 10 | filename = 'filmconv_nofpn32_%s_batch%d'%(args.dataset,args.batch_size) 11 | 12 | checkpoint_name = './saved_models/%s_checkpoint.pth.tar'%(filename) 13 | best_name = './saved_models/%s_model_best.pth.tar'%(filename) 14 | torch.save(state, checkpoint_name) 15 | if is_best: 16 | shutil.copyfile(checkpoint_name, best_name) 17 | 18 | def load_pretrain(model, args, logging): 19 | if os.path.isfile(args.pretrain): 20 | checkpoint = torch.load(args.pretrain) 21 | #print(checkpoint.items()) 22 | pretrained_dict = checkpoint['state_dict'] 23 | #print(pretrained_dict) 24 | 25 | # new_state_dict = OrderedDict() 26 | # for k, v in pretrained_dict.items(): # k为module.xxx.weight, v为权重 27 | # name = k[7:] # 截取`module.`后面的xxx.weight 28 | # new_state_dict[name] = v 29 | 30 | model_dict = model.state_dict() 31 | pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict} 32 | #pretrained_dict = {k: v for k, v in new_state_dict.items() if k in model_dict} 33 | 34 | 35 | assert (len([k for k, v in pretrained_dict.items()])!=0) 36 | model_dict.update(pretrained_dict) 37 | 38 | model.load_state_dict(model_dict) 39 | #model.load_state_dict(new_state_dict) 40 | print("=> loaded pretrain model at {}" 41 | .format(args.pretrain)) 42 | logging.info("=> loaded pretrain model at {}" 43 | .format(args.pretrain)) 44 | del checkpoint # dereference seems crucial 45 | torch.cuda.empty_cache() 46 | else: 47 | print(("=> no pretrained file found at '{}'".format(args.pretrain))) 48 | logging.info("=> no pretrained file found at '{}'".format(args.pretrain)) 49 | return model 50 | 51 | def load_resume(model, args, logging): 52 | if os.path.isfile(args.resume): 53 | print(("=> loading checkpoint '{}'".format(args.resume))) 54 | logging.info("=> loading checkpoint '{}'".format(args.resume)) 55 | checkpoint = torch.load(args.resume) 56 | args.start_epoch = checkpoint['epoch'] 57 | best_loss = checkpoint['best_loss'] 58 | model.load_state_dict(checkpoint['state_dict']) 59 | print(("=> loaded checkpoint (epoch {}) Loss{}" 60 | .format(checkpoint['epoch'], best_loss))) 61 | logging.info("=> loaded checkpoint (epoch {}) Loss{}" 62 | .format(checkpoint['epoch'], best_loss)) 63 | del checkpoint # dereference seems crucial 64 | torch.cuda.empty_cache() 65 | else: 66 | print(("=> no checkpoint found at '{}'".format(args.resume))) 67 | logging.info(("=> no checkpoint found at '{}'".format(args.resume))) 68 | return model -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spatial and Visual Perspective-Taking via View Rotation and Relation Reasoning for Embodied Reference Understanding 2 | 3 | by [Cheng Shi](https://github.com/ChengShiest/) and [Sibei Yang](https://sibeiyang.github.io/) 4 | 5 | European Conference on Computer Vision (ECCV), 2022 6 | 7 | ## Introduction 8 | 9 | Embodied Reference Understanding studies the reference understanding in an embodied fashion, where a receiver requires to locate a target object referred to by both language and gesture of the sender in a shared physical environment. Its main challenge lies in how to make the receiver with the egocentric view access spatial and visual information relative to the sender to judge how objects are oriented around and seen from the sender, i.e., spatial and visual perspective-taking. In this paper, we propose a REasoning from your Perspective (REP) method to tackle the challenge by modeling relations between the receiver and the sender as well as the sender and the objects via the proposed novel view rotation and relation reasoning. Specifically, view rotation first rotates the receiver to the position of the sender by constructing an embodied 3D coordinate system with the position of the sender as the origin. Then, it changes the orientation of the receiver to the orientation of the sender by encoding the body orientation and gesture of the sender. Relation reasoning models both the nonverbal and verbal relations between the sender and the objects by multi-modal cooperative reasoning in gesture, language, visual content, and spatial position. 10 | 11 | 12 |

13 | 14 |

15 | 16 | ## Framework 17 | 18 |

19 | 20 |

21 | 22 | ## Dataset 23 | Download the YouRefIt dataset from [Dataset Request Page](https://yixchen.github.io/YouRefIt/request.html) and put under ```./ln_data``` 24 | 25 | ## Model weights 26 | * [Yolov3](https://pjreddie.com/media/files/yolov3.weights): download the pretrained model and place the file in ``./saved_models`` by 27 | ``` 28 | sh saved_models/yolov3_weights.sh 29 | ``` 30 | 31 | Make sure to put the files in the following structure: 32 | 33 | ``` 34 | |-- ROOT 35 | | |-- ln_data 36 | | |-- yourefit 37 | | |-- images 38 | | |-- paf 39 | | |-- saliency 40 | ``` 41 | 42 | ## Training and Evaluation 43 | The training and evaluation script is the same as [YouRefIt](https://github.com/yixchen/YouRefIt_ERU) 44 | 45 | ## Checklist 46 | 47 | + [x] code 48 | + [ ] pre-process data 49 | 50 | ### Citation 51 | 52 | @inproceedings{shi2022spatial, 53 | title={Spatial and Visual Perspective-Taking via View Rotation and Relation Reasoning for Embodied Reference Understanding}, 54 | author={Shi, Cheng and Yang, Sibei}, 55 | booktitle={Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part XXXVI}, 56 | pages={201--218}, 57 | year={2022}, 58 | organization={Springer} 59 | } 60 | 61 | ### Acknowledgement 62 | Our code is built on [ReSC](https://github.com/zyang-ur/ReSC) and [YouRefIt](https://github.com/yixchen/YouRefIt_ERU), we thank the authors for their hard work. 63 | 64 | 65 | -------------------------------------------------------------------------------- /utils/word_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Language-related data loading helper functions and class wrappers. 5 | """ 6 | 7 | import re 8 | import torch 9 | import codecs 10 | 11 | UNK_TOKEN = '' 12 | PAD_TOKEN = '' 13 | END_TOKEN = '' 14 | SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)') 15 | 16 | 17 | class Dictionary(object): 18 | def __init__(self): 19 | self.word2idx = {} 20 | self.idx2word = [] 21 | 22 | def add_word(self, word): 23 | if word not in self.word2idx: 24 | self.idx2word.append(word) 25 | self.word2idx[word] = len(self.idx2word) - 1 26 | return self.word2idx[word] 27 | 28 | def __len__(self): 29 | return len(self.idx2word) 30 | 31 | def __getitem__(self, a): 32 | if isinstance(a, int): 33 | return self.idx2word[a] 34 | elif isinstance(a, list): 35 | return [self.idx2word[x] for x in a] 36 | elif isinstance(a, str): 37 | return self.word2idx[a] 38 | else: 39 | raise TypeError("Query word/index argument must be int or str") 40 | 41 | def __contains__(self, word): 42 | return word in self.word2idx 43 | 44 | 45 | class Corpus(object): 46 | def __init__(self): 47 | self.dictionary = Dictionary() 48 | 49 | def set_max_len(self, value): 50 | self.max_len = value 51 | 52 | def load_file(self, filename): 53 | with codecs.open(filename, 'r', 'utf-8') as f: 54 | for line in f: 55 | line = line.strip() 56 | self.add_to_corpus(line) 57 | self.dictionary.add_word(UNK_TOKEN) 58 | self.dictionary.add_word(PAD_TOKEN) 59 | 60 | def add_to_corpus(self, line): 61 | """Tokenizes a text line.""" 62 | # Add words to the dictionary 63 | words = line.split() 64 | # tokens = len(words) 65 | for word in words: 66 | word = word.lower() 67 | self.dictionary.add_word(word) 68 | 69 | def tokenize(self, line, max_len=20): 70 | # Tokenize line contents 71 | words = SENTENCE_SPLIT_REGEX.split(line.strip()) 72 | # words = [w.lower() for w in words if len(w) > 0] 73 | words = [w.lower() for w in words if (len(w) > 0 and w!=' ')] ## do not include space as a token 74 | 75 | if words[-1] == '.': 76 | words = words[:-1] 77 | 78 | if max_len > 0: 79 | if len(words) > max_len: 80 | words = words[:max_len] 81 | elif len(words) < max_len: 82 | # words = [PAD_TOKEN] * (max_len - len(words)) + words 83 | words = words + [END_TOKEN] + [PAD_TOKEN] * (max_len - len(words) - 1) 84 | 85 | tokens = len(words) ## for end token 86 | ids = torch.LongTensor(tokens) 87 | token = 0 88 | for word in words: 89 | if word not in self.dictionary: 90 | word = UNK_TOKEN 91 | # print(word, type(word), word.encode('ascii','ignore').decode('ascii'), type(word.encode('ascii','ignore').decode('ascii'))) 92 | if type(word)!=type('a'): 93 | print(word, type(word), word.encode('ascii','ignore').decode('ascii'), type(word.encode('ascii','ignore').decode('ascii'))) 94 | word = word.encode('ascii','ignore').decode('ascii') 95 | ids[token] = self.dictionary[word] 96 | token += 1 97 | # ids[token] = self.dictionary[END_TOKEN] 98 | return ids 99 | 100 | def __len__(self): 101 | return len(self.dictionary) 102 | -------------------------------------------------------------------------------- /evaluation_results.py: -------------------------------------------------------------------------------- 1 | import os 2 | import numpy as np 3 | import cv2 4 | import pickle5 as pickle 5 | import torch 6 | import json 7 | def bbox_iou(box1, box2, x1y1x2y2=True): 8 | """ 9 | Returns the IoU of two bounding boxes 10 | """ 11 | box1 = torch.tensor(box1) 12 | box2 = torch.tensor(box2) 13 | if x1y1x2y2: 14 | # Get the coordinates of bounding boxes 15 | b1_x1, b1_y1, b1_x2, b1_y2 = box1[ 0], box1[ 1], box1[ 2], box1[ 3] 16 | b2_x1, b2_y1, b2_x2, b2_y2 = box2[ 0], box2[ 1], box2[ 2], box2[ 3] 17 | else: 18 | # Transform from center and width to exact coordinates 19 | b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 20 | b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 21 | b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 22 | b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 23 | 24 | # get the coordinates of the intersection rectangle 25 | inter_rect_x1 = torch.max(b1_x1, b2_x1) 26 | inter_rect_y1 = torch.max(b1_y1, b2_y1) 27 | inter_rect_x2 = torch.min(b1_x2, b2_x2) 28 | inter_rect_y2 = torch.min(b1_y2, b2_y2) 29 | # Intersection area 30 | inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1, 0) * torch.clamp(inter_rect_y2 - inter_rect_y1, 0) 31 | # Union Area 32 | b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1) 33 | b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) 34 | 35 | # print(box1, box1.shape) 36 | # print(box2, box2.shape) 37 | return inter_area / (b1_area + b2_area - inter_area + 1e-16) 38 | 39 | # Given 2 bounding boxes, return their IoU 40 | def bb_IoU(bb1,bb2): 41 | 42 | Area1 = abs(bb1[2] - bb1[0]) * abs(bb1[3]-bb1[1]) 43 | Area2 = abs(bb2[2] - bb2[0]) * abs(bb2[3]-bb2[1]) 44 | 45 | xA = max(bb1[0],bb2[0]) 46 | yA = max(bb1[1],bb2[1]) 47 | xB = min(bb1[2],bb2[2]) 48 | yB = min(bb1[3],bb2[3]) 49 | 50 | intersection = max(0, xB - xA) * max(0, yB - yA) 51 | IoU = intersection / (Area1 + Area2 - intersection + 1e-16) 52 | 53 | return(IoU) 54 | 55 | def Area(bb1, image): 56 | area1 = abs(bb1[2] - bb1[0]) * abs(bb1[3]-bb1[1]) 57 | return area1/image 58 | 59 | def evaluation(image_path, gt_path, predict_path): 60 | yolopred = dict() 61 | 62 | with open("ln_data/yourefit/test_id.txt", "r") as f: 63 | test_id_list = f.readlines() 64 | test_id_list = [x.strip('\n') for x in test_id_list] 65 | print(test_id_list) 66 | 67 | with open("ln_data/yourefit/train_id.txt", "r") as f: 68 | train_id_list = f.readlines() 69 | train_id_list = [x.strip('\n') for x in train_id_list] 70 | 71 | 72 | 73 | TP= dict() 74 | TP['all'] = np.zeros((3,)) 75 | TP['s'] = np.zeros((3,)) 76 | TP['m'] = np.zeros((3,)) 77 | TP['l'] = np.zeros((3,)) 78 | 79 | FP= dict() 80 | FP['all'] = np.zeros((3,)) 81 | FP['s'] = np.zeros((3,)) 82 | FP['m'] = np.zeros((3,)) 83 | FP['l'] = np.zeros((3,)) 84 | gt_boxes = [] 85 | for ind, pattern in enumerate(test_id_list): 86 | img = cv2.imread(os.path.join(image_path, pattern+'.jpg')) 87 | H,W,_ = img.shape 88 | pickle_name = os.path.join(gt_path, pattern+'.p') 89 | gt = pickle.load(open( pickle_name, "rb" )) 90 | ground_truth_box = gt['bbox'] 91 | gt_boxes.append(ground_truth_box) 92 | # read prediction file (Need to change based on input) 93 | pred_pickle = os.path.join(predict_path, pattern+'.jpg.p') 94 | pred = pickle.load(open(pred_pickle, "rb" )) 95 | predicted_box = pred[0] 96 | # 97 | yolopred[test_id_list[ind]] = predicted_box 98 | for ind, IoU in enumerate([0.25, 0.5, 0.75] ): 99 | if bbox_iou(predicted_box,ground_truth_box) >= IoU: 100 | TP['all'][ind] +=1 101 | if 100*Area(ground_truth_box, H*W) < 0.48: 102 | TP['s'][ind] += 1 103 | else: 104 | if 100*Area(ground_truth_box, H*W) < 1.75: 105 | TP['m'][ind] += 1 106 | else: 107 | TP['l'][ind] += 1 108 | else: 109 | FP['all'][ind] +=1 110 | if 100*Area(ground_truth_box, H*W) < 0.48: 111 | FP['s'][ind] += 1 112 | else: 113 | if 100*Area(ground_truth_box, H*W) < 1.75: 114 | FP['m'][ind] += 1 115 | else: 116 | FP['l'][ind] += 1 117 | 118 | for ind, IoU in enumerate([0.25, 0.5, 0.75]): 119 | print('Accuracy =',TP['all'][ind]/(TP['all'][ind]+FP['all'][ind])) 120 | print('Small Accuracy =',TP['s'][ind]/(TP['s'][ind]+FP['s'][ind]), 'in', TP['s'][ind]+FP['s'][ind], 'samples') 121 | print('Medium Accuracy =',TP['m'][ind]/(TP['m'][ind]+FP['m'][ind]), 'in', TP['m'][ind]+FP['m'][ind], 'samples') 122 | print('Large Accuracy =',TP['l'][ind]/(TP['l'][ind]+FP['l'][ind]), 'in', TP['l'][ind]+FP['l'][ind], 'samples') 123 | 124 | if __name__ == "__main__": 125 | 126 | image_path= 'ln_data/yourefit/images' 127 | gt_path= 'ln_data/yourefit/pickle' 128 | predict_path = 'test/test_final' 129 | evaluation(image_path, gt_path, predict_path) 130 | -------------------------------------------------------------------------------- /utils/utils.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import cv2 4 | import numpy as np 5 | import torch 6 | import torch.nn.functional as F 7 | 8 | class AverageMeter(object): 9 | """Computes and stores the average and current value""" 10 | def __init__(self): 11 | self.reset() 12 | 13 | def reset(self): 14 | self.val = 0 15 | self.avg = 0 16 | self.sum = 0 17 | self.count = 0 18 | 19 | def update(self, val, n=1): 20 | self.val = val 21 | self.sum += val * n 22 | self.count += n 23 | self.avg = self.sum / self.count 24 | 25 | def xyxy2xywh(x): # Convert bounding box format from [x1, y1, x2, y2] to [x, y, w, h] 26 | y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape) 27 | y[:, 0] = (x[:, 0] + x[:, 2]) / 2 28 | y[:, 1] = (x[:, 1] + x[:, 3]) / 2 29 | y[:, 2] = x[:, 2] - x[:, 0] 30 | y[:, 3] = x[:, 3] - x[:, 1] 31 | return y 32 | 33 | 34 | def xywh2xyxy(x): # Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2] 35 | y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape) 36 | y[:, 0] = (x[:, 0] - x[:, 2] / 2) 37 | y[:, 1] = (x[:, 1] - x[:, 3] / 2) 38 | y[:, 2] = (x[:, 0] + x[:, 2] / 2) 39 | y[:, 3] = (x[:, 1] + x[:, 3] / 2) 40 | return y 41 | 42 | def bbox_iou_numpy(box1, box2): 43 | """Computes IoU between bounding boxes. 44 | Parameters 45 | ---------- 46 | box1 : ndarray 47 | (N, 4) shaped array with bboxes 48 | box2 : ndarray 49 | (M, 4) shaped array with bboxes 50 | Returns 51 | ------- 52 | : ndarray 53 | (N, M) shaped array with IoUs 54 | """ 55 | area = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1]) 56 | 57 | iw = np.minimum(np.expand_dims(box1[:, 2], axis=1), box2[:, 2]) - np.maximum( 58 | np.expand_dims(box1[:, 0], 1), box2[:, 0] 59 | ) 60 | ih = np.minimum(np.expand_dims(box1[:, 3], axis=1), box2[:, 3]) - np.maximum( 61 | np.expand_dims(box1[:, 1], 1), box2[:, 1] 62 | ) 63 | 64 | iw = np.maximum(iw, 0) 65 | ih = np.maximum(ih, 0) 66 | 67 | ua = np.expand_dims((box1[:, 2] - box1[:, 0]) * (box1[:, 3] - box1[:, 1]), axis=1) + area - iw * ih 68 | 69 | ua = np.maximum(ua, np.finfo(float).eps) 70 | 71 | intersection = iw * ih 72 | 73 | return intersection / ua 74 | 75 | 76 | def bbox_iou(box1, box2, x1y1x2y2=True): 77 | """ 78 | Returns the IoU of two bounding boxes 79 | """ 80 | if x1y1x2y2: 81 | # Get the coordinates of bounding boxes 82 | b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3] 83 | b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3] 84 | else: 85 | # Transform from center and width to exact coordinates 86 | b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2 87 | b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2 88 | b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2 89 | b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2 90 | 91 | # get the coordinates of the intersection rectangle 92 | inter_rect_x1 = torch.max(b1_x1, b2_x1) 93 | inter_rect_y1 = torch.max(b1_y1, b2_y1) 94 | inter_rect_x2 = torch.min(b1_x2, b2_x2) 95 | inter_rect_y2 = torch.min(b1_y2, b2_y2) 96 | # Intersection area 97 | inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1, 0) * torch.clamp(inter_rect_y2 - inter_rect_y1, 0) 98 | # Union Area 99 | b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1) 100 | b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) 101 | 102 | # print(box1, box1.shape) 103 | # print(box2, box2.shape) 104 | return inter_area / (b1_area + b2_area - inter_area + 1e-16) 105 | 106 | def multiclass_metrics(pred, gt): 107 | """ 108 | check precision and recall for predictions. 109 | Output: overall = {precision, recall, f1} 110 | """ 111 | eps=1e-6 112 | overall = {'precision': -1, 'recall': -1, 'f1': -1} 113 | NP, NR, NC = 0, 0, 0 # num of pred, num of recall, num of correct 114 | for ii in range(pred.shape[0]): 115 | pred_ind = np.array(pred[ii]>0.5, dtype=int) 116 | gt_ind = np.array(gt[ii]>0.5, dtype=int) 117 | inter = pred_ind * gt_ind 118 | # add to overall 119 | NC += np.sum(inter) 120 | NP += np.sum(pred_ind) 121 | NR += np.sum(gt_ind) 122 | if NP > 0: 123 | overall['precision'] = float(NC)/NP 124 | if NR > 0: 125 | overall['recall'] = float(NC)/NR 126 | if NP > 0 and NR > 0: 127 | overall['f1'] = 2*overall['precision']*overall['recall']/(overall['precision']+overall['recall']+eps) 128 | return overall 129 | 130 | def compute_ap(recall, precision): 131 | """ Compute the average precision, given the recall and precision curves. 132 | Code originally from https://github.com/rbgirshick/py-faster-rcnn. 133 | # Arguments 134 | recall: The recall curve (list). 135 | precision: The precision curve (list). 136 | # Returns 137 | The average precision as computed in py-faster-rcnn. 138 | """ 139 | # correct AP calculation 140 | # first append sentinel values at the end 141 | mrec = np.concatenate(([0.0], recall, [1.0])) 142 | mpre = np.concatenate(([0.0], precision, [0.0])) 143 | 144 | # compute the precision envelope 145 | for i in range(mpre.size - 1, 0, -1): 146 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i]) 147 | 148 | # to calculate area under PR curve, look for points 149 | # where X axis (recall) changes value 150 | i = np.where(mrec[1:] != mrec[:-1])[0] 151 | 152 | # and sum (\Delta recall) * prec 153 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1]) 154 | return ap 155 | -------------------------------------------------------------------------------- /utils/parsing_metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import numpy as np 3 | import os 4 | 5 | # from plot_util import plot_confusion_matrix 6 | # from makemask import * 7 | 8 | def _fast_hist(label_true, label_pred, n_class): 9 | mask = (label_true >= 0) & (label_true < n_class) 10 | hist = np.bincount( 11 | n_class * label_true[mask].astype(int) + 12 | label_pred[mask], minlength=n_class ** 2).reshape(n_class, n_class) 13 | return hist 14 | 15 | def label_accuracy_score(label_trues, label_preds, n_class, bg_thre=200): 16 | """Returns accuracy score evaluation result. 17 | - overall accuracy 18 | - mean accuracy 19 | - mean IU 20 | - fwavacc 21 | """ 22 | hist = np.zeros((n_class, n_class)) 23 | for lt, lp in zip(label_trues, label_preds): 24 | # hist += _fast_hist(lt.flatten(), lp.flatten(), n_class) 25 | hist += _fast_hist(lt[lt 0] * iu[freq > 0]).sum() 33 | return acc, acc_cls, mean_iu, fwavacc 34 | 35 | def label_confusion_matrix(label_trues, label_preds, n_class, bg_thre=200): 36 | # eps=1e-20 37 | hist=np.zeros((n_class,n_class),dtype=float) 38 | """ (8,256,256), (256,256) """ 39 | for lt,lp in zip(label_trues, label_preds): 40 | # hist += _fast_hist(lt.flatten(), lp.flatten(), n_class) 41 | hist += _fast_hist(lt[lt 0] * iu[freq > 0]).sum() 72 | return acc, acc_cls, mean_iu, fwavacc, iu 73 | 74 | # if __name__ == '__main__': 75 | # """ Evaluating from saved png segmentation maps 76 | # 0.862723060822 0.608076070823 0.503493670787 0.76556929118 77 | # """ 78 | # import csv 79 | # from PIL import Image 80 | # import matplotlib as mpl 81 | # mpl.use('Agg') 82 | # from matplotlib import pyplot as plt 83 | # eps=1e-20 84 | 85 | # class AverageMeter(object): 86 | # """Computes and stores the average and current value""" 87 | # def __init__(self): 88 | # self.reset() 89 | 90 | # def reset(self): 91 | # self.val = 0 92 | # self.avg = 0 93 | # self.sum = 0 94 | # self.count = 0 95 | 96 | # def update(self, val, n=1): 97 | # self.val = val 98 | # self.sum += val * n 99 | # self.count += n 100 | # self.avg = self.sum / self.count 101 | # def load_csv(csv_file): 102 | # img_list, kpt_list, conf_list=[],[],[] 103 | # with open(csv_file, 'rb') as f: 104 | # reader = csv.reader(f) 105 | # for row in reader: 106 | # img_list.append(row[0]) 107 | # kpt_list.append([row[i] for i in range(1,len(row)) if i%3!=0]) 108 | # conf_list.append([row[i] for i in range(1,len(row)) if i%3==0]) 109 | # # print len(img_list),len(kpt_list[0]),len(conf_list[0]) 110 | # return img_list,kpt_list,conf_list 111 | 112 | # n_class = 7 113 | # superpixel_smooth = False 114 | # # valfile = '../../ln_data/LIP/TrainVal_pose_annotations/lip_val_set.csv' 115 | # # pred_folder = '../../../git_code/LIP_JPPNet/output/parsing/val/' 116 | # # pred_folder = '../visulizations/refinenet_baseline/test_out/' 117 | # pred_folder = '../visulizations/refinenet_splittask/test_out/' 118 | # gt_folder = '../../ln_data/pascal_data/SegmentationPart/' 119 | # img_path = '../../ln_data/pascal_data/JPEGImages/' 120 | 121 | # file = '../../ln_data/pascal_data/val_id.txt' 122 | # missjoints = '../../ln_data/pascal_data/no_joint_list.txt' 123 | # img_list = [x.strip().split(' ')[0] for x in open(file)] 124 | # miss_list = [x.strip().split(' ')[0] for x in open(missjoints)] 125 | 126 | # conf_matrices = AverageMeter() 127 | # for index in range(len(img_list)): 128 | # img_name = img_list[index] 129 | # if img_name in miss_list: 130 | # continue 131 | # if not os.path.isfile(pred_folder + img_name + '.png'): 132 | # continue 133 | # pred_file = pred_folder + img_name + '.png' 134 | # pred = Image.open(pred_file) 135 | # gt_file = gt_folder + img_name + '.png' 136 | # gt = Image.open(gt_file) 137 | # pred, gt = np.array(pred, dtype=np.int32), np.array(gt, dtype=np.int32) 138 | # if superpixel_smooth: 139 | # img_file = img_path+img_name+'.jpg' 140 | # img = Image.open(img_file) 141 | # pred = superpixel_expand(np.array(img),pred) 142 | # confusion, _ = label_confusion_matrix(gt, pred, n_class) 143 | # conf_matrices.update(confusion,1) 144 | # acc, acc_cls, mean_iu, fwavacc, iu = hist_based_accu_cal(conf_matrices.avg) 145 | # print(acc, acc_cls, mean_iu, fwavacc) 146 | # print(iu) 147 | 148 | # ## SAVE CONFUSION MATRIX 149 | # figure=plt.figure() 150 | # class_name=['bg', 'head', 'torso', 'upper arm', 'lower arm', 'upper leg', 'lower leg'] 151 | # conf_matrices = conf_matrices.avg 152 | # for i in range(n_class): 153 | # conf_matrices[i,:]=(conf_matrices[i,:]+eps)/sum(conf_matrices[i,:]+eps) 154 | # plot_confusion_matrix(conf_matrices, classes=class_name, 155 | # rotation=0, include_text=True, 156 | # title='Confusion matrix, without normalization') 157 | # plt.show() 158 | # plt.savefig('../saved_models/Baseline_refinenet_test.jpg') 159 | # plt.close('all') 160 | -------------------------------------------------------------------------------- /model/convlstm.py: -------------------------------------------------------------------------------- 1 | import torch.nn as nn 2 | from torch.autograd import Variable 3 | import torch 4 | 5 | """ 6 | https://github.com/ndrplz/ConvLSTM_pytorch 7 | """ 8 | 9 | class ConvLSTMCell(nn.Module): 10 | 11 | def __init__(self, input_size, input_dim, hidden_dim, kernel_size, bias): 12 | """ 13 | Initialize ConvLSTM cell. 14 | 15 | Parameters 16 | ---------- 17 | input_size: (int, int) 18 | Height and width of input tensor as (height, width). 19 | input_dim: int 20 | Number of channels of input tensor. 21 | hidden_dim: int 22 | Number of channels of hidden state. 23 | kernel_size: (int, int) 24 | Size of the convolutional kernel. 25 | bias: bool 26 | Whether or not to add the bias. 27 | """ 28 | 29 | super(ConvLSTMCell, self).__init__() 30 | 31 | self.height, self.width = input_size 32 | self.input_dim = input_dim 33 | self.hidden_dim = hidden_dim 34 | 35 | self.kernel_size = kernel_size 36 | self.padding = kernel_size[0] // 2, kernel_size[1] // 2 37 | self.bias = bias 38 | 39 | self.conv = nn.Conv2d(in_channels=self.input_dim + self.hidden_dim, 40 | out_channels=4 * self.hidden_dim, 41 | kernel_size=self.kernel_size, 42 | padding=self.padding, 43 | bias=self.bias) 44 | 45 | def forward(self, input_tensor, cur_state): 46 | 47 | h_cur, c_cur = cur_state 48 | 49 | combined = torch.cat([input_tensor, h_cur], dim=1) # concatenate along channel axis 50 | 51 | combined_conv = self.conv(combined) 52 | cc_i, cc_f, cc_o, cc_g = torch.split(combined_conv, self.hidden_dim, dim=1) 53 | i = torch.sigmoid(cc_i) 54 | f = torch.sigmoid(cc_f) 55 | o = torch.sigmoid(cc_o) 56 | g = torch.tanh(cc_g) 57 | 58 | c_next = f * c_cur + i * g 59 | h_next = o * torch.tanh(c_next) 60 | 61 | return h_next, c_next 62 | 63 | def init_hidden(self, batch_size): 64 | return (Variable(torch.zeros(batch_size, self.hidden_dim, self.height, self.width)).cuda(), 65 | Variable(torch.zeros(batch_size, self.hidden_dim, self.height, self.width)).cuda()) 66 | 67 | 68 | class ConvLSTM(nn.Module): 69 | 70 | def __init__(self, input_size, input_dim, hidden_dim, kernel_size, num_layers, 71 | batch_first=False, bias=True, return_all_layers=False): 72 | super(ConvLSTM, self).__init__() 73 | 74 | self._check_kernel_size_consistency(kernel_size) 75 | 76 | # Make sure that both `kernel_size` and `hidden_dim` are lists having len == num_layers 77 | kernel_size = self._extend_for_multilayer(kernel_size, num_layers) 78 | hidden_dim = self._extend_for_multilayer(hidden_dim, num_layers) 79 | if not len(kernel_size) == len(hidden_dim) == num_layers: 80 | raise ValueError('Inconsistent list length.') 81 | 82 | self.height, self.width = input_size 83 | 84 | self.input_dim = input_dim 85 | self.hidden_dim = hidden_dim 86 | self.kernel_size = kernel_size 87 | self.num_layers = num_layers 88 | self.batch_first = batch_first 89 | self.bias = bias 90 | self.return_all_layers = return_all_layers 91 | 92 | cell_list = [] 93 | for i in range(0, self.num_layers): 94 | cur_input_dim = self.input_dim if i == 0 else self.hidden_dim[i-1] 95 | 96 | cell_list.append(ConvLSTMCell(input_size=(self.height, self.width), 97 | input_dim=cur_input_dim, 98 | hidden_dim=self.hidden_dim[i], 99 | kernel_size=self.kernel_size[i], 100 | bias=self.bias)) 101 | 102 | self.cell_list = nn.ModuleList(cell_list) 103 | 104 | def forward(self, input_tensor, hidden_state=None): 105 | """ 106 | 107 | Parameters 108 | ---------- 109 | input_tensor: todo 110 | 5-D Tensor either of shape (t, b, c, h, w) or (b, t, c, h, w) 111 | hidden_state: todo 112 | None. todo implement stateful 113 | 114 | Returns 115 | ------- 116 | last_state_list, layer_output 117 | """ 118 | if not self.batch_first: 119 | # (t, b, c, h, w) -> (b, t, c, h, w) 120 | input_tensor = input_tensor.permute(1, 0, 2, 3, 4) 121 | 122 | # Implement stateful ConvLSTM 123 | if hidden_state is not None: 124 | raise NotImplementedError() 125 | else: 126 | hidden_state = self._init_hidden(batch_size=input_tensor.size(0)) 127 | 128 | layer_output_list = [] 129 | last_state_list = [] 130 | 131 | seq_len = input_tensor.size(1) 132 | cur_layer_input = input_tensor 133 | 134 | for layer_idx in range(self.num_layers): 135 | 136 | h, c = hidden_state[layer_idx] 137 | output_inner = [] 138 | for t in range(seq_len): 139 | 140 | h, c = self.cell_list[layer_idx](input_tensor=cur_layer_input[:, t, :, :, :], 141 | cur_state=[h, c]) 142 | output_inner.append(h) 143 | 144 | layer_output = torch.stack(output_inner, dim=1) 145 | cur_layer_input = layer_output 146 | 147 | layer_output_list.append(layer_output) 148 | last_state_list.append([h, c]) 149 | 150 | if not self.return_all_layers: 151 | layer_output_list = layer_output_list[-1:] 152 | last_state_list = last_state_list[-1:] 153 | 154 | return layer_output_list, last_state_list 155 | 156 | def _init_hidden(self, batch_size): 157 | init_states = [] 158 | for i in range(self.num_layers): 159 | init_states.append(self.cell_list[i].init_hidden(batch_size)) 160 | return init_states 161 | 162 | @staticmethod 163 | def _check_kernel_size_consistency(kernel_size): 164 | if not (isinstance(kernel_size, tuple) or 165 | (isinstance(kernel_size, list) and all([isinstance(elem, tuple) for elem in kernel_size]))): 166 | raise ValueError('`kernel_size` must be tuple or list of tuples') 167 | 168 | @staticmethod 169 | def _extend_for_multilayer(param, num_layers): 170 | if not isinstance(param, list): 171 | param = [param] * num_layers 172 | return param -------------------------------------------------------------------------------- /model/loss.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import torch 3 | import torch.nn.functional as F 4 | from torch.autograd import Variable 5 | from model.modulation import mask_softmax 6 | from utils.utils import bbox_iou 7 | import math 8 | from torchvision.ops.boxes import box_area 9 | def lr_poly(base_lr, iter, max_iter, power): 10 | return base_lr * ((1 - float(iter) / max_iter) ** (power)) 11 | 12 | def lr_cos(base_lr, iter, max_iter, warm_up=0.05): 13 | warm_up_epoch = int(max_iter*warm_up) 14 | if iter<=warm_up_epoch: 15 | lr = base_lr*(0.8*iter/warm_up_epoch+0.2) 16 | else: 17 | lr = 0.5*base_lr*(1+math.cos(math.pi*(iter-warm_up_epoch)/(max_iter-warm_up_epoch))) 18 | return lr 19 | 20 | def adjust_learning_rate(args, optimizer, i_iter): 21 | # print(optimizer.param_groups[0]['lr'], optimizer.param_groups[1]['lr']) 22 | if args.power==-1: 23 | lr = lr_cos(args.lr, i_iter, args.nb_epoch) 24 | elif args.power==-2: 25 | lr = args.lr*((0.5)**(i_iter//10)) 26 | elif args.power==-3: 27 | lr = args.lr*((0.5)**(i_iter//30)) 28 | elif args.power!=0.: 29 | lr = lr_poly(args.lr, i_iter, args.nb_epoch, args.power) 30 | else: 31 | # lr = args.lr*((0.1)**(i_iter//(args.nb_epoch//4))) 32 | lr = args.lr*((0.5)**(i_iter//(args.nb_epoch//10))) 33 | print(lr) 34 | optimizer.param_groups[0]['lr'] = lr 35 | if len(optimizer.param_groups) > 1: 36 | optimizer.param_groups[1]['lr'] = lr / 10 37 | if len(optimizer.param_groups) > 2: 38 | optimizer.param_groups[2]['lr'] = lr / 10 39 | 40 | def yolo_loss(input, target, gi, gj, best_n_list, w_coord=5., w_neg=1./5, size_average=True): 41 | mseloss = torch.nn.MSELoss(size_average=True) 42 | celoss = torch.nn.CrossEntropyLoss(size_average=True) 43 | batch = input.size(0) 44 | 45 | pred_bbox = Variable(torch.zeros(batch,4).cuda()) 46 | gt_bbox = Variable(torch.zeros(batch,4).cuda()) 47 | for ii in range(batch): 48 | pred_bbox[ii, 0:2] = F.sigmoid(input[ii,best_n_list[ii],0:2,gj[ii],gi[ii]]) 49 | pred_bbox[ii, 2:4] = input[ii,best_n_list[ii],2:4,gj[ii],gi[ii]] 50 | gt_bbox[ii, :] = target[ii,best_n_list[ii],:4,gj[ii],gi[ii]] 51 | loss_x = mseloss(pred_bbox[:,0], gt_bbox[:,0]) 52 | loss_y = mseloss(pred_bbox[:,1], gt_bbox[:,1]) 53 | loss_w = mseloss(pred_bbox[:,2], gt_bbox[:,2]) 54 | loss_h = mseloss(pred_bbox[:,3], gt_bbox[:,3]) 55 | 56 | pred_conf_list, gt_conf_list = [], [] 57 | pred_conf_list.append(input[:,:,4,:,:].contiguous().view(batch,-1)) 58 | gt_conf_list.append(target[:,:,4,:,:].contiguous().view(batch,-1)) 59 | pred_conf = torch.cat(pred_conf_list, dim=1) 60 | gt_conf = torch.cat(gt_conf_list, dim=1) 61 | loss_conf = celoss(pred_conf, gt_conf.max(1)[1]) 62 | 63 | 64 | return (loss_x+loss_y+loss_w+loss_h)*w_coord + loss_conf 65 | 66 | def generalized_box_iou(boxes1, boxes2): 67 | """ 68 | Generalized IoU from https://giou.stanford.edu/ 69 | 70 | The boxes should be in [x0, y0, x1, y1] format 71 | 72 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 73 | and M = len(boxes2) 74 | """ 75 | # degenerate boxes gives inf / nan results 76 | # so do an early check 77 | if (boxes1[:, 2:] >= boxes1[:, :2]).all() and (boxes2[:, 2:] >= boxes2[:, :2]).all(): 78 | 79 | iou, union = box_iou(boxes1, boxes2) 80 | 81 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 82 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 83 | 84 | wh = (rb - lt).clamp(min=0) # [N,M,2] 85 | area = wh[:, :, 0] * wh[:, :, 1] 86 | 87 | return iou - (area - union) / (area) 88 | else: 89 | return torch.tensor([0.]) 90 | def box_iou(boxes1, boxes2): 91 | area1 = box_area(boxes1) 92 | area2 = box_area(boxes2) 93 | 94 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 95 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 96 | 97 | wh = (rb - lt).clamp(min=0) # [N,M,2] 98 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 99 | 100 | union = area1[:, None] + area2 - inter 101 | 102 | iou = inter / union 103 | return iou, union 104 | def diverse_loss(score_list, word_mask, m=-1, coverage_reg=True): 105 | score_matrix = torch.stack([mask_softmax(score,word_mask) for score in score_list], dim=1) ## (B,Nfilm,N,H,W) 106 | cov_matrix = torch.bmm(score_matrix,score_matrix.permute(0,2,1)) ## (BHW,Nfilm,Nfilm) 107 | id_matrix = Variable(torch.eye(cov_matrix.shape[1]).unsqueeze(0).repeat(cov_matrix.shape[0],1,1).cuda()) 108 | if m==-1.: 109 | div_reg = torch.sum(((cov_matrix*(1-id_matrix))**2).view(-1))/cov_matrix.shape[0] 110 | else: 111 | div_reg = torch.sum(((cov_matrix-m*id_matrix)**2).view(-1))/cov_matrix.shape[0] 112 | if coverage_reg: 113 | word_mask_cp = word_mask.clone() 114 | for ii in range(word_mask_cp.shape[0]): 115 | word_mask_cp[ii,0]=0 116 | word_mask_cp[ii,word_mask_cp[ii,:].sum()]=0 ## set one to 0 already 117 | cover_matrix = 1.-torch.clamp(torch.sum(score_matrix, dim=1, keepdim=False),min=0.,max=1.) 118 | cover_reg = torch.sum((cover_matrix*word_mask_cp.float()).view(-1))/cov_matrix.shape[0] 119 | div_reg += cover_reg 120 | return div_reg 121 | 122 | def build_target(raw_coord, pred, anchors_full, args): 123 | coord = Variable(torch.zeros(raw_coord.size(0), raw_coord.size(1)).cuda()) 124 | batch, grid = raw_coord.size(0), args.size//args.gsize 125 | coord[:,0] = (raw_coord[:,0] + raw_coord[:,2])/(2*args.size) 126 | coord[:,1] = (raw_coord[:,1] + raw_coord[:,3])/(2*args.size) 127 | coord[:,2] = (raw_coord[:,2] - raw_coord[:,0])/(args.size) 128 | coord[:,3] = (raw_coord[:,3] - raw_coord[:,1])/(args.size) 129 | coord = coord * grid 130 | bbox=torch.zeros(coord.size(0),9,5,grid, grid) 131 | best_n_list, best_gi, best_gj = [],[],[] 132 | 133 | for ii in range(batch): 134 | batch, grid = raw_coord.size(0), args.size//args.gsize 135 | gi = coord[ii,0].long() 136 | gj = coord[ii,1].long() 137 | tx = coord[ii,0] - gi.float() 138 | ty = coord[ii,1] - gj.float() 139 | 140 | gw = coord[ii,2] 141 | gh = coord[ii,3] 142 | 143 | anchor_idxs = range(9) 144 | anchors = [anchors_full[i] for i in anchor_idxs] 145 | scaled_anchors = [ (x[0] / (args.anchor_imsize/grid), \ 146 | x[1] / (args.anchor_imsize/grid)) for x in anchors] 147 | 148 | ## Get shape of gt box 149 | gt_box = torch.FloatTensor(np.array([0, 0, gw.cpu(), gh.cpu()],dtype=np.float32)).unsqueeze(0) 150 | ## Get shape of anchor box 151 | anchor_shapes = torch.FloatTensor(np.concatenate((np.zeros((len(scaled_anchors), 2)), np.array(scaled_anchors)), 1)) 152 | ## Calculate iou between gt and anchor shapes 153 | # anch_ious = list(bbox_iou(gt_box, anchor_shapes)) 154 | anch_ious = list(bbox_iou(gt_box, anchor_shapes,x1y1x2y2=False)) 155 | ## Find the best matching anchor box 156 | best_n = np.argmax(np.array(anch_ious)) 157 | 158 | tw = torch.log(gw / scaled_anchors[best_n][0] + 1e-16) 159 | th = torch.log(gh / scaled_anchors[best_n][1] + 1e-16) 160 | 161 | bbox[ii, best_n, :, gj, gi] = torch.stack([tx, ty, tw, th, torch.ones(1).cuda().squeeze()]) 162 | best_n_list.append(int(best_n)) 163 | best_gi.append(gi) 164 | best_gj.append(gj) 165 | 166 | bbox = Variable(bbox.cuda()) 167 | return bbox, best_gi, best_gj, best_n_list 168 | -------------------------------------------------------------------------------- /utils/transforms.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Generic Image Transform utillities. 5 | """ 6 | import torch 7 | import cv2 8 | import random, math 9 | import numpy as np 10 | from collections import Iterable 11 | 12 | import torch.nn.functional as F 13 | from torch.autograd import Variable 14 | 15 | 16 | class ResizePad: 17 | """ 18 | Resize and pad an image to given size. 19 | """ 20 | 21 | def __init__(self, size): 22 | if not isinstance(size, (int, Iterable)): 23 | raise TypeError('Got inappropriate size arg: {}'.format(size)) 24 | 25 | self.h, self.w = size 26 | 27 | def __call__(self, img): 28 | h, w = img.shape[:2] 29 | scale = min(self.h / h, self.w / w) 30 | resized_h = int(np.round(h * scale)) 31 | resized_w = int(np.round(w * scale)) 32 | pad_h = int(np.floor(self.h - resized_h) / 2) 33 | pad_w = int(np.floor(self.w - resized_w) / 2) 34 | 35 | resized_img = cv2.resize(img, (resized_w, resized_h)) 36 | 37 | # if img.ndim > 2: 38 | if img.ndim > 2: 39 | new_img = np.zeros( 40 | (self.h, self.w, img.shape[-1]), dtype=resized_img.dtype) 41 | else: 42 | resized_img = np.expand_dims(resized_img, -1) 43 | new_img = np.zeros((self.h, self.w, 1), dtype=resized_img.dtype) 44 | new_img[pad_h: pad_h + resized_h, 45 | pad_w: pad_w + resized_w, ...] = resized_img 46 | return new_img 47 | 48 | 49 | class CropResize: 50 | """Remove padding and resize image to its original size.""" 51 | 52 | def __call__(self, img, size): 53 | if not isinstance(size, (int, Iterable)): 54 | raise TypeError('Got inappropriate size arg: {}'.format(size)) 55 | im_h, im_w = img.data.shape[:2] 56 | input_h, input_w = size 57 | scale = max(input_h / im_h, input_w / im_w) 58 | # scale = torch.Tensor([[input_h / im_h, input_w / im_w]]).max() 59 | resized_h = int(np.round(im_h * scale)) 60 | # resized_h = torch.round(im_h * scale) 61 | resized_w = int(np.round(im_w * scale)) 62 | # resized_w = torch.round(im_w * scale) 63 | crop_h = int(np.floor(resized_h - input_h) / 2) 64 | # crop_h = torch.floor(resized_h - input_h) // 2 65 | crop_w = int(np.floor(resized_w - input_w) / 2) 66 | # crop_w = torch.floor(resized_w - input_w) // 2 67 | # resized_img = cv2.resize(img, (resized_w, resized_h)) 68 | resized_img = F.upsample( 69 | img.unsqueeze(0).unsqueeze(0), size=(resized_h, resized_w), 70 | mode='bilinear') 71 | 72 | resized_img = resized_img.squeeze().unsqueeze(0) 73 | 74 | return resized_img[0, crop_h: crop_h + input_h, 75 | crop_w: crop_w + input_w] 76 | 77 | 78 | class ResizeImage: 79 | """Resize the largest of the sides of the image to a given size""" 80 | def __init__(self, size): 81 | if not isinstance(size, (int, Iterable)): 82 | raise TypeError('Got inappropriate size arg: {}'.format(size)) 83 | 84 | self.size = size 85 | 86 | def __call__(self, img): 87 | im_h, im_w = img.shape[-2:] 88 | scale = min(self.size / im_h, self.size / im_w) 89 | resized_h = int(np.round(im_h * scale)) 90 | resized_w = int(np.round(im_w * scale)) 91 | out = F.upsample( 92 | Variable(img).unsqueeze(0), size=(resized_h, resized_w), 93 | mode='bilinear').squeeze().data 94 | return out 95 | 96 | 97 | class ResizeAnnotation: 98 | """Resize the largest of the sides of the annotation to a given size""" 99 | def __init__(self, size): 100 | if not isinstance(size, (int, Iterable)): 101 | raise TypeError('Got inappropriate size arg: {}'.format(size)) 102 | 103 | self.size = size 104 | 105 | def __call__(self, img): 106 | im_h, im_w = img.shape[-2:] 107 | scale = min(self.size / im_h, self.size / im_w) 108 | resized_h = int(np.round(im_h * scale)) 109 | resized_w = int(np.round(im_w * scale)) 110 | out = F.upsample( 111 | Variable(img).unsqueeze(0).unsqueeze(0), 112 | size=(resized_h, resized_w), 113 | mode='bilinear').squeeze().data 114 | return out 115 | 116 | 117 | class ToNumpy: 118 | """Transform an torch.*Tensor to an numpy ndarray.""" 119 | 120 | def __call__(self, x): 121 | return x.numpy() 122 | 123 | def letterbox(img, mask, height, color=(123.7, 116.3, 103.5)): # resize a rectangular image to a padded square 124 | shape = img.shape[:2] # shape = [height, width] 125 | #print(shape) 126 | ratio = float(height) / max(shape) # ratio = old / new 127 | new_shape = (round(shape[1] * ratio), round(shape[0] * ratio)) 128 | dw = (height - new_shape[0]) / 2 # width padding 129 | dh = (height - new_shape[1]) / 2 # height padding 130 | top, bottom = round(dh - 0.1), round(dh + 0.1) 131 | left, right = round(dw - 0.1), round(dw + 0.1) 132 | img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border 133 | img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # padded square 134 | if mask is not None: 135 | mask = cv2.resize(mask, new_shape, interpolation=cv2.INTER_NEAREST) # resized, no border 136 | mask = cv2.copyMakeBorder(mask, top, bottom, left, right, cv2.BORDER_CONSTANT, value=0) # padded square 137 | return img, mask, ratio, dw, dh 138 | 139 | def random_affine(img, mask, targets, degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-2, 2), 140 | borderValue=(123.7, 116.3, 103.5), all_bbox=None): 141 | border = 0 # width of added border (optional) 142 | height = max(img.shape[0], img.shape[1]) + border * 2 143 | # Rotation and Scale 144 | R = np.eye(3) 145 | Rht = np.eye(3) 146 | a = random.random() * (degrees[1] - degrees[0]) + degrees[0] 147 | # a += random.choice([-180, -90, 0, 90]) # 90deg rotations added to small rotations 148 | s = random.random() * (scale[1] - scale[0]) + scale[0] 149 | R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s) 150 | # Translation 151 | T = np.eye(3) 152 | r1 = random.random() 153 | r2 = random.random() 154 | T[0, 2] = (r1 * 2 - 1) * translate[0] * img.shape[0] + border # x translation (pixels) 155 | T[1, 2] = (r2 * 2 - 1) * translate[1] * img.shape[1] + border # y translation (pixels) 156 | 157 | # Shear 158 | S = np.eye(3) 159 | S[0, 1] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # x shear (deg) 160 | S[1, 0] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # y shear (deg) 161 | 162 | M = S @ T @ R # Combined rotation matrix. ORDER IS IMPORTANT HERE!! 163 | imw = cv2.warpPerspective(img, M, dsize=(height, height), flags=cv2.INTER_LINEAR, 164 | borderValue=borderValue) # BGR order borderValue 165 | 166 | if mask is not None: 167 | maskw = cv2.warpPerspective(mask, M, dsize=(height, height), flags=cv2.INTER_NEAREST, 168 | borderValue=0) # BGR order borderValue 169 | else: 170 | maskw = None 171 | 172 | # Return warped points also 173 | if type(targets)==type([1]): 174 | targetlist=[] 175 | for bbox in targets: 176 | targetlist.append(wrap_points(bbox, M, height, a)) 177 | return imw, maskw, targetlist, M 178 | elif all_bbox is not None: 179 | targets = wrap_points(targets, M, height, a) 180 | for ii in range(all_bbox.shape[0]): 181 | all_bbox[ii,:] = wrap_points(all_bbox[ii,:], M, height, a) 182 | return imw, maskw, targets, all_bbox, M 183 | elif targets is not None: ## previous main 184 | targets = wrap_points(targets, M, height, a) 185 | return imw, maskw, targets, M 186 | else: 187 | return imw 188 | 189 | def wrap_points(targets, M, height, a): 190 | # n = targets.shape[0] 191 | # points = targets[:, 1:5].copy() 192 | points = targets.copy() 193 | # area0 = (points[:, 2] - points[:, 0]) * (points[:, 3] - points[:, 1]) 194 | area0 = (points[2] - points[0]) * (points[3] - points[1]) 195 | 196 | # warp points 197 | xy = np.ones((4, 3)) 198 | xy[:, :2] = points[[0, 1, 2, 3, 0, 3, 2, 1]].reshape(4, 2) # x1y1, x2y2, x1y2, x2y1 199 | xy = (xy @ M.T)[:, :2].reshape(1, 8) 200 | 201 | # create new boxes 202 | x = xy[:, [0, 2, 4, 6]] 203 | y = xy[:, [1, 3, 5, 7]] 204 | xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, 1).T 205 | 206 | # apply angle-based reduction 207 | radians = a * math.pi / 180 208 | reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5 209 | x = (xy[:, 2] + xy[:, 0]) / 2 210 | y = (xy[:, 3] + xy[:, 1]) / 2 211 | w = (xy[:, 2] - xy[:, 0]) * reduction 212 | h = (xy[:, 3] - xy[:, 1]) * reduction 213 | xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, 1).T 214 | 215 | # reject warped points outside of image 216 | np.clip(xy, 0, height, out=xy) 217 | w = xy[:, 2] - xy[:, 0] 218 | h = xy[:, 3] - xy[:, 1] 219 | area = w * h 220 | ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16)) 221 | i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10) 222 | 223 | ## print(targets, xy) 224 | ## [ 56 36 108 210] [[ 47.80464857 15.6096533 106.30993434 196.71267693]] 225 | # targets = targets[i] 226 | # targets[:, 1:5] = xy[i] 227 | targets = xy[0] 228 | return targets -------------------------------------------------------------------------------- /utils/transformsv2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Generic Image Transform utillities. 5 | """ 6 | 7 | import cv2 8 | import random, math 9 | import numpy as np 10 | from collections import Iterable 11 | 12 | import torch.nn.functional as F 13 | from torch.autograd import Variable 14 | 15 | 16 | class ResizePad: 17 | """ 18 | Resize and pad an image to given size. 19 | """ 20 | 21 | def __init__(self, size): 22 | if not isinstance(size, (int, Iterable)): 23 | raise TypeError('Got inappropriate size arg: {}'.format(size)) 24 | 25 | self.h, self.w = size 26 | 27 | def __call__(self, img): 28 | h, w = img.shape[:2] 29 | scale = min(self.h / h, self.w / w) 30 | resized_h = int(np.round(h * scale)) 31 | resized_w = int(np.round(w * scale)) 32 | pad_h = int(np.floor(self.h - resized_h) / 2) 33 | pad_w = int(np.floor(self.w - resized_w) / 2) 34 | 35 | resized_img = cv2.resize(img, (resized_w, resized_h)) 36 | 37 | # if img.ndim > 2: 38 | if img.ndim > 2: 39 | new_img = np.zeros( 40 | (self.h, self.w, img.shape[-1]), dtype=resized_img.dtype) 41 | else: 42 | resized_img = np.expand_dims(resized_img, -1) 43 | new_img = np.zeros((self.h, self.w, 1), dtype=resized_img.dtype) 44 | new_img[pad_h: pad_h + resized_h, 45 | pad_w: pad_w + resized_w, ...] = resized_img 46 | return new_img 47 | 48 | 49 | class CropResize: 50 | """Remove padding and resize image to its original size.""" 51 | 52 | def __call__(self, img, size): 53 | if not isinstance(size, (int, Iterable)): 54 | raise TypeError('Got inappropriate size arg: {}'.format(size)) 55 | im_h, im_w = img.data.shape[:2] 56 | input_h, input_w = size 57 | scale = max(input_h / im_h, input_w / im_w) 58 | # scale = torch.Tensor([[input_h / im_h, input_w / im_w]]).max() 59 | resized_h = int(np.round(im_h * scale)) 60 | # resized_h = torch.round(im_h * scale) 61 | resized_w = int(np.round(im_w * scale)) 62 | # resized_w = torch.round(im_w * scale) 63 | crop_h = int(np.floor(resized_h - input_h) / 2) 64 | # crop_h = torch.floor(resized_h - input_h) // 2 65 | crop_w = int(np.floor(resized_w - input_w) / 2) 66 | # crop_w = torch.floor(resized_w - input_w) // 2 67 | # resized_img = cv2.resize(img, (resized_w, resized_h)) 68 | resized_img = F.upsample( 69 | img.unsqueeze(0).unsqueeze(0), size=(resized_h, resized_w), 70 | mode='bilinear') 71 | 72 | resized_img = resized_img.squeeze().unsqueeze(0) 73 | 74 | return resized_img[0, crop_h: crop_h + input_h, 75 | crop_w: crop_w + input_w] 76 | 77 | 78 | class ResizeImage: 79 | """Resize the largest of the sides of the image to a given size""" 80 | def __init__(self, size): 81 | if not isinstance(size, (int, Iterable)): 82 | raise TypeError('Got inappropriate size arg: {}'.format(size)) 83 | 84 | self.size = size 85 | 86 | def __call__(self, img): 87 | im_h, im_w = img.shape[-2:] 88 | scale = min(self.size / im_h, self.size / im_w) 89 | resized_h = int(np.round(im_h * scale)) 90 | resized_w = int(np.round(im_w * scale)) 91 | out = F.upsample( 92 | Variable(img).unsqueeze(0), size=(resized_h, resized_w), 93 | mode='bilinear').squeeze().data 94 | return out 95 | 96 | 97 | class ResizeAnnotation: 98 | """Resize the largest of the sides of the annotation to a given size""" 99 | def __init__(self, size): 100 | if not isinstance(size, (int, Iterable)): 101 | raise TypeError('Got inappropriate size arg: {}'.format(size)) 102 | 103 | self.size = size 104 | 105 | def __call__(self, img): 106 | im_h, im_w = img.shape[-2:] 107 | scale = min(self.size / im_h, self.size / im_w) 108 | resized_h = int(np.round(im_h * scale)) 109 | resized_w = int(np.round(im_w * scale)) 110 | out = F.upsample( 111 | Variable(img).unsqueeze(0).unsqueeze(0), 112 | size=(resized_h, resized_w), 113 | mode='bilinear').squeeze().data 114 | return out 115 | 116 | 117 | class ToNumpy: 118 | """Transform an torch.*Tensor to an numpy ndarray.""" 119 | 120 | def __call__(self, x): 121 | return x.numpy() 122 | 123 | def letterbox(img, mask, height, color=(123.7, 116.3, 103.5)): # resize a rectangular image to a padded square 124 | shape = img.shape[:2] # shape = [height, width] 125 | ratio = float(height) / max(shape) # ratio = old / new 126 | new_shape = (round(shape[1] * ratio), round(shape[0] * ratio)) 127 | dw = (height - new_shape[0]) / 2 # width padding 128 | dh = (height - new_shape[1]) / 2 # height padding 129 | top, bottom = round(dh - 0.1), round(dh + 0.1) 130 | left, right = round(dw - 0.1), round(dw + 0.1) 131 | img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border 132 | img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # padded square 133 | if mask is not None: 134 | mask = cv2.resize(mask, new_shape, interpolation=cv2.INTER_NEAREST) # resized, no border 135 | mask = cv2.copyMakeBorder(mask, top, bottom, left, right, cv2.BORDER_CONSTANT, value=255) # padded square 136 | return img, mask, ratio, dw, dh 137 | 138 | def random_affine(img, mask, targets, degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-2, 2), 139 | borderValue=(123.7, 116.3, 103.5), all_bbox=None): 140 | border = 0 # width of added border (optional) 141 | height = max(img.shape[0], img.shape[1]) + border * 2 142 | heightht = max(ht.shape[0], ht.shape[1]) + border * 2 143 | # Rotation and Scale 144 | R = np.eye(3) 145 | Rht = np.eye(3) 146 | a = random.random() * (degrees[1] - degrees[0]) + degrees[0] 147 | # a += random.choice([-180, -90, 0, 90]) # 90deg rotations added to small rotations 148 | s = random.random() * (scale[1] - scale[0]) + scale[0] 149 | R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s) 150 | Rht[:2] = cv2.getRotationMatrix2D(angle=a, center=(ht.shape[1] / 2, ht.shape[0] / 2), scale=s) 151 | # Translation 152 | T = np.eye(3) 153 | r1 = random.random() 154 | r2 = random.random() 155 | T[0, 2] = (r1 * 2 - 1) * translate[0] * img.shape[0] + border # x translation (pixels) 156 | T[1, 2] = (r2 * 2 - 1) * translate[1] * img.shape[1] + border # y translation (pixels) 157 | 158 | Tht = np.eye(3) 159 | Tht[0, 2] = (r1 * 2 - 1) * translate[0] * ht.shape[0] + border # x translation (pixels) 160 | Tht[1, 2] = (r2 * 2 - 1) * translate[1] * ht.shape[1] + border # y translation (pixels) 161 | # Shear 162 | S = np.eye(3) 163 | S[0, 1] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # x shear (deg) 164 | S[1, 0] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # y shear (deg) 165 | 166 | M = S @ T @ R # Combined rotation matrix. ORDER IS IMPORTANT HERE!! 167 | Mht = S@Tht @ Rht 168 | imw = cv2.warpPerspective(img, M, dsize=(height, height), flags=cv2.INTER_LINEAR, 169 | borderValue=borderValue) # BGR order borderValue 170 | 171 | if mask is not None: 172 | maskw = cv2.warpPerspective(mask, M, dsize=(height, height), flags=cv2.INTER_NEAREST, 173 | borderValue=0) # BGR order borderValue 174 | else: 175 | maskw = None 176 | 177 | # Return warped points also 178 | if type(targets)==type([1]): 179 | targetlist=[] 180 | for bbox in targets: 181 | targetlist.append(wrap_points(bbox, M, height, a)) 182 | return imw, maskw, targetlist, M 183 | elif all_bbox is not None: 184 | targets = wrap_points(targets, M, height, a) 185 | for ii in range(all_bbox.shape[0]): 186 | all_bbox[ii,:] = wrap_points(all_bbox[ii,:], M, height, a) 187 | return imw, maskw, targets, all_bbox, M 188 | elif targets is not None: ## previous main 189 | targets = wrap_points(targets, M, height, a) 190 | return imw, maskw, targets, M 191 | else: 192 | return imw 193 | 194 | def wrap_points(targets, M, height, a): 195 | # n = targets.shape[0] 196 | # points = targets[:, 1:5].copy() 197 | points = targets.copy() 198 | # area0 = (points[:, 2] - points[:, 0]) * (points[:, 3] - points[:, 1]) 199 | area0 = (points[2] - points[0]) * (points[3] - points[1]) 200 | 201 | # warp points 202 | xy = np.ones((4, 3)) 203 | xy[:, :2] = points[[0, 1, 2, 3, 0, 3, 2, 1]].reshape(4, 2) # x1y1, x2y2, x1y2, x2y1 204 | xy = (xy @ M.T)[:, :2].reshape(1, 8) 205 | 206 | # create new boxes 207 | x = xy[:, [0, 2, 4, 6]] 208 | y = xy[:, [1, 3, 5, 7]] 209 | xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, 1).T 210 | 211 | # apply angle-based reduction 212 | radians = a * math.pi / 180 213 | reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5 214 | x = (xy[:, 2] + xy[:, 0]) / 2 215 | y = (xy[:, 3] + xy[:, 1]) / 2 216 | w = (xy[:, 2] - xy[:, 0]) * reduction 217 | h = (xy[:, 3] - xy[:, 1]) * reduction 218 | xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, 1).T 219 | 220 | # reject warped points outside of image 221 | np.clip(xy, 0, height, out=xy) 222 | w = xy[:, 2] - xy[:, 0] 223 | h = xy[:, 3] - xy[:, 1] 224 | area = w * h 225 | ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16)) 226 | i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10) 227 | 228 | ## print(targets, xy) 229 | ## [ 56 36 108 210] [[ 47.80464857 15.6096533 106.30993434 196.71267693]] 230 | # targets = targets[i] 231 | # targets[:, 1:5] = xy[i] 232 | targets = xy[0] 233 | return targets -------------------------------------------------------------------------------- /model/yolov3.cfg: -------------------------------------------------------------------------------- 1 | [net] 2 | # Testing 3 | #batch=1 4 | #subdivisions=1 5 | # Training 6 | batch=16 7 | subdivisions=1 8 | width=416 9 | height=416 10 | channels=3 11 | momentum=0.9 12 | decay=0.0005 13 | angle=0 14 | saturation = 1.5 15 | exposure = 1.5 16 | hue=.1 17 | 18 | learning_rate=0.001 19 | burn_in=1000 20 | max_batches = 500200 21 | policy=steps 22 | steps=400000,450000 23 | scales=.1,.1 24 | 25 | [convolutional] 26 | batch_normalize=1 27 | filters=32 28 | size=3 29 | stride=1 30 | pad=1 31 | activation=leaky 32 | 33 | # Downsample 34 | 35 | [convolutional] 36 | batch_normalize=1 37 | filters=64 38 | size=3 39 | stride=2 40 | pad=1 41 | activation=leaky 42 | 43 | [convolutional] 44 | batch_normalize=1 45 | filters=32 46 | size=1 47 | stride=1 48 | pad=1 49 | activation=leaky 50 | 51 | [convolutional] 52 | batch_normalize=1 53 | filters=64 54 | size=3 55 | stride=1 56 | pad=1 57 | activation=leaky 58 | 59 | [shortcut] 60 | from=-3 61 | activation=linear 62 | 63 | # Downsample 64 | 65 | [convolutional] 66 | batch_normalize=1 67 | filters=128 68 | size=3 69 | stride=2 70 | pad=1 71 | activation=leaky 72 | 73 | [convolutional] 74 | batch_normalize=1 75 | filters=64 76 | size=1 77 | stride=1 78 | pad=1 79 | activation=leaky 80 | 81 | [convolutional] 82 | batch_normalize=1 83 | filters=128 84 | size=3 85 | stride=1 86 | pad=1 87 | activation=leaky 88 | 89 | [shortcut] 90 | from=-3 91 | activation=linear 92 | 93 | [convolutional] 94 | batch_normalize=1 95 | filters=64 96 | size=1 97 | stride=1 98 | pad=1 99 | activation=leaky 100 | 101 | [convolutional] 102 | batch_normalize=1 103 | filters=128 104 | size=3 105 | stride=1 106 | pad=1 107 | activation=leaky 108 | 109 | [shortcut] 110 | from=-3 111 | activation=linear 112 | 113 | # Downsample 114 | 115 | [convolutional] 116 | batch_normalize=1 117 | filters=256 118 | size=3 119 | stride=2 120 | pad=1 121 | activation=leaky 122 | 123 | [convolutional] 124 | batch_normalize=1 125 | filters=128 126 | size=1 127 | stride=1 128 | pad=1 129 | activation=leaky 130 | 131 | [convolutional] 132 | batch_normalize=1 133 | filters=256 134 | size=3 135 | stride=1 136 | pad=1 137 | activation=leaky 138 | 139 | [shortcut] 140 | from=-3 141 | activation=linear 142 | 143 | [convolutional] 144 | batch_normalize=1 145 | filters=128 146 | size=1 147 | stride=1 148 | pad=1 149 | activation=leaky 150 | 151 | [convolutional] 152 | batch_normalize=1 153 | filters=256 154 | size=3 155 | stride=1 156 | pad=1 157 | activation=leaky 158 | 159 | [shortcut] 160 | from=-3 161 | activation=linear 162 | 163 | [convolutional] 164 | batch_normalize=1 165 | filters=128 166 | size=1 167 | stride=1 168 | pad=1 169 | activation=leaky 170 | 171 | [convolutional] 172 | batch_normalize=1 173 | filters=256 174 | size=3 175 | stride=1 176 | pad=1 177 | activation=leaky 178 | 179 | [shortcut] 180 | from=-3 181 | activation=linear 182 | 183 | [convolutional] 184 | batch_normalize=1 185 | filters=128 186 | size=1 187 | stride=1 188 | pad=1 189 | activation=leaky 190 | 191 | [convolutional] 192 | batch_normalize=1 193 | filters=256 194 | size=3 195 | stride=1 196 | pad=1 197 | activation=leaky 198 | 199 | [shortcut] 200 | from=-3 201 | activation=linear 202 | 203 | 204 | [convolutional] 205 | batch_normalize=1 206 | filters=128 207 | size=1 208 | stride=1 209 | pad=1 210 | activation=leaky 211 | 212 | [convolutional] 213 | batch_normalize=1 214 | filters=256 215 | size=3 216 | stride=1 217 | pad=1 218 | activation=leaky 219 | 220 | [shortcut] 221 | from=-3 222 | activation=linear 223 | 224 | [convolutional] 225 | batch_normalize=1 226 | filters=128 227 | size=1 228 | stride=1 229 | pad=1 230 | activation=leaky 231 | 232 | [convolutional] 233 | batch_normalize=1 234 | filters=256 235 | size=3 236 | stride=1 237 | pad=1 238 | activation=leaky 239 | 240 | [shortcut] 241 | from=-3 242 | activation=linear 243 | 244 | [convolutional] 245 | batch_normalize=1 246 | filters=128 247 | size=1 248 | stride=1 249 | pad=1 250 | activation=leaky 251 | 252 | [convolutional] 253 | batch_normalize=1 254 | filters=256 255 | size=3 256 | stride=1 257 | pad=1 258 | activation=leaky 259 | 260 | [shortcut] 261 | from=-3 262 | activation=linear 263 | 264 | [convolutional] 265 | batch_normalize=1 266 | filters=128 267 | size=1 268 | stride=1 269 | pad=1 270 | activation=leaky 271 | 272 | [convolutional] 273 | batch_normalize=1 274 | filters=256 275 | size=3 276 | stride=1 277 | pad=1 278 | activation=leaky 279 | 280 | [shortcut] 281 | from=-3 282 | activation=linear 283 | 284 | # Downsample 285 | 286 | [convolutional] 287 | batch_normalize=1 288 | filters=512 289 | size=3 290 | stride=2 291 | pad=1 292 | activation=leaky 293 | 294 | [convolutional] 295 | batch_normalize=1 296 | filters=256 297 | size=1 298 | stride=1 299 | pad=1 300 | activation=leaky 301 | 302 | [convolutional] 303 | batch_normalize=1 304 | filters=512 305 | size=3 306 | stride=1 307 | pad=1 308 | activation=leaky 309 | 310 | [shortcut] 311 | from=-3 312 | activation=linear 313 | 314 | 315 | [convolutional] 316 | batch_normalize=1 317 | filters=256 318 | size=1 319 | stride=1 320 | pad=1 321 | activation=leaky 322 | 323 | [convolutional] 324 | batch_normalize=1 325 | filters=512 326 | size=3 327 | stride=1 328 | pad=1 329 | activation=leaky 330 | 331 | [shortcut] 332 | from=-3 333 | activation=linear 334 | 335 | 336 | [convolutional] 337 | batch_normalize=1 338 | filters=256 339 | size=1 340 | stride=1 341 | pad=1 342 | activation=leaky 343 | 344 | [convolutional] 345 | batch_normalize=1 346 | filters=512 347 | size=3 348 | stride=1 349 | pad=1 350 | activation=leaky 351 | 352 | [shortcut] 353 | from=-3 354 | activation=linear 355 | 356 | 357 | [convolutional] 358 | batch_normalize=1 359 | filters=256 360 | size=1 361 | stride=1 362 | pad=1 363 | activation=leaky 364 | 365 | [convolutional] 366 | batch_normalize=1 367 | filters=512 368 | size=3 369 | stride=1 370 | pad=1 371 | activation=leaky 372 | 373 | [shortcut] 374 | from=-3 375 | activation=linear 376 | 377 | [convolutional] 378 | batch_normalize=1 379 | filters=256 380 | size=1 381 | stride=1 382 | pad=1 383 | activation=leaky 384 | 385 | [convolutional] 386 | batch_normalize=1 387 | filters=512 388 | size=3 389 | stride=1 390 | pad=1 391 | activation=leaky 392 | 393 | [shortcut] 394 | from=-3 395 | activation=linear 396 | 397 | 398 | [convolutional] 399 | batch_normalize=1 400 | filters=256 401 | size=1 402 | stride=1 403 | pad=1 404 | activation=leaky 405 | 406 | [convolutional] 407 | batch_normalize=1 408 | filters=512 409 | size=3 410 | stride=1 411 | pad=1 412 | activation=leaky 413 | 414 | [shortcut] 415 | from=-3 416 | activation=linear 417 | 418 | 419 | [convolutional] 420 | batch_normalize=1 421 | filters=256 422 | size=1 423 | stride=1 424 | pad=1 425 | activation=leaky 426 | 427 | [convolutional] 428 | batch_normalize=1 429 | filters=512 430 | size=3 431 | stride=1 432 | pad=1 433 | activation=leaky 434 | 435 | [shortcut] 436 | from=-3 437 | activation=linear 438 | 439 | [convolutional] 440 | batch_normalize=1 441 | filters=256 442 | size=1 443 | stride=1 444 | pad=1 445 | activation=leaky 446 | 447 | [convolutional] 448 | batch_normalize=1 449 | filters=512 450 | size=3 451 | stride=1 452 | pad=1 453 | activation=leaky 454 | 455 | [shortcut] 456 | from=-3 457 | activation=linear 458 | 459 | # Downsample 460 | 461 | [convolutional] 462 | batch_normalize=1 463 | filters=1024 464 | size=3 465 | stride=2 466 | pad=1 467 | activation=leaky 468 | 469 | [convolutional] 470 | batch_normalize=1 471 | filters=512 472 | size=1 473 | stride=1 474 | pad=1 475 | activation=leaky 476 | 477 | [convolutional] 478 | batch_normalize=1 479 | filters=1024 480 | size=3 481 | stride=1 482 | pad=1 483 | activation=leaky 484 | 485 | [shortcut] 486 | from=-3 487 | activation=linear 488 | 489 | [convolutional] 490 | batch_normalize=1 491 | filters=512 492 | size=1 493 | stride=1 494 | pad=1 495 | activation=leaky 496 | 497 | [convolutional] 498 | batch_normalize=1 499 | filters=1024 500 | size=3 501 | stride=1 502 | pad=1 503 | activation=leaky 504 | 505 | [shortcut] 506 | from=-3 507 | activation=linear 508 | 509 | [convolutional] 510 | batch_normalize=1 511 | filters=512 512 | size=1 513 | stride=1 514 | pad=1 515 | activation=leaky 516 | 517 | [convolutional] 518 | batch_normalize=1 519 | filters=1024 520 | size=3 521 | stride=1 522 | pad=1 523 | activation=leaky 524 | 525 | [shortcut] 526 | from=-3 527 | activation=linear 528 | 529 | [convolutional] 530 | batch_normalize=1 531 | filters=512 532 | size=1 533 | stride=1 534 | pad=1 535 | activation=leaky 536 | 537 | [convolutional] 538 | batch_normalize=1 539 | filters=1024 540 | size=3 541 | stride=1 542 | pad=1 543 | activation=leaky 544 | 545 | [shortcut] 546 | from=-3 547 | activation=linear 548 | 549 | ###################### 550 | 551 | [convolutional] 552 | batch_normalize=1 553 | filters=512 554 | size=1 555 | stride=1 556 | pad=1 557 | activation=leaky 558 | 559 | [convolutional] 560 | batch_normalize=1 561 | size=3 562 | stride=1 563 | pad=1 564 | filters=1024 565 | activation=leaky 566 | 567 | [convolutional] 568 | batch_normalize=1 569 | filters=512 570 | size=1 571 | stride=1 572 | pad=1 573 | activation=leaky 574 | 575 | [convolutional] 576 | batch_normalize=1 577 | size=3 578 | stride=1 579 | pad=1 580 | filters=1024 581 | activation=leaky 582 | 583 | [yoloconvolutional] 584 | batch_normalize=1 585 | filters=512 586 | size=1 587 | stride=1 588 | pad=1 589 | activation=leaky 590 | 591 | [convolutional] 592 | batch_normalize=1 593 | size=3 594 | stride=1 595 | pad=1 596 | filters=1024 597 | activation=leaky 598 | 599 | [convolutional] 600 | size=1 601 | stride=1 602 | pad=1 603 | filters=255 604 | activation=linear 605 | 606 | 607 | [yolo] 608 | mask = 6,7,8 609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 610 | classes=80 611 | num=9 612 | jitter=.3 613 | ignore_thresh = .7 614 | truth_thresh = 1 615 | random=1 616 | 617 | 618 | [route] 619 | layers = -4 620 | 621 | [convolutional] 622 | batch_normalize=1 623 | filters=256 624 | size=1 625 | stride=1 626 | pad=1 627 | activation=leaky 628 | 629 | [upsample] 630 | stride=2 631 | 632 | [route] 633 | layers = -1, 61 634 | 635 | 636 | 637 | [convolutional] 638 | batch_normalize=1 639 | filters=256 640 | size=1 641 | stride=1 642 | pad=1 643 | activation=leaky 644 | 645 | [convolutional] 646 | batch_normalize=1 647 | size=3 648 | stride=1 649 | pad=1 650 | filters=512 651 | activation=leaky 652 | 653 | [convolutional] 654 | batch_normalize=1 655 | filters=256 656 | size=1 657 | stride=1 658 | pad=1 659 | activation=leaky 660 | 661 | [convolutional] 662 | batch_normalize=1 663 | size=3 664 | stride=1 665 | pad=1 666 | filters=512 667 | activation=leaky 668 | 669 | [yoloconvolutional] 670 | batch_normalize=1 671 | filters=256 672 | size=1 673 | stride=1 674 | pad=1 675 | activation=leaky 676 | 677 | [convolutional] 678 | batch_normalize=1 679 | size=3 680 | stride=1 681 | pad=1 682 | filters=512 683 | activation=leaky 684 | 685 | [convolutional] 686 | size=1 687 | stride=1 688 | pad=1 689 | filters=255 690 | activation=linear 691 | 692 | 693 | [yolo] 694 | mask = 3,4,5 695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 696 | classes=80 697 | num=9 698 | jitter=.3 699 | ignore_thresh = .7 700 | truth_thresh = 1 701 | random=1 702 | 703 | 704 | 705 | [route] 706 | layers = -4 707 | 708 | [convolutional] 709 | batch_normalize=1 710 | filters=128 711 | size=1 712 | stride=1 713 | pad=1 714 | activation=leaky 715 | 716 | [upsample] 717 | stride=2 718 | 719 | [route] 720 | layers = -1, 36 721 | 722 | 723 | 724 | [convolutional] 725 | batch_normalize=1 726 | filters=128 727 | size=1 728 | stride=1 729 | pad=1 730 | activation=leaky 731 | 732 | [convolutional] 733 | batch_normalize=1 734 | size=3 735 | stride=1 736 | pad=1 737 | filters=256 738 | activation=leaky 739 | 740 | [convolutional] 741 | batch_normalize=1 742 | filters=128 743 | size=1 744 | stride=1 745 | pad=1 746 | activation=leaky 747 | 748 | [convolutional] 749 | batch_normalize=1 750 | size=3 751 | stride=1 752 | pad=1 753 | filters=256 754 | activation=leaky 755 | 756 | [yoloconvolutional] 757 | batch_normalize=1 758 | filters=128 759 | size=1 760 | stride=1 761 | pad=1 762 | activation=leaky 763 | 764 | [convolutional] 765 | batch_normalize=1 766 | size=3 767 | stride=1 768 | pad=1 769 | filters=256 770 | activation=leaky 771 | 772 | [convolutional] 773 | size=1 774 | stride=1 775 | pad=1 776 | filters=255 777 | activation=linear 778 | 779 | 780 | [yolo] 781 | mask = 0,1,2 782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326 783 | classes=80 784 | num=9 785 | jitter=.3 786 | ignore_thresh = .7 787 | truth_thresh = 1 788 | random=1 789 | -------------------------------------------------------------------------------- /dataset/data_loaderv2.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | YouRefIt referring image PyTorch dataset. 5 | Define and group batches of images and queries. 6 | Based on: 7 | https://github.com/zyang-ur/ReSC/blob/master/dataset/data_loader.py 8 | """ 9 | from torchvision.transforms import Compose, ToTensor, Normalize 10 | import os 11 | import sys 12 | import cv2 13 | import json 14 | import uuid 15 | import tqdm 16 | import math 17 | import torch 18 | import random 19 | # import h5py 20 | import numpy as np 21 | import os.path as osp 22 | import scipy.io as sio 23 | import torch.utils.data as data 24 | from collections import OrderedDict 25 | sys.path.append('.') 26 | import operator 27 | import utils 28 | from utils import Corpus 29 | 30 | import argparse 31 | import collections 32 | import logging 33 | import json 34 | import re 35 | 36 | from pytorch_pretrained_bert.tokenization import BertTokenizer 37 | from pytorch_pretrained_bert.modeling import BertModel 38 | # from transformers import BertTokenizer,BertModel 39 | from utils.transforms import letterbox, random_affine 40 | 41 | sys.modules['utils'] = utils 42 | 43 | cv2.setNumThreads(0) 44 | 45 | def read_examples(input_line, unique_id): 46 | """Read a list of `InputExample`s from an input file.""" 47 | examples = [] 48 | # unique_id = 0 49 | line = input_line #reader.readline() 50 | # if not line: 51 | # break 52 | line = line.strip() 53 | text_a = None 54 | text_b = None 55 | m = re.match(r"^(.*) \|\|\| (.*)$", line) 56 | if m is None: 57 | text_a = line 58 | else: 59 | text_a = m.group(1) 60 | text_b = m.group(2) 61 | examples.append( 62 | InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) 63 | # unique_id += 1 64 | return examples 65 | 66 | ## Bert text encoding 67 | class InputExample(object): 68 | def __init__(self, unique_id, text_a, text_b): 69 | self.unique_id = unique_id 70 | self.text_a = text_a 71 | self.text_b = text_b 72 | 73 | class InputFeatures(object): 74 | """A single set of features of data.""" 75 | def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): 76 | self.unique_id = unique_id 77 | self.tokens = tokens 78 | self.input_ids = input_ids 79 | self.input_mask = input_mask 80 | self.input_type_ids = input_type_ids 81 | 82 | def convert_examples_to_features(examples, seq_length, tokenizer): 83 | """Loads a data file into a list of `InputBatch`s.""" 84 | features = [] 85 | for (ex_index, example) in enumerate(examples): 86 | tokens_a = tokenizer.tokenize(example.text_a) 87 | 88 | tokens_b = None 89 | if example.text_b: 90 | tokens_b = tokenizer.tokenize(example.text_b) 91 | 92 | if tokens_b: 93 | # Modifies `tokens_a` and `tokens_b` in place so that the total 94 | # length is less than the specified length. 95 | # Account for [CLS], [SEP], [SEP] with "- 3" 96 | _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) 97 | else: 98 | # Account for [CLS] and [SEP] with "- 2" 99 | if len(tokens_a) > seq_length - 2: 100 | tokens_a = tokens_a[0:(seq_length - 2)] 101 | tokens = [] 102 | input_type_ids = [] 103 | tokens.append("[CLS]") 104 | input_type_ids.append(0) 105 | for token in tokens_a: 106 | tokens.append(token) 107 | input_type_ids.append(0) 108 | tokens.append("[SEP]") 109 | input_type_ids.append(0) 110 | 111 | if tokens_b: 112 | for token in tokens_b: 113 | tokens.append(token) 114 | input_type_ids.append(1) 115 | tokens.append("[SEP]") 116 | input_type_ids.append(1) 117 | 118 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 119 | 120 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 121 | # tokens are attended to. 122 | input_mask = [1] * len(input_ids) 123 | 124 | # Zero-pad up to the sequence length. 125 | while len(input_ids) < seq_length: 126 | input_ids.append(0) 127 | input_mask.append(0) 128 | input_type_ids.append(0) 129 | 130 | assert len(input_ids) == seq_length 131 | assert len(input_mask) == seq_length 132 | assert len(input_type_ids) == seq_length 133 | features.append( 134 | InputFeatures( 135 | unique_id=example.unique_id, 136 | tokens=tokens, 137 | input_ids=input_ids, 138 | input_mask=input_mask, 139 | input_type_ids=input_type_ids)) 140 | return features 141 | 142 | class DatasetNotFoundError(Exception): 143 | pass 144 | 145 | class ReferDataset(data.Dataset): 146 | SUPPORTED_DATASETS = { 147 | 'yourefit': {'splits': ('train', 'val', 'test')}, 148 | 'referit': {'splits': ('train', 'val', 'trainval', 'test')}, 149 | 'unc': { 150 | 'splits': ('train', 'val', 'trainval', 'testA', 'testB'), 151 | 'params': {'dataset': 'refcoco', 'split_by': 'unc'} 152 | }, 153 | 'unc+': { 154 | 'splits': ('train', 'val', 'trainval', 'testA', 'testB'), 155 | 'params': {'dataset': 'refcoco+', 'split_by': 'unc'} 156 | }, 157 | 'gref': { 158 | 'splits': ('train', 'val'), 159 | 'params': {'dataset': 'refcocog', 'split_by': 'google'} 160 | }, 161 | 'gref_umd': { 162 | 'splits': ('train', 'val', 'test'), 163 | 'params': {'dataset': 'refcocog', 'split_by': 'umd'} 164 | }, 165 | 'flickr': { 166 | 'splits': ('train', 'val', 'test')} 167 | } 168 | 169 | def __init__(self, data_root, split_root='data', dataset='referit', imsize=256, 170 | transform=None, augment=False, device=None, return_idx=False, testmode=False, 171 | split='train', max_query_len=128, lstm=False, bert_model='bert-base-uncased'): 172 | self.images = [] 173 | self.data_root = data_root 174 | self.split_root = split_root 175 | self.dataset = dataset 176 | self.imsize = imsize 177 | self.query_len = max_query_len 178 | self.lstm = lstm 179 | self.transform = transform 180 | self.testmode = testmode 181 | self.split = split 182 | self.device = device 183 | self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True) 184 | self.augment=augment 185 | self.return_idx=return_idx 186 | 187 | if self.dataset == 'yourefit': 188 | self.dataset_root = osp.join(self.data_root, 'yourefit') 189 | self.im_dir = osp.join(self.dataset_root, 'images') 190 | elif self.dataset == 'referit': 191 | self.dataset_root = osp.join(self.data_root, 'referit') 192 | self.im_dir = osp.join(self.dataset_root, 'images') 193 | self.split_dir = osp.join(self.dataset_root, 'splits') 194 | elif self.dataset == 'flickr': 195 | self.dataset_root = osp.join(self.data_root, 'Flickr30k') 196 | self.im_dir = osp.join(self.dataset_root, 'flickr30k_images') 197 | else: ## refcoco, etc. 198 | self.dataset_root = osp.join(self.data_root, 'other') 199 | self.im_dir = osp.join( 200 | self.dataset_root, 'images', 'mscoco', 'images', 'train2014') 201 | self.split_dir = osp.join(self.dataset_root, 'splits') 202 | 203 | if not self.exists_dataset(): 204 | print('Please download index cache to data folder') 205 | exit(0) 206 | 207 | dataset_path = osp.join(self.split_root, self.dataset) 208 | valid_splits = self.SUPPORTED_DATASETS[self.dataset]['splits'] 209 | 210 | if self.lstm: 211 | self.corpus = Corpus() 212 | corpus_path = osp.join(dataset_path, 'corpus.pth') 213 | self.corpus = torch.load(corpus_path) 214 | 215 | if split not in valid_splits: 216 | raise ValueError( 217 | 'Dataset {0} does not have split {1}'.format( 218 | self.dataset, split)) 219 | 220 | splits = [split] 221 | if self.dataset != 'referit': 222 | splits = ['train', 'val'] if split == 'trainval' else [split] 223 | for split in splits: 224 | imgset_file = '{0}_{1}full.pth'.format(self.dataset, split) 225 | imgset_path = osp.join(dataset_path, imgset_file) 226 | self.images += torch.load(imgset_path) 227 | 228 | def exists_dataset(self): 229 | return osp.exists(osp.join(self.split_root, self.dataset)) 230 | 231 | 232 | def pull_item(self, idx): 233 | if self.dataset == 'flickr': 234 | img_file, bbox, phrase = self.images[idx] 235 | else: 236 | img_file, _, bbox, phrase, attri = self.images[idx] 237 | ## box format: to x1y1x2y2 238 | if not (self.dataset == 'referit' or self.dataset == 'flickr'): 239 | bbox = np.array(bbox, dtype=int) 240 | #bbox[2], bbox[3] = bbox[0]+bbox[2], bbox[1]+bbox[3] 241 | else: 242 | bbox = np.array(bbox, dtype=int) 243 | 244 | img_path = osp.join(self.im_dir, img_file) 245 | img = cv2.imread(img_path) 246 | 247 | htmapdir = self.im_dir.replace('images', 'paf') 248 | htmapfile = img_file.replace('.jpg', '_rendered.png') 249 | htmap_path = osp.join(htmapdir, htmapfile) 250 | htmap = cv2.imread(htmap_path) 251 | 252 | ht = np.asarray(htmap) 253 | ht = np.mean(ht, axis=2) 254 | ht = cv2.resize(ht, (256, 256)) 255 | 256 | ptdir = self.im_dir.replace('images', 'depimg') 257 | ptfile = img_file #.replace('.jpg', '.jpeg') 258 | pt_path = osp.join(ptdir, ptfile) 259 | pt = cv2.imread(pt_path) 260 | # print(pt.shape) 261 | # exit() 262 | # pt = cv2.resize(pt, (256,256)) 263 | # pt = np.reshape(pt, (3, 256, 256)) 264 | 265 | gestdir = self.im_dir.replace('images','gest') 266 | gestfile = img_file.replace('.jpg' , '.json') 267 | gest_path = osp.join(gestdir,gestfile) 268 | gest = json.load(open(gest_path)) 269 | 270 | ## duplicate channel if gray image 271 | if img.shape[-1] > 1: 272 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 273 | else: 274 | img = np.stack([img] * 3) 275 | 276 | return img, pt, ht, phrase, bbox, gest 277 | # return img, phrase, bbox, pt, ht 278 | 279 | def tokenize_phrase(self, phrase): 280 | return self.corpus.tokenize(phrase, self.query_len) 281 | 282 | def untokenize_word_vector(self, words): 283 | return self.corpus.dictionary[words] 284 | 285 | def __len__(self): 286 | return len(self.images) 287 | 288 | def __getitem__(self, idx): 289 | img, pt, ht, phrase, bbox, gest = self.pull_item(idx) 290 | # phrase = phrase.decode("utf-8").encode().lower() 291 | 292 | center_point = gest['candidate'] 293 | try: 294 | center_point = np.asarray(center_point)[:,0:2] 295 | if center_point[0,0] != 0: 296 | center_point = center_point [0,:] 297 | elif center_point[1,0] != 0: 298 | center_point = center_point [1,:] 299 | else : 300 | center_point = np.asarray([256,256]) 301 | # mask = center_point!=0 302 | # print(center_point.shape) 303 | # center_point = center_point[mask] 304 | # print(center_point.shape) 305 | # center_point = center_point [0:2,:] 306 | # center_point = np.mean(center_point , axis = 0) 307 | except IndexError: 308 | center_point = np.asarray([256,256]) 309 | 310 | phrase = phrase.lower() 311 | if self.augment: 312 | augment_flip, augment_hsv, augment_affine = True,True,True 313 | 314 | ## seems a bug in torch transformation resize, so separate in advance 315 | h,w = img.shape[0], img.shape[1] 316 | if self.augment: 317 | ## random horizontal flip 318 | if augment_flip and random.random() > 0.5: 319 | img = cv2.flip(img, 1) 320 | pt = cv2.flip(pt , 1 ) 321 | ht = cv2.flip(ht , 1 ) 322 | center_point[0] = w - center_point[0] - 1 323 | bbox[0], bbox[2] = w-bbox[2]-1, w-bbox[0]-1 324 | phrase = phrase.replace('right','*&^special^&*').replace('left','right').replace('*&^special^&*','left') 325 | ## random intensity, saturation change 326 | if augment_hsv: 327 | fraction = 0.50 328 | img_hsv = cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2HSV) 329 | S = img_hsv[:, :, 1].astype(np.float32) 330 | V = img_hsv[:, :, 2].astype(np.float32) 331 | a = (random.random() * 2 - 1) * fraction + 1 332 | if a > 1: 333 | np.clip(S, a_min=0, a_max=255, out=S) 334 | a = (random.random() * 2 - 1) * fraction + 1 335 | V *= a 336 | if a > 1: 337 | np.clip(V, a_min=0, a_max=255, out=V) 338 | 339 | img_hsv[:, :, 1] = S.astype(np.uint8) 340 | img_hsv[:, :, 2] = V.astype(np.uint8) 341 | img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR), cv2.COLOR_BGR2RGB) 342 | img, _, ratio, dw, dh = letterbox(img, None, self.imsize) 343 | bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw 344 | bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh 345 | ## random affine transformation 346 | if augment_affine: 347 | img, _, bbox, M, center_point, pt, gt = random_affine(center_point, pt, img, None, bbox, \ 348 | degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10)) 349 | else: ## should be inference, or specified training 350 | img, _, ratio, dw, dh = letterbox(img, None, self.imsize) 351 | bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw 352 | bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh 353 | gt = np.asarray(torch.zeros([512,512])) 354 | gt[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] = 1 355 | ## Norm, to tensor 356 | # print(img.shape) 357 | if self.transform is not None: 358 | img = self.transform(img) 359 | pt = self.t(pt) 360 | #print(ht.shape) 361 | ht = self.t(ht) 362 | #print(ht.shape) 363 | if self.lstm: 364 | phrase = self.tokenize_phrase(phrase) 365 | word_id = phrase 366 | # word_mask = np.zeros(word_id.shape) 367 | word_mask = np.array(word_id>0,dtype=int) 368 | else: 369 | ## encode phrase to bert input 370 | examples = read_examples(phrase, idx) 371 | features = convert_examples_to_features( 372 | examples=examples, seq_length=self.query_len, tokenizer=self.tokenizer) 373 | word_id = features[0].input_ids 374 | word_mask = features[0].input_mask 375 | 376 | if self.testmode: 377 | return img, pt, ht, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \ 378 | np.array(bbox, dtype=np.float32), np.array(ratio, dtype=np.float32), \ 379 | np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0], center_point, gt 380 | else: 381 | return img, pt, ht, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \ 382 | np.array(bbox, dtype=np.float32), center_point, gt -------------------------------------------------------------------------------- /utils/temp.py: -------------------------------------------------------------------------------- 1 | class grounding_model_multihop(nn.Module): 2 | def __init__(self, corpus=None, emb_size=256, jemb_drop_out=0.1, bert_model='bert-base-uncased', \ 3 | NFilm=2, fusion='prod', intmd=False, mstage=False, convlstm=False, \ 4 | coordmap=True, leaky=False, dataset=None, bert_emb=False, tunebert=False, use_sal=False, use_paf=False): 5 | super(grounding_model_multihop, self).__init__() 6 | self.coordmap = coordmap 7 | self.emb_size = emb_size 8 | self.NFilm = NFilm 9 | self.intmd = intmd 10 | self.mstage = mstage 11 | self.convlstm = convlstm 12 | self.tunebert = tunebert 13 | self.use_sal = use_sal 14 | self.use_paf = use_paf 15 | self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True) 16 | 17 | if bert_model=='bert-base-uncased': 18 | self.textdim=768 19 | else: 20 | self.textdim=1024 21 | ## Visual model 22 | self.visumodel = Darknet(config_path='./model/yolov3.cfg') 23 | self.visumodel.load_weights('./saved_models/yolov3.weights') 24 | ## Text model 25 | self.textmodel = BertModel.from_pretrained(bert_model) 26 | 27 | self.mapping_visu = ConvBatchNormReLU(512 if self.convlstm else 256, emb_size, 1, 1, 0, 1, leaky=leaky) 28 | 29 | self.mapping_lang = torch.nn.Sequential( 30 | nn.Linear(self.textdim, emb_size), 31 | nn.ReLU(), 32 | nn.Dropout(jemb_drop_out), 33 | nn.Linear(emb_size, emb_size), 34 | nn.ReLU(),) 35 | 36 | textdim=emb_size 37 | 38 | self.film = FiLMedConvBlock_multihop(NFilm=3,textdim=textdim,visudim=emb_size,\ 39 | emb_size=emb_size,fusion=fusion,intmd=(intmd or mstage or convlstm)) 40 | 41 | self.film1 = FiLMedConvBlock_multihop(NFilm=1,textdim=textdim,visudim=emb_size,\ 42 | emb_size=emb_size,fusion=fusion,intmd=(intmd or mstage or convlstm)) 43 | 44 | if self.convlstm: 45 | output_emb = emb_size 46 | self.global_out = ConvLSTM(input_size=(32, 32), 47 | input_dim=emb_size, 48 | hidden_dim=[emb_size], 49 | kernel_size=(1, 1), 50 | num_layers=1, 51 | batch_first=True, 52 | bias=True, 53 | return_all_layers=False) 54 | 55 | self.fcn_out = torch.nn.Sequential( 56 | ConvBatchNormReLU(output_emb+8, output_emb//2, 1, 1, 0, 1, leaky=leaky), 57 | nn.Conv2d(output_emb//2, 3*5, kernel_size=1)) 58 | 59 | self.fcn_out1 = torch.nn.Sequential( 60 | ConvBatchNormReLU(2*output_emb+8, output_emb//2, 1, 1, 0, 1, leaky=leaky), 61 | nn.Conv2d(output_emb//2, 6*5, kernel_size=1)) 62 | #self.vl_transformer = VisionLanguageEncoder(d_model=512, nhead=8, num_encoder_layers=6,num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,activation="relu", normalize_before=False) 63 | ''' 64 | #transformer 65 | decoder_layer = TransformerDecoderLayer(512, 8, 2048, 0.1, "relu", False) 66 | decoder_norm = nn.LayerNorm(512) 67 | self.decoder = TransformerDecoder(decoder_layer, 6, decoder_norm, return_intermediate=True,d_model=512) 68 | 69 | encoder_layer = TransformerEncoderLayer(512, 8, 2048, 0.1, "relu", False) 70 | encoder_norm = None 71 | self.encoder = TransformerEncoder(encoder_layer, 6, encoder_norm) 72 | ''' 73 | 74 | ## Mapping module 75 | ''' 76 | for i in self.parameters(): 77 | i.requires_grad=False 78 | ''' 79 | self.mapping_visu2 = ConvBatchNormReLU(512 if self.convlstm else 256+1, emb_size, 3, 1, 1, 1, leaky=leaky) 80 | self.mapping_visu1 = ConvBatchNormReLU(512+4 if self.convlstm else 256+1, emb_size, 3, 1, 1, 1, leaky=leaky) 81 | self.mp1 = nn.MaxPool2d(16, stride=16) 82 | self.mp2 = nn.AvgPool2d(4, stride=4) 83 | self.mp3 = nn.AvgPool2d(16, stride=16) 84 | self.mp4 = nn.AvgPool2d(2, stride=2) 85 | 86 | self.mapbodyfeature = MLP(512,512,512,2) 87 | 88 | self.linecode = MLP(512,128,3,2) 89 | 90 | self.poscode = MLP(3,128,512,2) 91 | 92 | 93 | #self.pattention = nn.Conv2d(512,1,1) 94 | 95 | #self.l_embed = nn.Embedding(22, 512) 96 | 97 | ## output head 98 | 99 | #self.maplast = ConvBatchNormReLU(output_emb+8, output_emb, 1, 1, 0, 1, leaky=leaky) 100 | 101 | output_emb = emb_size 102 | if self.mstage: 103 | self.fcn_out = nn.ModuleDict() 104 | modules = OrderedDict() 105 | for n in range(0,NFilm): 106 | modules["out%d"%n] = torch.nn.Sequential( 107 | ConvBatchNormReLU(output_emb, output_emb//2, 1, 1, 0, 1, leaky=leaky), 108 | nn.Conv2d(output_emb//2, 9*5, kernel_size=1)) 109 | self.fcn_out.update(modules) 110 | else: 111 | if self.intmd: 112 | output_emb = emb_size*NFilm 113 | if self.use_sal: 114 | self.conv1 = nn.Conv2d(1, 2, 4, 4) 115 | self.conv15 = nn.Conv2d(2, 4, 2, 2) 116 | self.conv2 = nn.Conv2d(4, 8, 2, 2) 117 | else: 118 | self.fcn_out = torch.nn.Sequential( 119 | ConvBatchNormReLU(output_emb+8, output_emb//2, 1, 1, 0, 1, leaky=leaky), 120 | nn.Conv2d(output_emb//2, 3*5, kernel_size=1)) 121 | self.fcn_out1 = torch.nn.Sequential( 122 | ConvBatchNormReLU(2*output_emb+8, output_emb//2, 1, 1, 0, 1, leaky=leaky), 123 | nn.Conv2d(output_emb//2, 6*5, kernel_size=1)) 124 | 125 | def _reset_parameters(self): 126 | for p in self.parameters(): 127 | if p.dim() > 1: 128 | nn.init.xavier_uniform_(p) 129 | 130 | def forward(self, image, seg, ht, dp, word_id, word_mask): 131 | ## Visual Module 132 | batch_size = image.size(0) 133 | ''' 134 | memory_mask = word_mask.view(batch_size,1,-1,1) 135 | memory_mask = memory_mask.repeat(1,8,1,1024) 136 | membed_mask = torch.ones(batch_size, 8, 3, 1024).cuda() 137 | memory_mask = torch.cat((memory_mask,membed_mask),dim=2) 138 | memory_mask = word_mask.view(batch_size*8,23,1024) 139 | print(memory_mask.size()) 140 | 141 | tgt_key_padding_mask = word_mask 142 | embed_mask = torch.ones(batch_size,3).cuda() 143 | tgt_key_padding_mask = torch.cat((tgt_key_padding_mask,embed_mask),dim=1) 144 | tgt_key_padding_mask = tgt_key_padding_mask.bool() 145 | ''' 146 | dp = dp.unsqueeze(1) 147 | seg = seg.unsqueeze(1) 148 | dp = dp.type(torch.FloatTensor).cuda() 149 | seg = seg.type(torch.FloatTensor).cuda() 150 | 151 | distxy = distancexy.repeat(batch_size,1,1,1).cuda() 152 | dist = torch.cat([distxy,dp],dim=1) 153 | 154 | seeg = seg.view(batch_size,1,-1) 155 | seeg = F.normalize(seeg,dim=2,p=1) 156 | 157 | dist = dist.view(batch_size,3,-1) 158 | dist = F.normalize(dist,dim=2,p=1) 159 | 160 | #===============================# 161 | distfeature = dist.permute(0,2,1) 162 | #distfeature = distfeature.permute(1,0,2) 163 | #distfeature = self.posecode(distfeature) 164 | #distfeature = distfeature.permute(1,0,2) 165 | 166 | #===============================# 167 | bodypositionseg = torch.mul(seeg,dist) 168 | bodyposition = torch.sum(bodypositionseg,dim=2) 169 | bodyposition = bodyposition.view(batch_size,1,3) 170 | 171 | #bodypfeature = self.poscode(bodyposition) 172 | #bodypfeature = bodypfeature.view(batch_size,1,-1) 173 | #bodypfeature = bodyposition.permute(1,0,2) 174 | #bodypfeature = gen_sineembed_for_position(bodypfeature*512) 175 | #bodypfeature = bodypfeature.permute(1,0,2) 176 | #bodypfeature = bodypfeature.view(batch_size,1,-1) 177 | 178 | #restdistfeature = distfeature - bodypfeature 179 | #restdistfeature = F.normalize(restdistfeature,dim=2,p=2) 180 | 181 | #restdistfeature = restdistfeature.permute(0,2,1) 182 | #restdistfeature = restdistfeature.view(batch_size,512,512,512) 183 | 184 | #distfeature = distfeature.permute(0,2,1) 185 | #distfeature = distfeature.view(batch_size,512,512,512) 186 | 187 | bodyp = bodyposition.view(batch_size,3,1) 188 | relatepos = torch.sub(dist,bodyp) 189 | relatepos = relatepos.view(batch_size,3,-1) 190 | relatepos = relatepos.permute(0,2,1) 191 | relateposfeature = self.poscode(relatepos) 192 | relateposfeature = F.normalize(relateposfeature,dim=2,p=2) 193 | relateposfeature = relateposfeature.permute(0,2,1) 194 | relateposfeature = relateposfeature.view(batch_size,512,512,512) 195 | 196 | relatepos = F.normalize(relatepos,dim=2,p=2) 197 | relatepos = relatepos.permute(0,2,1) 198 | relatepos = relatepos.view(batch_size,3,512,512) 199 | #restdist = restdist * seg 200 | 201 | #====================================================# 202 | raw_fvisu = self.visumodel(image) 203 | raw_fvisu = raw_fvisu[1] 204 | bodyinfo = raw_fvisu 205 | 206 | #bodypfeature = bodypfeature.view(batch_size,-1) 207 | #compute position informations 208 | ht = ht.type(torch.FloatTensor).cuda() 209 | ht = ht.view(batch_size,-1,3) 210 | ht = ht.permute(0,2,1) 211 | ht = ht.view(batch_size,3,512,512) 212 | ht = torch.mean(ht,dim=1) 213 | ht = ht.view(batch_size,1,512,512) 214 | ht = self.mp1(ht) 215 | 216 | rd = self.mp3(relatepos) 217 | 218 | bodyinfo = torch.cat((bodyinfo, ht),1) 219 | bodyinfo = torch.cat((bodyinfo, rd),1) 220 | 221 | bodyinfo = self.mapping_visu1(bodyinfo) 222 | bodyinfo = self.mp2(bodyinfo) 223 | bodyinfo = self.mapping_visu2(bodyinfo) 224 | bodyinfo = self.mp2(bodyinfo) 225 | bodyinfo = self.mp4(bodyinfo) 226 | 227 | bodyfeature = bodyinfo.view(batch_size,-1) 228 | #bodyfeature = torch.cat([bodyinfo,bodypfeature],dim=1) 229 | #bodypfeature = bodypfeature.view(batch_size,1,-1) 230 | bodyfeature = self.mapbodyfeature(bodyfeature).view(batch_size,-1) 231 | bodyfeature = F.normalize(bodyfeature,dim=1,p=2) 232 | 233 | line = self.linecode(bodyfeature) 234 | line = line.view(batch_size,1,3) 235 | 236 | lor = line.view(batch_size,3) 237 | 238 | ''' 239 | word_id = [] 240 | word_mask = [] 241 | for uu in range(batch_size): 242 | if(lor[uu,0]>0): 243 | word_idt = word_ida[uu,:] 244 | word_idt = word_idt.unsqueeze(0) 245 | word_id.append(word_idt) 246 | 247 | word_maskt = word_maska[uu,:] 248 | word_maskt = word_maskt.unsqueeze(0) 249 | word_mask.append(word_maskt) 250 | else: 251 | word_idt = word_idb[uu,:] 252 | word_idt = word_idt.unsqueeze(0) 253 | word_id.append(word_idt) 254 | 255 | word_maskt = word_maskb[uu,:] 256 | word_maskt = word_maskt.unsqueeze(0) 257 | word_mask.append(word_maskt) 258 | 259 | word_id = torch.cat(word_id,dim=0).contiguous() 260 | word_mask = torch.cat(word_mask,dim=0).contiguous() 261 | ''' 262 | 263 | relatepos = relatepos.view(batch_size,3,512,512) 264 | relateposfeature = relateposfeature.view(batch_size,512,512,512) 265 | 266 | relatepos = relatepos.view(batch_size,3,-1) 267 | relateposfeature = relateposfeature.view(batch_size,512,-1) 268 | line = F.normalize(line,dim=2,p=2) 269 | #pt3 = torch.matmul(line,restdist) 270 | 271 | bodyfeature = bodyfeature.view(batch_size,1,512) 272 | pt512 = torch.matmul(bodyfeature,relateposfeature) 273 | 274 | #pt3 = pt3.view(batch_size,1,512,512) 275 | #attention1 = pt3.view(batch_size,1,512,512) 276 | attention = pt512.view(batch_size,1,512,512) 277 | 278 | #===============================================# 279 | seg = seg.clamp(max=1) 280 | seg = -seg+1 281 | attention = attention * seg 282 | attention = attention.view(batch_size,1,-1) 283 | attention = F.softmax(attention,dim=2) 284 | attention = attention.view(batch_size,1,512,512) 285 | 286 | pt = attention 287 | pt = pt.view(batch_size,1,512,512) 288 | fvisu = self.mapping_visu(raw_fvisu) 289 | 290 | #restdistfeature = restdistfeature.view(batch_size,512,512,512) 291 | #raw_fvisu = fvisu 292 | 293 | 294 | 295 | 296 | 297 | 298 | 299 | 300 | 301 | 302 | 303 | 304 | 305 | 306 | 307 | 308 | 309 | 310 | 311 | 312 | 313 | 314 | 315 | 316 | 317 | 318 | def attt_loss(line,relatepos,attention, bbox, eps=1e-3): 319 | """This function computes the Kullback-Leibler divergence between ground 320 | truth saliency maps and their predictions. Values are first divided by 321 | their sum for each image to yield a distribution that adds to 1. 322 | Args: 323 | y_true (tensor, float32): A 4d tensor that holds the ground truth 324 | saliency maps with values between 0 and 255. 325 | y_pred (tensor, float32): A 4d tensor that holds the predicted saliency 326 | maps with values between 0 and 1. 327 | eps (scalar, float, optional): A small factor to avoid numerical 328 | instabilities. Defaults to 1e-7. 329 | Returns: 330 | tensor, float32: A 0D tensor that holds the averaged error. 331 | """ 332 | loss = 0 333 | batch = line.size(0) 334 | bbox = bbox.int() 335 | for ii in range(batch): 336 | 337 | region1 = attention[ii,0,bbox[ii][0]:max(bbox[ii][2],bbox[ii][0]+1),bbox[ii][1]:max(bbox[ii][3],bbox[ii][1]+1)].contiguous() 338 | region1.view(-1) 339 | region1 = torch.sum(region1) 340 | 341 | relatepos = relatepos.view(batch,3,512,512) 342 | region2 = relatepos[ii,:,bbox[ii][0]:max(bbox[ii][2],bbox[ii][0]+1),bbox[ii][1]:max(bbox[ii][3],bbox[ii][1]+1)].contiguous() 343 | region2 = region2.view(3,-1) 344 | region2 = torch.mean(region2,dim=1) 345 | region2 = region2.view(3) 346 | 347 | region2 = torch.sum(torch.abs(region2-line[ii])) 348 | #print(region) 349 | loss += region2+1-region1 #-region1 350 | loss = loss/batch 351 | return loss 352 | 353 | def depth_loss(input, dp, bbox, gi, gj, best_n_list): 354 | mseloss = torch.nn.MSELoss(reduction='mean' ) 355 | batch = input.size(0) 356 | dp = dp.view(batch,-1).float() 357 | dpmax,_ = torch.max(dp,dim=1) 358 | dpmax = dpmax.view(batch,-1).float() 359 | bbox = bbox.int() 360 | dp = dp/dpmax 361 | loss = 0 362 | dp = dp.view(batch,512,512) 363 | 364 | for ii in range(batch): 365 | pred_depth = F.sigmoid(input[ii,best_n_list[ii],-1,gj[ii],gi[ii]]) 366 | target_bbox = dp[ii,bbox[ii][0]:max(bbox[ii][2],bbox[ii][0]+1),bbox[ii][1]:max(bbox[ii][3],bbox[ii][1]+1)].contiguous() 367 | target_bbox = target_bbox.view(-1) 368 | target_bbox = torch.mean(target_bbox,dim=0) 369 | loss += mseloss(pred_depth,target_bbox) 370 | loss = loss/batch 371 | loss = loss.float() 372 | return loss 373 | 374 | 375 | 376 | 377 | 378 | 379 | -------------------------------------------------------------------------------- /model/grounding_model.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import torch.utils.model_zoo as model_zoo 7 | from torch.utils.data import TensorDataset, DataLoader, SequentialSampler 8 | from torch.utils.data.distributed import DistributedSampler 9 | 10 | from .darknet import * 11 | from .convlstm import * 12 | from .modulation import * 13 | 14 | import argparse 15 | import collections 16 | import logging 17 | import json 18 | import re 19 | import time 20 | from tqdm import tqdm 21 | from pytorch_pretrained_bert.tokenization import BertTokenizer 22 | from pytorch_pretrained_bert.modeling import BertModel 23 | 24 | def generate_coord(batch, height, width): 25 | # coord = Variable(torch.zeros(batch,8,height,width).cuda()) 26 | xv, yv = torch.meshgrid([torch.arange(0,height), torch.arange(0,width)]) 27 | xv_min = (xv.float()*2 - width)/width 28 | yv_min = (yv.float()*2 - height)/height 29 | xv_max = ((xv+1).float()*2 - width)/width 30 | yv_max = ((yv+1).float()*2 - height)/height 31 | xv_ctr = (xv_min+xv_max)/2 32 | yv_ctr = (yv_min+yv_max)/2 33 | hmap = torch.ones(height,width)*(1./height) 34 | wmap = torch.ones(height,width)*(1./width) 35 | coord = torch.autograd.Variable(torch.cat([xv_min.unsqueeze(0), yv_min.unsqueeze(0),\ 36 | xv_max.unsqueeze(0), yv_max.unsqueeze(0),\ 37 | xv_ctr.unsqueeze(0), yv_ctr.unsqueeze(0),\ 38 | hmap.unsqueeze(0), wmap.unsqueeze(0)], dim=0).cuda()) 39 | coord = coord.unsqueeze(0).repeat(batch,1,1,1) 40 | return coord 41 | 42 | class grounding_model_multihop(nn.Module): 43 | def __init__(self, corpus=None, emb_size=256, jemb_drop_out=0.1, bert_model='bert-base-uncased', \ 44 | NFilm=2, fusion='prod', intmd=False, mstage=False, convlstm=False, \ 45 | coordmap=True, leaky=False, dataset=None, bert_emb=False, tunebert=False, use_sal=False, use_paf=False): 46 | super(grounding_model_multihop, self).__init__() 47 | self.coordmap = coordmap 48 | self.emb_size = emb_size 49 | self.NFilm = NFilm 50 | self.intmd = intmd 51 | self.mstage = mstage 52 | self.convlstm = convlstm 53 | self.tunebert = tunebert 54 | self.use_sal = use_sal 55 | self.use_paf = use_paf 56 | if bert_model=='bert-base-uncased': 57 | self.textdim=768 58 | else: 59 | self.textdim=1024 60 | ## Visual model 61 | self.visumodel = Darknet(config_path='model/yolov3.cfg') 62 | self.visumodel.load_weights('saved_models/yolov3.weights') 63 | self.trans = CLIPVisionTransformer(512,8,256) 64 | 65 | self.visumodel4t = Darknetfort(config_path='model/yolov3.cfg') 66 | self.visumodel4t.load_weights('saved_models/yolov3.weights') 67 | 68 | ## Text model 69 | self.textmodel = BertModel.from_pretrained(bert_model) 70 | 71 | ## Mapping module 72 | if self.use_paf: 73 | self.mapping_visu = ConvBatchNormReLU(512+3+1 if self.convlstm else 256+3, emb_size, 1, 1, 0, 1, leaky=leaky) 74 | self.mp1 = nn.MaxPool2d(16, stride=16) 75 | self.mp2 = nn.AvgPool2d(16, stride=16) 76 | else: 77 | self.mapping_visu = ConvBatchNormReLU(512 if self.convlstm else 256, emb_size, 1, 1, 0, 1, leaky=leaky) 78 | 79 | self.mapping_lang = torch.nn.Sequential( 80 | nn.Linear(self.textdim, emb_size), 81 | nn.ReLU(), 82 | nn.Dropout(jemb_drop_out), 83 | nn.Linear(emb_size, emb_size), 84 | nn.ReLU(),) 85 | self.mp3 = nn.MaxPool2d(8, stride=8) 86 | self.mp4 = nn.AvgPool2d(8, stride=8) 87 | self.mapping_visuf = ConvBatchNormReLU(256 + 4 +1, 256, 1, 1,0, 1, leaky=leaky) 88 | textdim=emb_size 89 | self.film = FiLMedConvBlock_multihop(NFilm=NFilm,textdim=textdim,visudim=emb_size,\ 90 | emb_size=emb_size,fusion=fusion,intmd=(intmd or mstage or convlstm)) 91 | 92 | ## output head 93 | output_emb = emb_size 94 | self.loc_avg = nn.AvgPool2d(16, stride=16) 95 | self.pt_avg = nn.AvgPool2d(16, stride=16) 96 | self.ht_avg = nn.AvgPool2d(16, stride=16) 97 | self.vis_map = ConvBatchNormReLU(512, 128, 3, 1, 1, 1, leaky=leaky) 98 | self.locationpool = torch.nn.Sequential( 99 | nn.AvgPool2d(8, stride=8), 100 | #ConvBatchNormReLU(3, 256, 1, 1, 0, 1, leaky=leaky) 101 | ) 102 | self.linear1 = torch.nn.Sequential( 103 | ConvBatchNormReLU(256,128,8, 8, 0, 1, leaky=leaky), 104 | ConvBatchNormReLU(128,32,9, 1, 4, 1, leaky=leaky), 105 | nn.MaxPool2d(8, stride=8) 106 | ) 107 | self.linear2 = nn.Linear(32, 3) 108 | self.language = nn.Linear(512, 1) 109 | self.stage0 = torch.nn.Sequential( 110 | ConvBatchNormReLU(135, 1024, 1, 1, 0, 1, leaky=leaky) 111 | ) 112 | self.stage1 = torch.nn.Sequential( 113 | ConvBatchNormReLU(1024, 1, 9, 1, 4, 1, leaky=leaky), 114 | torch.nn.Upsample(512,mode = 'bilinear' , align_corners = True), 115 | ) 116 | self.upsample = torch.nn.Upsample(512,mode = 'bilinear' , align_corners = True) 117 | self.tohyper = torch.nn.Sequential( 118 | ConvBatchNormReLU(768, 512, 1, 1, 0, 1, leaky=leaky) 119 | ) 120 | self.word_projection = nn.Sequential(nn.Linear(512, 256), 121 | nn.ReLU(), 122 | nn.Dropout(0.1), 123 | nn.Linear(256, 256), 124 | nn.ReLU()) 125 | #self.tstage0 = torch.nn.Con 126 | self.center = torch.nn.Sequential( 127 | nn.AvgPool2d(16, stride=16) 128 | ) 129 | if self.mstage: 130 | self.fcn_out = nn.ModuleDict() 131 | modules = OrderedDict() 132 | for n in range(0,NFilm): 133 | modules["out%d"%n] = torch.nn.Sequential( 134 | ConvBatchNormReLU(output_emb, output_emb//2, 1, 1, 0, 1, leaky=leaky), 135 | nn.Conv2d(output_emb//2, 9*5, kernel_size=1)) 136 | self.fcn_out.update(modules) 137 | else: 138 | if self.intmd: 139 | output_emb = emb_size*NFilm 140 | if self.convlstm: 141 | output_emb = emb_size 142 | self.global_out = ConvLSTM(input_size=(32, 32), 143 | input_dim=emb_size, 144 | hidden_dim=[emb_size], 145 | kernel_size=(1, 1), 146 | num_layers=1, 147 | batch_first=True, 148 | bias=True, 149 | return_all_layers=False) 150 | if self.use_sal: 151 | self.conv1 = torch.nn.Sequential( 152 | nn.AvgPool2d(16) 153 | ) 154 | self.fcn_out = torch.nn.Sequential( 155 | ConvBatchNormReLU(output_emb+1, output_emb//2, 1, 1, 0, 1, leaky=leaky), 156 | nn.Conv2d(output_emb//2, 9*5, kernel_size=1)) 157 | else: 158 | self.fcn_out = torch.nn.Sequential( 159 | ConvBatchNormReLU(output_emb, output_emb//2, 1, 1, 0, 1, leaky=leaky), 160 | nn.Conv2d(output_emb//2, 9*5, kernel_size=1)) 161 | self.test = Vector(512,16,512) 162 | self.vectmaxp = torch.nn.Sequential( 163 | nn.MaxPool2d(16, stride=16), 164 | nn.ReLU() 165 | ) 166 | self.ptmax = torch.nn.Sequential( 167 | #nn.MaxPool2d(8, stride=8), 168 | nn.ReLU() 169 | ) 170 | self.draw = torch.nn.Sequential( 171 | nn.ReLU() 172 | ) 173 | self.softmax = nn.Softmax(dim=-1) 174 | self.linear = nn.Linear(256, 1) 175 | def forward(self, image, dp, ht, word_id, word_mask, gest, bbox, gt, amask, sal,phrase): 176 | ## Visual Module 177 | 178 | batch_size = image.size(0) 179 | out = self.visumodel(image) 180 | intemide = self.visumodel(image)[1] 181 | gest = gest.type(torch.FloatTensor).cuda() 182 | amask = amask.type(torch.FloatTensor).cuda() 183 | dp = torch.mul(amask,dp) 184 | dp = F.normalize(dp.type(torch.FloatTensor).view(batch_size,-1),dim=1,p=float('INF')).view(batch_size,1,512,512).cuda() #* 1.5 185 | 186 | 187 | raw_fvisu4t = self.visumodel4t(image) 188 | 189 | xv, yv = torch.meshgrid([torch.arange(0,512), torch.arange(0,512)]) 190 | xv = (xv / 512 ).unsqueeze(0).unsqueeze(0).repeat(batch_size,1,1,1).cuda() 191 | yv = (yv / 512 ).unsqueeze(0).unsqueeze(0).repeat(batch_size,1,1,1).cuda() 192 | xyz = torch.cat( (xv,yv,dp), dim = 1).cuda() 193 | 194 | gestfraw = torch.mul(gest , amask) 195 | 196 | gestf = F.normalize(gestfraw.view(batch_size,1,-1),dim=2,p=1).view(batch_size,1,512,512).repeat(1,3,1,1) 197 | body = torch.mul(gestf , xyz).view(batch_size, 3, -1) 198 | body = torch.sum(body,dim=2) 199 | 200 | gtbo = F.normalize(gt.view(batch_size,1,-1),dim=2,p=1).view(batch_size,1,512,512).repeat(1,3,1,1) 201 | target = torch.mul(gtbo , xyz).view(batch_size, 3, -1) 202 | target = torch.sum(target,dim=2) - body 203 | 204 | xyz_cent = xyz - body.unsqueeze(2).unsqueeze(2).repeat(1,1,512,512) 205 | 206 | t = self.test(torch.cat( (ht.type(torch.FloatTensor).cuda(),xyz_cent) ,dim = 1)) 207 | vloss = 1 - torch.cosine_similarity(t, target, dim=1) 208 | vectmap = torch.cosine_similarity(xyz_cent , t.unsqueeze(2).unsqueeze(2).repeat(1,1,512,512) , dim = 1).unsqueeze(1) - 0.7 209 | 210 | # cv2.imwrite('output/'+rank+'img.jpg' , imagedraw*255) 211 | norm = torch.max(gestfraw.reshape(batch_size,-1), dim=1, keepdim = True)[0].detach().unsqueeze(2).unsqueeze(2).repeat(1,1,512,512) 212 | gestfraw = (gestfraw.unsqueeze(1))/norm 213 | maxgestvect = self.ptmax(vectmap ) #+self.ptmax(gestfraw) 214 | maxvecter = self.vectmaxp(vectmap ) 215 | mid = torch.cat((raw_fvisu4t[2], self.mp3(ht.type(torch.FloatTensor).cuda()), self.mp4(dp.type(torch.FloatTensor).cuda()), self.mp4(vectmap.type(torch.FloatTensor).cuda())),1) 216 | # 217 | mid = self.mapping_visuf(mid) 218 | raw_fvisu = torch.cat((intemide, self.mp1(ht.type(torch.FloatTensor).cuda()), self.mp2(dp.type(torch.FloatTensor).cuda())),1) 219 | fvisu = self.mapping_visu(raw_fvisu) * maxvecter.repeat(1,512,1,1).detach() 220 | raw_fvisu = F.normalize(fvisu, p=2, dim=1) 221 | size = (raw_fvisu.shape[2]) 222 | 223 | ## Language Module 224 | all_encoder_layers, _ = self.textmodel(word_id, \ 225 | token_type_ids=None, attention_mask=word_mask) 226 | ## Sentence feature at the first position [cls] 227 | raw_flang = (all_encoder_layers[-1][:,0,:] + all_encoder_layers[-2][:,0,:]\ 228 | + all_encoder_layers[-3][:,0,:] + all_encoder_layers[-4][:,0,:])/4 229 | raw_fword = (all_encoder_layers[-1] + all_encoder_layers[-2]\ 230 | + all_encoder_layers[-3] + all_encoder_layers[-4])/4 231 | if not self.tunebert: 232 | ## fix bert during training 233 | # raw_flang = raw_flang.detach() 234 | hidden = raw_flang.detach() 235 | raw_fword = raw_fword.detach() 236 | 237 | fword = Variable(torch.zeros(raw_fword.shape[0], raw_fword.shape[1], self.emb_size).cuda()) 238 | for ii in range(raw_fword.shape[0]): 239 | ntoken = (word_mask[ii] != 0).sum() 240 | fword[ii,:ntoken,:] = F.normalize(self.mapping_lang(raw_fword[ii,:ntoken,:]), p=2, dim=1) 241 | ## [CLS], [SEP] 242 | # fword[ii,1:ntoken-1,:] = F.normalize(self.mapping_lang(raw_fword[ii,1:ntoken-1,:].view(-1,self.textdim)), p=2, dim=1) 243 | raw_fword = fword 244 | x = self.trans(mid)[1].reshape(batch_size,256,-1).permute(0,2,1) 245 | x = self.linear(x) 246 | pt = self.upsample ( torch.softmax(x , dim = 1).squeeze(2).reshape(batch_size,1,64,64) ) 247 | 248 | gt = gt.unsqueeze(1) 249 | gest = 1 - torch.mul(gest , amask).clamp(max = 1,min=0) 250 | pt = torch.mul(pt, amask.unsqueeze(1)) 251 | pt = F.normalize(pt.view(batch_size,1,-1),dim=2,p=1).view(batch_size,1,512,512) 252 | loss3 = 1 - torch.sum(torch.mul(pt,gt).reshape(batch_size,-1) , -1) 253 | gt = F.normalize(gt.view(batch_size,1,-1),dim=2,p=1).view(batch_size,1,512,512) 254 | eps = 1e-7 255 | 256 | vect = torch.mul(pt , xyz).view(batch_size, 3, -1) 257 | vect = torch.sum(vect,dim=2) - body 258 | loss1 = torch.sum(torch.abs(vect - target)) 259 | loss3 += torch.sum( (torch.log( gt / (eps + pt) + eps ) * gt).reshape(batch_size,-1) , -1) * 0.1 260 | 261 | norm = torch.max(pt.reshape(batch_size,-1), dim=1, keepdim = True)[0].detach().unsqueeze(2).unsqueeze(2).repeat(1,1,512,512) 262 | pt = (pt/norm).detach() 263 | centerout = self.center(pt.type(torch.FloatTensor)).squeeze(1).cuda() 264 | 265 | coord = generate_coord(batch_size, raw_fvisu.size(2), raw_fvisu.size(3)) 266 | x, attnscore_list = self.film(raw_fvisu, raw_fword, coord,maxvecter,fsent=None,word_mask=word_mask) 267 | if self.mstage: 268 | outbox = [] 269 | for film_ii in range(len(x)): 270 | outbox.append(self.fcn_out["out%d"%film_ii](x[film_ii])) 271 | elif self.convlstm: 272 | x = torch.stack(x, dim=1) 273 | 274 | output, state = self.global_out(x) 275 | output, hidden, cell = output[-1], state[-1][0], state[-1][1] 276 | if self.use_sal: 277 | #pt = sal.type(torch.FloatTensor).cuda() 278 | pt_c = self.conv1(pt.type(torch.FloatTensor).cuda()) 279 | 280 | hidden = torch.cat((hidden, pt_c), 1) 281 | 282 | outbox = [self.fcn_out(hidden)] 283 | else: 284 | x = torch.stack(x, dim=1).view(batch_size, -1, raw_fvisu.size(2), raw_fvisu.size(3)) 285 | outbox = [self.fcn_out(x)] 286 | 287 | return outbox, attnscore_list, loss1, vloss, centerout,loss3,pt ## list of (B,N,H,W) 288 | 289 | 290 | if __name__ == "__main__": 291 | import sys 292 | import argparse 293 | sys.path.append('.') 294 | from dataset.data_loader import * 295 | from torch.autograd import Variable 296 | from torch.utils.data import DataLoader 297 | from torchvision.transforms import Compose, ToTensor, Normalize 298 | from utils.transforms import ResizeImage, ResizeAnnotation 299 | parser = argparse.ArgumentParser( 300 | description='Dataloader test') 301 | parser.add_argument('--size', default=416, type=int, 302 | help='image size') 303 | parser.add_argument('--data', type=str, default='./ln_data/', 304 | help='path to ReferIt splits data folder') 305 | parser.add_argument('--dataset', default='referit', type=str, 306 | help='referit/flickr/unc/unc+/gref') 307 | parser.add_argument('--split', default='train', type=str, 308 | help='name of the dataset split used to train') 309 | parser.add_argument('--time', default=20, type=int, 310 | help='maximum time steps (lang length) per batch') 311 | parser.add_argument('--emb_size', default=256, type=int, 312 | help='word embedding dimensions') 313 | # parser.add_argument('--lang_layers', default=3, type=int, 314 | # help='number of SRU/LSTM stacked layers') 315 | 316 | args = parser.parse_args() 317 | 318 | torch.manual_seed(13) 319 | np.random.seed(13) 320 | torch.backends.cudnn.deterministic = True 321 | torch.backends.cudnn.benchmark = False 322 | input_transform = Compose([ 323 | ToTensor(), 324 | # ResizeImage(args.size), 325 | Normalize( 326 | mean=[0.485, 0.456, 0.406], 327 | std=[0.229, 0.224, 0.225]) 328 | ]) 329 | 330 | refer = ReferDataset(data_root=args.data, 331 | dataset=args.dataset, 332 | split=args.split, 333 | imsize = args.size, 334 | transform=input_transform, 335 | max_query_len=args.time) 336 | 337 | train_loader = DataLoader(refer, batch_size=1, shuffle=True, 338 | pin_memory=True, num_workers=0) 339 | 340 | # model = textcam_yolo_light(emb_size=args.emb_size) 341 | 342 | for i in enumerate(train_loader): 343 | print(i) 344 | break 345 | -------------------------------------------------------------------------------- /dataset/data_loader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | YouRefIt referring image PyTorch dataset. 5 | Define and group batches of images and queries. 6 | Based on: 7 | https://github.com/zyang-ur/ReSC/blob/master/dataset/data_loader.py 8 | """ 9 | from torchvision.transforms import Compose, ToTensor, Normalize 10 | import os 11 | import sys 12 | import cv2 13 | import json 14 | import uuid 15 | import tqdm 16 | import math 17 | import torch 18 | import random 19 | # import h5py 20 | import numpy as np 21 | import os.path as osp 22 | import scipy.io as sio 23 | import torch.utils.data as data 24 | from collections import OrderedDict 25 | sys.path.append('.') 26 | import operator 27 | import utils 28 | from utils import Corpus 29 | import clip 30 | import argparse 31 | import collections 32 | import logging 33 | import json 34 | import re 35 | 36 | np.set_printoptions(threshold=np.inf) 37 | from pytorch_pretrained_bert.tokenization import BertTokenizer 38 | from pytorch_pretrained_bert.modeling import BertModel 39 | # from transformers import BertTokenizer,BertModel 40 | from utils.transforms import letterbox, random_affine 41 | 42 | sys.modules['utils'] = utils 43 | cv2.setNumThreads(0) 44 | cv2.ocl.setUseOpenCL(True) 45 | 46 | def read_examples(input_line, unique_id): 47 | """Read a list of `InputExample`s from an input file.""" 48 | examples = [] 49 | # unique_id = 0 50 | line = input_line #reader.readline() 51 | # if not line: 52 | # break 53 | line = line.strip() 54 | text_a = None 55 | text_b = None 56 | m = re.match(r"^(.*) \|\|\| (.*)$", line) 57 | if m is None: 58 | text_a = line 59 | else: 60 | text_a = m.group(1) 61 | text_b = m.group(2) 62 | examples.append( 63 | InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) 64 | # unique_id += 1 65 | return examples 66 | 67 | ## Bert text encoding 68 | class InputExample(object): 69 | def __init__(self, unique_id, text_a, text_b): 70 | self.unique_id = unique_id 71 | self.text_a = text_a 72 | self.text_b = text_b 73 | 74 | class InputFeatures(object): 75 | """A single set of features of data.""" 76 | def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): 77 | self.unique_id = unique_id 78 | self.tokens = tokens 79 | self.input_ids = input_ids 80 | self.input_mask = input_mask 81 | self.input_type_ids = input_type_ids 82 | 83 | def convert_examples_to_features(examples, seq_length, tokenizer): 84 | """Loads a data file into a list of `InputBatch`s.""" 85 | features = [] 86 | for (ex_index, example) in enumerate(examples): 87 | tokens_a = tokenizer.tokenize(example.text_a) 88 | 89 | tokens_b = None 90 | if example.text_b: 91 | tokens_b = tokenizer.tokenize(example.text_b) 92 | 93 | if tokens_b: 94 | # Modifies `tokens_a` and `tokens_b` in place so that the total 95 | # length is less than the specified length. 96 | # Account for [CLS], [SEP], [SEP] with "- 3" 97 | _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) 98 | else: 99 | # Account for [CLS] and [SEP] with "- 2" 100 | if len(tokens_a) > seq_length - 2: 101 | tokens_a = tokens_a[0:(seq_length - 2)] 102 | tokens = [] 103 | input_type_ids = [] 104 | tokens.append("[CLS]") 105 | input_type_ids.append(0) 106 | for token in tokens_a: 107 | tokens.append(token) 108 | input_type_ids.append(0) 109 | tokens.append("[SEP]") 110 | input_type_ids.append(0) 111 | 112 | if tokens_b: 113 | for token in tokens_b: 114 | tokens.append(token) 115 | input_type_ids.append(1) 116 | tokens.append("[SEP]") 117 | input_type_ids.append(1) 118 | 119 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 120 | 121 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 122 | # tokens are attended to. 123 | input_mask = [1] * len(input_ids) 124 | 125 | # Zero-pad up to the sequence length. 126 | while len(input_ids) < seq_length: 127 | input_ids.append(0) 128 | input_mask.append(0) 129 | input_type_ids.append(0) 130 | 131 | assert len(input_ids) == seq_length 132 | assert len(input_mask) == seq_length 133 | assert len(input_type_ids) == seq_length 134 | features.append( 135 | InputFeatures( 136 | unique_id=example.unique_id, 137 | tokens=tokens, 138 | input_ids=input_ids, 139 | input_mask=input_mask, 140 | input_type_ids=input_type_ids)) 141 | return features 142 | 143 | class DatasetNotFoundError(Exception): 144 | pass 145 | 146 | class ReferDataset(data.Dataset): 147 | SUPPORTED_DATASETS = { 148 | 'yourefit': {'splits': ('train', 'val', 'test')}, 149 | 'referit': {'splits': ('train', 'val', 'trainval', 'test')}, 150 | 'unc': { 151 | 'splits': ('train', 'val', 'trainval', 'testA', 'testB'), 152 | 'params': {'dataset': 'refcoco', 'split_by': 'unc'} 153 | }, 154 | 'unc+': { 155 | 'splits': ('train', 'val', 'trainval', 'testA', 'testB'), 156 | 'params': {'dataset': 'refcoco+', 'split_by': 'unc'} 157 | }, 158 | 'gref': { 159 | 'splits': ('train', 'val'), 160 | 'params': {'dataset': 'refcocog', 'split_by': 'google'} 161 | }, 162 | 'gref_umd': { 163 | 'splits': ('train', 'val', 'test'), 164 | 'params': {'dataset': 'refcocog', 'split_by': 'umd'} 165 | }, 166 | 'flickr': { 167 | 'splits': ('train', 'val', 'test')} 168 | } 169 | 170 | def __init__(self, data_root, split_root='data', dataset='referit', imsize=256, 171 | transform=None, augment=False, device=None, return_idx=False, testmode=False, 172 | split='train', max_query_len=128, lstm=False, bert_model='bert-base-uncased'): 173 | self.images = [] 174 | self.data_root = data_root 175 | self.split_root = split_root 176 | self.dataset = dataset 177 | self.imsize = imsize 178 | self.query_len = max_query_len 179 | self.lstm = lstm 180 | self.transform = transform 181 | self.testmode = testmode 182 | self.split = split 183 | self.device = device 184 | self.t = input_transform = Compose([ 185 | ToTensor() 186 | ]) 187 | self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True) 188 | self.augment=augment 189 | self.return_idx=return_idx 190 | self.num = 0 191 | if self.dataset == 'yourefit': 192 | self.dataset_root = osp.join(self.data_root, 'yourefit') 193 | self.im_dir = osp.join(self.dataset_root, 'images') 194 | elif self.dataset == 'referit': 195 | self.dataset_root = osp.join(self.data_root, 'referit') 196 | self.im_dir = osp.join(self.dataset_root, 'images') 197 | self.split_dir = osp.join(self.dataset_root, 'splits') 198 | elif self.dataset == 'flickr': 199 | self.dataset_root = osp.join(self.data_root, 'Flickr30k') 200 | self.im_dir = osp.join(self.dataset_root, 'flickr30k_images') 201 | else: ## refcoco, etc. 202 | self.dataset_root = osp.join(self.data_root, 'other') 203 | self.im_dir = osp.join( 204 | self.dataset_root, 'images', 'mscoco', 'images', 'train2014') 205 | self.split_dir = osp.join(self.dataset_root, 'splits') 206 | 207 | if not self.exists_dataset(): 208 | print('Please download index cache to data folder') 209 | exit(0) 210 | 211 | dataset_path = osp.join(self.split_root, self.dataset) 212 | valid_splits = self.SUPPORTED_DATASETS[self.dataset]['splits'] 213 | 214 | if self.lstm: 215 | self.corpus = Corpus() 216 | corpus_path = osp.join(dataset_path, 'corpus.pth') 217 | self.corpus = torch.load(corpus_path) 218 | 219 | if split not in valid_splits: 220 | raise ValueError( 221 | 'Dataset {0} does not have split {1}'.format( 222 | self.dataset, split)) 223 | 224 | splits = [split] 225 | if self.dataset != 'referit': 226 | splits = ['train', 'val'] if split == 'trainval' else [split] 227 | for split in splits: 228 | imgset_file = '{0}_{1}full.pth'.format(self.dataset, split) 229 | imgset_path = osp.join(dataset_path, imgset_file) 230 | self.images += torch.load(imgset_path) 231 | 232 | def exists_dataset(self): 233 | return osp.exists(osp.join(self.split_root, self.dataset)) 234 | 235 | 236 | def pull_item(self, idx): 237 | if self.dataset == 'flickr': 238 | img_file, bbox, phrase = self.images[idx] 239 | else: 240 | img_file, _, bbox, phrase, attri = self.images[idx] 241 | ## box format: to x1y1x2y2 242 | if not (self.dataset == 'referit' or self.dataset == 'flickr'): 243 | bbox = np.array(bbox, dtype=int) 244 | #bbox[2], bbox[3] = bbox[0]+bbox[2], bbox[1]+bbox[3] 245 | else: 246 | bbox = np.array(bbox, dtype=int) 247 | 248 | img_path = osp.join(self.im_dir, img_file) 249 | img = cv2.imread(img_path) 250 | 251 | htmapdir = self.im_dir.replace('images', 'pafours') 252 | htmapfile = img_file #.replace('.jpg', '_rendered.png') 253 | htmap_path = osp.join(htmapdir, htmapfile) 254 | htmap = cv2.imread(htmap_path) 255 | 256 | ht = np.asarray(htmap) 257 | 258 | # #ht = np.mean(ht, axis=2) 259 | 260 | 261 | # ht = cv2.resize(ht, (512, 512)) 262 | 263 | ptdir = self.im_dir.replace('images', 'depimg') 264 | ptfile = img_file #.replace('.jpg', '_depth.png') 265 | pt_path = osp.join(ptdir, ptfile) 266 | pt = cv2.imread(pt_path) 267 | # print(pt.shape) 268 | # exit() 269 | # pt = cv2.resize(pt, (256,256)) 270 | # pt = np.reshape(pt, (3, 256, 256)) 271 | 272 | saldir = self.im_dir.replace('images', 'saliency') 273 | salfile = img_file.replace('.jpg', '.jpeg') 274 | sal_path = osp.join(saldir, salfile) 275 | sal = cv2.imread(sal_path) 276 | sal = cv2.resize(sal, (256,256)) 277 | #sal = np.reshape(sal, (3, 256, 256)) 278 | 279 | gestdir = 'ln_data/bodysegment' 280 | gestfile = img_file.replace('.jpg' , '_seg.png') 281 | gest_path = osp.join(gestdir,gestfile) 282 | gest = cv2.imread(gest_path) 283 | if gest.shape != img.shape: 284 | gest = cv2.resize(gest, img.shape[:2], interpolation=cv2.INTER_AREA) 285 | ## duplicate channel if gray image 286 | if img.shape[-1] > 1: 287 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 288 | else: 289 | img = np.stack([img] * 3) 290 | 291 | return img, pt, ht, phrase, bbox, gest, sal, img_file 292 | # return img, phrase, bbox, pt, ht 293 | 294 | def tokenize_phrase(self, phrase): 295 | return self.corpus.tokenize(phrase, self.query_len) 296 | 297 | def untokenize_word_vector(self, words): 298 | return self.corpus.dictionary[words] 299 | 300 | def __len__(self): 301 | return len(self.images) 302 | 303 | def __getitem__(self, idx): 304 | img, pt, ht, phrase, bbox, gest, sal, img_file = self.pull_item(idx) 305 | # phrase = phrase.decode("utf-8").encode().lower() 306 | 307 | 308 | phrase = phrase.lower() 309 | if self.augment: 310 | augment_flip, augment_hsv, augment_affine = True,True,True 311 | 312 | ## seems a bug in torch transformation resize, so separate in advance 313 | h,w = img.shape[0], img.shape[1] 314 | if self.augment: 315 | ## random horizontal flip 316 | if augment_flip and random.random() > 0.5: 317 | img = cv2.flip(img, 1) 318 | pt = cv2.flip(pt, 1 ) 319 | ht = cv2.flip(ht, 1 ) 320 | gest = cv2.flip(gest, 1) 321 | sal = cv2.flip(sal, 1 ) 322 | bbox[0], bbox[2] = w-bbox[2]-1, w-bbox[0]-1 323 | phrase = phrase.replace('right','*&^special^&*').replace('left','right').replace('*&^special^&*','left') 324 | 325 | ## random intensity, saturation change 326 | if augment_hsv: 327 | fraction = 0.5 328 | img_hsv = cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2HSV) 329 | S = img_hsv[:, :, 1].astype(np.float32) 330 | V = img_hsv[:, :, 2].astype(np.float32) 331 | a = (random.random() * 2 - 1) * fraction + 1 332 | if a > 1: 333 | np.clip(S, a_min=0, a_max=255, out=S) 334 | a = (random.random() * 2 - 1) * fraction + 1 335 | V *= a 336 | if a > 1: 337 | np.clip(V, a_min=0, a_max=255, out=V) 338 | 339 | img_hsv[:, :, 1] = S.astype(np.uint8) 340 | img_hsv[:, :, 2] = V.astype(np.uint8) 341 | img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR), cv2.COLOR_BGR2RGB) 342 | 343 | mask = np.ones_like(img) 344 | img, mask, ratio, dw, dh = letterbox(img, mask, self.imsize) 345 | #ht, _, ratio, dw, dh = letterbox(ht, None, self.imsize) 346 | gest, _, ratio, dw, dh = letterbox(gest, None, self.imsize) 347 | #sal, _, ratio, dw, dh = letterbox(sal, None, self.imsize) 348 | bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw 349 | bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh 350 | ## random affine transformation 351 | if augment_affine: 352 | gt = np.asarray(torch.zeros([512,512])) 353 | gt[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] = 1 354 | img, mask, bbox, M = random_affine(img, mask, bbox, \ 355 | degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10)) 356 | pt = cv2.warpPerspective(pt, M, dsize=(512, 512), flags=cv2.INTER_LINEAR, 357 | borderValue=0) 358 | ht = cv2.warpPerspective(ht, M, dsize=(512, 512), flags=cv2.INTER_LINEAR, 359 | borderValue=0) 360 | gest = cv2.warpPerspective(gest, M, dsize=(512, 512), flags=cv2.INTER_NEAREST, 361 | borderValue=0) 362 | sal = cv2.warpPerspective(sal, M, dsize=(256, 256), flags=cv2.INTER_NEAREST, 363 | borderValue=0) 364 | gt = cv2.warpPerspective(gt, M, dsize=(512, 512), flags=cv2.INTER_NEAREST, 365 | borderValue=0) 366 | else: ## should be inference, or specified training 367 | mask = np.ones_like(img) 368 | img, mask, ratio, dw, dh = letterbox(img, mask, self.imsize) 369 | # ht, _, ratio, dw, dh = letterbox(ht, None, self.imsize) 370 | gest, _, ratio, dw, dh = letterbox(gest, None, self.imsize) 371 | bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw 372 | bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh 373 | gt = np.asarray(torch.zeros([512,512])) 374 | gt[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] = 1 375 | ## Norm, to tensor 376 | # print(img.shape) 377 | 378 | pt = pt[:,:,0] 379 | gest = gest[:,:,0] 380 | mask = mask[:,:,0] 381 | sal = np.reshape(sal, (3, 256, 256)) 382 | sal = sal[0,:,:] 383 | if self.transform is not None: 384 | 385 | img = self.transform(img) 386 | 387 | #pt = self.t(pt) 388 | #print(ht.shape) 389 | 390 | ht = self.transform(ht) 391 | 392 | #print(ht.shape) 393 | if self.lstm: 394 | phrase = self.tokenize_phrase(phrase) 395 | word_id = phrase 396 | # word_mask = np.zeros(word_id.shape) 397 | word_mask = np.array(word_id>0,dtype=int) 398 | else: 399 | ## encode phrase to bert input 400 | 401 | examples = read_examples(phrase, idx) 402 | features = convert_examples_to_features( 403 | examples=examples, seq_length=self.query_len, tokenizer=self.tokenizer) 404 | word_id = features[0].input_ids 405 | word_mask = features[0].input_mask 406 | #phrase = features[0].input_mask #clip.tokenize(phrase, context_length=20) 407 | if self.testmode: 408 | return img, pt, ht, gest, gt, mask, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \ 409 | np.array(bbox, dtype=np.float32), np.array(ratio, dtype=np.float32), \ 410 | np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0],sal , phrase 411 | else: 412 | return img, pt, ht, gest, gt, mask, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \ 413 | np.array(bbox, dtype=np.float32),sal, phrase, img_file -------------------------------------------------------------------------------- /model/darknet.py: -------------------------------------------------------------------------------- 1 | from __future__ import division 2 | 3 | import math 4 | import torch 5 | import torch.nn as nn 6 | import torch.nn.functional as F 7 | from torch.autograd import Variable 8 | import numpy as np 9 | from collections import defaultdict, OrderedDict 10 | 11 | from PIL import Image 12 | 13 | # from utils.parse_config import * 14 | from utils.utils import * 15 | # import matplotlib.pyplot as plt 16 | # import matplotlib.patches as patches 17 | 18 | exist_id = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, \ 19 | 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, \ 20 | 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, \ 21 | 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, \ 22 | 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, \ 23 | 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, \ 24 | 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, \ 25 | 87, 88, 89, 90] 26 | catmap_dict = OrderedDict() 27 | for ii in range(len(exist_id)): 28 | catmap_dict[exist_id[ii]] = ii 29 | 30 | def build_object_targets( 31 | pred_boxes, pred_conf, pred_cls, target, anchors, num_anchors, num_classes, grid_size, ignore_thres, img_dim 32 | ): 33 | nB = target.size(0) 34 | nA = num_anchors 35 | nC = num_classes 36 | nG = grid_size 37 | mask = torch.zeros(nB, nA, nG, nG) 38 | conf_mask = torch.ones(nB, nA, nG, nG) 39 | tx = torch.zeros(nB, nA, nG, nG) 40 | ty = torch.zeros(nB, nA, nG, nG) 41 | tw = torch.zeros(nB, nA, nG, nG) 42 | th = torch.zeros(nB, nA, nG, nG) 43 | tconf = torch.ByteTensor(nB, nA, nG, nG).fill_(0) 44 | tcls = torch.ByteTensor(nB, nA, nG, nG, nC).fill_(0) 45 | 46 | nGT = 0 47 | nCorrect = 0 48 | for b in range(nB): 49 | for t in range(target.shape[1]): 50 | if target[b, t].sum() == 0: 51 | continue 52 | nGT += 1 53 | # Convert to position relative to box 54 | gx = target[b, t, 1] * nG 55 | gy = target[b, t, 2] * nG 56 | gw = target[b, t, 3] * nG 57 | gh = target[b, t, 4] * nG 58 | # Get grid box indices 59 | gi = int(gx) 60 | gj = int(gy) 61 | # Get shape of gt box 62 | gt_box = torch.FloatTensor(np.array([0, 0, gw, gh])).unsqueeze(0) 63 | # Get shape of anchor box 64 | anchor_shapes = torch.FloatTensor(np.concatenate((np.zeros((len(anchors), 2)), np.array(anchors)), 1)) 65 | # Calculate iou between gt and anchor shapes 66 | anch_ious = bbox_iou(gt_box, anchor_shapes) 67 | # Where the overlap is larger than threshold set mask to zero (ignore) 68 | conf_mask[b, anch_ious > ignore_thres, gj, gi] = 0 69 | # Find the best matching anchor box 70 | best_n = np.argmax(anch_ious) 71 | # Get ground truth box 72 | gt_box = torch.FloatTensor(np.array([gx, gy, gw, gh])).unsqueeze(0) 73 | # Get the best prediction 74 | pred_box = pred_boxes[b, best_n, gj, gi].unsqueeze(0) 75 | # Masks 76 | mask[b, best_n, gj, gi] = 1 77 | conf_mask[b, best_n, gj, gi] = 1 78 | # Coordinates 79 | tx[b, best_n, gj, gi] = gx - gi 80 | ty[b, best_n, gj, gi] = gy - gj 81 | # Width and height 82 | tw[b, best_n, gj, gi] = math.log(gw / anchors[best_n][0] + 1e-16) 83 | th[b, best_n, gj, gi] = math.log(gh / anchors[best_n][1] + 1e-16) 84 | # One-hot encoding of label 85 | target_label = int(target[b, t, 0]) 86 | target_label = catmap_dict[target_label] 87 | tcls[b, best_n, gj, gi, target_label] = 1 88 | tconf[b, best_n, gj, gi] = 1 89 | 90 | # Calculate iou between ground truth and best matching prediction 91 | iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False) 92 | pred_label = torch.argmax(pred_cls[b, best_n, gj, gi]) 93 | score = pred_conf[b, best_n, gj, gi] 94 | if iou > 0.5 and pred_label == target_label and score > 0.5: 95 | nCorrect += 1 96 | 97 | return nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls 98 | 99 | def parse_model_config(path): 100 | """Parses the yolo-v3 layer configuration file and returns module definitions""" 101 | file = open(path, 'r') 102 | lines = file.read().split('\n') 103 | lines = [x for x in lines if x and not x.startswith('#')] 104 | lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces 105 | module_defs = [] 106 | for line in lines: 107 | if line.startswith('['): # This marks the start of a new block 108 | module_defs.append({}) 109 | module_defs[-1]['type'] = line[1:-1].rstrip() 110 | if module_defs[-1]['type'] == 'convolutional' or module_defs[-1]['type'] == 'yoloconvolutional': 111 | module_defs[-1]['batch_normalize'] = 0 112 | else: 113 | key, value = line.split("=") 114 | value = value.strip() 115 | module_defs[-1][key.rstrip()] = value.strip() 116 | return module_defs 117 | 118 | class ConvBatchNormReLU(nn.Sequential): 119 | def __init__( 120 | self, 121 | in_channels, 122 | out_channels, 123 | kernel_size, 124 | stride, 125 | padding, 126 | dilation, 127 | leaky=False, 128 | relu=True, 129 | instance=False, 130 | ): 131 | super(ConvBatchNormReLU, self).__init__() 132 | self.add_module( 133 | "conv", 134 | nn.Conv2d( 135 | in_channels=in_channels, 136 | out_channels=out_channels, 137 | kernel_size=kernel_size, 138 | stride=stride, 139 | padding=padding, 140 | dilation=dilation, 141 | bias=False, 142 | ), 143 | ) 144 | if instance: 145 | self.add_module( 146 | "bn", 147 | nn.InstanceNorm2d(num_features=out_channels), 148 | ) 149 | else: 150 | self.add_module( 151 | "bn", 152 | nn.BatchNorm2d( 153 | num_features=out_channels, eps=1e-5, momentum=0.999, affine=True 154 | ), 155 | ) 156 | 157 | if leaky: 158 | self.add_module("relu", nn.LeakyReLU(0.1)) 159 | elif relu: 160 | self.add_module("relu", nn.ReLU()) 161 | 162 | def forward(self, x): 163 | return super(ConvBatchNormReLU, self).forward(x) 164 | 165 | class ConvBatchNormReLU_3d(nn.Sequential): 166 | def __init__( 167 | self, 168 | in_channels, 169 | out_channels, 170 | kernel_size, 171 | stride, 172 | padding, 173 | dilation, 174 | leaky=False, 175 | relu=True, 176 | ): 177 | super(ConvBatchNormReLU_3d, self).__init__() 178 | self.add_module( 179 | "conv", 180 | nn.Conv3d( 181 | in_channels=in_channels, 182 | out_channels=out_channels, 183 | kernel_size=kernel_size, 184 | stride=stride, 185 | padding=padding, 186 | dilation=dilation, 187 | bias=False, 188 | ), 189 | ) 190 | self.add_module( 191 | "bn", 192 | nn.BatchNorm3d( 193 | num_features=out_channels, eps=1e-5, momentum=0.999, affine=True 194 | ), 195 | ) 196 | 197 | if leaky: 198 | self.add_module("relu", nn.LeakyReLU(0.1)) 199 | elif relu: 200 | self.add_module("relu", nn.ReLU()) 201 | 202 | def forward(self, x): 203 | return super(ConvBatchNormReLU_3d, self).forward(x) 204 | 205 | class MyUpsample2(nn.Module): 206 | def forward(self, x): 207 | return x[:, :, :, None, :, None].expand(-1, -1, -1, 2, -1, 2).reshape(x.size(0), x.size(1), x.size(2)*2, x.size(3)*2) 208 | 209 | def create_modules(module_defs): 210 | """ 211 | Constructs module list of layer blocks from module configuration in module_defs 212 | """ 213 | hyperparams = module_defs.pop(0) 214 | output_filters = [int(hyperparams["channels"])] 215 | module_list = nn.ModuleList() 216 | for i, module_def in enumerate(module_defs): 217 | modules = nn.Sequential() 218 | 219 | if module_def["type"] == "convolutional" or module_def["type"] == "yoloconvolutional": 220 | bn = int(module_def["batch_normalize"]) 221 | filters = int(module_def["filters"]) 222 | kernel_size = int(module_def["size"]) 223 | pad = (kernel_size - 1) // 2 if int(module_def["pad"]) else 0 224 | modules.add_module( 225 | "conv_%d" % i, 226 | nn.Conv2d( 227 | in_channels=output_filters[-1], 228 | out_channels=filters, 229 | kernel_size=kernel_size, 230 | stride=int(module_def["stride"]), 231 | padding=pad, 232 | bias=not bn, 233 | ), 234 | ) 235 | if bn: 236 | modules.add_module("batch_norm_%d" % i, nn.BatchNorm2d(filters)) 237 | if module_def["activation"] == "leaky": 238 | modules.add_module("leaky_%d" % i, nn.LeakyReLU(0.1)) 239 | 240 | elif module_def["type"] == "maxpool": 241 | kernel_size = int(module_def["size"]) 242 | stride = int(module_def["stride"]) 243 | if kernel_size == 2 and stride == 1: 244 | padding = nn.ZeroPad2d((0, 1, 0, 1)) 245 | modules.add_module("_debug_padding_%d" % i, padding) 246 | maxpool = nn.MaxPool2d( 247 | kernel_size=int(module_def["size"]), 248 | stride=int(module_def["stride"]), 249 | padding=int((kernel_size - 1) // 2), 250 | ) 251 | modules.add_module("maxpool_%d" % i, maxpool) 252 | 253 | elif module_def["type"] == "upsample": 254 | # upsample = nn.Upsample(scale_factor=int(module_def["stride"]), mode="nearest") 255 | assert(int(module_def["stride"])==2) 256 | upsample = MyUpsample2() 257 | modules.add_module("upsample_%d" % i, upsample) 258 | 259 | elif module_def["type"] == "route": 260 | layers = [int(x) for x in module_def["layers"].split(",")] 261 | filters = sum([output_filters[layer_i] for layer_i in layers]) 262 | modules.add_module("route_%d" % i, EmptyLayer()) 263 | 264 | elif module_def["type"] == "shortcut": 265 | filters = output_filters[int(module_def["from"])] 266 | modules.add_module("shortcut_%d" % i, EmptyLayer()) 267 | 268 | elif module_def["type"] == "yolo": 269 | anchor_idxs = [int(x) for x in module_def["mask"].split(",")] 270 | # Extract anchors 271 | anchors = [int(x) for x in module_def["anchors"].split(",")] 272 | anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)] 273 | anchors = [anchors[i] for i in anchor_idxs] 274 | num_classes = int(module_def["classes"]) 275 | img_height = int(hyperparams["height"]) 276 | # Define detection layer 277 | # yolo_layer = YOLOLayer(anchors, num_classes, img_height) 278 | yolo_layer = YOLOLayer(anchors, num_classes, 256) 279 | modules.add_module("yolo_%d" % i, yolo_layer) 280 | # Register module list and number of output filters 281 | module_list.append(modules) 282 | output_filters.append(filters) 283 | 284 | return hyperparams, module_list 285 | 286 | class EmptyLayer(nn.Module): 287 | """Placeholder for 'route' and 'shortcut' layers""" 288 | 289 | def __init__(self): 290 | super(EmptyLayer, self).__init__() 291 | 292 | class YOLOLayer(nn.Module): 293 | """Detection layer""" 294 | 295 | def __init__(self, anchors, num_classes, img_dim): 296 | super(YOLOLayer, self).__init__() 297 | self.anchors = anchors 298 | self.num_anchors = len(anchors) 299 | self.num_classes = num_classes 300 | self.bbox_attrs = 5 + num_classes 301 | self.image_dim = img_dim 302 | self.ignore_thres = 0.5 303 | self.lambda_coord = 1 304 | 305 | self.mse_loss = nn.MSELoss(size_average=True) # Coordinate loss 306 | self.bce_loss = nn.BCELoss(size_average=True) # Confidence loss 307 | self.ce_loss = nn.CrossEntropyLoss() # Class loss 308 | 309 | def forward(self, x, targets=None): 310 | nA = self.num_anchors 311 | nB = x.size(0) 312 | nG = x.size(2) 313 | stride = self.image_dim / nG 314 | 315 | # Tensors for cuda support 316 | FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor 317 | LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor 318 | ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor 319 | 320 | prediction = x.view(nB, nA, self.bbox_attrs, nG, nG).permute(0, 1, 3, 4, 2).contiguous() 321 | 322 | # Get outputs 323 | x = torch.sigmoid(prediction[..., 0]) # Center x 324 | y = torch.sigmoid(prediction[..., 1]) # Center y 325 | w = prediction[..., 2] # Width 326 | h = prediction[..., 3] # Height 327 | pred_conf = torch.sigmoid(prediction[..., 4]) # Conf 328 | pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred. 329 | 330 | # Calculate offsets for each grid 331 | grid_x = torch.arange(nG).repeat(nG, 1).view([1, 1, nG, nG]).type(FloatTensor) 332 | grid_y = torch.arange(nG).repeat(nG, 1).t().view([1, 1, nG, nG]).type(FloatTensor) 333 | # scaled_anchors = FloatTensor([(a_w / stride, a_h / stride) for a_w, a_h in self.anchors]) 334 | scaled_anchors = FloatTensor([(a_w / (416 / nG), a_h / (416 / nG)) for a_w, a_h in self.anchors]) 335 | anchor_w = scaled_anchors[:, 0:1].view((1, nA, 1, 1)) 336 | anchor_h = scaled_anchors[:, 1:2].view((1, nA, 1, 1)) 337 | 338 | # Add offset and scale with anchors 339 | pred_boxes = FloatTensor(prediction[..., :4].shape) 340 | pred_boxes[..., 0] = x.data + grid_x 341 | pred_boxes[..., 1] = y.data + grid_y 342 | pred_boxes[..., 2] = torch.exp(w.data) * anchor_w 343 | pred_boxes[..., 3] = torch.exp(h.data) * anchor_h 344 | 345 | # Training 346 | if targets is not None: 347 | targets = targets.clone() 348 | targets[:,:,1:] = targets[:,:,1:]/self.image_dim 349 | for b_i in range(targets.shape[0]): 350 | targets[b_i,:,1:] = xyxy2xywh(targets[b_i,:,1:]) 351 | 352 | if x.is_cuda: 353 | self.mse_loss = self.mse_loss.cuda() 354 | self.bce_loss = self.bce_loss.cuda() 355 | self.ce_loss = self.ce_loss.cuda() 356 | 357 | nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls = build_object_targets( 358 | pred_boxes=pred_boxes.cpu().data, 359 | pred_conf=pred_conf.cpu().data, 360 | pred_cls=pred_cls.cpu().data, 361 | target=targets.cpu().data, 362 | anchors=scaled_anchors.cpu().data, 363 | num_anchors=nA, 364 | num_classes=self.num_classes, 365 | grid_size=nG, 366 | ignore_thres=self.ignore_thres, 367 | img_dim=self.image_dim, 368 | ) 369 | 370 | nProposals = int((pred_conf > 0.5).sum().item()) 371 | recall = float(nCorrect / nGT) if nGT else 1 372 | precision = float(nCorrect / nProposals) if nProposals else 0 373 | 374 | # Handle masks 375 | mask = Variable(mask.type(ByteTensor)) 376 | conf_mask = Variable(conf_mask.type(ByteTensor)) 377 | 378 | # Handle target variables 379 | tx = Variable(tx.type(FloatTensor), requires_grad=False) 380 | ty = Variable(ty.type(FloatTensor), requires_grad=False) 381 | tw = Variable(tw.type(FloatTensor), requires_grad=False) 382 | th = Variable(th.type(FloatTensor), requires_grad=False) 383 | tconf = Variable(tconf.type(FloatTensor), requires_grad=False) 384 | tcls = Variable(tcls.type(LongTensor), requires_grad=False) 385 | 386 | # Get conf mask where gt and where there is no gt 387 | conf_mask_true = mask 388 | conf_mask_false = conf_mask - mask 389 | 390 | # Mask outputs to ignore non-existing objects 391 | loss_x = self.mse_loss(x[mask], tx[mask]) 392 | loss_y = self.mse_loss(y[mask], ty[mask]) 393 | loss_w = self.mse_loss(w[mask], tw[mask]) 394 | loss_h = self.mse_loss(h[mask], th[mask]) 395 | loss_conf = self.bce_loss(pred_conf[conf_mask_false], tconf[conf_mask_false]) + self.bce_loss( 396 | pred_conf[conf_mask_true], tconf[conf_mask_true] 397 | ) 398 | loss_cls = (1 / nB) * self.ce_loss(pred_cls[mask], torch.argmax(tcls[mask], 1)) 399 | loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls 400 | return ( 401 | loss, 402 | loss_x.item(), 403 | loss_y.item(), 404 | loss_w.item(), 405 | loss_h.item(), 406 | loss_conf.item(), 407 | loss_cls.item(), 408 | recall, 409 | precision, 410 | ) 411 | 412 | else: 413 | # If not in training phase return predictions 414 | output = torch.cat( 415 | ( 416 | pred_boxes.view(nB, -1, 4) * stride, 417 | pred_conf.view(nB, -1, 1), 418 | pred_cls.view(nB, -1, self.num_classes), 419 | ), 420 | -1, 421 | ) 422 | return output 423 | 424 | class Darknet(nn.Module): 425 | """YOLOv3 object detection model""" 426 | 427 | def __init__(self, config_path='./model/yolov3.cfg', img_size=416, obj_out=False): 428 | super(Darknet, self).__init__() 429 | self.config_path = config_path 430 | self.obj_out = obj_out 431 | self.module_defs = parse_model_config(config_path) 432 | self.hyperparams, self.module_list = create_modules(self.module_defs) 433 | self.img_size = img_size 434 | self.seen = 0 435 | self.header_info = np.array([0, 0, 0, self.seen, 0]) 436 | self.loss_names = ["x", "y", "w", "h", "conf", "cls", "recall", "precision"] 437 | 438 | def forward(self, x, targets=None): 439 | batch = x.shape[0] 440 | is_training = targets is not None 441 | output, output_obj = [], [] 442 | self.losses = defaultdict(float) 443 | layer_outputs = [] 444 | for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): 445 | if module_def["type"] in ["convolutional", "upsample", "maxpool"]: 446 | x = module(x) 447 | elif module_def["type"] == "route": 448 | layer_i = [int(x) for x in module_def["layers"].split(",")] 449 | x = torch.cat([layer_outputs[i] for i in layer_i], 1) 450 | elif module_def["type"] == "shortcut": 451 | layer_i = int(module_def["from"]) 452 | x = layer_outputs[-1] + layer_outputs[layer_i] 453 | elif module_def["type"] == "yoloconvolutional": 454 | output.append(x) ## save final feature block 455 | x = module(x) 456 | elif module_def["type"] == "yolo": 457 | # Train phase: get loss 458 | if is_training: 459 | x, *losses = module[0](x, targets) 460 | for name, loss in zip(self.loss_names, losses): 461 | self.losses[name] += loss 462 | # Test phase: Get detections 463 | else: 464 | x = module(x) 465 | output_obj.append(x) 466 | # x = module(x) 467 | # output.append(x) 468 | layer_outputs.append(x) 469 | 470 | self.losses["recall"] /= 3 471 | self.losses["precision"] /= 3 472 | # return sum(output) if is_training else torch.cat(output, 1) 473 | # return torch.cat(output, 1) 474 | if self.obj_out: 475 | return output, sum(output_obj) if is_training else torch.cat(output_obj, 1), self.losses["precision"], self.losses["recall"] 476 | # return output, sum(output_obj)/(len(output_obj)*batch) if is_training else torch.cat(output_obj, 1) 477 | else: 478 | return output 479 | 480 | def load_weights(self, weights_path): 481 | """Parses and loads the weights stored in 'weights_path'""" 482 | 483 | # Open the weights file 484 | fp = open(weights_path, "rb") 485 | if self.config_path=='./model/yolo9000.cfg': 486 | header = np.fromfile(fp, dtype=np.int32, count=4) # First five are header values 487 | else: 488 | header = np.fromfile(fp, dtype=np.int32, count=5) # First five are header values 489 | # Needed to write header when saving weights 490 | self.header_info = header 491 | 492 | self.seen = header[3] 493 | weights = np.fromfile(fp, dtype=np.float32) # The rest are weights 494 | fp.close() 495 | 496 | ptr = 0 497 | for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): 498 | if module_def["type"] == "convolutional" or module_def["type"] == "yoloconvolutional": 499 | conv_layer = module[0] 500 | if module_def["batch_normalize"]: 501 | # Load BN bias, weights, running mean and running variance 502 | bn_layer = module[1] 503 | num_b = bn_layer.bias.numel() # Number of biases 504 | # Bias 505 | bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias) 506 | bn_layer.bias.data.copy_(bn_b) 507 | ptr += num_b 508 | # Weight 509 | bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight) 510 | bn_layer.weight.data.copy_(bn_w) 511 | ptr += num_b 512 | # Running Mean 513 | bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean) 514 | bn_layer.running_mean.data.copy_(bn_rm) 515 | ptr += num_b 516 | # Running Var 517 | bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var) 518 | bn_layer.running_var.data.copy_(bn_rv) 519 | ptr += num_b 520 | else: 521 | # Load conv. bias 522 | num_b = conv_layer.bias.numel() 523 | conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias) 524 | conv_layer.bias.data.copy_(conv_b) 525 | ptr += num_b 526 | # Load conv. weights 527 | num_w = conv_layer.weight.numel() 528 | conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight) 529 | conv_layer.weight.data.copy_(conv_w) 530 | ptr += num_w 531 | 532 | """ 533 | @:param path - path of the new weights file 534 | @:param cutoff - save layers between 0 and cutoff (cutoff = -1 -> all are saved) 535 | """ 536 | 537 | def save_weights(self, path, cutoff=-1): 538 | 539 | fp = open(path, "wb") 540 | self.header_info[3] = self.seen 541 | self.header_info.tofile(fp) 542 | 543 | # Iterate through layers 544 | for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])): 545 | if module_def["type"] == "convolutional": 546 | conv_layer = module[0] 547 | # If batch norm, load bn first 548 | if module_def["batch_normalize"]: 549 | bn_layer = module[1] 550 | bn_layer.bias.data.cpu().numpy().tofile(fp) 551 | bn_layer.weight.data.cpu().numpy().tofile(fp) 552 | bn_layer.running_mean.data.cpu().numpy().tofile(fp) 553 | bn_layer.running_var.data.cpu().numpy().tofile(fp) 554 | # Load conv bias 555 | else: 556 | conv_layer.bias.data.cpu().numpy().tofile(fp) 557 | # Load conv weights 558 | conv_layer.weight.data.cpu().numpy().tofile(fp) 559 | 560 | fp.close 561 | 562 | class Darknetfort(nn.Module): 563 | """YOLOv3 object detection model""" 564 | 565 | def __init__(self, config_path='./model/yolov3.cfg', img_size=416, obj_out=False): 566 | super(Darknetfort, self).__init__() 567 | self.config_path = config_path 568 | self.obj_out = obj_out 569 | self.module_defs = parse_model_config(config_path) 570 | self.hyperparams, self.module_list = create_modules(self.module_defs) 571 | self.img_size = img_size 572 | self.seen = 0 573 | self.header_info = np.array([0, 0, 0, self.seen, 0]) 574 | self.loss_names = ["x", "y", "w", "h", "conf", "cls", "recall", "precision"] 575 | self.layer_num = 12 576 | def forward(self, x, targets=None): 577 | batch = x.shape[0] 578 | is_training = targets is not None 579 | output, output_obj = [], [] 580 | self.losses = defaultdict(float) 581 | layer_outputs = [] 582 | layer = 0 583 | for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): 584 | if module_def["type"] in ["convolutional", "upsample", "maxpool"]: 585 | x = module(x) 586 | layer += 1 587 | elif module_def["type"] == "route": 588 | layer_i = [int(x) for x in module_def["layers"].split(",")] 589 | x = torch.cat([layer_outputs[i] for i in layer_i], 1) 590 | layer += 1 591 | elif module_def["type"] == "shortcut": 592 | layer_i = int(module_def["from"]) 593 | x = layer_outputs[-1] + layer_outputs[layer_i] 594 | layer += 1 595 | elif module_def["type"] == "yoloconvolutional": 596 | output.append(x) ## save final feature block 597 | x = module(x) 598 | layer += 1 599 | elif module_def["type"] == "yolo": 600 | # Train phase: get loss 601 | if is_training: 602 | x, *losses = module[0](x, targets) 603 | for name, loss in zip(self.loss_names, losses): 604 | self.losses[name] += loss 605 | # Test phase: Get detections 606 | else: 607 | x = module(x) 608 | output_obj.append(x) 609 | layer += 1 610 | # x = module(x) 611 | # output.append(x) 612 | layer_outputs.append(x) 613 | 614 | self.losses["recall"] /= 3 615 | self.losses["precision"] /= 3 616 | # return sum(output) if is_training else torch.cat(output, 1) 617 | # return torch.cat(output, 1) 618 | if self.obj_out: 619 | return output, sum(output_obj) if is_training else torch.cat(output_obj, 1), self.losses["precision"], self.losses["recall"] 620 | # return output, sum(output_obj)/(len(output_obj)*batch) if is_training else torch.cat(output_obj, 1) 621 | else: 622 | return output 623 | 624 | def load_weights(self, weights_path): 625 | """Parses and loads the weights stored in 'weights_path'""" 626 | 627 | # Open the weights file 628 | fp = open(weights_path, "rb") 629 | if self.config_path=='./model/yolo9000.cfg': 630 | header = np.fromfile(fp, dtype=np.int32, count=4) # First five are header values 631 | else: 632 | header = np.fromfile(fp, dtype=np.int32, count=5) # First five are header values 633 | # Needed to write header when saving weights 634 | self.header_info = header 635 | 636 | self.seen = header[3] 637 | weights = np.fromfile(fp, dtype=np.float32) # The rest are weights 638 | fp.close() 639 | 640 | ptr = 0 641 | for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)): 642 | if module_def["type"] == "convolutional" or module_def["type"] == "yoloconvolutional": 643 | conv_layer = module[0] 644 | if module_def["batch_normalize"]: 645 | # Load BN bias, weights, running mean and running variance 646 | bn_layer = module[1] 647 | num_b = bn_layer.bias.numel() # Number of biases 648 | # Bias 649 | bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias) 650 | bn_layer.bias.data.copy_(bn_b) 651 | ptr += num_b 652 | # Weight 653 | bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight) 654 | bn_layer.weight.data.copy_(bn_w) 655 | ptr += num_b 656 | # Running Mean 657 | bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean) 658 | bn_layer.running_mean.data.copy_(bn_rm) 659 | ptr += num_b 660 | # Running Var 661 | bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var) 662 | bn_layer.running_var.data.copy_(bn_rv) 663 | ptr += num_b 664 | else: 665 | # Load conv. bias 666 | num_b = conv_layer.bias.numel() 667 | conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias) 668 | conv_layer.bias.data.copy_(conv_b) 669 | ptr += num_b 670 | # Load conv. weights 671 | num_w = conv_layer.weight.numel() 672 | conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight) 673 | conv_layer.weight.data.copy_(conv_w) 674 | ptr += num_w 675 | 676 | """ 677 | @:param path - path of the new weights file 678 | @:param cutoff - save layers between 0 and cutoff (cutoff = -1 -> all are saved) 679 | """ 680 | 681 | def save_weights(self, path, cutoff=-1): 682 | 683 | fp = open(path, "wb") 684 | self.header_info[3] = self.seen 685 | self.header_info.tofile(fp) 686 | 687 | # Iterate through layers 688 | for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])): 689 | if module_def["type"] == "convolutional": 690 | conv_layer = module[0] 691 | # If batch norm, load bn first 692 | if module_def["batch_normalize"]: 693 | bn_layer = module[1] 694 | bn_layer.bias.data.cpu().numpy().tofile(fp) 695 | bn_layer.weight.data.cpu().numpy().tofile(fp) 696 | bn_layer.running_mean.data.cpu().numpy().tofile(fp) 697 | bn_layer.running_var.data.cpu().numpy().tofile(fp) 698 | # Load conv bias 699 | else: 700 | conv_layer.bias.data.cpu().numpy().tofile(fp) 701 | # Load conv weights 702 | conv_layer.weight.data.cpu().numpy().tofile(fp) 703 | 704 | fp.close 705 | 706 | 707 | if __name__ == "__main__": 708 | import torch 709 | import numpy as np 710 | torch.manual_seed(13) 711 | np.random.seed(13) 712 | torch.backends.cudnn.deterministic = True 713 | torch.backends.cudnn.benchmark = False 714 | 715 | model = Darknet() 716 | model.load_weights('./saved_models/yolov3.weights') 717 | # model.eval() 718 | 719 | image = torch.autograd.Variable(torch.randn(1, 3, 416, 416)) 720 | output1, output2, output3 = model(image) 721 | print(output1) 722 | # print(output1.size(), output2.size(), output3.size()) 723 | # print(model(image)) 724 | # print(len(output), output[0].size(), output[1].size(), output[2].size()) 725 | -------------------------------------------------------------------------------- /model/modulation.py: -------------------------------------------------------------------------------- 1 | from collections import OrderedDict 2 | import math 3 | import random 4 | import pprint 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from torch.autograd import Variable 9 | import torchvision.models 10 | from torch.nn.init import kaiming_normal, kaiming_uniform_ 11 | from .darknet import ConvBatchNormReLU, ConvBatchNormReLU_3d 12 | 13 | class Bottleneck(nn.Module): 14 | expansion = 4 15 | 16 | def __init__(self, inplanes, planes, stride=1): 17 | super().__init__() 18 | 19 | # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1 20 | self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False) 21 | self.bn1 = nn.BatchNorm2d(planes) 22 | 23 | self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False) 24 | self.bn2 = nn.BatchNorm2d(planes) 25 | 26 | self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity() 27 | 28 | self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False) 29 | self.bn3 = nn.BatchNorm2d(planes * self.expansion) 30 | 31 | self.relu = nn.ReLU(inplace=True) 32 | self.downsample = None 33 | self.stride = stride 34 | 35 | if stride > 1 or inplanes != planes * Bottleneck.expansion: 36 | # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1 37 | self.downsample = nn.Sequential(OrderedDict([ 38 | ("-1", nn.AvgPool2d(stride)), 39 | ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)), 40 | ("1", nn.BatchNorm2d(planes * self.expansion)) 41 | ])) 42 | 43 | def forward(self, x: torch.Tensor): 44 | identity = x 45 | 46 | out = self.relu(self.bn1(self.conv1(x))) 47 | out = self.relu(self.bn2(self.conv2(out))) 48 | out = self.avgpool(out) 49 | out = self.bn3(self.conv3(out)) 50 | 51 | if self.downsample is not None: 52 | identity = self.downsample(x) 53 | 54 | out += identity 55 | out = self.relu(out) 56 | return out 57 | 58 | class AttentionPool2d(nn.Module): 59 | def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None): 60 | super().__init__() 61 | self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5) 62 | self.k_proj = nn.Linear(embed_dim, embed_dim) 63 | self.q_proj = nn.Linear(embed_dim, embed_dim) 64 | self.v_proj = nn.Linear(embed_dim, embed_dim) 65 | self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim) 66 | self.num_heads = num_heads 67 | self.embed_dim = embed_dim 68 | self.spacial_dim = spacial_dim 69 | 70 | def forward(self, x): 71 | B, C, H, W = x.shape 72 | x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1) # NCHW -> (HW)NC 73 | x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC 74 | 75 | cls_pos = self.positional_embedding[0:1, :] 76 | # spatial_pos = F.interpolate(self.positional_embedding[1:,].reshape(1, self.spacial_dim, self.spacial_dim, self.embed_dim).permute(0, 3, 1, 2), size=(H, W), mode='bilinear') 77 | spatial_pos = self.positional_embedding[1:].reshape(self.spacial_dim, self.spacial_dim, self.embed_dim)[:H, :W] 78 | spatial_pos = spatial_pos.reshape(-1, self.embed_dim) 79 | # spatial_pos = spatial_pos.reshape(self.embed_dim, H*W).permute(1, 0) 80 | positional_embedding = torch.cat([cls_pos, spatial_pos], dim=0) 81 | 82 | x = x + positional_embedding[:, None, :] 83 | x, _ = F.multi_head_attention_forward( 84 | query=x, key=x, value=x, 85 | embed_dim_to_check=x.shape[-1], 86 | num_heads=self.num_heads, 87 | q_proj_weight=self.q_proj.weight, 88 | k_proj_weight=self.k_proj.weight, 89 | v_proj_weight=self.v_proj.weight, 90 | in_proj_weight=None, 91 | in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]), 92 | bias_k=None, 93 | bias_v=None, 94 | add_zero_attn=False, 95 | dropout_p=0, 96 | out_proj_weight=self.c_proj.weight, 97 | out_proj_bias=self.c_proj.bias, 98 | use_separate_proj_weight=True, 99 | training=self.training, 100 | need_weights=False 101 | ) 102 | 103 | x = x.permute(1, 2, 0) 104 | global_feat = x[:, :, 0] 105 | feature_map = x[:, :, 1:].reshape(B, -1, H, W) 106 | return global_feat, feature_map 107 | 108 | class CLIPResNetWithAttention(nn.Module): 109 | """ 110 | A ResNet class that is similar to torchvision's but contains the following changes: 111 | - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool. 112 | - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1 113 | - The final pooling layer is a QKV attention instead of an average pool 114 | """ 115 | 116 | def __init__(self, layers, output_dim=1024, input_resolution=224, width=64, pretrained=None, att_level3=False, baseline=False, **kwargs): 117 | super().__init__() 118 | self.pretrained = pretrained 119 | self.output_dim = output_dim 120 | self.input_resolution = input_resolution 121 | 122 | # the 3-layer stem 123 | self.conv1 = nn.Conv2d(4, width // 2, kernel_size=3, stride=2, padding=1, bias=False) 124 | self.bn1 = nn.BatchNorm2d(width // 2) 125 | self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False) 126 | self.bn2 = nn.BatchNorm2d(width // 2) 127 | self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False) 128 | self.bn3 = nn.BatchNorm2d(width) 129 | self.avgpool = nn.AvgPool2d(2) 130 | self.relu = nn.ReLU(inplace=True) 131 | self.reg = torch.nn.Sequential( 132 | nn.Conv2d(256, 1, kernel_size=1, padding=0, bias=False), 133 | nn.Sigmoid() 134 | ) 135 | # residual layers 136 | self._inplanes = width # this is a *mutable* variable used during construction 137 | self.layer1 = self._make_layer(width, layers[0]) 138 | self.layer2 = self._make_layer(width * 2, layers[1], stride=2) 139 | self.layer3 = self._make_layer(width * 4, layers[2], stride=2) 140 | self.layer4 = self._make_layer(width * 8, layers[3], stride=2) 141 | 142 | embed_dim = width * 32 # the ResNet feature dimension 143 | self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, 32, output_dim) 144 | self.att_level3 = att_level3 145 | self.baseline = baseline 146 | 147 | def init_weights(self, pretrained=None): 148 | pretrained = pretrained or self.pretrained 149 | if isinstance(pretrained, str): 150 | checkpoint = torch.jit.load(pretrained, map_location='cpu').float().state_dict() 151 | 152 | state_dict = {} 153 | 154 | for k in checkpoint.keys(): 155 | if k.startswith('visual.'): 156 | new_k = k.replace('visual.', '') 157 | state_dict[new_k] = checkpoint[k] 158 | 159 | if 'positional_embedding' in new_k: 160 | if self.attnpool.positional_embedding.shape != state_dict[new_k].shape: 161 | print(f'Resize the pos_embed shape from {state_dict[new_k].shape} to {self.attnpool.positional_embedding.shape}') 162 | cls_pos = state_dict[new_k][0:1, :] 163 | H = W = self.input_resolution // 32 164 | spatial_pos = F.interpolate(state_dict[new_k][1:,].reshape(1, 7, 7, cls_pos.shape[1]).permute(0, 3, 1, 2), size=(H, W), mode='bilinear') 165 | spatial_pos = spatial_pos.reshape(cls_pos.shape[1], H*W).permute(1, 0) 166 | positional_embedding = torch.cat([cls_pos, spatial_pos], dim=0) 167 | state_dict[new_k] = positional_embedding 168 | assert self.attnpool.positional_embedding.shape == state_dict[new_k].shape 169 | 170 | u, w = self.load_state_dict(state_dict, False) 171 | print(u, w, 'are misaligned params in CLIPResNet') 172 | 173 | def _make_layer(self, planes, blocks, stride=1): 174 | layers = [Bottleneck(self._inplanes, planes, stride)] 175 | 176 | self._inplanes = planes * Bottleneck.expansion 177 | for _ in range(1, blocks): 178 | layers.append(Bottleneck(self._inplanes, planes)) 179 | 180 | return nn.Sequential(*layers) 181 | 182 | def forward(self, x): 183 | def stem(x): 184 | for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]: 185 | x = self.relu(bn(conv(x))) 186 | x = self.avgpool(x) 187 | return x 188 | 189 | x = x.type(self.conv1.weight.dtype) 190 | x = stem(x) 191 | 192 | outs = [] 193 | x = self.layer1(x) 194 | out1 = self.reg(x) 195 | outs.append(out1) 196 | x = self.layer2(x) 197 | outs.append(x) 198 | x = self.layer3(x) 199 | outs.append(x) 200 | x = self.layer4(x) 201 | outs.append(x) 202 | 203 | x_global, x_local = self.attnpool(x) 204 | outs.append([x_global, x_local]) 205 | if self.att_level3: 206 | new_outs = [outs[0], outs[1], outs[2], outs[4][1], outs[4]] 207 | if self.baseline: 208 | new_outs = new_outs[:-1] 209 | return tuple(new_outs) 210 | else: 211 | return tuple(outs) 212 | 213 | class CLIPTextEncoder(nn.Module): 214 | def __init__(self, context_length=77, 215 | vocab_size=49408, 216 | transformer_width=512, 217 | transformer_heads=8, 218 | transformer_layers=12, 219 | embed_dim=512, 220 | out_dim=256, 221 | pretrained=None, **kwargs): 222 | super().__init__() 223 | 224 | self.pretrained = pretrained 225 | 226 | self.context_length = context_length 227 | 228 | self.transformer = Transformer( 229 | width=transformer_width, 230 | layers=transformer_layers, 231 | heads=transformer_heads, 232 | attn_mask=self.build_attention_mask() 233 | ) 234 | 235 | self.vocab_size = vocab_size 236 | self.token_embedding = nn.Embedding(vocab_size, transformer_width) 237 | self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width)) 238 | self.ln_final = LayerNorm(transformer_width) 239 | self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim)) 240 | 241 | def init_weights(self, pretrained=None): 242 | pretrained = pretrained or self.pretrained 243 | if isinstance(pretrained, str): 244 | checkpoint = torch.jit.load(pretrained, map_location='cpu').float().state_dict() 245 | 246 | state_dict = {} 247 | 248 | for k in checkpoint.keys(): 249 | if k.startswith('transformer.'): 250 | state_dict[k] = checkpoint[k] 251 | 252 | if k == 'positional_embedding' or k == 'text_projection' or k.startswith('token_embedding') or k.startswith('ln_final'): 253 | if k == 'positional_embedding' and checkpoint[k].size(0) > self.context_length: 254 | checkpoint[k] = checkpoint[k][:self.context_length] 255 | print('positional_embedding is tuncated from 77 to', self.context_length) 256 | state_dict[k] = checkpoint[k] 257 | 258 | u, w = self.load_state_dict(state_dict, False) 259 | print(u, w, 'are misaligned params in text encoder') 260 | 261 | 262 | def build_attention_mask(self): 263 | # lazily create causal attention mask, with full attention between the vision tokens 264 | # pytorch uses additive attention mask; fill with -inf 265 | mask = torch.empty(self.context_length, self.context_length) 266 | mask.fill_(float("-inf")) 267 | mask.triu_(1) # zero out the lower diagonal 268 | return mask 269 | 270 | def forward(self, text): 271 | x = self.token_embedding(text) # [batch_size, n_ctx, d_model] 272 | #print(x.shape) 273 | #exit() 274 | x = x + self.positional_embedding 275 | 276 | x = x.permute(1, 0, 2) # NLD -> LND 277 | x = self.transformer(x) 278 | x = x.permute(1, 0, 2) # LND -> NLD 279 | x = self.ln_final(x) 280 | x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection 281 | #x = self.out_proj(x) 282 | return x 283 | 284 | class CLIPVisionTransformer(nn.Module): 285 | def __init__(self, input_resolution=224, patch_size=32, width=768, layers=3, heads=2, output_dim=512, out_indices=[0,1,2], pretrained=None, **kwargs): 286 | super().__init__() 287 | self.pretrained = pretrained 288 | self.input_resolution = input_resolution 289 | self.output_dim = output_dim 290 | self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False) 291 | 292 | scale = width ** -0.5 293 | self.class_embedding = nn.Parameter(scale * torch.randn(width)) 294 | self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width)) 295 | self.spatial_size = input_resolution // patch_size 296 | self.ln_pre = LayerNorm(width) 297 | 298 | self.transformer = Transformer(width, layers, heads) 299 | 300 | self.out_indices = out_indices 301 | 302 | self.ln_post = LayerNorm(width) 303 | self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) 304 | 305 | embed_dim = width 306 | if patch_size == 16: 307 | self.fpn1 = nn.Sequential( 308 | nn.GroupNorm(1, embed_dim), 309 | nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), 310 | nn.BatchNorm2d(embed_dim), 311 | nn.GELU(), 312 | nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), 313 | ) 314 | 315 | self.fpn2 = nn.Sequential( 316 | nn.GroupNorm(1, embed_dim), 317 | nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), 318 | ) 319 | 320 | self.fpn3 = nn.GroupNorm(1, embed_dim) 321 | 322 | self.fpn4 = nn.Sequential( 323 | nn.GroupNorm(1, embed_dim), 324 | nn.MaxPool2d(kernel_size=2, stride=2) 325 | ) 326 | 327 | elif patch_size == 8: 328 | self.fpn1 = nn.Sequential( 329 | nn.GroupNorm(1, embed_dim), 330 | nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2), 331 | ) 332 | 333 | self.fpn2 = nn.GroupNorm(1, embed_dim) 334 | 335 | self.fpn3 = nn.Sequential( 336 | nn.GroupNorm(1, embed_dim), 337 | nn.MaxPool2d(kernel_size=2, stride=2), 338 | ) 339 | 340 | self.fpn4 = nn.Sequential( 341 | nn.GroupNorm(1, embed_dim), 342 | nn.MaxPool2d(kernel_size=4, stride=4), 343 | ) 344 | 345 | 346 | def init_weights(self, pretrained=None): 347 | pretrained = pretrained or self.pretrained 348 | if isinstance(pretrained, str): 349 | checkpoint = torch.jit.load(pretrained, map_location='cpu').float().state_dict() 350 | 351 | state_dict = {} 352 | 353 | for k in checkpoint.keys(): 354 | if k.startswith('visual.'): 355 | new_k = k.replace('visual.', '') 356 | state_dict[new_k] = checkpoint[k] 357 | 358 | if 'positional_embedding' in state_dict.keys(): 359 | if self.positional_embedding.shape != state_dict['positional_embedding'].shape: 360 | print(f'Resize the pos_embed shape from {state_dict["positional_embedding"].shape} to {self.positional_embedding.shape}') 361 | cls_pos = state_dict["positional_embedding"][0:1, :] 362 | spatial_pos = F.interpolate(state_dict["positional_embedding"][1:,].reshape(1, 14, 14, 768).permute(0, 3, 1, 2), size=(self.spatial_size, self.spatial_size), mode='bilinear') 363 | spatial_pos = spatial_pos.reshape(768, self.spatial_size*self.spatial_size).permute(1, 0) 364 | positional_embedding = torch.cat([cls_pos, spatial_pos], dim=0) 365 | state_dict['positional_embedding'] = positional_embedding 366 | assert self.positional_embedding.shape == state_dict['positional_embedding'].shape 367 | 368 | u, w = self.load_state_dict(state_dict, False) 369 | #print(u[0]) 370 | print(u, w, 'are misaligned params in vision transformer') 371 | 372 | def forward(self, x: torch.Tensor): 373 | #x = self.conv1(x) # shape = [*, width, grid, grid] 374 | x = x 375 | #print(x.shape) 376 | B, C, H, W = x.shape 377 | 378 | x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] 379 | x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] 380 | x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width] 381 | 382 | 383 | pos = self.positional_embedding.to(x.dtype) 384 | cls_pos = pos[0,:] + self.class_embedding.to(x.dtype) 385 | spatial_pos = F.interpolate(pos[1:,].reshape(1, self.spatial_size, self.spatial_size, C).permute(0, 3, 1, 2), size=(H, W), mode='bilinear') 386 | spatial_pos = spatial_pos.reshape(1, C, H*W).permute(0, 2, 1) 387 | pos = torch.cat([cls_pos.reshape(1, 1, C), spatial_pos], dim=1) 388 | x = x + pos 389 | x = self.ln_pre(x) 390 | x = x.permute(1, 0, 2) # NLD -> LND 391 | 392 | features = [] 393 | for i, blk in enumerate(self.transformer.resblocks): 394 | x = blk(x) 395 | if i in self.out_indices: 396 | xp = x[1:,: , :].permute(1, 2, 0).reshape(B, -1, H, W) 397 | features.append(xp.contiguous()) 398 | 399 | ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4] 400 | for i in range(len(features)): 401 | features[i] = ops[i](features[i]) 402 | 403 | return tuple(features) 404 | 405 | 406 | class ResidualAttentionBlock(nn.Module): 407 | def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, drop_path=0.): 408 | super().__init__() 409 | 410 | self.attn = nn.MultiheadAttention(d_model, n_head) 411 | self.ln_1 = LayerNorm(d_model) 412 | self.mlp = nn.Sequential(OrderedDict([ 413 | ("c_fc", nn.Linear(d_model, d_model * 4)), 414 | ("gelu", QuickGELU()), 415 | ("c_proj", nn.Linear(d_model * 4, d_model)) 416 | ])) 417 | self.ln_2 = LayerNorm(d_model) 418 | self.attn_mask = attn_mask 419 | 420 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity() 421 | 422 | def attention(self, x: torch.Tensor): 423 | self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None 424 | return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0] 425 | 426 | def forward(self, x: torch.Tensor): 427 | x = x + self.drop_path(self.attention(self.ln_1(x))) 428 | x = x + self.drop_path(self.mlp(self.ln_2(x))) 429 | return x 430 | 431 | class QuickGELU(nn.Module): 432 | def forward(self, x: torch.Tensor): 433 | return x * torch.sigmoid(1.702 * x) 434 | class Transformer(nn.Module): 435 | def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, drop_path_rate=0.): 436 | super().__init__() 437 | self.width = width 438 | self.layers = layers 439 | dpr = [x.item() for x in torch.linspace(0, drop_path_rate, layers)] # stochastic depth decay rule 440 | self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask, dpr[i]) for i in range(layers)]) 441 | 442 | def forward(self, x: torch.Tensor): 443 | return self.resblocks(x) 444 | 445 | def init_modules(modules, init='uniform'): 446 | if init.lower() == 'normal': 447 | init_params = kaiming_normal 448 | elif init.lower() == 'uniform': 449 | init_params = kaiming_uniform_ 450 | else: 451 | return 452 | for m in modules: 453 | if isinstance(m, (nn.Conv3d, nn.Conv2d, nn.Linear)): 454 | init_params(m.weight) 455 | 456 | def gelu(x): 457 | return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) 458 | 459 | class FiLM(nn.Module): 460 | """ 461 | A Feature-wise Linear Modulation Layer from 462 | 'FiLM: Visual Reasoning with a General Conditioning Layer' 463 | """ 464 | def forward(self, x, gammas, betas): 465 | # gammas = gammas.unsqueeze(2).unsqueeze(3).expand_as(x) 466 | # betas = betas.unsqueeze(2).unsqueeze(3).expand_as(x) 467 | return (gammas * x) + betas 468 | 469 | def mask_softmax(attn_score, word_mask, tempuature=10., clssep=False, lstm=False): 470 | if len(attn_score.shape)!=2: 471 | attn_score = attn_score.squeeze(2).squeeze(2) 472 | word_mask_cp = word_mask[:,:attn_score.shape[1]].clone() 473 | score = F.softmax(attn_score*tempuature, dim=1) 474 | if not clssep: 475 | for ii in range(word_mask_cp.shape[0]): 476 | if lstm: 477 | word_mask_cp[ii,word_mask_cp[ii,:].sum()-1]=0 478 | else: 479 | word_mask_cp[ii,0]=0 480 | word_mask_cp[ii,word_mask_cp[ii,:].sum()]=0 ## set one to 0 already 481 | mask_score = score * word_mask_cp.float() 482 | mask_score = mask_score/(mask_score.sum(1)+1e-8).view(mask_score.size(0), 1).expand(mask_score.size(0), mask_score.size(1)) 483 | return mask_score 484 | 485 | class FiLMedConvBlock_context(nn.Module): 486 | def __init__(self, with_residual=True, with_batchnorm=True, 487 | with_cond=[False], dropout=0, num_extra_channels=0, extra_channel_freq=1, 488 | with_input_proj=1, num_cond_maps=8, kernel_size=1, batchnorm_affine=False, 489 | num_layers=1, condition_method='bn-film', debug_every=float('inf'), 490 | textdim=768,visudim=512,contextdim=512,emb_size=512,fusion='prod',cont_map=False, 491 | lstm=False,baseline=False): 492 | super(FiLMedConvBlock_context, self).__init__() 493 | 494 | self.cont_map = cont_map ## mapping context with language feature 495 | self.lstm = lstm 496 | self.emb_size = emb_size 497 | self.with_residual = with_residual 498 | self.fusion = fusion 499 | self.baseline = baseline 500 | self.film = FiLM() 501 | 502 | if self.cont_map: 503 | self.sent_map = nn.Linear(768, emb_size) 504 | self.context_map = nn.Linear(emb_size, emb_size) 505 | if self.fusion == 'cat': 506 | self.attn_map = nn.Conv1d(textdim+visudim, emb_size//2, kernel_size=1) 507 | elif self.fusion == 'prod': 508 | assert(textdim==visudim) ## if product fusion 509 | self.attn_map = nn.Conv1d(visudim, emb_size//2, kernel_size=1) 510 | 511 | self.attn_score = nn.Conv1d(emb_size//2, 1, kernel_size=1) 512 | if self.baseline: 513 | self.fusion_layer = ConvBatchNormReLU(visudim+textdim+8, emb_size, 1, 1, 0, 1) 514 | else: 515 | self.gamme_decode = nn.Linear(textdim, 2 * emb_size) 516 | self.conv1 = nn.Conv2d(visudim+8, emb_size, kernel_size=1) 517 | # self.bn1 = nn.BatchNorm2d(emb_size) 518 | self.bn1 = nn.InstanceNorm2d(emb_size) 519 | init_modules(self.modules()) 520 | 521 | 522 | def forward(self, fvisu, fword, context_score, fcoord,gest, textattn=None,weight=None,fsent=None,word_mask=None): 523 | fword = fword.permute(0, 2, 1) 524 | B, Dvisu, H, W = fvisu.size() 525 | B, Dlang, N = fword.size() 526 | B, N = context_score.size() 527 | assert(Dvisu==Dlang) 528 | 529 | if self.cont_map and fsent is not None: 530 | fsent = F.normalize(F.relu(self.sent_map(fsent)), p=2, dim=1) 531 | fcont = torch.matmul(context_score.view(B,1,N),fword.permute(0,2,1)).squeeze(1) 532 | fcontext = F.relu(self.context_map(fsent*fcont)).unsqueeze(2).repeat(1,1,N) 533 | ## word attention 534 | tile_visu = torch.mean(fvisu.view(B, Dvisu, -1),dim=2,keepdim=True).repeat(1,1,N) 535 | if self.fusion == 'cat': 536 | context_tile = torch.cat([tile_visu,\ 537 | fword, fcontext], dim=1) 538 | elif self.fusion == 'prod': 539 | context_tile = tile_visu * \ 540 | fword * fcontext 541 | else: 542 | ## word attention 543 | tile_visu = torch.mean(fvisu.view(B, Dvisu, -1),dim=2,keepdim=True).repeat(1,1,N) 544 | if self.fusion == 'cat': 545 | context_tile = torch.cat([tile_visu,\ 546 | fword * context_score.view(B, 1, N).repeat(1, Dlang, 1,)], dim=1) 547 | elif self.fusion == 'prod': 548 | context_tile = tile_visu * \ 549 | fword * context_score.view(B, 1, N).repeat(1, Dlang, 1,) 550 | #print(context_tile.shape) 551 | #print(tile_visu.shape) 552 | 553 | attn_feat = F.tanh(self.attn_map(context_tile)) 554 | attn_score = self.attn_score(attn_feat).squeeze(1) 555 | mask_score = mask_softmax(attn_score,word_mask,lstm=self.lstm) 556 | attn_lang = torch.matmul(mask_score.view(B,1,N),fword.permute(0,2,1)) 557 | attn_lang = attn_lang.view(B,Dlang).squeeze(1) 558 | 559 | if self.baseline: 560 | fmodu = self.fusion_layer(torch.cat([fvisu,\ 561 | attn_lang.unsqueeze(2).unsqueeze(2).repeat(1,1,fvisu.shape[-1],fvisu.shape[-1]),fcoord],dim=1)) 562 | else: 563 | ## lang-> gamma, beta 564 | film_param = self.gamme_decode(attn_lang) 565 | film_param = film_param.view(B,2*self.emb_size,1,1).repeat(1,1,H,W) 566 | #print(film_param.shape) 567 | gammas, betas = torch.split(film_param, self.emb_size, dim=1) 568 | 569 | gammas, betas = F.tanh(gammas), F.tanh(betas) 570 | #gest = F.tanh(gest) 571 | # GEST LANGUAGE FUSION 572 | # gammas = gammas * gest.repeat(1,512,1,1).detach() 573 | # betas = betas * gest.repeat(1,512,1,1).detach() 574 | 575 | ## modulate visu feature 576 | fmodu = self.bn1(self.conv1(torch.cat([fvisu,fcoord],dim=1))) 577 | #print(fmodu.shape) 578 | #print(gammas.shape) 579 | #print(betas.shape) 580 | #exit() 581 | fmodu = self.film(fmodu, gammas, betas) 582 | fmodu = F.relu(fmodu) 583 | if self.with_residual: 584 | if weight is None: 585 | fmodu = fvisu + fmodu 586 | else: 587 | weight = weight.view(B,1,1,1).repeat(1, Dvisu, H, W) 588 | fmodu = (1-weight)*fvisu + weight*fmodu 589 | return fmodu, attn_lang, attn_score 590 | 591 | class LayerNorm(nn.LayerNorm): 592 | """Subclass torch's LayerNorm to handle fp16.""" 593 | 594 | def forward(self, x: torch.Tensor): 595 | orig_type = x.dtype 596 | ret = super().forward(x.type(torch.float32)) 597 | return ret.type(orig_type) 598 | 599 | class FiLMedConvBlock_multihop(nn.Module): 600 | def __init__(self, NFilm=2, with_residual=True, with_batchnorm=True, 601 | with_cond=[False], dropout=0, num_extra_channels=0, extra_channel_freq=1, 602 | with_input_proj=1, num_cond_maps=8, kernel_size=1, batchnorm_affine=False, 603 | num_layers=1, condition_method='bn-film', debug_every=float('inf'), 604 | textdim=768,visudim=512,emb_size=512,fusion='cat',intmd=False,lstm=False,erasing=0.): 605 | super(FiLMedConvBlock_multihop, self).__init__() 606 | 607 | self.NFilm = NFilm 608 | self.emb_size = emb_size 609 | self.with_residual = with_residual 610 | self.cont_size = emb_size 611 | self.fusion = fusion 612 | self.intmd = intmd 613 | self.lstm = lstm 614 | self.erasing = erasing 615 | if self.fusion=='cat': 616 | self.cont_size = emb_size*2 617 | 618 | self.modulesdict = nn.ModuleDict() 619 | modules = OrderedDict() 620 | modules["film0"] = FiLMedConvBlock_context(textdim=textdim,visudim=emb_size,contextdim=emb_size,emb_size=emb_size,fusion=fusion,lstm=self.lstm) 621 | for n in range(1,NFilm): 622 | modules["conv%d"%n] = ConvBatchNormReLU(emb_size, emb_size, 3, 1, 1, 1) 623 | modules["film%d"%n] = FiLMedConvBlock_context(textdim=textdim,visudim=emb_size,contextdim=self.cont_size,emb_size=emb_size,fusion=fusion,lstm=self.lstm) 624 | self.modulesdict.update(modules) 625 | 626 | def forward(self, fvisu, fword, fcoord,gest = None, weight=None,fsent=None,word_mask=None): 627 | B, Dvisu, H, W = fvisu.size() 628 | B, N, Dlang = fword.size() 629 | intmd_feat, attnscore_list = [], [] 630 | 631 | x, _, attn_score = self.modulesdict["film0"](fvisu, fword, Variable(torch.ones(B,N).cuda()), fcoord,gest, fsent=fsent,word_mask=word_mask) 632 | attnscore_list.append(attn_score.view(B,N,1,1)) 633 | if self.intmd: 634 | intmd_feat.append(x) 635 | if self.NFilm==1: 636 | intmd_feat = [x] 637 | for n in range(1,self.NFilm): 638 | score_list = [mask_softmax(score.squeeze(2).squeeze(2),word_mask,lstm=self.lstm) for score in attnscore_list] 639 | 640 | score = torch.clamp(torch.max(torch.stack(score_list, dim=1), dim=1, keepdim=False)[0],min=0.,max=1.) 641 | x = self.modulesdict["conv%d"%n](x) 642 | x, _, attn_score = self.modulesdict["film%d"%n](x, fword, (1-score), fcoord,gest, fsent=fsent,word_mask=word_mask) 643 | attnscore_list.append(attn_score.view(B,N,1,1)) ## format match div loss in main func 644 | if self.intmd: 645 | intmd_feat.append(x) 646 | elif n==self.NFilm-1: 647 | intmd_feat = [x] 648 | return intmd_feat, attnscore_list 649 | 650 | class Vector(nn.Sequential): 651 | def __init__(self, input_resolution=224, patch_size=32, width=768, layers=3, heads=2, output_dim=3, out_indices=[0,1,2], pretrained=None, **kwargs): 652 | super().__init__() 653 | self.pretrained = pretrained 654 | self.input_resolution = input_resolution 655 | self.output_dim = output_dim 656 | self.conv1 = nn.Conv2d(in_channels=6, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False) 657 | 658 | scale = width ** -0.5 659 | self.class_embedding = nn.Parameter(scale * torch.randn(width)) 660 | self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width)) 661 | self.spatial_size = input_resolution // patch_size 662 | self.ln_pre = LayerNorm(width) 663 | 664 | self.transformer = Transformer(width, layers, heads) 665 | 666 | self.out_indices = out_indices 667 | 668 | self.ln_post = LayerNorm(width) 669 | self.proj = nn.Parameter(scale * torch.randn(width, output_dim)) 670 | 671 | embed_dim = width 672 | 673 | def forward(self, x: torch.Tensor): 674 | x = self.conv1(x) # shape = [*, width, grid, grid] 675 | B, C, H, W = x.shape 676 | 677 | x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2] 678 | x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] 679 | x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width] 680 | 681 | 682 | pos = self.positional_embedding.to(x.dtype) 683 | cls_pos = pos[0,:] + self.class_embedding.to(x.dtype) 684 | spatial_pos = F.interpolate(pos[1:,].reshape(1, self.spatial_size, self.spatial_size, C).permute(0, 3, 1, 2), size=(H, W), mode='bilinear') 685 | spatial_pos = spatial_pos.reshape(1, C, H*W).permute(0, 2, 1) 686 | pos = torch.cat([cls_pos.reshape(1, 1, C), spatial_pos], dim=1) 687 | x = x + pos 688 | x = self.ln_pre(x) 689 | x = x.permute(1, 0, 2) # NLD -> LND 690 | x = self.transformer(x) 691 | x = self.ln_post(x[0,:,:]) @ self.proj 692 | return x 693 | 694 | class MLP(nn.Module): 695 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 696 | super().__init__() 697 | self.num_layers = num_layers 698 | h = [hidden_dim] * (num_layers - 1) 699 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) 700 | 701 | def forward(self, x): 702 | for i, layer in enumerate(self.layers): 703 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 704 | return x 705 | 706 | 707 | 708 | 709 | 710 | if __name__ == "__main__": 711 | import torch 712 | import numpy as np 713 | 714 | vect = Vector() 715 | 716 | dep = torch.autograd.Variable(torch.randn(1, 3, 512, 512)) 717 | paf = torch.autograd.Variable(torch.randn(1, 1, 256, 256)) 718 | output = model(paf) 719 | print(output) 720 | # print(output1.size(), output2.size(), output3.size()) 721 | # print(model(image)) 722 | # print(len(output), output[0].size(), output[1].size(), output[2].size()) 723 | --------------------------------------------------------------------------------