├── doc
├── frame.png
└── intro.png
├── saved_models
└── yolov3_weights.sh
├── model
├── __pycache__
│ ├── clip.cpython-39.pyc
│ ├── loss.cpython-36.pyc
│ ├── loss.cpython-38.pyc
│ ├── loss.cpython-39.pyc
│ ├── convlstm.cpython-36.pyc
│ ├── convlstm.cpython-38.pyc
│ ├── convlstm.cpython-39.pyc
│ ├── darknet.cpython-36.pyc
│ ├── darknet.cpython-38.pyc
│ ├── darknet.cpython-39.pyc
│ ├── modulation.cpython-36.pyc
│ ├── modulation.cpython-38.pyc
│ ├── modulation.cpython-39.pyc
│ ├── grounding_model.cpython-36.pyc
│ ├── grounding_model.cpython-38.pyc
│ ├── grounding_model.cpython-39.pyc
│ └── grounding_modelbest.cpython-38.pyc
├── convlstm.py
├── loss.py
├── yolov3.cfg
├── grounding_model.py
├── darknet.py
└── modulation.py
├── utils
├── __pycache__
│ ├── utils.cpython-36.pyc
│ ├── utils.cpython-38.pyc
│ ├── utils.cpython-39.pyc
│ ├── __init__.cpython-36.pyc
│ ├── __init__.cpython-38.pyc
│ ├── __init__.cpython-39.pyc
│ ├── checkpoint.cpython-36.pyc
│ ├── checkpoint.cpython-38.pyc
│ ├── checkpoint.cpython-39.pyc
│ ├── transforms.cpython-36.pyc
│ ├── transforms.cpython-38.pyc
│ ├── transforms.cpython-39.pyc
│ ├── word_utils.cpython-36.pyc
│ ├── word_utils.cpython-38.pyc
│ ├── word_utils.cpython-39.pyc
│ ├── parsing_metrics.cpython-36.pyc
│ ├── parsing_metrics.cpython-38.pyc
│ └── parsing_metrics.cpython-39.pyc
├── __init__.py
├── losses.py
├── misc_utils.py
├── checkpoint.py
├── word_utils.py
├── utils.py
├── parsing_metrics.py
├── transforms.py
├── transformsv2.py
└── temp.py
├── dataset
├── __pycache__
│ ├── data_loader.cpython-36.pyc
│ ├── data_loader.cpython-38.pyc
│ └── data_loader.cpython-39.pyc
├── data_loaderv2.py
└── data_loader.py
├── ln_data
└── README.md
├── README.md
└── evaluation_results.py
/doc/frame.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/doc/frame.png
--------------------------------------------------------------------------------
/doc/intro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/doc/intro.png
--------------------------------------------------------------------------------
/saved_models/yolov3_weights.sh:
--------------------------------------------------------------------------------
1 | #wget -P saved_models https://pjreddie.com/media/files/yolov3.weights
--------------------------------------------------------------------------------
/model/__pycache__/clip.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/clip.cpython-39.pyc
--------------------------------------------------------------------------------
/model/__pycache__/loss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/loss.cpython-36.pyc
--------------------------------------------------------------------------------
/model/__pycache__/loss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/loss.cpython-38.pyc
--------------------------------------------------------------------------------
/model/__pycache__/loss.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/loss.cpython-39.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/utils.cpython-36.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/utils.cpython-38.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/utils.cpython-39.pyc
--------------------------------------------------------------------------------
/model/__pycache__/convlstm.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/convlstm.cpython-36.pyc
--------------------------------------------------------------------------------
/model/__pycache__/convlstm.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/convlstm.cpython-38.pyc
--------------------------------------------------------------------------------
/model/__pycache__/convlstm.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/convlstm.cpython-39.pyc
--------------------------------------------------------------------------------
/model/__pycache__/darknet.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/darknet.cpython-36.pyc
--------------------------------------------------------------------------------
/model/__pycache__/darknet.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/darknet.cpython-38.pyc
--------------------------------------------------------------------------------
/model/__pycache__/darknet.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/darknet.cpython-39.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/__init__.cpython-36.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/__init__.cpython-38.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/__init__.cpython-39.pyc
--------------------------------------------------------------------------------
/model/__pycache__/modulation.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/modulation.cpython-36.pyc
--------------------------------------------------------------------------------
/model/__pycache__/modulation.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/modulation.cpython-38.pyc
--------------------------------------------------------------------------------
/model/__pycache__/modulation.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/modulation.cpython-39.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/checkpoint.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/checkpoint.cpython-36.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/checkpoint.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/checkpoint.cpython-38.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/checkpoint.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/checkpoint.cpython-39.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/transforms.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/transforms.cpython-36.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/transforms.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/transforms.cpython-38.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/transforms.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/transforms.cpython-39.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/word_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/word_utils.cpython-36.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/word_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/word_utils.cpython-38.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/word_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/word_utils.cpython-39.pyc
--------------------------------------------------------------------------------
/dataset/__pycache__/data_loader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/dataset/__pycache__/data_loader.cpython-36.pyc
--------------------------------------------------------------------------------
/dataset/__pycache__/data_loader.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/dataset/__pycache__/data_loader.cpython-38.pyc
--------------------------------------------------------------------------------
/dataset/__pycache__/data_loader.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/dataset/__pycache__/data_loader.cpython-39.pyc
--------------------------------------------------------------------------------
/model/__pycache__/grounding_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/grounding_model.cpython-36.pyc
--------------------------------------------------------------------------------
/model/__pycache__/grounding_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/grounding_model.cpython-38.pyc
--------------------------------------------------------------------------------
/model/__pycache__/grounding_model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/grounding_model.cpython-39.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/parsing_metrics.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/parsing_metrics.cpython-36.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/parsing_metrics.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/parsing_metrics.cpython-38.pyc
--------------------------------------------------------------------------------
/utils/__pycache__/parsing_metrics.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/parsing_metrics.cpython-39.pyc
--------------------------------------------------------------------------------
/ln_data/README.md:
--------------------------------------------------------------------------------
1 | # Dataset
2 | Download the YouRefIt dataset from [Dataset Request Page](https://yixchen.github.io/YouRefIt/request.html) and put here.
--------------------------------------------------------------------------------
/model/__pycache__/grounding_modelbest.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/grounding_modelbest.cpython-38.pyc
--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | # -----------------------------------------------------------------------------
3 | # Copyright (c) Edgar Andrés Margffoy-Tuay, Emilio Botero and Juan Camilo Pérez
4 | #
5 | # Licensed under the terms of the MIT License
6 | # (see LICENSE for details)
7 | # -----------------------------------------------------------------------------
8 |
9 | """Misc data and other helping utillites."""
10 |
11 | from .word_utils import Corpus
12 | from .transforms import ResizeImage, ResizeAnnotation
13 |
14 | Corpus
15 | ResizeImage
16 | ResizeAnnotation
17 |
18 |
19 | class AverageMeter(object):
20 | """Computes and stores the average and current value"""
21 |
22 | def __init__(self):
23 | self.reset()
24 |
25 | def reset(self):
26 | self.val = 0
27 | self.avg = 0
28 | self.sum = 0
29 | self.count = 0
30 |
31 | def update(self, val, n=1):
32 | self.val = val
33 | self.sum += val * n
34 | self.count += n
35 | self.avg = self.sum / self.count
36 |
--------------------------------------------------------------------------------
/utils/losses.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | Custom loss function definitions.
5 | """
6 |
7 | import torch.nn as nn
8 | import torch.nn.functional as F
9 |
10 |
11 | class IoULoss(nn.Module):
12 | """
13 | Creates a criterion that computes the Intersection over Union (IoU)
14 | between a segmentation mask and its ground truth.
15 |
16 | Rahman, M.A. and Wang, Y:
17 | Optimizing Intersection-Over-Union in Deep Neural Networks for
18 | Image Segmentation. International Symposium on Visual Computing (2016)
19 | http://www.cs.umanitoba.ca/~ywang/papers/isvc16.pdf
20 | """
21 |
22 | def __init__(self, size_average=True):
23 | super().__init__()
24 | self.size_average = size_average
25 |
26 | def forward(self, input, target):
27 | input = F.sigmoid(input)
28 | intersection = (input * target).sum()
29 | union = ((input + target) - (input * target)).sum()
30 | iou = intersection / union
31 | iou_dual = input.size(0) - iou
32 | if self.size_average:
33 | iou_dual = iou_dual / input.size(0)
34 | return iou_dual
35 |
--------------------------------------------------------------------------------
/utils/misc_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | Misc download and visualization helper functions and class wrappers.
5 | """
6 |
7 | import sys
8 | import time
9 | import torch
10 | from visdom import Visdom
11 |
12 |
13 | def reporthook(count, block_size, total_size):
14 | global start_time
15 | if count == 0:
16 | start_time = time.time()
17 | return
18 | duration = time.time() - start_time
19 | progress_size = int(count * block_size)
20 | speed = int(progress_size / (1024 * duration))
21 | percent = min(int(count * block_size * 100 / total_size), 100)
22 | sys.stdout.write("\r...%d%%, %d MB, %d KB/s, %d seconds passed" %
23 | (percent, progress_size / (1024 * 1024), speed, duration))
24 | sys.stdout.flush()
25 |
26 |
27 | class VisdomWrapper(Visdom):
28 | def __init__(self, *args, env=None, **kwargs):
29 | Visdom.__init__(self, *args, **kwargs)
30 | self.env = env
31 | self.plots = {}
32 |
33 | def init_line_plot(self, name,
34 | X=torch.zeros((1,)).cpu(),
35 | Y=torch.zeros((1,)).cpu(), **opts):
36 | self.plots[name] = self.line(X=X, Y=Y, env=self.env, opts=opts)
37 |
38 | def plot_line(self, name, **kwargs):
39 | self.line(win=self.plots[name], env=self.env, **kwargs)
40 |
--------------------------------------------------------------------------------
/utils/checkpoint.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import numpy as np
4 | import torch
5 | import torch.nn.functional as F
6 | from collections import OrderedDict
7 |
8 | def save_checkpoint(state, is_best, args, filename='default'):
9 | if filename=='default':
10 | filename = 'filmconv_nofpn32_%s_batch%d'%(args.dataset,args.batch_size)
11 |
12 | checkpoint_name = './saved_models/%s_checkpoint.pth.tar'%(filename)
13 | best_name = './saved_models/%s_model_best.pth.tar'%(filename)
14 | torch.save(state, checkpoint_name)
15 | if is_best:
16 | shutil.copyfile(checkpoint_name, best_name)
17 |
18 | def load_pretrain(model, args, logging):
19 | if os.path.isfile(args.pretrain):
20 | checkpoint = torch.load(args.pretrain)
21 | #print(checkpoint.items())
22 | pretrained_dict = checkpoint['state_dict']
23 | #print(pretrained_dict)
24 |
25 | # new_state_dict = OrderedDict()
26 | # for k, v in pretrained_dict.items(): # k为module.xxx.weight, v为权重
27 | # name = k[7:] # 截取`module.`后面的xxx.weight
28 | # new_state_dict[name] = v
29 |
30 | model_dict = model.state_dict()
31 | pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
32 | #pretrained_dict = {k: v for k, v in new_state_dict.items() if k in model_dict}
33 |
34 |
35 | assert (len([k for k, v in pretrained_dict.items()])!=0)
36 | model_dict.update(pretrained_dict)
37 |
38 | model.load_state_dict(model_dict)
39 | #model.load_state_dict(new_state_dict)
40 | print("=> loaded pretrain model at {}"
41 | .format(args.pretrain))
42 | logging.info("=> loaded pretrain model at {}"
43 | .format(args.pretrain))
44 | del checkpoint # dereference seems crucial
45 | torch.cuda.empty_cache()
46 | else:
47 | print(("=> no pretrained file found at '{}'".format(args.pretrain)))
48 | logging.info("=> no pretrained file found at '{}'".format(args.pretrain))
49 | return model
50 |
51 | def load_resume(model, args, logging):
52 | if os.path.isfile(args.resume):
53 | print(("=> loading checkpoint '{}'".format(args.resume)))
54 | logging.info("=> loading checkpoint '{}'".format(args.resume))
55 | checkpoint = torch.load(args.resume)
56 | args.start_epoch = checkpoint['epoch']
57 | best_loss = checkpoint['best_loss']
58 | model.load_state_dict(checkpoint['state_dict'])
59 | print(("=> loaded checkpoint (epoch {}) Loss{}"
60 | .format(checkpoint['epoch'], best_loss)))
61 | logging.info("=> loaded checkpoint (epoch {}) Loss{}"
62 | .format(checkpoint['epoch'], best_loss))
63 | del checkpoint # dereference seems crucial
64 | torch.cuda.empty_cache()
65 | else:
66 | print(("=> no checkpoint found at '{}'".format(args.resume)))
67 | logging.info(("=> no checkpoint found at '{}'".format(args.resume)))
68 | return model
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Spatial and Visual Perspective-Taking via View Rotation and Relation Reasoning for Embodied Reference Understanding
2 |
3 | by [Cheng Shi](https://github.com/ChengShiest/) and [Sibei Yang](https://sibeiyang.github.io/)
4 |
5 | European Conference on Computer Vision (ECCV), 2022
6 |
7 | ## Introduction
8 |
9 | Embodied Reference Understanding studies the reference understanding in an embodied fashion, where a receiver requires to locate a target object referred to by both language and gesture of the sender in a shared physical environment. Its main challenge lies in how to make the receiver with the egocentric view access spatial and visual information relative to the sender to judge how objects are oriented around and seen from the sender, i.e., spatial and visual perspective-taking. In this paper, we propose a REasoning from your Perspective (REP) method to tackle the challenge by modeling relations between the receiver and the sender as well as the sender and the objects via the proposed novel view rotation and relation reasoning. Specifically, view rotation first rotates the receiver to the position of the sender by constructing an embodied 3D coordinate system with the position of the sender as the origin. Then, it changes the orientation of the receiver to the orientation of the sender by encoding the body orientation and gesture of the sender. Relation reasoning models both the nonverbal and verbal relations between the sender and the objects by multi-modal cooperative reasoning in gesture, language, visual content, and spatial position.
10 |
11 |
12 |
13 |
14 |
15 |
16 | ## Framework
17 |
18 |
19 |
20 |
21 |
22 | ## Dataset
23 | Download the YouRefIt dataset from [Dataset Request Page](https://yixchen.github.io/YouRefIt/request.html) and put under ```./ln_data```
24 |
25 | ## Model weights
26 | * [Yolov3](https://pjreddie.com/media/files/yolov3.weights): download the pretrained model and place the file in ``./saved_models`` by
27 | ```
28 | sh saved_models/yolov3_weights.sh
29 | ```
30 |
31 | Make sure to put the files in the following structure:
32 |
33 | ```
34 | |-- ROOT
35 | | |-- ln_data
36 | | |-- yourefit
37 | | |-- images
38 | | |-- paf
39 | | |-- saliency
40 | ```
41 |
42 | ## Training and Evaluation
43 | The training and evaluation script is the same as [YouRefIt](https://github.com/yixchen/YouRefIt_ERU)
44 |
45 | ## Checklist
46 |
47 | + [x] code
48 | + [ ] pre-process data
49 |
50 | ### Citation
51 |
52 | @inproceedings{shi2022spatial,
53 | title={Spatial and Visual Perspective-Taking via View Rotation and Relation Reasoning for Embodied Reference Understanding},
54 | author={Shi, Cheng and Yang, Sibei},
55 | booktitle={Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part XXXVI},
56 | pages={201--218},
57 | year={2022},
58 | organization={Springer}
59 | }
60 |
61 | ### Acknowledgement
62 | Our code is built on [ReSC](https://github.com/zyang-ur/ReSC) and [YouRefIt](https://github.com/yixchen/YouRefIt_ERU), we thank the authors for their hard work.
63 |
64 |
65 |
--------------------------------------------------------------------------------
/utils/word_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | Language-related data loading helper functions and class wrappers.
5 | """
6 |
7 | import re
8 | import torch
9 | import codecs
10 |
11 | UNK_TOKEN = ''
12 | PAD_TOKEN = ''
13 | END_TOKEN = ''
14 | SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
15 |
16 |
17 | class Dictionary(object):
18 | def __init__(self):
19 | self.word2idx = {}
20 | self.idx2word = []
21 |
22 | def add_word(self, word):
23 | if word not in self.word2idx:
24 | self.idx2word.append(word)
25 | self.word2idx[word] = len(self.idx2word) - 1
26 | return self.word2idx[word]
27 |
28 | def __len__(self):
29 | return len(self.idx2word)
30 |
31 | def __getitem__(self, a):
32 | if isinstance(a, int):
33 | return self.idx2word[a]
34 | elif isinstance(a, list):
35 | return [self.idx2word[x] for x in a]
36 | elif isinstance(a, str):
37 | return self.word2idx[a]
38 | else:
39 | raise TypeError("Query word/index argument must be int or str")
40 |
41 | def __contains__(self, word):
42 | return word in self.word2idx
43 |
44 |
45 | class Corpus(object):
46 | def __init__(self):
47 | self.dictionary = Dictionary()
48 |
49 | def set_max_len(self, value):
50 | self.max_len = value
51 |
52 | def load_file(self, filename):
53 | with codecs.open(filename, 'r', 'utf-8') as f:
54 | for line in f:
55 | line = line.strip()
56 | self.add_to_corpus(line)
57 | self.dictionary.add_word(UNK_TOKEN)
58 | self.dictionary.add_word(PAD_TOKEN)
59 |
60 | def add_to_corpus(self, line):
61 | """Tokenizes a text line."""
62 | # Add words to the dictionary
63 | words = line.split()
64 | # tokens = len(words)
65 | for word in words:
66 | word = word.lower()
67 | self.dictionary.add_word(word)
68 |
69 | def tokenize(self, line, max_len=20):
70 | # Tokenize line contents
71 | words = SENTENCE_SPLIT_REGEX.split(line.strip())
72 | # words = [w.lower() for w in words if len(w) > 0]
73 | words = [w.lower() for w in words if (len(w) > 0 and w!=' ')] ## do not include space as a token
74 |
75 | if words[-1] == '.':
76 | words = words[:-1]
77 |
78 | if max_len > 0:
79 | if len(words) > max_len:
80 | words = words[:max_len]
81 | elif len(words) < max_len:
82 | # words = [PAD_TOKEN] * (max_len - len(words)) + words
83 | words = words + [END_TOKEN] + [PAD_TOKEN] * (max_len - len(words) - 1)
84 |
85 | tokens = len(words) ## for end token
86 | ids = torch.LongTensor(tokens)
87 | token = 0
88 | for word in words:
89 | if word not in self.dictionary:
90 | word = UNK_TOKEN
91 | # print(word, type(word), word.encode('ascii','ignore').decode('ascii'), type(word.encode('ascii','ignore').decode('ascii')))
92 | if type(word)!=type('a'):
93 | print(word, type(word), word.encode('ascii','ignore').decode('ascii'), type(word.encode('ascii','ignore').decode('ascii')))
94 | word = word.encode('ascii','ignore').decode('ascii')
95 | ids[token] = self.dictionary[word]
96 | token += 1
97 | # ids[token] = self.dictionary[END_TOKEN]
98 | return ids
99 |
100 | def __len__(self):
101 | return len(self.dictionary)
102 |
--------------------------------------------------------------------------------
/evaluation_results.py:
--------------------------------------------------------------------------------
1 | import os
2 | import numpy as np
3 | import cv2
4 | import pickle5 as pickle
5 | import torch
6 | import json
7 | def bbox_iou(box1, box2, x1y1x2y2=True):
8 | """
9 | Returns the IoU of two bounding boxes
10 | """
11 | box1 = torch.tensor(box1)
12 | box2 = torch.tensor(box2)
13 | if x1y1x2y2:
14 | # Get the coordinates of bounding boxes
15 | b1_x1, b1_y1, b1_x2, b1_y2 = box1[ 0], box1[ 1], box1[ 2], box1[ 3]
16 | b2_x1, b2_y1, b2_x2, b2_y2 = box2[ 0], box2[ 1], box2[ 2], box2[ 3]
17 | else:
18 | # Transform from center and width to exact coordinates
19 | b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
20 | b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
21 | b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
22 | b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
23 |
24 | # get the coordinates of the intersection rectangle
25 | inter_rect_x1 = torch.max(b1_x1, b2_x1)
26 | inter_rect_y1 = torch.max(b1_y1, b2_y1)
27 | inter_rect_x2 = torch.min(b1_x2, b2_x2)
28 | inter_rect_y2 = torch.min(b1_y2, b2_y2)
29 | # Intersection area
30 | inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1, 0) * torch.clamp(inter_rect_y2 - inter_rect_y1, 0)
31 | # Union Area
32 | b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
33 | b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
34 |
35 | # print(box1, box1.shape)
36 | # print(box2, box2.shape)
37 | return inter_area / (b1_area + b2_area - inter_area + 1e-16)
38 |
39 | # Given 2 bounding boxes, return their IoU
40 | def bb_IoU(bb1,bb2):
41 |
42 | Area1 = abs(bb1[2] - bb1[0]) * abs(bb1[3]-bb1[1])
43 | Area2 = abs(bb2[2] - bb2[0]) * abs(bb2[3]-bb2[1])
44 |
45 | xA = max(bb1[0],bb2[0])
46 | yA = max(bb1[1],bb2[1])
47 | xB = min(bb1[2],bb2[2])
48 | yB = min(bb1[3],bb2[3])
49 |
50 | intersection = max(0, xB - xA) * max(0, yB - yA)
51 | IoU = intersection / (Area1 + Area2 - intersection + 1e-16)
52 |
53 | return(IoU)
54 |
55 | def Area(bb1, image):
56 | area1 = abs(bb1[2] - bb1[0]) * abs(bb1[3]-bb1[1])
57 | return area1/image
58 |
59 | def evaluation(image_path, gt_path, predict_path):
60 | yolopred = dict()
61 |
62 | with open("ln_data/yourefit/test_id.txt", "r") as f:
63 | test_id_list = f.readlines()
64 | test_id_list = [x.strip('\n') for x in test_id_list]
65 | print(test_id_list)
66 |
67 | with open("ln_data/yourefit/train_id.txt", "r") as f:
68 | train_id_list = f.readlines()
69 | train_id_list = [x.strip('\n') for x in train_id_list]
70 |
71 |
72 |
73 | TP= dict()
74 | TP['all'] = np.zeros((3,))
75 | TP['s'] = np.zeros((3,))
76 | TP['m'] = np.zeros((3,))
77 | TP['l'] = np.zeros((3,))
78 |
79 | FP= dict()
80 | FP['all'] = np.zeros((3,))
81 | FP['s'] = np.zeros((3,))
82 | FP['m'] = np.zeros((3,))
83 | FP['l'] = np.zeros((3,))
84 | gt_boxes = []
85 | for ind, pattern in enumerate(test_id_list):
86 | img = cv2.imread(os.path.join(image_path, pattern+'.jpg'))
87 | H,W,_ = img.shape
88 | pickle_name = os.path.join(gt_path, pattern+'.p')
89 | gt = pickle.load(open( pickle_name, "rb" ))
90 | ground_truth_box = gt['bbox']
91 | gt_boxes.append(ground_truth_box)
92 | # read prediction file (Need to change based on input)
93 | pred_pickle = os.path.join(predict_path, pattern+'.jpg.p')
94 | pred = pickle.load(open(pred_pickle, "rb" ))
95 | predicted_box = pred[0]
96 | #
97 | yolopred[test_id_list[ind]] = predicted_box
98 | for ind, IoU in enumerate([0.25, 0.5, 0.75] ):
99 | if bbox_iou(predicted_box,ground_truth_box) >= IoU:
100 | TP['all'][ind] +=1
101 | if 100*Area(ground_truth_box, H*W) < 0.48:
102 | TP['s'][ind] += 1
103 | else:
104 | if 100*Area(ground_truth_box, H*W) < 1.75:
105 | TP['m'][ind] += 1
106 | else:
107 | TP['l'][ind] += 1
108 | else:
109 | FP['all'][ind] +=1
110 | if 100*Area(ground_truth_box, H*W) < 0.48:
111 | FP['s'][ind] += 1
112 | else:
113 | if 100*Area(ground_truth_box, H*W) < 1.75:
114 | FP['m'][ind] += 1
115 | else:
116 | FP['l'][ind] += 1
117 |
118 | for ind, IoU in enumerate([0.25, 0.5, 0.75]):
119 | print('Accuracy =',TP['all'][ind]/(TP['all'][ind]+FP['all'][ind]))
120 | print('Small Accuracy =',TP['s'][ind]/(TP['s'][ind]+FP['s'][ind]), 'in', TP['s'][ind]+FP['s'][ind], 'samples')
121 | print('Medium Accuracy =',TP['m'][ind]/(TP['m'][ind]+FP['m'][ind]), 'in', TP['m'][ind]+FP['m'][ind], 'samples')
122 | print('Large Accuracy =',TP['l'][ind]/(TP['l'][ind]+FP['l'][ind]), 'in', TP['l'][ind]+FP['l'][ind], 'samples')
123 |
124 | if __name__ == "__main__":
125 |
126 | image_path= 'ln_data/yourefit/images'
127 | gt_path= 'ln_data/yourefit/pickle'
128 | predict_path = 'test/test_final'
129 | evaluation(image_path, gt_path, predict_path)
130 |
--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | import cv2
4 | import numpy as np
5 | import torch
6 | import torch.nn.functional as F
7 |
8 | class AverageMeter(object):
9 | """Computes and stores the average and current value"""
10 | def __init__(self):
11 | self.reset()
12 |
13 | def reset(self):
14 | self.val = 0
15 | self.avg = 0
16 | self.sum = 0
17 | self.count = 0
18 |
19 | def update(self, val, n=1):
20 | self.val = val
21 | self.sum += val * n
22 | self.count += n
23 | self.avg = self.sum / self.count
24 |
25 | def xyxy2xywh(x): # Convert bounding box format from [x1, y1, x2, y2] to [x, y, w, h]
26 | y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape)
27 | y[:, 0] = (x[:, 0] + x[:, 2]) / 2
28 | y[:, 1] = (x[:, 1] + x[:, 3]) / 2
29 | y[:, 2] = x[:, 2] - x[:, 0]
30 | y[:, 3] = x[:, 3] - x[:, 1]
31 | return y
32 |
33 |
34 | def xywh2xyxy(x): # Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2]
35 | y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape)
36 | y[:, 0] = (x[:, 0] - x[:, 2] / 2)
37 | y[:, 1] = (x[:, 1] - x[:, 3] / 2)
38 | y[:, 2] = (x[:, 0] + x[:, 2] / 2)
39 | y[:, 3] = (x[:, 1] + x[:, 3] / 2)
40 | return y
41 |
42 | def bbox_iou_numpy(box1, box2):
43 | """Computes IoU between bounding boxes.
44 | Parameters
45 | ----------
46 | box1 : ndarray
47 | (N, 4) shaped array with bboxes
48 | box2 : ndarray
49 | (M, 4) shaped array with bboxes
50 | Returns
51 | -------
52 | : ndarray
53 | (N, M) shaped array with IoUs
54 | """
55 | area = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1])
56 |
57 | iw = np.minimum(np.expand_dims(box1[:, 2], axis=1), box2[:, 2]) - np.maximum(
58 | np.expand_dims(box1[:, 0], 1), box2[:, 0]
59 | )
60 | ih = np.minimum(np.expand_dims(box1[:, 3], axis=1), box2[:, 3]) - np.maximum(
61 | np.expand_dims(box1[:, 1], 1), box2[:, 1]
62 | )
63 |
64 | iw = np.maximum(iw, 0)
65 | ih = np.maximum(ih, 0)
66 |
67 | ua = np.expand_dims((box1[:, 2] - box1[:, 0]) * (box1[:, 3] - box1[:, 1]), axis=1) + area - iw * ih
68 |
69 | ua = np.maximum(ua, np.finfo(float).eps)
70 |
71 | intersection = iw * ih
72 |
73 | return intersection / ua
74 |
75 |
76 | def bbox_iou(box1, box2, x1y1x2y2=True):
77 | """
78 | Returns the IoU of two bounding boxes
79 | """
80 | if x1y1x2y2:
81 | # Get the coordinates of bounding boxes
82 | b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
83 | b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
84 | else:
85 | # Transform from center and width to exact coordinates
86 | b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
87 | b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
88 | b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
89 | b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
90 |
91 | # get the coordinates of the intersection rectangle
92 | inter_rect_x1 = torch.max(b1_x1, b2_x1)
93 | inter_rect_y1 = torch.max(b1_y1, b2_y1)
94 | inter_rect_x2 = torch.min(b1_x2, b2_x2)
95 | inter_rect_y2 = torch.min(b1_y2, b2_y2)
96 | # Intersection area
97 | inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1, 0) * torch.clamp(inter_rect_y2 - inter_rect_y1, 0)
98 | # Union Area
99 | b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
100 | b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
101 |
102 | # print(box1, box1.shape)
103 | # print(box2, box2.shape)
104 | return inter_area / (b1_area + b2_area - inter_area + 1e-16)
105 |
106 | def multiclass_metrics(pred, gt):
107 | """
108 | check precision and recall for predictions.
109 | Output: overall = {precision, recall, f1}
110 | """
111 | eps=1e-6
112 | overall = {'precision': -1, 'recall': -1, 'f1': -1}
113 | NP, NR, NC = 0, 0, 0 # num of pred, num of recall, num of correct
114 | for ii in range(pred.shape[0]):
115 | pred_ind = np.array(pred[ii]>0.5, dtype=int)
116 | gt_ind = np.array(gt[ii]>0.5, dtype=int)
117 | inter = pred_ind * gt_ind
118 | # add to overall
119 | NC += np.sum(inter)
120 | NP += np.sum(pred_ind)
121 | NR += np.sum(gt_ind)
122 | if NP > 0:
123 | overall['precision'] = float(NC)/NP
124 | if NR > 0:
125 | overall['recall'] = float(NC)/NR
126 | if NP > 0 and NR > 0:
127 | overall['f1'] = 2*overall['precision']*overall['recall']/(overall['precision']+overall['recall']+eps)
128 | return overall
129 |
130 | def compute_ap(recall, precision):
131 | """ Compute the average precision, given the recall and precision curves.
132 | Code originally from https://github.com/rbgirshick/py-faster-rcnn.
133 | # Arguments
134 | recall: The recall curve (list).
135 | precision: The precision curve (list).
136 | # Returns
137 | The average precision as computed in py-faster-rcnn.
138 | """
139 | # correct AP calculation
140 | # first append sentinel values at the end
141 | mrec = np.concatenate(([0.0], recall, [1.0]))
142 | mpre = np.concatenate(([0.0], precision, [0.0]))
143 |
144 | # compute the precision envelope
145 | for i in range(mpre.size - 1, 0, -1):
146 | mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
147 |
148 | # to calculate area under PR curve, look for points
149 | # where X axis (recall) changes value
150 | i = np.where(mrec[1:] != mrec[:-1])[0]
151 |
152 | # and sum (\Delta recall) * prec
153 | ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
154 | return ap
155 |
--------------------------------------------------------------------------------
/utils/parsing_metrics.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import numpy as np
3 | import os
4 |
5 | # from plot_util import plot_confusion_matrix
6 | # from makemask import *
7 |
8 | def _fast_hist(label_true, label_pred, n_class):
9 | mask = (label_true >= 0) & (label_true < n_class)
10 | hist = np.bincount(
11 | n_class * label_true[mask].astype(int) +
12 | label_pred[mask], minlength=n_class ** 2).reshape(n_class, n_class)
13 | return hist
14 |
15 | def label_accuracy_score(label_trues, label_preds, n_class, bg_thre=200):
16 | """Returns accuracy score evaluation result.
17 | - overall accuracy
18 | - mean accuracy
19 | - mean IU
20 | - fwavacc
21 | """
22 | hist = np.zeros((n_class, n_class))
23 | for lt, lp in zip(label_trues, label_preds):
24 | # hist += _fast_hist(lt.flatten(), lp.flatten(), n_class)
25 | hist += _fast_hist(lt[lt 0] * iu[freq > 0]).sum()
33 | return acc, acc_cls, mean_iu, fwavacc
34 |
35 | def label_confusion_matrix(label_trues, label_preds, n_class, bg_thre=200):
36 | # eps=1e-20
37 | hist=np.zeros((n_class,n_class),dtype=float)
38 | """ (8,256,256), (256,256) """
39 | for lt,lp in zip(label_trues, label_preds):
40 | # hist += _fast_hist(lt.flatten(), lp.flatten(), n_class)
41 | hist += _fast_hist(lt[lt 0] * iu[freq > 0]).sum()
72 | return acc, acc_cls, mean_iu, fwavacc, iu
73 |
74 | # if __name__ == '__main__':
75 | # """ Evaluating from saved png segmentation maps
76 | # 0.862723060822 0.608076070823 0.503493670787 0.76556929118
77 | # """
78 | # import csv
79 | # from PIL import Image
80 | # import matplotlib as mpl
81 | # mpl.use('Agg')
82 | # from matplotlib import pyplot as plt
83 | # eps=1e-20
84 |
85 | # class AverageMeter(object):
86 | # """Computes and stores the average and current value"""
87 | # def __init__(self):
88 | # self.reset()
89 |
90 | # def reset(self):
91 | # self.val = 0
92 | # self.avg = 0
93 | # self.sum = 0
94 | # self.count = 0
95 |
96 | # def update(self, val, n=1):
97 | # self.val = val
98 | # self.sum += val * n
99 | # self.count += n
100 | # self.avg = self.sum / self.count
101 | # def load_csv(csv_file):
102 | # img_list, kpt_list, conf_list=[],[],[]
103 | # with open(csv_file, 'rb') as f:
104 | # reader = csv.reader(f)
105 | # for row in reader:
106 | # img_list.append(row[0])
107 | # kpt_list.append([row[i] for i in range(1,len(row)) if i%3!=0])
108 | # conf_list.append([row[i] for i in range(1,len(row)) if i%3==0])
109 | # # print len(img_list),len(kpt_list[0]),len(conf_list[0])
110 | # return img_list,kpt_list,conf_list
111 |
112 | # n_class = 7
113 | # superpixel_smooth = False
114 | # # valfile = '../../ln_data/LIP/TrainVal_pose_annotations/lip_val_set.csv'
115 | # # pred_folder = '../../../git_code/LIP_JPPNet/output/parsing/val/'
116 | # # pred_folder = '../visulizations/refinenet_baseline/test_out/'
117 | # pred_folder = '../visulizations/refinenet_splittask/test_out/'
118 | # gt_folder = '../../ln_data/pascal_data/SegmentationPart/'
119 | # img_path = '../../ln_data/pascal_data/JPEGImages/'
120 |
121 | # file = '../../ln_data/pascal_data/val_id.txt'
122 | # missjoints = '../../ln_data/pascal_data/no_joint_list.txt'
123 | # img_list = [x.strip().split(' ')[0] for x in open(file)]
124 | # miss_list = [x.strip().split(' ')[0] for x in open(missjoints)]
125 |
126 | # conf_matrices = AverageMeter()
127 | # for index in range(len(img_list)):
128 | # img_name = img_list[index]
129 | # if img_name in miss_list:
130 | # continue
131 | # if not os.path.isfile(pred_folder + img_name + '.png'):
132 | # continue
133 | # pred_file = pred_folder + img_name + '.png'
134 | # pred = Image.open(pred_file)
135 | # gt_file = gt_folder + img_name + '.png'
136 | # gt = Image.open(gt_file)
137 | # pred, gt = np.array(pred, dtype=np.int32), np.array(gt, dtype=np.int32)
138 | # if superpixel_smooth:
139 | # img_file = img_path+img_name+'.jpg'
140 | # img = Image.open(img_file)
141 | # pred = superpixel_expand(np.array(img),pred)
142 | # confusion, _ = label_confusion_matrix(gt, pred, n_class)
143 | # conf_matrices.update(confusion,1)
144 | # acc, acc_cls, mean_iu, fwavacc, iu = hist_based_accu_cal(conf_matrices.avg)
145 | # print(acc, acc_cls, mean_iu, fwavacc)
146 | # print(iu)
147 |
148 | # ## SAVE CONFUSION MATRIX
149 | # figure=plt.figure()
150 | # class_name=['bg', 'head', 'torso', 'upper arm', 'lower arm', 'upper leg', 'lower leg']
151 | # conf_matrices = conf_matrices.avg
152 | # for i in range(n_class):
153 | # conf_matrices[i,:]=(conf_matrices[i,:]+eps)/sum(conf_matrices[i,:]+eps)
154 | # plot_confusion_matrix(conf_matrices, classes=class_name,
155 | # rotation=0, include_text=True,
156 | # title='Confusion matrix, without normalization')
157 | # plt.show()
158 | # plt.savefig('../saved_models/Baseline_refinenet_test.jpg')
159 | # plt.close('all')
160 |
--------------------------------------------------------------------------------
/model/convlstm.py:
--------------------------------------------------------------------------------
1 | import torch.nn as nn
2 | from torch.autograd import Variable
3 | import torch
4 |
5 | """
6 | https://github.com/ndrplz/ConvLSTM_pytorch
7 | """
8 |
9 | class ConvLSTMCell(nn.Module):
10 |
11 | def __init__(self, input_size, input_dim, hidden_dim, kernel_size, bias):
12 | """
13 | Initialize ConvLSTM cell.
14 |
15 | Parameters
16 | ----------
17 | input_size: (int, int)
18 | Height and width of input tensor as (height, width).
19 | input_dim: int
20 | Number of channels of input tensor.
21 | hidden_dim: int
22 | Number of channels of hidden state.
23 | kernel_size: (int, int)
24 | Size of the convolutional kernel.
25 | bias: bool
26 | Whether or not to add the bias.
27 | """
28 |
29 | super(ConvLSTMCell, self).__init__()
30 |
31 | self.height, self.width = input_size
32 | self.input_dim = input_dim
33 | self.hidden_dim = hidden_dim
34 |
35 | self.kernel_size = kernel_size
36 | self.padding = kernel_size[0] // 2, kernel_size[1] // 2
37 | self.bias = bias
38 |
39 | self.conv = nn.Conv2d(in_channels=self.input_dim + self.hidden_dim,
40 | out_channels=4 * self.hidden_dim,
41 | kernel_size=self.kernel_size,
42 | padding=self.padding,
43 | bias=self.bias)
44 |
45 | def forward(self, input_tensor, cur_state):
46 |
47 | h_cur, c_cur = cur_state
48 |
49 | combined = torch.cat([input_tensor, h_cur], dim=1) # concatenate along channel axis
50 |
51 | combined_conv = self.conv(combined)
52 | cc_i, cc_f, cc_o, cc_g = torch.split(combined_conv, self.hidden_dim, dim=1)
53 | i = torch.sigmoid(cc_i)
54 | f = torch.sigmoid(cc_f)
55 | o = torch.sigmoid(cc_o)
56 | g = torch.tanh(cc_g)
57 |
58 | c_next = f * c_cur + i * g
59 | h_next = o * torch.tanh(c_next)
60 |
61 | return h_next, c_next
62 |
63 | def init_hidden(self, batch_size):
64 | return (Variable(torch.zeros(batch_size, self.hidden_dim, self.height, self.width)).cuda(),
65 | Variable(torch.zeros(batch_size, self.hidden_dim, self.height, self.width)).cuda())
66 |
67 |
68 | class ConvLSTM(nn.Module):
69 |
70 | def __init__(self, input_size, input_dim, hidden_dim, kernel_size, num_layers,
71 | batch_first=False, bias=True, return_all_layers=False):
72 | super(ConvLSTM, self).__init__()
73 |
74 | self._check_kernel_size_consistency(kernel_size)
75 |
76 | # Make sure that both `kernel_size` and `hidden_dim` are lists having len == num_layers
77 | kernel_size = self._extend_for_multilayer(kernel_size, num_layers)
78 | hidden_dim = self._extend_for_multilayer(hidden_dim, num_layers)
79 | if not len(kernel_size) == len(hidden_dim) == num_layers:
80 | raise ValueError('Inconsistent list length.')
81 |
82 | self.height, self.width = input_size
83 |
84 | self.input_dim = input_dim
85 | self.hidden_dim = hidden_dim
86 | self.kernel_size = kernel_size
87 | self.num_layers = num_layers
88 | self.batch_first = batch_first
89 | self.bias = bias
90 | self.return_all_layers = return_all_layers
91 |
92 | cell_list = []
93 | for i in range(0, self.num_layers):
94 | cur_input_dim = self.input_dim if i == 0 else self.hidden_dim[i-1]
95 |
96 | cell_list.append(ConvLSTMCell(input_size=(self.height, self.width),
97 | input_dim=cur_input_dim,
98 | hidden_dim=self.hidden_dim[i],
99 | kernel_size=self.kernel_size[i],
100 | bias=self.bias))
101 |
102 | self.cell_list = nn.ModuleList(cell_list)
103 |
104 | def forward(self, input_tensor, hidden_state=None):
105 | """
106 |
107 | Parameters
108 | ----------
109 | input_tensor: todo
110 | 5-D Tensor either of shape (t, b, c, h, w) or (b, t, c, h, w)
111 | hidden_state: todo
112 | None. todo implement stateful
113 |
114 | Returns
115 | -------
116 | last_state_list, layer_output
117 | """
118 | if not self.batch_first:
119 | # (t, b, c, h, w) -> (b, t, c, h, w)
120 | input_tensor = input_tensor.permute(1, 0, 2, 3, 4)
121 |
122 | # Implement stateful ConvLSTM
123 | if hidden_state is not None:
124 | raise NotImplementedError()
125 | else:
126 | hidden_state = self._init_hidden(batch_size=input_tensor.size(0))
127 |
128 | layer_output_list = []
129 | last_state_list = []
130 |
131 | seq_len = input_tensor.size(1)
132 | cur_layer_input = input_tensor
133 |
134 | for layer_idx in range(self.num_layers):
135 |
136 | h, c = hidden_state[layer_idx]
137 | output_inner = []
138 | for t in range(seq_len):
139 |
140 | h, c = self.cell_list[layer_idx](input_tensor=cur_layer_input[:, t, :, :, :],
141 | cur_state=[h, c])
142 | output_inner.append(h)
143 |
144 | layer_output = torch.stack(output_inner, dim=1)
145 | cur_layer_input = layer_output
146 |
147 | layer_output_list.append(layer_output)
148 | last_state_list.append([h, c])
149 |
150 | if not self.return_all_layers:
151 | layer_output_list = layer_output_list[-1:]
152 | last_state_list = last_state_list[-1:]
153 |
154 | return layer_output_list, last_state_list
155 |
156 | def _init_hidden(self, batch_size):
157 | init_states = []
158 | for i in range(self.num_layers):
159 | init_states.append(self.cell_list[i].init_hidden(batch_size))
160 | return init_states
161 |
162 | @staticmethod
163 | def _check_kernel_size_consistency(kernel_size):
164 | if not (isinstance(kernel_size, tuple) or
165 | (isinstance(kernel_size, list) and all([isinstance(elem, tuple) for elem in kernel_size]))):
166 | raise ValueError('`kernel_size` must be tuple or list of tuples')
167 |
168 | @staticmethod
169 | def _extend_for_multilayer(param, num_layers):
170 | if not isinstance(param, list):
171 | param = [param] * num_layers
172 | return param
--------------------------------------------------------------------------------
/model/loss.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import torch
3 | import torch.nn.functional as F
4 | from torch.autograd import Variable
5 | from model.modulation import mask_softmax
6 | from utils.utils import bbox_iou
7 | import math
8 | from torchvision.ops.boxes import box_area
9 | def lr_poly(base_lr, iter, max_iter, power):
10 | return base_lr * ((1 - float(iter) / max_iter) ** (power))
11 |
12 | def lr_cos(base_lr, iter, max_iter, warm_up=0.05):
13 | warm_up_epoch = int(max_iter*warm_up)
14 | if iter<=warm_up_epoch:
15 | lr = base_lr*(0.8*iter/warm_up_epoch+0.2)
16 | else:
17 | lr = 0.5*base_lr*(1+math.cos(math.pi*(iter-warm_up_epoch)/(max_iter-warm_up_epoch)))
18 | return lr
19 |
20 | def adjust_learning_rate(args, optimizer, i_iter):
21 | # print(optimizer.param_groups[0]['lr'], optimizer.param_groups[1]['lr'])
22 | if args.power==-1:
23 | lr = lr_cos(args.lr, i_iter, args.nb_epoch)
24 | elif args.power==-2:
25 | lr = args.lr*((0.5)**(i_iter//10))
26 | elif args.power==-3:
27 | lr = args.lr*((0.5)**(i_iter//30))
28 | elif args.power!=0.:
29 | lr = lr_poly(args.lr, i_iter, args.nb_epoch, args.power)
30 | else:
31 | # lr = args.lr*((0.1)**(i_iter//(args.nb_epoch//4)))
32 | lr = args.lr*((0.5)**(i_iter//(args.nb_epoch//10)))
33 | print(lr)
34 | optimizer.param_groups[0]['lr'] = lr
35 | if len(optimizer.param_groups) > 1:
36 | optimizer.param_groups[1]['lr'] = lr / 10
37 | if len(optimizer.param_groups) > 2:
38 | optimizer.param_groups[2]['lr'] = lr / 10
39 |
40 | def yolo_loss(input, target, gi, gj, best_n_list, w_coord=5., w_neg=1./5, size_average=True):
41 | mseloss = torch.nn.MSELoss(size_average=True)
42 | celoss = torch.nn.CrossEntropyLoss(size_average=True)
43 | batch = input.size(0)
44 |
45 | pred_bbox = Variable(torch.zeros(batch,4).cuda())
46 | gt_bbox = Variable(torch.zeros(batch,4).cuda())
47 | for ii in range(batch):
48 | pred_bbox[ii, 0:2] = F.sigmoid(input[ii,best_n_list[ii],0:2,gj[ii],gi[ii]])
49 | pred_bbox[ii, 2:4] = input[ii,best_n_list[ii],2:4,gj[ii],gi[ii]]
50 | gt_bbox[ii, :] = target[ii,best_n_list[ii],:4,gj[ii],gi[ii]]
51 | loss_x = mseloss(pred_bbox[:,0], gt_bbox[:,0])
52 | loss_y = mseloss(pred_bbox[:,1], gt_bbox[:,1])
53 | loss_w = mseloss(pred_bbox[:,2], gt_bbox[:,2])
54 | loss_h = mseloss(pred_bbox[:,3], gt_bbox[:,3])
55 |
56 | pred_conf_list, gt_conf_list = [], []
57 | pred_conf_list.append(input[:,:,4,:,:].contiguous().view(batch,-1))
58 | gt_conf_list.append(target[:,:,4,:,:].contiguous().view(batch,-1))
59 | pred_conf = torch.cat(pred_conf_list, dim=1)
60 | gt_conf = torch.cat(gt_conf_list, dim=1)
61 | loss_conf = celoss(pred_conf, gt_conf.max(1)[1])
62 |
63 |
64 | return (loss_x+loss_y+loss_w+loss_h)*w_coord + loss_conf
65 |
66 | def generalized_box_iou(boxes1, boxes2):
67 | """
68 | Generalized IoU from https://giou.stanford.edu/
69 |
70 | The boxes should be in [x0, y0, x1, y1] format
71 |
72 | Returns a [N, M] pairwise matrix, where N = len(boxes1)
73 | and M = len(boxes2)
74 | """
75 | # degenerate boxes gives inf / nan results
76 | # so do an early check
77 | if (boxes1[:, 2:] >= boxes1[:, :2]).all() and (boxes2[:, 2:] >= boxes2[:, :2]).all():
78 |
79 | iou, union = box_iou(boxes1, boxes2)
80 |
81 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
82 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
83 |
84 | wh = (rb - lt).clamp(min=0) # [N,M,2]
85 | area = wh[:, :, 0] * wh[:, :, 1]
86 |
87 | return iou - (area - union) / (area)
88 | else:
89 | return torch.tensor([0.])
90 | def box_iou(boxes1, boxes2):
91 | area1 = box_area(boxes1)
92 | area2 = box_area(boxes2)
93 |
94 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2]
95 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2]
96 |
97 | wh = (rb - lt).clamp(min=0) # [N,M,2]
98 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M]
99 |
100 | union = area1[:, None] + area2 - inter
101 |
102 | iou = inter / union
103 | return iou, union
104 | def diverse_loss(score_list, word_mask, m=-1, coverage_reg=True):
105 | score_matrix = torch.stack([mask_softmax(score,word_mask) for score in score_list], dim=1) ## (B,Nfilm,N,H,W)
106 | cov_matrix = torch.bmm(score_matrix,score_matrix.permute(0,2,1)) ## (BHW,Nfilm,Nfilm)
107 | id_matrix = Variable(torch.eye(cov_matrix.shape[1]).unsqueeze(0).repeat(cov_matrix.shape[0],1,1).cuda())
108 | if m==-1.:
109 | div_reg = torch.sum(((cov_matrix*(1-id_matrix))**2).view(-1))/cov_matrix.shape[0]
110 | else:
111 | div_reg = torch.sum(((cov_matrix-m*id_matrix)**2).view(-1))/cov_matrix.shape[0]
112 | if coverage_reg:
113 | word_mask_cp = word_mask.clone()
114 | for ii in range(word_mask_cp.shape[0]):
115 | word_mask_cp[ii,0]=0
116 | word_mask_cp[ii,word_mask_cp[ii,:].sum()]=0 ## set one to 0 already
117 | cover_matrix = 1.-torch.clamp(torch.sum(score_matrix, dim=1, keepdim=False),min=0.,max=1.)
118 | cover_reg = torch.sum((cover_matrix*word_mask_cp.float()).view(-1))/cov_matrix.shape[0]
119 | div_reg += cover_reg
120 | return div_reg
121 |
122 | def build_target(raw_coord, pred, anchors_full, args):
123 | coord = Variable(torch.zeros(raw_coord.size(0), raw_coord.size(1)).cuda())
124 | batch, grid = raw_coord.size(0), args.size//args.gsize
125 | coord[:,0] = (raw_coord[:,0] + raw_coord[:,2])/(2*args.size)
126 | coord[:,1] = (raw_coord[:,1] + raw_coord[:,3])/(2*args.size)
127 | coord[:,2] = (raw_coord[:,2] - raw_coord[:,0])/(args.size)
128 | coord[:,3] = (raw_coord[:,3] - raw_coord[:,1])/(args.size)
129 | coord = coord * grid
130 | bbox=torch.zeros(coord.size(0),9,5,grid, grid)
131 | best_n_list, best_gi, best_gj = [],[],[]
132 |
133 | for ii in range(batch):
134 | batch, grid = raw_coord.size(0), args.size//args.gsize
135 | gi = coord[ii,0].long()
136 | gj = coord[ii,1].long()
137 | tx = coord[ii,0] - gi.float()
138 | ty = coord[ii,1] - gj.float()
139 |
140 | gw = coord[ii,2]
141 | gh = coord[ii,3]
142 |
143 | anchor_idxs = range(9)
144 | anchors = [anchors_full[i] for i in anchor_idxs]
145 | scaled_anchors = [ (x[0] / (args.anchor_imsize/grid), \
146 | x[1] / (args.anchor_imsize/grid)) for x in anchors]
147 |
148 | ## Get shape of gt box
149 | gt_box = torch.FloatTensor(np.array([0, 0, gw.cpu(), gh.cpu()],dtype=np.float32)).unsqueeze(0)
150 | ## Get shape of anchor box
151 | anchor_shapes = torch.FloatTensor(np.concatenate((np.zeros((len(scaled_anchors), 2)), np.array(scaled_anchors)), 1))
152 | ## Calculate iou between gt and anchor shapes
153 | # anch_ious = list(bbox_iou(gt_box, anchor_shapes))
154 | anch_ious = list(bbox_iou(gt_box, anchor_shapes,x1y1x2y2=False))
155 | ## Find the best matching anchor box
156 | best_n = np.argmax(np.array(anch_ious))
157 |
158 | tw = torch.log(gw / scaled_anchors[best_n][0] + 1e-16)
159 | th = torch.log(gh / scaled_anchors[best_n][1] + 1e-16)
160 |
161 | bbox[ii, best_n, :, gj, gi] = torch.stack([tx, ty, tw, th, torch.ones(1).cuda().squeeze()])
162 | best_n_list.append(int(best_n))
163 | best_gi.append(gi)
164 | best_gj.append(gj)
165 |
166 | bbox = Variable(bbox.cuda())
167 | return bbox, best_gi, best_gj, best_n_list
168 |
--------------------------------------------------------------------------------
/utils/transforms.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | Generic Image Transform utillities.
5 | """
6 | import torch
7 | import cv2
8 | import random, math
9 | import numpy as np
10 | from collections import Iterable
11 |
12 | import torch.nn.functional as F
13 | from torch.autograd import Variable
14 |
15 |
16 | class ResizePad:
17 | """
18 | Resize and pad an image to given size.
19 | """
20 |
21 | def __init__(self, size):
22 | if not isinstance(size, (int, Iterable)):
23 | raise TypeError('Got inappropriate size arg: {}'.format(size))
24 |
25 | self.h, self.w = size
26 |
27 | def __call__(self, img):
28 | h, w = img.shape[:2]
29 | scale = min(self.h / h, self.w / w)
30 | resized_h = int(np.round(h * scale))
31 | resized_w = int(np.round(w * scale))
32 | pad_h = int(np.floor(self.h - resized_h) / 2)
33 | pad_w = int(np.floor(self.w - resized_w) / 2)
34 |
35 | resized_img = cv2.resize(img, (resized_w, resized_h))
36 |
37 | # if img.ndim > 2:
38 | if img.ndim > 2:
39 | new_img = np.zeros(
40 | (self.h, self.w, img.shape[-1]), dtype=resized_img.dtype)
41 | else:
42 | resized_img = np.expand_dims(resized_img, -1)
43 | new_img = np.zeros((self.h, self.w, 1), dtype=resized_img.dtype)
44 | new_img[pad_h: pad_h + resized_h,
45 | pad_w: pad_w + resized_w, ...] = resized_img
46 | return new_img
47 |
48 |
49 | class CropResize:
50 | """Remove padding and resize image to its original size."""
51 |
52 | def __call__(self, img, size):
53 | if not isinstance(size, (int, Iterable)):
54 | raise TypeError('Got inappropriate size arg: {}'.format(size))
55 | im_h, im_w = img.data.shape[:2]
56 | input_h, input_w = size
57 | scale = max(input_h / im_h, input_w / im_w)
58 | # scale = torch.Tensor([[input_h / im_h, input_w / im_w]]).max()
59 | resized_h = int(np.round(im_h * scale))
60 | # resized_h = torch.round(im_h * scale)
61 | resized_w = int(np.round(im_w * scale))
62 | # resized_w = torch.round(im_w * scale)
63 | crop_h = int(np.floor(resized_h - input_h) / 2)
64 | # crop_h = torch.floor(resized_h - input_h) // 2
65 | crop_w = int(np.floor(resized_w - input_w) / 2)
66 | # crop_w = torch.floor(resized_w - input_w) // 2
67 | # resized_img = cv2.resize(img, (resized_w, resized_h))
68 | resized_img = F.upsample(
69 | img.unsqueeze(0).unsqueeze(0), size=(resized_h, resized_w),
70 | mode='bilinear')
71 |
72 | resized_img = resized_img.squeeze().unsqueeze(0)
73 |
74 | return resized_img[0, crop_h: crop_h + input_h,
75 | crop_w: crop_w + input_w]
76 |
77 |
78 | class ResizeImage:
79 | """Resize the largest of the sides of the image to a given size"""
80 | def __init__(self, size):
81 | if not isinstance(size, (int, Iterable)):
82 | raise TypeError('Got inappropriate size arg: {}'.format(size))
83 |
84 | self.size = size
85 |
86 | def __call__(self, img):
87 | im_h, im_w = img.shape[-2:]
88 | scale = min(self.size / im_h, self.size / im_w)
89 | resized_h = int(np.round(im_h * scale))
90 | resized_w = int(np.round(im_w * scale))
91 | out = F.upsample(
92 | Variable(img).unsqueeze(0), size=(resized_h, resized_w),
93 | mode='bilinear').squeeze().data
94 | return out
95 |
96 |
97 | class ResizeAnnotation:
98 | """Resize the largest of the sides of the annotation to a given size"""
99 | def __init__(self, size):
100 | if not isinstance(size, (int, Iterable)):
101 | raise TypeError('Got inappropriate size arg: {}'.format(size))
102 |
103 | self.size = size
104 |
105 | def __call__(self, img):
106 | im_h, im_w = img.shape[-2:]
107 | scale = min(self.size / im_h, self.size / im_w)
108 | resized_h = int(np.round(im_h * scale))
109 | resized_w = int(np.round(im_w * scale))
110 | out = F.upsample(
111 | Variable(img).unsqueeze(0).unsqueeze(0),
112 | size=(resized_h, resized_w),
113 | mode='bilinear').squeeze().data
114 | return out
115 |
116 |
117 | class ToNumpy:
118 | """Transform an torch.*Tensor to an numpy ndarray."""
119 |
120 | def __call__(self, x):
121 | return x.numpy()
122 |
123 | def letterbox(img, mask, height, color=(123.7, 116.3, 103.5)): # resize a rectangular image to a padded square
124 | shape = img.shape[:2] # shape = [height, width]
125 | #print(shape)
126 | ratio = float(height) / max(shape) # ratio = old / new
127 | new_shape = (round(shape[1] * ratio), round(shape[0] * ratio))
128 | dw = (height - new_shape[0]) / 2 # width padding
129 | dh = (height - new_shape[1]) / 2 # height padding
130 | top, bottom = round(dh - 0.1), round(dh + 0.1)
131 | left, right = round(dw - 0.1), round(dw + 0.1)
132 | img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border
133 | img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # padded square
134 | if mask is not None:
135 | mask = cv2.resize(mask, new_shape, interpolation=cv2.INTER_NEAREST) # resized, no border
136 | mask = cv2.copyMakeBorder(mask, top, bottom, left, right, cv2.BORDER_CONSTANT, value=0) # padded square
137 | return img, mask, ratio, dw, dh
138 |
139 | def random_affine(img, mask, targets, degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-2, 2),
140 | borderValue=(123.7, 116.3, 103.5), all_bbox=None):
141 | border = 0 # width of added border (optional)
142 | height = max(img.shape[0], img.shape[1]) + border * 2
143 | # Rotation and Scale
144 | R = np.eye(3)
145 | Rht = np.eye(3)
146 | a = random.random() * (degrees[1] - degrees[0]) + degrees[0]
147 | # a += random.choice([-180, -90, 0, 90]) # 90deg rotations added to small rotations
148 | s = random.random() * (scale[1] - scale[0]) + scale[0]
149 | R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s)
150 | # Translation
151 | T = np.eye(3)
152 | r1 = random.random()
153 | r2 = random.random()
154 | T[0, 2] = (r1 * 2 - 1) * translate[0] * img.shape[0] + border # x translation (pixels)
155 | T[1, 2] = (r2 * 2 - 1) * translate[1] * img.shape[1] + border # y translation (pixels)
156 |
157 | # Shear
158 | S = np.eye(3)
159 | S[0, 1] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # x shear (deg)
160 | S[1, 0] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # y shear (deg)
161 |
162 | M = S @ T @ R # Combined rotation matrix. ORDER IS IMPORTANT HERE!!
163 | imw = cv2.warpPerspective(img, M, dsize=(height, height), flags=cv2.INTER_LINEAR,
164 | borderValue=borderValue) # BGR order borderValue
165 |
166 | if mask is not None:
167 | maskw = cv2.warpPerspective(mask, M, dsize=(height, height), flags=cv2.INTER_NEAREST,
168 | borderValue=0) # BGR order borderValue
169 | else:
170 | maskw = None
171 |
172 | # Return warped points also
173 | if type(targets)==type([1]):
174 | targetlist=[]
175 | for bbox in targets:
176 | targetlist.append(wrap_points(bbox, M, height, a))
177 | return imw, maskw, targetlist, M
178 | elif all_bbox is not None:
179 | targets = wrap_points(targets, M, height, a)
180 | for ii in range(all_bbox.shape[0]):
181 | all_bbox[ii,:] = wrap_points(all_bbox[ii,:], M, height, a)
182 | return imw, maskw, targets, all_bbox, M
183 | elif targets is not None: ## previous main
184 | targets = wrap_points(targets, M, height, a)
185 | return imw, maskw, targets, M
186 | else:
187 | return imw
188 |
189 | def wrap_points(targets, M, height, a):
190 | # n = targets.shape[0]
191 | # points = targets[:, 1:5].copy()
192 | points = targets.copy()
193 | # area0 = (points[:, 2] - points[:, 0]) * (points[:, 3] - points[:, 1])
194 | area0 = (points[2] - points[0]) * (points[3] - points[1])
195 |
196 | # warp points
197 | xy = np.ones((4, 3))
198 | xy[:, :2] = points[[0, 1, 2, 3, 0, 3, 2, 1]].reshape(4, 2) # x1y1, x2y2, x1y2, x2y1
199 | xy = (xy @ M.T)[:, :2].reshape(1, 8)
200 |
201 | # create new boxes
202 | x = xy[:, [0, 2, 4, 6]]
203 | y = xy[:, [1, 3, 5, 7]]
204 | xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, 1).T
205 |
206 | # apply angle-based reduction
207 | radians = a * math.pi / 180
208 | reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5
209 | x = (xy[:, 2] + xy[:, 0]) / 2
210 | y = (xy[:, 3] + xy[:, 1]) / 2
211 | w = (xy[:, 2] - xy[:, 0]) * reduction
212 | h = (xy[:, 3] - xy[:, 1]) * reduction
213 | xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, 1).T
214 |
215 | # reject warped points outside of image
216 | np.clip(xy, 0, height, out=xy)
217 | w = xy[:, 2] - xy[:, 0]
218 | h = xy[:, 3] - xy[:, 1]
219 | area = w * h
220 | ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16))
221 | i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10)
222 |
223 | ## print(targets, xy)
224 | ## [ 56 36 108 210] [[ 47.80464857 15.6096533 106.30993434 196.71267693]]
225 | # targets = targets[i]
226 | # targets[:, 1:5] = xy[i]
227 | targets = xy[0]
228 | return targets
--------------------------------------------------------------------------------
/utils/transformsv2.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | Generic Image Transform utillities.
5 | """
6 |
7 | import cv2
8 | import random, math
9 | import numpy as np
10 | from collections import Iterable
11 |
12 | import torch.nn.functional as F
13 | from torch.autograd import Variable
14 |
15 |
16 | class ResizePad:
17 | """
18 | Resize and pad an image to given size.
19 | """
20 |
21 | def __init__(self, size):
22 | if not isinstance(size, (int, Iterable)):
23 | raise TypeError('Got inappropriate size arg: {}'.format(size))
24 |
25 | self.h, self.w = size
26 |
27 | def __call__(self, img):
28 | h, w = img.shape[:2]
29 | scale = min(self.h / h, self.w / w)
30 | resized_h = int(np.round(h * scale))
31 | resized_w = int(np.round(w * scale))
32 | pad_h = int(np.floor(self.h - resized_h) / 2)
33 | pad_w = int(np.floor(self.w - resized_w) / 2)
34 |
35 | resized_img = cv2.resize(img, (resized_w, resized_h))
36 |
37 | # if img.ndim > 2:
38 | if img.ndim > 2:
39 | new_img = np.zeros(
40 | (self.h, self.w, img.shape[-1]), dtype=resized_img.dtype)
41 | else:
42 | resized_img = np.expand_dims(resized_img, -1)
43 | new_img = np.zeros((self.h, self.w, 1), dtype=resized_img.dtype)
44 | new_img[pad_h: pad_h + resized_h,
45 | pad_w: pad_w + resized_w, ...] = resized_img
46 | return new_img
47 |
48 |
49 | class CropResize:
50 | """Remove padding and resize image to its original size."""
51 |
52 | def __call__(self, img, size):
53 | if not isinstance(size, (int, Iterable)):
54 | raise TypeError('Got inappropriate size arg: {}'.format(size))
55 | im_h, im_w = img.data.shape[:2]
56 | input_h, input_w = size
57 | scale = max(input_h / im_h, input_w / im_w)
58 | # scale = torch.Tensor([[input_h / im_h, input_w / im_w]]).max()
59 | resized_h = int(np.round(im_h * scale))
60 | # resized_h = torch.round(im_h * scale)
61 | resized_w = int(np.round(im_w * scale))
62 | # resized_w = torch.round(im_w * scale)
63 | crop_h = int(np.floor(resized_h - input_h) / 2)
64 | # crop_h = torch.floor(resized_h - input_h) // 2
65 | crop_w = int(np.floor(resized_w - input_w) / 2)
66 | # crop_w = torch.floor(resized_w - input_w) // 2
67 | # resized_img = cv2.resize(img, (resized_w, resized_h))
68 | resized_img = F.upsample(
69 | img.unsqueeze(0).unsqueeze(0), size=(resized_h, resized_w),
70 | mode='bilinear')
71 |
72 | resized_img = resized_img.squeeze().unsqueeze(0)
73 |
74 | return resized_img[0, crop_h: crop_h + input_h,
75 | crop_w: crop_w + input_w]
76 |
77 |
78 | class ResizeImage:
79 | """Resize the largest of the sides of the image to a given size"""
80 | def __init__(self, size):
81 | if not isinstance(size, (int, Iterable)):
82 | raise TypeError('Got inappropriate size arg: {}'.format(size))
83 |
84 | self.size = size
85 |
86 | def __call__(self, img):
87 | im_h, im_w = img.shape[-2:]
88 | scale = min(self.size / im_h, self.size / im_w)
89 | resized_h = int(np.round(im_h * scale))
90 | resized_w = int(np.round(im_w * scale))
91 | out = F.upsample(
92 | Variable(img).unsqueeze(0), size=(resized_h, resized_w),
93 | mode='bilinear').squeeze().data
94 | return out
95 |
96 |
97 | class ResizeAnnotation:
98 | """Resize the largest of the sides of the annotation to a given size"""
99 | def __init__(self, size):
100 | if not isinstance(size, (int, Iterable)):
101 | raise TypeError('Got inappropriate size arg: {}'.format(size))
102 |
103 | self.size = size
104 |
105 | def __call__(self, img):
106 | im_h, im_w = img.shape[-2:]
107 | scale = min(self.size / im_h, self.size / im_w)
108 | resized_h = int(np.round(im_h * scale))
109 | resized_w = int(np.round(im_w * scale))
110 | out = F.upsample(
111 | Variable(img).unsqueeze(0).unsqueeze(0),
112 | size=(resized_h, resized_w),
113 | mode='bilinear').squeeze().data
114 | return out
115 |
116 |
117 | class ToNumpy:
118 | """Transform an torch.*Tensor to an numpy ndarray."""
119 |
120 | def __call__(self, x):
121 | return x.numpy()
122 |
123 | def letterbox(img, mask, height, color=(123.7, 116.3, 103.5)): # resize a rectangular image to a padded square
124 | shape = img.shape[:2] # shape = [height, width]
125 | ratio = float(height) / max(shape) # ratio = old / new
126 | new_shape = (round(shape[1] * ratio), round(shape[0] * ratio))
127 | dw = (height - new_shape[0]) / 2 # width padding
128 | dh = (height - new_shape[1]) / 2 # height padding
129 | top, bottom = round(dh - 0.1), round(dh + 0.1)
130 | left, right = round(dw - 0.1), round(dw + 0.1)
131 | img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border
132 | img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # padded square
133 | if mask is not None:
134 | mask = cv2.resize(mask, new_shape, interpolation=cv2.INTER_NEAREST) # resized, no border
135 | mask = cv2.copyMakeBorder(mask, top, bottom, left, right, cv2.BORDER_CONSTANT, value=255) # padded square
136 | return img, mask, ratio, dw, dh
137 |
138 | def random_affine(img, mask, targets, degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-2, 2),
139 | borderValue=(123.7, 116.3, 103.5), all_bbox=None):
140 | border = 0 # width of added border (optional)
141 | height = max(img.shape[0], img.shape[1]) + border * 2
142 | heightht = max(ht.shape[0], ht.shape[1]) + border * 2
143 | # Rotation and Scale
144 | R = np.eye(3)
145 | Rht = np.eye(3)
146 | a = random.random() * (degrees[1] - degrees[0]) + degrees[0]
147 | # a += random.choice([-180, -90, 0, 90]) # 90deg rotations added to small rotations
148 | s = random.random() * (scale[1] - scale[0]) + scale[0]
149 | R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s)
150 | Rht[:2] = cv2.getRotationMatrix2D(angle=a, center=(ht.shape[1] / 2, ht.shape[0] / 2), scale=s)
151 | # Translation
152 | T = np.eye(3)
153 | r1 = random.random()
154 | r2 = random.random()
155 | T[0, 2] = (r1 * 2 - 1) * translate[0] * img.shape[0] + border # x translation (pixels)
156 | T[1, 2] = (r2 * 2 - 1) * translate[1] * img.shape[1] + border # y translation (pixels)
157 |
158 | Tht = np.eye(3)
159 | Tht[0, 2] = (r1 * 2 - 1) * translate[0] * ht.shape[0] + border # x translation (pixels)
160 | Tht[1, 2] = (r2 * 2 - 1) * translate[1] * ht.shape[1] + border # y translation (pixels)
161 | # Shear
162 | S = np.eye(3)
163 | S[0, 1] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # x shear (deg)
164 | S[1, 0] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # y shear (deg)
165 |
166 | M = S @ T @ R # Combined rotation matrix. ORDER IS IMPORTANT HERE!!
167 | Mht = S@Tht @ Rht
168 | imw = cv2.warpPerspective(img, M, dsize=(height, height), flags=cv2.INTER_LINEAR,
169 | borderValue=borderValue) # BGR order borderValue
170 |
171 | if mask is not None:
172 | maskw = cv2.warpPerspective(mask, M, dsize=(height, height), flags=cv2.INTER_NEAREST,
173 | borderValue=0) # BGR order borderValue
174 | else:
175 | maskw = None
176 |
177 | # Return warped points also
178 | if type(targets)==type([1]):
179 | targetlist=[]
180 | for bbox in targets:
181 | targetlist.append(wrap_points(bbox, M, height, a))
182 | return imw, maskw, targetlist, M
183 | elif all_bbox is not None:
184 | targets = wrap_points(targets, M, height, a)
185 | for ii in range(all_bbox.shape[0]):
186 | all_bbox[ii,:] = wrap_points(all_bbox[ii,:], M, height, a)
187 | return imw, maskw, targets, all_bbox, M
188 | elif targets is not None: ## previous main
189 | targets = wrap_points(targets, M, height, a)
190 | return imw, maskw, targets, M
191 | else:
192 | return imw
193 |
194 | def wrap_points(targets, M, height, a):
195 | # n = targets.shape[0]
196 | # points = targets[:, 1:5].copy()
197 | points = targets.copy()
198 | # area0 = (points[:, 2] - points[:, 0]) * (points[:, 3] - points[:, 1])
199 | area0 = (points[2] - points[0]) * (points[3] - points[1])
200 |
201 | # warp points
202 | xy = np.ones((4, 3))
203 | xy[:, :2] = points[[0, 1, 2, 3, 0, 3, 2, 1]].reshape(4, 2) # x1y1, x2y2, x1y2, x2y1
204 | xy = (xy @ M.T)[:, :2].reshape(1, 8)
205 |
206 | # create new boxes
207 | x = xy[:, [0, 2, 4, 6]]
208 | y = xy[:, [1, 3, 5, 7]]
209 | xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, 1).T
210 |
211 | # apply angle-based reduction
212 | radians = a * math.pi / 180
213 | reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5
214 | x = (xy[:, 2] + xy[:, 0]) / 2
215 | y = (xy[:, 3] + xy[:, 1]) / 2
216 | w = (xy[:, 2] - xy[:, 0]) * reduction
217 | h = (xy[:, 3] - xy[:, 1]) * reduction
218 | xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, 1).T
219 |
220 | # reject warped points outside of image
221 | np.clip(xy, 0, height, out=xy)
222 | w = xy[:, 2] - xy[:, 0]
223 | h = xy[:, 3] - xy[:, 1]
224 | area = w * h
225 | ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16))
226 | i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10)
227 |
228 | ## print(targets, xy)
229 | ## [ 56 36 108 210] [[ 47.80464857 15.6096533 106.30993434 196.71267693]]
230 | # targets = targets[i]
231 | # targets[:, 1:5] = xy[i]
232 | targets = xy[0]
233 | return targets
--------------------------------------------------------------------------------
/model/yolov3.cfg:
--------------------------------------------------------------------------------
1 | [net]
2 | # Testing
3 | #batch=1
4 | #subdivisions=1
5 | # Training
6 | batch=16
7 | subdivisions=1
8 | width=416
9 | height=416
10 | channels=3
11 | momentum=0.9
12 | decay=0.0005
13 | angle=0
14 | saturation = 1.5
15 | exposure = 1.5
16 | hue=.1
17 |
18 | learning_rate=0.001
19 | burn_in=1000
20 | max_batches = 500200
21 | policy=steps
22 | steps=400000,450000
23 | scales=.1,.1
24 |
25 | [convolutional]
26 | batch_normalize=1
27 | filters=32
28 | size=3
29 | stride=1
30 | pad=1
31 | activation=leaky
32 |
33 | # Downsample
34 |
35 | [convolutional]
36 | batch_normalize=1
37 | filters=64
38 | size=3
39 | stride=2
40 | pad=1
41 | activation=leaky
42 |
43 | [convolutional]
44 | batch_normalize=1
45 | filters=32
46 | size=1
47 | stride=1
48 | pad=1
49 | activation=leaky
50 |
51 | [convolutional]
52 | batch_normalize=1
53 | filters=64
54 | size=3
55 | stride=1
56 | pad=1
57 | activation=leaky
58 |
59 | [shortcut]
60 | from=-3
61 | activation=linear
62 |
63 | # Downsample
64 |
65 | [convolutional]
66 | batch_normalize=1
67 | filters=128
68 | size=3
69 | stride=2
70 | pad=1
71 | activation=leaky
72 |
73 | [convolutional]
74 | batch_normalize=1
75 | filters=64
76 | size=1
77 | stride=1
78 | pad=1
79 | activation=leaky
80 |
81 | [convolutional]
82 | batch_normalize=1
83 | filters=128
84 | size=3
85 | stride=1
86 | pad=1
87 | activation=leaky
88 |
89 | [shortcut]
90 | from=-3
91 | activation=linear
92 |
93 | [convolutional]
94 | batch_normalize=1
95 | filters=64
96 | size=1
97 | stride=1
98 | pad=1
99 | activation=leaky
100 |
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 |
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 |
113 | # Downsample
114 |
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 |
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 |
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 |
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 |
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 |
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 |
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 |
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 |
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 |
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 |
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 |
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 |
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 |
203 |
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 |
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 |
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 |
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 |
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 |
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 |
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 |
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 |
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 |
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 |
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 |
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 |
284 | # Downsample
285 |
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 |
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 |
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 |
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 |
314 |
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 |
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 |
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 |
335 |
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 |
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 |
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 |
356 |
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 |
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 |
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 |
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 |
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 |
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 |
397 |
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 |
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 |
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 |
418 |
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 |
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 |
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 |
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 |
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 |
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 |
459 | # Downsample
460 |
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 |
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 |
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 |
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 |
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 |
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 |
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 |
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 |
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 |
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 |
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 |
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 |
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 |
549 | ######################
550 |
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 |
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 |
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 |
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 |
583 | [yoloconvolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 |
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 |
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 |
606 |
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 |
617 |
618 | [route]
619 | layers = -4
620 |
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 |
629 | [upsample]
630 | stride=2
631 |
632 | [route]
633 | layers = -1, 61
634 |
635 |
636 |
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 |
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 |
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 |
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 |
669 | [yoloconvolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 |
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 |
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 |
692 |
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 |
703 |
704 |
705 | [route]
706 | layers = -4
707 |
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 |
716 | [upsample]
717 | stride=2
718 |
719 | [route]
720 | layers = -1, 36
721 |
722 |
723 |
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 |
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 |
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 |
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 |
756 | [yoloconvolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 |
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 |
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 |
779 |
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13, 16,30, 33,23, 30,61, 62,45, 59,119, 116,90, 156,198, 373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 |
--------------------------------------------------------------------------------
/dataset/data_loaderv2.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | YouRefIt referring image PyTorch dataset.
5 | Define and group batches of images and queries.
6 | Based on:
7 | https://github.com/zyang-ur/ReSC/blob/master/dataset/data_loader.py
8 | """
9 | from torchvision.transforms import Compose, ToTensor, Normalize
10 | import os
11 | import sys
12 | import cv2
13 | import json
14 | import uuid
15 | import tqdm
16 | import math
17 | import torch
18 | import random
19 | # import h5py
20 | import numpy as np
21 | import os.path as osp
22 | import scipy.io as sio
23 | import torch.utils.data as data
24 | from collections import OrderedDict
25 | sys.path.append('.')
26 | import operator
27 | import utils
28 | from utils import Corpus
29 |
30 | import argparse
31 | import collections
32 | import logging
33 | import json
34 | import re
35 |
36 | from pytorch_pretrained_bert.tokenization import BertTokenizer
37 | from pytorch_pretrained_bert.modeling import BertModel
38 | # from transformers import BertTokenizer,BertModel
39 | from utils.transforms import letterbox, random_affine
40 |
41 | sys.modules['utils'] = utils
42 |
43 | cv2.setNumThreads(0)
44 |
45 | def read_examples(input_line, unique_id):
46 | """Read a list of `InputExample`s from an input file."""
47 | examples = []
48 | # unique_id = 0
49 | line = input_line #reader.readline()
50 | # if not line:
51 | # break
52 | line = line.strip()
53 | text_a = None
54 | text_b = None
55 | m = re.match(r"^(.*) \|\|\| (.*)$", line)
56 | if m is None:
57 | text_a = line
58 | else:
59 | text_a = m.group(1)
60 | text_b = m.group(2)
61 | examples.append(
62 | InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
63 | # unique_id += 1
64 | return examples
65 |
66 | ## Bert text encoding
67 | class InputExample(object):
68 | def __init__(self, unique_id, text_a, text_b):
69 | self.unique_id = unique_id
70 | self.text_a = text_a
71 | self.text_b = text_b
72 |
73 | class InputFeatures(object):
74 | """A single set of features of data."""
75 | def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
76 | self.unique_id = unique_id
77 | self.tokens = tokens
78 | self.input_ids = input_ids
79 | self.input_mask = input_mask
80 | self.input_type_ids = input_type_ids
81 |
82 | def convert_examples_to_features(examples, seq_length, tokenizer):
83 | """Loads a data file into a list of `InputBatch`s."""
84 | features = []
85 | for (ex_index, example) in enumerate(examples):
86 | tokens_a = tokenizer.tokenize(example.text_a)
87 |
88 | tokens_b = None
89 | if example.text_b:
90 | tokens_b = tokenizer.tokenize(example.text_b)
91 |
92 | if tokens_b:
93 | # Modifies `tokens_a` and `tokens_b` in place so that the total
94 | # length is less than the specified length.
95 | # Account for [CLS], [SEP], [SEP] with "- 3"
96 | _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
97 | else:
98 | # Account for [CLS] and [SEP] with "- 2"
99 | if len(tokens_a) > seq_length - 2:
100 | tokens_a = tokens_a[0:(seq_length - 2)]
101 | tokens = []
102 | input_type_ids = []
103 | tokens.append("[CLS]")
104 | input_type_ids.append(0)
105 | for token in tokens_a:
106 | tokens.append(token)
107 | input_type_ids.append(0)
108 | tokens.append("[SEP]")
109 | input_type_ids.append(0)
110 |
111 | if tokens_b:
112 | for token in tokens_b:
113 | tokens.append(token)
114 | input_type_ids.append(1)
115 | tokens.append("[SEP]")
116 | input_type_ids.append(1)
117 |
118 | input_ids = tokenizer.convert_tokens_to_ids(tokens)
119 |
120 | # The mask has 1 for real tokens and 0 for padding tokens. Only real
121 | # tokens are attended to.
122 | input_mask = [1] * len(input_ids)
123 |
124 | # Zero-pad up to the sequence length.
125 | while len(input_ids) < seq_length:
126 | input_ids.append(0)
127 | input_mask.append(0)
128 | input_type_ids.append(0)
129 |
130 | assert len(input_ids) == seq_length
131 | assert len(input_mask) == seq_length
132 | assert len(input_type_ids) == seq_length
133 | features.append(
134 | InputFeatures(
135 | unique_id=example.unique_id,
136 | tokens=tokens,
137 | input_ids=input_ids,
138 | input_mask=input_mask,
139 | input_type_ids=input_type_ids))
140 | return features
141 |
142 | class DatasetNotFoundError(Exception):
143 | pass
144 |
145 | class ReferDataset(data.Dataset):
146 | SUPPORTED_DATASETS = {
147 | 'yourefit': {'splits': ('train', 'val', 'test')},
148 | 'referit': {'splits': ('train', 'val', 'trainval', 'test')},
149 | 'unc': {
150 | 'splits': ('train', 'val', 'trainval', 'testA', 'testB'),
151 | 'params': {'dataset': 'refcoco', 'split_by': 'unc'}
152 | },
153 | 'unc+': {
154 | 'splits': ('train', 'val', 'trainval', 'testA', 'testB'),
155 | 'params': {'dataset': 'refcoco+', 'split_by': 'unc'}
156 | },
157 | 'gref': {
158 | 'splits': ('train', 'val'),
159 | 'params': {'dataset': 'refcocog', 'split_by': 'google'}
160 | },
161 | 'gref_umd': {
162 | 'splits': ('train', 'val', 'test'),
163 | 'params': {'dataset': 'refcocog', 'split_by': 'umd'}
164 | },
165 | 'flickr': {
166 | 'splits': ('train', 'val', 'test')}
167 | }
168 |
169 | def __init__(self, data_root, split_root='data', dataset='referit', imsize=256,
170 | transform=None, augment=False, device=None, return_idx=False, testmode=False,
171 | split='train', max_query_len=128, lstm=False, bert_model='bert-base-uncased'):
172 | self.images = []
173 | self.data_root = data_root
174 | self.split_root = split_root
175 | self.dataset = dataset
176 | self.imsize = imsize
177 | self.query_len = max_query_len
178 | self.lstm = lstm
179 | self.transform = transform
180 | self.testmode = testmode
181 | self.split = split
182 | self.device = device
183 | self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)
184 | self.augment=augment
185 | self.return_idx=return_idx
186 |
187 | if self.dataset == 'yourefit':
188 | self.dataset_root = osp.join(self.data_root, 'yourefit')
189 | self.im_dir = osp.join(self.dataset_root, 'images')
190 | elif self.dataset == 'referit':
191 | self.dataset_root = osp.join(self.data_root, 'referit')
192 | self.im_dir = osp.join(self.dataset_root, 'images')
193 | self.split_dir = osp.join(self.dataset_root, 'splits')
194 | elif self.dataset == 'flickr':
195 | self.dataset_root = osp.join(self.data_root, 'Flickr30k')
196 | self.im_dir = osp.join(self.dataset_root, 'flickr30k_images')
197 | else: ## refcoco, etc.
198 | self.dataset_root = osp.join(self.data_root, 'other')
199 | self.im_dir = osp.join(
200 | self.dataset_root, 'images', 'mscoco', 'images', 'train2014')
201 | self.split_dir = osp.join(self.dataset_root, 'splits')
202 |
203 | if not self.exists_dataset():
204 | print('Please download index cache to data folder')
205 | exit(0)
206 |
207 | dataset_path = osp.join(self.split_root, self.dataset)
208 | valid_splits = self.SUPPORTED_DATASETS[self.dataset]['splits']
209 |
210 | if self.lstm:
211 | self.corpus = Corpus()
212 | corpus_path = osp.join(dataset_path, 'corpus.pth')
213 | self.corpus = torch.load(corpus_path)
214 |
215 | if split not in valid_splits:
216 | raise ValueError(
217 | 'Dataset {0} does not have split {1}'.format(
218 | self.dataset, split))
219 |
220 | splits = [split]
221 | if self.dataset != 'referit':
222 | splits = ['train', 'val'] if split == 'trainval' else [split]
223 | for split in splits:
224 | imgset_file = '{0}_{1}full.pth'.format(self.dataset, split)
225 | imgset_path = osp.join(dataset_path, imgset_file)
226 | self.images += torch.load(imgset_path)
227 |
228 | def exists_dataset(self):
229 | return osp.exists(osp.join(self.split_root, self.dataset))
230 |
231 |
232 | def pull_item(self, idx):
233 | if self.dataset == 'flickr':
234 | img_file, bbox, phrase = self.images[idx]
235 | else:
236 | img_file, _, bbox, phrase, attri = self.images[idx]
237 | ## box format: to x1y1x2y2
238 | if not (self.dataset == 'referit' or self.dataset == 'flickr'):
239 | bbox = np.array(bbox, dtype=int)
240 | #bbox[2], bbox[3] = bbox[0]+bbox[2], bbox[1]+bbox[3]
241 | else:
242 | bbox = np.array(bbox, dtype=int)
243 |
244 | img_path = osp.join(self.im_dir, img_file)
245 | img = cv2.imread(img_path)
246 |
247 | htmapdir = self.im_dir.replace('images', 'paf')
248 | htmapfile = img_file.replace('.jpg', '_rendered.png')
249 | htmap_path = osp.join(htmapdir, htmapfile)
250 | htmap = cv2.imread(htmap_path)
251 |
252 | ht = np.asarray(htmap)
253 | ht = np.mean(ht, axis=2)
254 | ht = cv2.resize(ht, (256, 256))
255 |
256 | ptdir = self.im_dir.replace('images', 'depimg')
257 | ptfile = img_file #.replace('.jpg', '.jpeg')
258 | pt_path = osp.join(ptdir, ptfile)
259 | pt = cv2.imread(pt_path)
260 | # print(pt.shape)
261 | # exit()
262 | # pt = cv2.resize(pt, (256,256))
263 | # pt = np.reshape(pt, (3, 256, 256))
264 |
265 | gestdir = self.im_dir.replace('images','gest')
266 | gestfile = img_file.replace('.jpg' , '.json')
267 | gest_path = osp.join(gestdir,gestfile)
268 | gest = json.load(open(gest_path))
269 |
270 | ## duplicate channel if gray image
271 | if img.shape[-1] > 1:
272 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
273 | else:
274 | img = np.stack([img] * 3)
275 |
276 | return img, pt, ht, phrase, bbox, gest
277 | # return img, phrase, bbox, pt, ht
278 |
279 | def tokenize_phrase(self, phrase):
280 | return self.corpus.tokenize(phrase, self.query_len)
281 |
282 | def untokenize_word_vector(self, words):
283 | return self.corpus.dictionary[words]
284 |
285 | def __len__(self):
286 | return len(self.images)
287 |
288 | def __getitem__(self, idx):
289 | img, pt, ht, phrase, bbox, gest = self.pull_item(idx)
290 | # phrase = phrase.decode("utf-8").encode().lower()
291 |
292 | center_point = gest['candidate']
293 | try:
294 | center_point = np.asarray(center_point)[:,0:2]
295 | if center_point[0,0] != 0:
296 | center_point = center_point [0,:]
297 | elif center_point[1,0] != 0:
298 | center_point = center_point [1,:]
299 | else :
300 | center_point = np.asarray([256,256])
301 | # mask = center_point!=0
302 | # print(center_point.shape)
303 | # center_point = center_point[mask]
304 | # print(center_point.shape)
305 | # center_point = center_point [0:2,:]
306 | # center_point = np.mean(center_point , axis = 0)
307 | except IndexError:
308 | center_point = np.asarray([256,256])
309 |
310 | phrase = phrase.lower()
311 | if self.augment:
312 | augment_flip, augment_hsv, augment_affine = True,True,True
313 |
314 | ## seems a bug in torch transformation resize, so separate in advance
315 | h,w = img.shape[0], img.shape[1]
316 | if self.augment:
317 | ## random horizontal flip
318 | if augment_flip and random.random() > 0.5:
319 | img = cv2.flip(img, 1)
320 | pt = cv2.flip(pt , 1 )
321 | ht = cv2.flip(ht , 1 )
322 | center_point[0] = w - center_point[0] - 1
323 | bbox[0], bbox[2] = w-bbox[2]-1, w-bbox[0]-1
324 | phrase = phrase.replace('right','*&^special^&*').replace('left','right').replace('*&^special^&*','left')
325 | ## random intensity, saturation change
326 | if augment_hsv:
327 | fraction = 0.50
328 | img_hsv = cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2HSV)
329 | S = img_hsv[:, :, 1].astype(np.float32)
330 | V = img_hsv[:, :, 2].astype(np.float32)
331 | a = (random.random() * 2 - 1) * fraction + 1
332 | if a > 1:
333 | np.clip(S, a_min=0, a_max=255, out=S)
334 | a = (random.random() * 2 - 1) * fraction + 1
335 | V *= a
336 | if a > 1:
337 | np.clip(V, a_min=0, a_max=255, out=V)
338 |
339 | img_hsv[:, :, 1] = S.astype(np.uint8)
340 | img_hsv[:, :, 2] = V.astype(np.uint8)
341 | img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR), cv2.COLOR_BGR2RGB)
342 | img, _, ratio, dw, dh = letterbox(img, None, self.imsize)
343 | bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw
344 | bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh
345 | ## random affine transformation
346 | if augment_affine:
347 | img, _, bbox, M, center_point, pt, gt = random_affine(center_point, pt, img, None, bbox, \
348 | degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10))
349 | else: ## should be inference, or specified training
350 | img, _, ratio, dw, dh = letterbox(img, None, self.imsize)
351 | bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw
352 | bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh
353 | gt = np.asarray(torch.zeros([512,512]))
354 | gt[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] = 1
355 | ## Norm, to tensor
356 | # print(img.shape)
357 | if self.transform is not None:
358 | img = self.transform(img)
359 | pt = self.t(pt)
360 | #print(ht.shape)
361 | ht = self.t(ht)
362 | #print(ht.shape)
363 | if self.lstm:
364 | phrase = self.tokenize_phrase(phrase)
365 | word_id = phrase
366 | # word_mask = np.zeros(word_id.shape)
367 | word_mask = np.array(word_id>0,dtype=int)
368 | else:
369 | ## encode phrase to bert input
370 | examples = read_examples(phrase, idx)
371 | features = convert_examples_to_features(
372 | examples=examples, seq_length=self.query_len, tokenizer=self.tokenizer)
373 | word_id = features[0].input_ids
374 | word_mask = features[0].input_mask
375 |
376 | if self.testmode:
377 | return img, pt, ht, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
378 | np.array(bbox, dtype=np.float32), np.array(ratio, dtype=np.float32), \
379 | np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0], center_point, gt
380 | else:
381 | return img, pt, ht, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
382 | np.array(bbox, dtype=np.float32), center_point, gt
--------------------------------------------------------------------------------
/utils/temp.py:
--------------------------------------------------------------------------------
1 | class grounding_model_multihop(nn.Module):
2 | def __init__(self, corpus=None, emb_size=256, jemb_drop_out=0.1, bert_model='bert-base-uncased', \
3 | NFilm=2, fusion='prod', intmd=False, mstage=False, convlstm=False, \
4 | coordmap=True, leaky=False, dataset=None, bert_emb=False, tunebert=False, use_sal=False, use_paf=False):
5 | super(grounding_model_multihop, self).__init__()
6 | self.coordmap = coordmap
7 | self.emb_size = emb_size
8 | self.NFilm = NFilm
9 | self.intmd = intmd
10 | self.mstage = mstage
11 | self.convlstm = convlstm
12 | self.tunebert = tunebert
13 | self.use_sal = use_sal
14 | self.use_paf = use_paf
15 | self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)
16 |
17 | if bert_model=='bert-base-uncased':
18 | self.textdim=768
19 | else:
20 | self.textdim=1024
21 | ## Visual model
22 | self.visumodel = Darknet(config_path='./model/yolov3.cfg')
23 | self.visumodel.load_weights('./saved_models/yolov3.weights')
24 | ## Text model
25 | self.textmodel = BertModel.from_pretrained(bert_model)
26 |
27 | self.mapping_visu = ConvBatchNormReLU(512 if self.convlstm else 256, emb_size, 1, 1, 0, 1, leaky=leaky)
28 |
29 | self.mapping_lang = torch.nn.Sequential(
30 | nn.Linear(self.textdim, emb_size),
31 | nn.ReLU(),
32 | nn.Dropout(jemb_drop_out),
33 | nn.Linear(emb_size, emb_size),
34 | nn.ReLU(),)
35 |
36 | textdim=emb_size
37 |
38 | self.film = FiLMedConvBlock_multihop(NFilm=3,textdim=textdim,visudim=emb_size,\
39 | emb_size=emb_size,fusion=fusion,intmd=(intmd or mstage or convlstm))
40 |
41 | self.film1 = FiLMedConvBlock_multihop(NFilm=1,textdim=textdim,visudim=emb_size,\
42 | emb_size=emb_size,fusion=fusion,intmd=(intmd or mstage or convlstm))
43 |
44 | if self.convlstm:
45 | output_emb = emb_size
46 | self.global_out = ConvLSTM(input_size=(32, 32),
47 | input_dim=emb_size,
48 | hidden_dim=[emb_size],
49 | kernel_size=(1, 1),
50 | num_layers=1,
51 | batch_first=True,
52 | bias=True,
53 | return_all_layers=False)
54 |
55 | self.fcn_out = torch.nn.Sequential(
56 | ConvBatchNormReLU(output_emb+8, output_emb//2, 1, 1, 0, 1, leaky=leaky),
57 | nn.Conv2d(output_emb//2, 3*5, kernel_size=1))
58 |
59 | self.fcn_out1 = torch.nn.Sequential(
60 | ConvBatchNormReLU(2*output_emb+8, output_emb//2, 1, 1, 0, 1, leaky=leaky),
61 | nn.Conv2d(output_emb//2, 6*5, kernel_size=1))
62 | #self.vl_transformer = VisionLanguageEncoder(d_model=512, nhead=8, num_encoder_layers=6,num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,activation="relu", normalize_before=False)
63 | '''
64 | #transformer
65 | decoder_layer = TransformerDecoderLayer(512, 8, 2048, 0.1, "relu", False)
66 | decoder_norm = nn.LayerNorm(512)
67 | self.decoder = TransformerDecoder(decoder_layer, 6, decoder_norm, return_intermediate=True,d_model=512)
68 |
69 | encoder_layer = TransformerEncoderLayer(512, 8, 2048, 0.1, "relu", False)
70 | encoder_norm = None
71 | self.encoder = TransformerEncoder(encoder_layer, 6, encoder_norm)
72 | '''
73 |
74 | ## Mapping module
75 | '''
76 | for i in self.parameters():
77 | i.requires_grad=False
78 | '''
79 | self.mapping_visu2 = ConvBatchNormReLU(512 if self.convlstm else 256+1, emb_size, 3, 1, 1, 1, leaky=leaky)
80 | self.mapping_visu1 = ConvBatchNormReLU(512+4 if self.convlstm else 256+1, emb_size, 3, 1, 1, 1, leaky=leaky)
81 | self.mp1 = nn.MaxPool2d(16, stride=16)
82 | self.mp2 = nn.AvgPool2d(4, stride=4)
83 | self.mp3 = nn.AvgPool2d(16, stride=16)
84 | self.mp4 = nn.AvgPool2d(2, stride=2)
85 |
86 | self.mapbodyfeature = MLP(512,512,512,2)
87 |
88 | self.linecode = MLP(512,128,3,2)
89 |
90 | self.poscode = MLP(3,128,512,2)
91 |
92 |
93 | #self.pattention = nn.Conv2d(512,1,1)
94 |
95 | #self.l_embed = nn.Embedding(22, 512)
96 |
97 | ## output head
98 |
99 | #self.maplast = ConvBatchNormReLU(output_emb+8, output_emb, 1, 1, 0, 1, leaky=leaky)
100 |
101 | output_emb = emb_size
102 | if self.mstage:
103 | self.fcn_out = nn.ModuleDict()
104 | modules = OrderedDict()
105 | for n in range(0,NFilm):
106 | modules["out%d"%n] = torch.nn.Sequential(
107 | ConvBatchNormReLU(output_emb, output_emb//2, 1, 1, 0, 1, leaky=leaky),
108 | nn.Conv2d(output_emb//2, 9*5, kernel_size=1))
109 | self.fcn_out.update(modules)
110 | else:
111 | if self.intmd:
112 | output_emb = emb_size*NFilm
113 | if self.use_sal:
114 | self.conv1 = nn.Conv2d(1, 2, 4, 4)
115 | self.conv15 = nn.Conv2d(2, 4, 2, 2)
116 | self.conv2 = nn.Conv2d(4, 8, 2, 2)
117 | else:
118 | self.fcn_out = torch.nn.Sequential(
119 | ConvBatchNormReLU(output_emb+8, output_emb//2, 1, 1, 0, 1, leaky=leaky),
120 | nn.Conv2d(output_emb//2, 3*5, kernel_size=1))
121 | self.fcn_out1 = torch.nn.Sequential(
122 | ConvBatchNormReLU(2*output_emb+8, output_emb//2, 1, 1, 0, 1, leaky=leaky),
123 | nn.Conv2d(output_emb//2, 6*5, kernel_size=1))
124 |
125 | def _reset_parameters(self):
126 | for p in self.parameters():
127 | if p.dim() > 1:
128 | nn.init.xavier_uniform_(p)
129 |
130 | def forward(self, image, seg, ht, dp, word_id, word_mask):
131 | ## Visual Module
132 | batch_size = image.size(0)
133 | '''
134 | memory_mask = word_mask.view(batch_size,1,-1,1)
135 | memory_mask = memory_mask.repeat(1,8,1,1024)
136 | membed_mask = torch.ones(batch_size, 8, 3, 1024).cuda()
137 | memory_mask = torch.cat((memory_mask,membed_mask),dim=2)
138 | memory_mask = word_mask.view(batch_size*8,23,1024)
139 | print(memory_mask.size())
140 |
141 | tgt_key_padding_mask = word_mask
142 | embed_mask = torch.ones(batch_size,3).cuda()
143 | tgt_key_padding_mask = torch.cat((tgt_key_padding_mask,embed_mask),dim=1)
144 | tgt_key_padding_mask = tgt_key_padding_mask.bool()
145 | '''
146 | dp = dp.unsqueeze(1)
147 | seg = seg.unsqueeze(1)
148 | dp = dp.type(torch.FloatTensor).cuda()
149 | seg = seg.type(torch.FloatTensor).cuda()
150 |
151 | distxy = distancexy.repeat(batch_size,1,1,1).cuda()
152 | dist = torch.cat([distxy,dp],dim=1)
153 |
154 | seeg = seg.view(batch_size,1,-1)
155 | seeg = F.normalize(seeg,dim=2,p=1)
156 |
157 | dist = dist.view(batch_size,3,-1)
158 | dist = F.normalize(dist,dim=2,p=1)
159 |
160 | #===============================#
161 | distfeature = dist.permute(0,2,1)
162 | #distfeature = distfeature.permute(1,0,2)
163 | #distfeature = self.posecode(distfeature)
164 | #distfeature = distfeature.permute(1,0,2)
165 |
166 | #===============================#
167 | bodypositionseg = torch.mul(seeg,dist)
168 | bodyposition = torch.sum(bodypositionseg,dim=2)
169 | bodyposition = bodyposition.view(batch_size,1,3)
170 |
171 | #bodypfeature = self.poscode(bodyposition)
172 | #bodypfeature = bodypfeature.view(batch_size,1,-1)
173 | #bodypfeature = bodyposition.permute(1,0,2)
174 | #bodypfeature = gen_sineembed_for_position(bodypfeature*512)
175 | #bodypfeature = bodypfeature.permute(1,0,2)
176 | #bodypfeature = bodypfeature.view(batch_size,1,-1)
177 |
178 | #restdistfeature = distfeature - bodypfeature
179 | #restdistfeature = F.normalize(restdistfeature,dim=2,p=2)
180 |
181 | #restdistfeature = restdistfeature.permute(0,2,1)
182 | #restdistfeature = restdistfeature.view(batch_size,512,512,512)
183 |
184 | #distfeature = distfeature.permute(0,2,1)
185 | #distfeature = distfeature.view(batch_size,512,512,512)
186 |
187 | bodyp = bodyposition.view(batch_size,3,1)
188 | relatepos = torch.sub(dist,bodyp)
189 | relatepos = relatepos.view(batch_size,3,-1)
190 | relatepos = relatepos.permute(0,2,1)
191 | relateposfeature = self.poscode(relatepos)
192 | relateposfeature = F.normalize(relateposfeature,dim=2,p=2)
193 | relateposfeature = relateposfeature.permute(0,2,1)
194 | relateposfeature = relateposfeature.view(batch_size,512,512,512)
195 |
196 | relatepos = F.normalize(relatepos,dim=2,p=2)
197 | relatepos = relatepos.permute(0,2,1)
198 | relatepos = relatepos.view(batch_size,3,512,512)
199 | #restdist = restdist * seg
200 |
201 | #====================================================#
202 | raw_fvisu = self.visumodel(image)
203 | raw_fvisu = raw_fvisu[1]
204 | bodyinfo = raw_fvisu
205 |
206 | #bodypfeature = bodypfeature.view(batch_size,-1)
207 | #compute position informations
208 | ht = ht.type(torch.FloatTensor).cuda()
209 | ht = ht.view(batch_size,-1,3)
210 | ht = ht.permute(0,2,1)
211 | ht = ht.view(batch_size,3,512,512)
212 | ht = torch.mean(ht,dim=1)
213 | ht = ht.view(batch_size,1,512,512)
214 | ht = self.mp1(ht)
215 |
216 | rd = self.mp3(relatepos)
217 |
218 | bodyinfo = torch.cat((bodyinfo, ht),1)
219 | bodyinfo = torch.cat((bodyinfo, rd),1)
220 |
221 | bodyinfo = self.mapping_visu1(bodyinfo)
222 | bodyinfo = self.mp2(bodyinfo)
223 | bodyinfo = self.mapping_visu2(bodyinfo)
224 | bodyinfo = self.mp2(bodyinfo)
225 | bodyinfo = self.mp4(bodyinfo)
226 |
227 | bodyfeature = bodyinfo.view(batch_size,-1)
228 | #bodyfeature = torch.cat([bodyinfo,bodypfeature],dim=1)
229 | #bodypfeature = bodypfeature.view(batch_size,1,-1)
230 | bodyfeature = self.mapbodyfeature(bodyfeature).view(batch_size,-1)
231 | bodyfeature = F.normalize(bodyfeature,dim=1,p=2)
232 |
233 | line = self.linecode(bodyfeature)
234 | line = line.view(batch_size,1,3)
235 |
236 | lor = line.view(batch_size,3)
237 |
238 | '''
239 | word_id = []
240 | word_mask = []
241 | for uu in range(batch_size):
242 | if(lor[uu,0]>0):
243 | word_idt = word_ida[uu,:]
244 | word_idt = word_idt.unsqueeze(0)
245 | word_id.append(word_idt)
246 |
247 | word_maskt = word_maska[uu,:]
248 | word_maskt = word_maskt.unsqueeze(0)
249 | word_mask.append(word_maskt)
250 | else:
251 | word_idt = word_idb[uu,:]
252 | word_idt = word_idt.unsqueeze(0)
253 | word_id.append(word_idt)
254 |
255 | word_maskt = word_maskb[uu,:]
256 | word_maskt = word_maskt.unsqueeze(0)
257 | word_mask.append(word_maskt)
258 |
259 | word_id = torch.cat(word_id,dim=0).contiguous()
260 | word_mask = torch.cat(word_mask,dim=0).contiguous()
261 | '''
262 |
263 | relatepos = relatepos.view(batch_size,3,512,512)
264 | relateposfeature = relateposfeature.view(batch_size,512,512,512)
265 |
266 | relatepos = relatepos.view(batch_size,3,-1)
267 | relateposfeature = relateposfeature.view(batch_size,512,-1)
268 | line = F.normalize(line,dim=2,p=2)
269 | #pt3 = torch.matmul(line,restdist)
270 |
271 | bodyfeature = bodyfeature.view(batch_size,1,512)
272 | pt512 = torch.matmul(bodyfeature,relateposfeature)
273 |
274 | #pt3 = pt3.view(batch_size,1,512,512)
275 | #attention1 = pt3.view(batch_size,1,512,512)
276 | attention = pt512.view(batch_size,1,512,512)
277 |
278 | #===============================================#
279 | seg = seg.clamp(max=1)
280 | seg = -seg+1
281 | attention = attention * seg
282 | attention = attention.view(batch_size,1,-1)
283 | attention = F.softmax(attention,dim=2)
284 | attention = attention.view(batch_size,1,512,512)
285 |
286 | pt = attention
287 | pt = pt.view(batch_size,1,512,512)
288 | fvisu = self.mapping_visu(raw_fvisu)
289 |
290 | #restdistfeature = restdistfeature.view(batch_size,512,512,512)
291 | #raw_fvisu = fvisu
292 |
293 |
294 |
295 |
296 |
297 |
298 |
299 |
300 |
301 |
302 |
303 |
304 |
305 |
306 |
307 |
308 |
309 |
310 |
311 |
312 |
313 |
314 |
315 |
316 |
317 |
318 | def attt_loss(line,relatepos,attention, bbox, eps=1e-3):
319 | """This function computes the Kullback-Leibler divergence between ground
320 | truth saliency maps and their predictions. Values are first divided by
321 | their sum for each image to yield a distribution that adds to 1.
322 | Args:
323 | y_true (tensor, float32): A 4d tensor that holds the ground truth
324 | saliency maps with values between 0 and 255.
325 | y_pred (tensor, float32): A 4d tensor that holds the predicted saliency
326 | maps with values between 0 and 1.
327 | eps (scalar, float, optional): A small factor to avoid numerical
328 | instabilities. Defaults to 1e-7.
329 | Returns:
330 | tensor, float32: A 0D tensor that holds the averaged error.
331 | """
332 | loss = 0
333 | batch = line.size(0)
334 | bbox = bbox.int()
335 | for ii in range(batch):
336 |
337 | region1 = attention[ii,0,bbox[ii][0]:max(bbox[ii][2],bbox[ii][0]+1),bbox[ii][1]:max(bbox[ii][3],bbox[ii][1]+1)].contiguous()
338 | region1.view(-1)
339 | region1 = torch.sum(region1)
340 |
341 | relatepos = relatepos.view(batch,3,512,512)
342 | region2 = relatepos[ii,:,bbox[ii][0]:max(bbox[ii][2],bbox[ii][0]+1),bbox[ii][1]:max(bbox[ii][3],bbox[ii][1]+1)].contiguous()
343 | region2 = region2.view(3,-1)
344 | region2 = torch.mean(region2,dim=1)
345 | region2 = region2.view(3)
346 |
347 | region2 = torch.sum(torch.abs(region2-line[ii]))
348 | #print(region)
349 | loss += region2+1-region1 #-region1
350 | loss = loss/batch
351 | return loss
352 |
353 | def depth_loss(input, dp, bbox, gi, gj, best_n_list):
354 | mseloss = torch.nn.MSELoss(reduction='mean' )
355 | batch = input.size(0)
356 | dp = dp.view(batch,-1).float()
357 | dpmax,_ = torch.max(dp,dim=1)
358 | dpmax = dpmax.view(batch,-1).float()
359 | bbox = bbox.int()
360 | dp = dp/dpmax
361 | loss = 0
362 | dp = dp.view(batch,512,512)
363 |
364 | for ii in range(batch):
365 | pred_depth = F.sigmoid(input[ii,best_n_list[ii],-1,gj[ii],gi[ii]])
366 | target_bbox = dp[ii,bbox[ii][0]:max(bbox[ii][2],bbox[ii][0]+1),bbox[ii][1]:max(bbox[ii][3],bbox[ii][1]+1)].contiguous()
367 | target_bbox = target_bbox.view(-1)
368 | target_bbox = torch.mean(target_bbox,dim=0)
369 | loss += mseloss(pred_depth,target_bbox)
370 | loss = loss/batch
371 | loss = loss.float()
372 | return loss
373 |
374 |
375 |
376 |
377 |
378 |
379 |
--------------------------------------------------------------------------------
/model/grounding_model.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 |
3 | import torch
4 | import torch.nn as nn
5 | import torch.nn.functional as F
6 | import torch.utils.model_zoo as model_zoo
7 | from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
8 | from torch.utils.data.distributed import DistributedSampler
9 |
10 | from .darknet import *
11 | from .convlstm import *
12 | from .modulation import *
13 |
14 | import argparse
15 | import collections
16 | import logging
17 | import json
18 | import re
19 | import time
20 | from tqdm import tqdm
21 | from pytorch_pretrained_bert.tokenization import BertTokenizer
22 | from pytorch_pretrained_bert.modeling import BertModel
23 |
24 | def generate_coord(batch, height, width):
25 | # coord = Variable(torch.zeros(batch,8,height,width).cuda())
26 | xv, yv = torch.meshgrid([torch.arange(0,height), torch.arange(0,width)])
27 | xv_min = (xv.float()*2 - width)/width
28 | yv_min = (yv.float()*2 - height)/height
29 | xv_max = ((xv+1).float()*2 - width)/width
30 | yv_max = ((yv+1).float()*2 - height)/height
31 | xv_ctr = (xv_min+xv_max)/2
32 | yv_ctr = (yv_min+yv_max)/2
33 | hmap = torch.ones(height,width)*(1./height)
34 | wmap = torch.ones(height,width)*(1./width)
35 | coord = torch.autograd.Variable(torch.cat([xv_min.unsqueeze(0), yv_min.unsqueeze(0),\
36 | xv_max.unsqueeze(0), yv_max.unsqueeze(0),\
37 | xv_ctr.unsqueeze(0), yv_ctr.unsqueeze(0),\
38 | hmap.unsqueeze(0), wmap.unsqueeze(0)], dim=0).cuda())
39 | coord = coord.unsqueeze(0).repeat(batch,1,1,1)
40 | return coord
41 |
42 | class grounding_model_multihop(nn.Module):
43 | def __init__(self, corpus=None, emb_size=256, jemb_drop_out=0.1, bert_model='bert-base-uncased', \
44 | NFilm=2, fusion='prod', intmd=False, mstage=False, convlstm=False, \
45 | coordmap=True, leaky=False, dataset=None, bert_emb=False, tunebert=False, use_sal=False, use_paf=False):
46 | super(grounding_model_multihop, self).__init__()
47 | self.coordmap = coordmap
48 | self.emb_size = emb_size
49 | self.NFilm = NFilm
50 | self.intmd = intmd
51 | self.mstage = mstage
52 | self.convlstm = convlstm
53 | self.tunebert = tunebert
54 | self.use_sal = use_sal
55 | self.use_paf = use_paf
56 | if bert_model=='bert-base-uncased':
57 | self.textdim=768
58 | else:
59 | self.textdim=1024
60 | ## Visual model
61 | self.visumodel = Darknet(config_path='model/yolov3.cfg')
62 | self.visumodel.load_weights('saved_models/yolov3.weights')
63 | self.trans = CLIPVisionTransformer(512,8,256)
64 |
65 | self.visumodel4t = Darknetfort(config_path='model/yolov3.cfg')
66 | self.visumodel4t.load_weights('saved_models/yolov3.weights')
67 |
68 | ## Text model
69 | self.textmodel = BertModel.from_pretrained(bert_model)
70 |
71 | ## Mapping module
72 | if self.use_paf:
73 | self.mapping_visu = ConvBatchNormReLU(512+3+1 if self.convlstm else 256+3, emb_size, 1, 1, 0, 1, leaky=leaky)
74 | self.mp1 = nn.MaxPool2d(16, stride=16)
75 | self.mp2 = nn.AvgPool2d(16, stride=16)
76 | else:
77 | self.mapping_visu = ConvBatchNormReLU(512 if self.convlstm else 256, emb_size, 1, 1, 0, 1, leaky=leaky)
78 |
79 | self.mapping_lang = torch.nn.Sequential(
80 | nn.Linear(self.textdim, emb_size),
81 | nn.ReLU(),
82 | nn.Dropout(jemb_drop_out),
83 | nn.Linear(emb_size, emb_size),
84 | nn.ReLU(),)
85 | self.mp3 = nn.MaxPool2d(8, stride=8)
86 | self.mp4 = nn.AvgPool2d(8, stride=8)
87 | self.mapping_visuf = ConvBatchNormReLU(256 + 4 +1, 256, 1, 1,0, 1, leaky=leaky)
88 | textdim=emb_size
89 | self.film = FiLMedConvBlock_multihop(NFilm=NFilm,textdim=textdim,visudim=emb_size,\
90 | emb_size=emb_size,fusion=fusion,intmd=(intmd or mstage or convlstm))
91 |
92 | ## output head
93 | output_emb = emb_size
94 | self.loc_avg = nn.AvgPool2d(16, stride=16)
95 | self.pt_avg = nn.AvgPool2d(16, stride=16)
96 | self.ht_avg = nn.AvgPool2d(16, stride=16)
97 | self.vis_map = ConvBatchNormReLU(512, 128, 3, 1, 1, 1, leaky=leaky)
98 | self.locationpool = torch.nn.Sequential(
99 | nn.AvgPool2d(8, stride=8),
100 | #ConvBatchNormReLU(3, 256, 1, 1, 0, 1, leaky=leaky)
101 | )
102 | self.linear1 = torch.nn.Sequential(
103 | ConvBatchNormReLU(256,128,8, 8, 0, 1, leaky=leaky),
104 | ConvBatchNormReLU(128,32,9, 1, 4, 1, leaky=leaky),
105 | nn.MaxPool2d(8, stride=8)
106 | )
107 | self.linear2 = nn.Linear(32, 3)
108 | self.language = nn.Linear(512, 1)
109 | self.stage0 = torch.nn.Sequential(
110 | ConvBatchNormReLU(135, 1024, 1, 1, 0, 1, leaky=leaky)
111 | )
112 | self.stage1 = torch.nn.Sequential(
113 | ConvBatchNormReLU(1024, 1, 9, 1, 4, 1, leaky=leaky),
114 | torch.nn.Upsample(512,mode = 'bilinear' , align_corners = True),
115 | )
116 | self.upsample = torch.nn.Upsample(512,mode = 'bilinear' , align_corners = True)
117 | self.tohyper = torch.nn.Sequential(
118 | ConvBatchNormReLU(768, 512, 1, 1, 0, 1, leaky=leaky)
119 | )
120 | self.word_projection = nn.Sequential(nn.Linear(512, 256),
121 | nn.ReLU(),
122 | nn.Dropout(0.1),
123 | nn.Linear(256, 256),
124 | nn.ReLU())
125 | #self.tstage0 = torch.nn.Con
126 | self.center = torch.nn.Sequential(
127 | nn.AvgPool2d(16, stride=16)
128 | )
129 | if self.mstage:
130 | self.fcn_out = nn.ModuleDict()
131 | modules = OrderedDict()
132 | for n in range(0,NFilm):
133 | modules["out%d"%n] = torch.nn.Sequential(
134 | ConvBatchNormReLU(output_emb, output_emb//2, 1, 1, 0, 1, leaky=leaky),
135 | nn.Conv2d(output_emb//2, 9*5, kernel_size=1))
136 | self.fcn_out.update(modules)
137 | else:
138 | if self.intmd:
139 | output_emb = emb_size*NFilm
140 | if self.convlstm:
141 | output_emb = emb_size
142 | self.global_out = ConvLSTM(input_size=(32, 32),
143 | input_dim=emb_size,
144 | hidden_dim=[emb_size],
145 | kernel_size=(1, 1),
146 | num_layers=1,
147 | batch_first=True,
148 | bias=True,
149 | return_all_layers=False)
150 | if self.use_sal:
151 | self.conv1 = torch.nn.Sequential(
152 | nn.AvgPool2d(16)
153 | )
154 | self.fcn_out = torch.nn.Sequential(
155 | ConvBatchNormReLU(output_emb+1, output_emb//2, 1, 1, 0, 1, leaky=leaky),
156 | nn.Conv2d(output_emb//2, 9*5, kernel_size=1))
157 | else:
158 | self.fcn_out = torch.nn.Sequential(
159 | ConvBatchNormReLU(output_emb, output_emb//2, 1, 1, 0, 1, leaky=leaky),
160 | nn.Conv2d(output_emb//2, 9*5, kernel_size=1))
161 | self.test = Vector(512,16,512)
162 | self.vectmaxp = torch.nn.Sequential(
163 | nn.MaxPool2d(16, stride=16),
164 | nn.ReLU()
165 | )
166 | self.ptmax = torch.nn.Sequential(
167 | #nn.MaxPool2d(8, stride=8),
168 | nn.ReLU()
169 | )
170 | self.draw = torch.nn.Sequential(
171 | nn.ReLU()
172 | )
173 | self.softmax = nn.Softmax(dim=-1)
174 | self.linear = nn.Linear(256, 1)
175 | def forward(self, image, dp, ht, word_id, word_mask, gest, bbox, gt, amask, sal,phrase):
176 | ## Visual Module
177 |
178 | batch_size = image.size(0)
179 | out = self.visumodel(image)
180 | intemide = self.visumodel(image)[1]
181 | gest = gest.type(torch.FloatTensor).cuda()
182 | amask = amask.type(torch.FloatTensor).cuda()
183 | dp = torch.mul(amask,dp)
184 | dp = F.normalize(dp.type(torch.FloatTensor).view(batch_size,-1),dim=1,p=float('INF')).view(batch_size,1,512,512).cuda() #* 1.5
185 |
186 |
187 | raw_fvisu4t = self.visumodel4t(image)
188 |
189 | xv, yv = torch.meshgrid([torch.arange(0,512), torch.arange(0,512)])
190 | xv = (xv / 512 ).unsqueeze(0).unsqueeze(0).repeat(batch_size,1,1,1).cuda()
191 | yv = (yv / 512 ).unsqueeze(0).unsqueeze(0).repeat(batch_size,1,1,1).cuda()
192 | xyz = torch.cat( (xv,yv,dp), dim = 1).cuda()
193 |
194 | gestfraw = torch.mul(gest , amask)
195 |
196 | gestf = F.normalize(gestfraw.view(batch_size,1,-1),dim=2,p=1).view(batch_size,1,512,512).repeat(1,3,1,1)
197 | body = torch.mul(gestf , xyz).view(batch_size, 3, -1)
198 | body = torch.sum(body,dim=2)
199 |
200 | gtbo = F.normalize(gt.view(batch_size,1,-1),dim=2,p=1).view(batch_size,1,512,512).repeat(1,3,1,1)
201 | target = torch.mul(gtbo , xyz).view(batch_size, 3, -1)
202 | target = torch.sum(target,dim=2) - body
203 |
204 | xyz_cent = xyz - body.unsqueeze(2).unsqueeze(2).repeat(1,1,512,512)
205 |
206 | t = self.test(torch.cat( (ht.type(torch.FloatTensor).cuda(),xyz_cent) ,dim = 1))
207 | vloss = 1 - torch.cosine_similarity(t, target, dim=1)
208 | vectmap = torch.cosine_similarity(xyz_cent , t.unsqueeze(2).unsqueeze(2).repeat(1,1,512,512) , dim = 1).unsqueeze(1) - 0.7
209 |
210 | # cv2.imwrite('output/'+rank+'img.jpg' , imagedraw*255)
211 | norm = torch.max(gestfraw.reshape(batch_size,-1), dim=1, keepdim = True)[0].detach().unsqueeze(2).unsqueeze(2).repeat(1,1,512,512)
212 | gestfraw = (gestfraw.unsqueeze(1))/norm
213 | maxgestvect = self.ptmax(vectmap ) #+self.ptmax(gestfraw)
214 | maxvecter = self.vectmaxp(vectmap )
215 | mid = torch.cat((raw_fvisu4t[2], self.mp3(ht.type(torch.FloatTensor).cuda()), self.mp4(dp.type(torch.FloatTensor).cuda()), self.mp4(vectmap.type(torch.FloatTensor).cuda())),1)
216 | #
217 | mid = self.mapping_visuf(mid)
218 | raw_fvisu = torch.cat((intemide, self.mp1(ht.type(torch.FloatTensor).cuda()), self.mp2(dp.type(torch.FloatTensor).cuda())),1)
219 | fvisu = self.mapping_visu(raw_fvisu) * maxvecter.repeat(1,512,1,1).detach()
220 | raw_fvisu = F.normalize(fvisu, p=2, dim=1)
221 | size = (raw_fvisu.shape[2])
222 |
223 | ## Language Module
224 | all_encoder_layers, _ = self.textmodel(word_id, \
225 | token_type_ids=None, attention_mask=word_mask)
226 | ## Sentence feature at the first position [cls]
227 | raw_flang = (all_encoder_layers[-1][:,0,:] + all_encoder_layers[-2][:,0,:]\
228 | + all_encoder_layers[-3][:,0,:] + all_encoder_layers[-4][:,0,:])/4
229 | raw_fword = (all_encoder_layers[-1] + all_encoder_layers[-2]\
230 | + all_encoder_layers[-3] + all_encoder_layers[-4])/4
231 | if not self.tunebert:
232 | ## fix bert during training
233 | # raw_flang = raw_flang.detach()
234 | hidden = raw_flang.detach()
235 | raw_fword = raw_fword.detach()
236 |
237 | fword = Variable(torch.zeros(raw_fword.shape[0], raw_fword.shape[1], self.emb_size).cuda())
238 | for ii in range(raw_fword.shape[0]):
239 | ntoken = (word_mask[ii] != 0).sum()
240 | fword[ii,:ntoken,:] = F.normalize(self.mapping_lang(raw_fword[ii,:ntoken,:]), p=2, dim=1)
241 | ## [CLS], [SEP]
242 | # fword[ii,1:ntoken-1,:] = F.normalize(self.mapping_lang(raw_fword[ii,1:ntoken-1,:].view(-1,self.textdim)), p=2, dim=1)
243 | raw_fword = fword
244 | x = self.trans(mid)[1].reshape(batch_size,256,-1).permute(0,2,1)
245 | x = self.linear(x)
246 | pt = self.upsample ( torch.softmax(x , dim = 1).squeeze(2).reshape(batch_size,1,64,64) )
247 |
248 | gt = gt.unsqueeze(1)
249 | gest = 1 - torch.mul(gest , amask).clamp(max = 1,min=0)
250 | pt = torch.mul(pt, amask.unsqueeze(1))
251 | pt = F.normalize(pt.view(batch_size,1,-1),dim=2,p=1).view(batch_size,1,512,512)
252 | loss3 = 1 - torch.sum(torch.mul(pt,gt).reshape(batch_size,-1) , -1)
253 | gt = F.normalize(gt.view(batch_size,1,-1),dim=2,p=1).view(batch_size,1,512,512)
254 | eps = 1e-7
255 |
256 | vect = torch.mul(pt , xyz).view(batch_size, 3, -1)
257 | vect = torch.sum(vect,dim=2) - body
258 | loss1 = torch.sum(torch.abs(vect - target))
259 | loss3 += torch.sum( (torch.log( gt / (eps + pt) + eps ) * gt).reshape(batch_size,-1) , -1) * 0.1
260 |
261 | norm = torch.max(pt.reshape(batch_size,-1), dim=1, keepdim = True)[0].detach().unsqueeze(2).unsqueeze(2).repeat(1,1,512,512)
262 | pt = (pt/norm).detach()
263 | centerout = self.center(pt.type(torch.FloatTensor)).squeeze(1).cuda()
264 |
265 | coord = generate_coord(batch_size, raw_fvisu.size(2), raw_fvisu.size(3))
266 | x, attnscore_list = self.film(raw_fvisu, raw_fword, coord,maxvecter,fsent=None,word_mask=word_mask)
267 | if self.mstage:
268 | outbox = []
269 | for film_ii in range(len(x)):
270 | outbox.append(self.fcn_out["out%d"%film_ii](x[film_ii]))
271 | elif self.convlstm:
272 | x = torch.stack(x, dim=1)
273 |
274 | output, state = self.global_out(x)
275 | output, hidden, cell = output[-1], state[-1][0], state[-1][1]
276 | if self.use_sal:
277 | #pt = sal.type(torch.FloatTensor).cuda()
278 | pt_c = self.conv1(pt.type(torch.FloatTensor).cuda())
279 |
280 | hidden = torch.cat((hidden, pt_c), 1)
281 |
282 | outbox = [self.fcn_out(hidden)]
283 | else:
284 | x = torch.stack(x, dim=1).view(batch_size, -1, raw_fvisu.size(2), raw_fvisu.size(3))
285 | outbox = [self.fcn_out(x)]
286 |
287 | return outbox, attnscore_list, loss1, vloss, centerout,loss3,pt ## list of (B,N,H,W)
288 |
289 |
290 | if __name__ == "__main__":
291 | import sys
292 | import argparse
293 | sys.path.append('.')
294 | from dataset.data_loader import *
295 | from torch.autograd import Variable
296 | from torch.utils.data import DataLoader
297 | from torchvision.transforms import Compose, ToTensor, Normalize
298 | from utils.transforms import ResizeImage, ResizeAnnotation
299 | parser = argparse.ArgumentParser(
300 | description='Dataloader test')
301 | parser.add_argument('--size', default=416, type=int,
302 | help='image size')
303 | parser.add_argument('--data', type=str, default='./ln_data/',
304 | help='path to ReferIt splits data folder')
305 | parser.add_argument('--dataset', default='referit', type=str,
306 | help='referit/flickr/unc/unc+/gref')
307 | parser.add_argument('--split', default='train', type=str,
308 | help='name of the dataset split used to train')
309 | parser.add_argument('--time', default=20, type=int,
310 | help='maximum time steps (lang length) per batch')
311 | parser.add_argument('--emb_size', default=256, type=int,
312 | help='word embedding dimensions')
313 | # parser.add_argument('--lang_layers', default=3, type=int,
314 | # help='number of SRU/LSTM stacked layers')
315 |
316 | args = parser.parse_args()
317 |
318 | torch.manual_seed(13)
319 | np.random.seed(13)
320 | torch.backends.cudnn.deterministic = True
321 | torch.backends.cudnn.benchmark = False
322 | input_transform = Compose([
323 | ToTensor(),
324 | # ResizeImage(args.size),
325 | Normalize(
326 | mean=[0.485, 0.456, 0.406],
327 | std=[0.229, 0.224, 0.225])
328 | ])
329 |
330 | refer = ReferDataset(data_root=args.data,
331 | dataset=args.dataset,
332 | split=args.split,
333 | imsize = args.size,
334 | transform=input_transform,
335 | max_query_len=args.time)
336 |
337 | train_loader = DataLoader(refer, batch_size=1, shuffle=True,
338 | pin_memory=True, num_workers=0)
339 |
340 | # model = textcam_yolo_light(emb_size=args.emb_size)
341 |
342 | for i in enumerate(train_loader):
343 | print(i)
344 | break
345 |
--------------------------------------------------------------------------------
/dataset/data_loader.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | """
4 | YouRefIt referring image PyTorch dataset.
5 | Define and group batches of images and queries.
6 | Based on:
7 | https://github.com/zyang-ur/ReSC/blob/master/dataset/data_loader.py
8 | """
9 | from torchvision.transforms import Compose, ToTensor, Normalize
10 | import os
11 | import sys
12 | import cv2
13 | import json
14 | import uuid
15 | import tqdm
16 | import math
17 | import torch
18 | import random
19 | # import h5py
20 | import numpy as np
21 | import os.path as osp
22 | import scipy.io as sio
23 | import torch.utils.data as data
24 | from collections import OrderedDict
25 | sys.path.append('.')
26 | import operator
27 | import utils
28 | from utils import Corpus
29 | import clip
30 | import argparse
31 | import collections
32 | import logging
33 | import json
34 | import re
35 |
36 | np.set_printoptions(threshold=np.inf)
37 | from pytorch_pretrained_bert.tokenization import BertTokenizer
38 | from pytorch_pretrained_bert.modeling import BertModel
39 | # from transformers import BertTokenizer,BertModel
40 | from utils.transforms import letterbox, random_affine
41 |
42 | sys.modules['utils'] = utils
43 | cv2.setNumThreads(0)
44 | cv2.ocl.setUseOpenCL(True)
45 |
46 | def read_examples(input_line, unique_id):
47 | """Read a list of `InputExample`s from an input file."""
48 | examples = []
49 | # unique_id = 0
50 | line = input_line #reader.readline()
51 | # if not line:
52 | # break
53 | line = line.strip()
54 | text_a = None
55 | text_b = None
56 | m = re.match(r"^(.*) \|\|\| (.*)$", line)
57 | if m is None:
58 | text_a = line
59 | else:
60 | text_a = m.group(1)
61 | text_b = m.group(2)
62 | examples.append(
63 | InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
64 | # unique_id += 1
65 | return examples
66 |
67 | ## Bert text encoding
68 | class InputExample(object):
69 | def __init__(self, unique_id, text_a, text_b):
70 | self.unique_id = unique_id
71 | self.text_a = text_a
72 | self.text_b = text_b
73 |
74 | class InputFeatures(object):
75 | """A single set of features of data."""
76 | def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
77 | self.unique_id = unique_id
78 | self.tokens = tokens
79 | self.input_ids = input_ids
80 | self.input_mask = input_mask
81 | self.input_type_ids = input_type_ids
82 |
83 | def convert_examples_to_features(examples, seq_length, tokenizer):
84 | """Loads a data file into a list of `InputBatch`s."""
85 | features = []
86 | for (ex_index, example) in enumerate(examples):
87 | tokens_a = tokenizer.tokenize(example.text_a)
88 |
89 | tokens_b = None
90 | if example.text_b:
91 | tokens_b = tokenizer.tokenize(example.text_b)
92 |
93 | if tokens_b:
94 | # Modifies `tokens_a` and `tokens_b` in place so that the total
95 | # length is less than the specified length.
96 | # Account for [CLS], [SEP], [SEP] with "- 3"
97 | _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
98 | else:
99 | # Account for [CLS] and [SEP] with "- 2"
100 | if len(tokens_a) > seq_length - 2:
101 | tokens_a = tokens_a[0:(seq_length - 2)]
102 | tokens = []
103 | input_type_ids = []
104 | tokens.append("[CLS]")
105 | input_type_ids.append(0)
106 | for token in tokens_a:
107 | tokens.append(token)
108 | input_type_ids.append(0)
109 | tokens.append("[SEP]")
110 | input_type_ids.append(0)
111 |
112 | if tokens_b:
113 | for token in tokens_b:
114 | tokens.append(token)
115 | input_type_ids.append(1)
116 | tokens.append("[SEP]")
117 | input_type_ids.append(1)
118 |
119 | input_ids = tokenizer.convert_tokens_to_ids(tokens)
120 |
121 | # The mask has 1 for real tokens and 0 for padding tokens. Only real
122 | # tokens are attended to.
123 | input_mask = [1] * len(input_ids)
124 |
125 | # Zero-pad up to the sequence length.
126 | while len(input_ids) < seq_length:
127 | input_ids.append(0)
128 | input_mask.append(0)
129 | input_type_ids.append(0)
130 |
131 | assert len(input_ids) == seq_length
132 | assert len(input_mask) == seq_length
133 | assert len(input_type_ids) == seq_length
134 | features.append(
135 | InputFeatures(
136 | unique_id=example.unique_id,
137 | tokens=tokens,
138 | input_ids=input_ids,
139 | input_mask=input_mask,
140 | input_type_ids=input_type_ids))
141 | return features
142 |
143 | class DatasetNotFoundError(Exception):
144 | pass
145 |
146 | class ReferDataset(data.Dataset):
147 | SUPPORTED_DATASETS = {
148 | 'yourefit': {'splits': ('train', 'val', 'test')},
149 | 'referit': {'splits': ('train', 'val', 'trainval', 'test')},
150 | 'unc': {
151 | 'splits': ('train', 'val', 'trainval', 'testA', 'testB'),
152 | 'params': {'dataset': 'refcoco', 'split_by': 'unc'}
153 | },
154 | 'unc+': {
155 | 'splits': ('train', 'val', 'trainval', 'testA', 'testB'),
156 | 'params': {'dataset': 'refcoco+', 'split_by': 'unc'}
157 | },
158 | 'gref': {
159 | 'splits': ('train', 'val'),
160 | 'params': {'dataset': 'refcocog', 'split_by': 'google'}
161 | },
162 | 'gref_umd': {
163 | 'splits': ('train', 'val', 'test'),
164 | 'params': {'dataset': 'refcocog', 'split_by': 'umd'}
165 | },
166 | 'flickr': {
167 | 'splits': ('train', 'val', 'test')}
168 | }
169 |
170 | def __init__(self, data_root, split_root='data', dataset='referit', imsize=256,
171 | transform=None, augment=False, device=None, return_idx=False, testmode=False,
172 | split='train', max_query_len=128, lstm=False, bert_model='bert-base-uncased'):
173 | self.images = []
174 | self.data_root = data_root
175 | self.split_root = split_root
176 | self.dataset = dataset
177 | self.imsize = imsize
178 | self.query_len = max_query_len
179 | self.lstm = lstm
180 | self.transform = transform
181 | self.testmode = testmode
182 | self.split = split
183 | self.device = device
184 | self.t = input_transform = Compose([
185 | ToTensor()
186 | ])
187 | self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)
188 | self.augment=augment
189 | self.return_idx=return_idx
190 | self.num = 0
191 | if self.dataset == 'yourefit':
192 | self.dataset_root = osp.join(self.data_root, 'yourefit')
193 | self.im_dir = osp.join(self.dataset_root, 'images')
194 | elif self.dataset == 'referit':
195 | self.dataset_root = osp.join(self.data_root, 'referit')
196 | self.im_dir = osp.join(self.dataset_root, 'images')
197 | self.split_dir = osp.join(self.dataset_root, 'splits')
198 | elif self.dataset == 'flickr':
199 | self.dataset_root = osp.join(self.data_root, 'Flickr30k')
200 | self.im_dir = osp.join(self.dataset_root, 'flickr30k_images')
201 | else: ## refcoco, etc.
202 | self.dataset_root = osp.join(self.data_root, 'other')
203 | self.im_dir = osp.join(
204 | self.dataset_root, 'images', 'mscoco', 'images', 'train2014')
205 | self.split_dir = osp.join(self.dataset_root, 'splits')
206 |
207 | if not self.exists_dataset():
208 | print('Please download index cache to data folder')
209 | exit(0)
210 |
211 | dataset_path = osp.join(self.split_root, self.dataset)
212 | valid_splits = self.SUPPORTED_DATASETS[self.dataset]['splits']
213 |
214 | if self.lstm:
215 | self.corpus = Corpus()
216 | corpus_path = osp.join(dataset_path, 'corpus.pth')
217 | self.corpus = torch.load(corpus_path)
218 |
219 | if split not in valid_splits:
220 | raise ValueError(
221 | 'Dataset {0} does not have split {1}'.format(
222 | self.dataset, split))
223 |
224 | splits = [split]
225 | if self.dataset != 'referit':
226 | splits = ['train', 'val'] if split == 'trainval' else [split]
227 | for split in splits:
228 | imgset_file = '{0}_{1}full.pth'.format(self.dataset, split)
229 | imgset_path = osp.join(dataset_path, imgset_file)
230 | self.images += torch.load(imgset_path)
231 |
232 | def exists_dataset(self):
233 | return osp.exists(osp.join(self.split_root, self.dataset))
234 |
235 |
236 | def pull_item(self, idx):
237 | if self.dataset == 'flickr':
238 | img_file, bbox, phrase = self.images[idx]
239 | else:
240 | img_file, _, bbox, phrase, attri = self.images[idx]
241 | ## box format: to x1y1x2y2
242 | if not (self.dataset == 'referit' or self.dataset == 'flickr'):
243 | bbox = np.array(bbox, dtype=int)
244 | #bbox[2], bbox[3] = bbox[0]+bbox[2], bbox[1]+bbox[3]
245 | else:
246 | bbox = np.array(bbox, dtype=int)
247 |
248 | img_path = osp.join(self.im_dir, img_file)
249 | img = cv2.imread(img_path)
250 |
251 | htmapdir = self.im_dir.replace('images', 'pafours')
252 | htmapfile = img_file #.replace('.jpg', '_rendered.png')
253 | htmap_path = osp.join(htmapdir, htmapfile)
254 | htmap = cv2.imread(htmap_path)
255 |
256 | ht = np.asarray(htmap)
257 |
258 | # #ht = np.mean(ht, axis=2)
259 |
260 |
261 | # ht = cv2.resize(ht, (512, 512))
262 |
263 | ptdir = self.im_dir.replace('images', 'depimg')
264 | ptfile = img_file #.replace('.jpg', '_depth.png')
265 | pt_path = osp.join(ptdir, ptfile)
266 | pt = cv2.imread(pt_path)
267 | # print(pt.shape)
268 | # exit()
269 | # pt = cv2.resize(pt, (256,256))
270 | # pt = np.reshape(pt, (3, 256, 256))
271 |
272 | saldir = self.im_dir.replace('images', 'saliency')
273 | salfile = img_file.replace('.jpg', '.jpeg')
274 | sal_path = osp.join(saldir, salfile)
275 | sal = cv2.imread(sal_path)
276 | sal = cv2.resize(sal, (256,256))
277 | #sal = np.reshape(sal, (3, 256, 256))
278 |
279 | gestdir = 'ln_data/bodysegment'
280 | gestfile = img_file.replace('.jpg' , '_seg.png')
281 | gest_path = osp.join(gestdir,gestfile)
282 | gest = cv2.imread(gest_path)
283 | if gest.shape != img.shape:
284 | gest = cv2.resize(gest, img.shape[:2], interpolation=cv2.INTER_AREA)
285 | ## duplicate channel if gray image
286 | if img.shape[-1] > 1:
287 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
288 | else:
289 | img = np.stack([img] * 3)
290 |
291 | return img, pt, ht, phrase, bbox, gest, sal, img_file
292 | # return img, phrase, bbox, pt, ht
293 |
294 | def tokenize_phrase(self, phrase):
295 | return self.corpus.tokenize(phrase, self.query_len)
296 |
297 | def untokenize_word_vector(self, words):
298 | return self.corpus.dictionary[words]
299 |
300 | def __len__(self):
301 | return len(self.images)
302 |
303 | def __getitem__(self, idx):
304 | img, pt, ht, phrase, bbox, gest, sal, img_file = self.pull_item(idx)
305 | # phrase = phrase.decode("utf-8").encode().lower()
306 |
307 |
308 | phrase = phrase.lower()
309 | if self.augment:
310 | augment_flip, augment_hsv, augment_affine = True,True,True
311 |
312 | ## seems a bug in torch transformation resize, so separate in advance
313 | h,w = img.shape[0], img.shape[1]
314 | if self.augment:
315 | ## random horizontal flip
316 | if augment_flip and random.random() > 0.5:
317 | img = cv2.flip(img, 1)
318 | pt = cv2.flip(pt, 1 )
319 | ht = cv2.flip(ht, 1 )
320 | gest = cv2.flip(gest, 1)
321 | sal = cv2.flip(sal, 1 )
322 | bbox[0], bbox[2] = w-bbox[2]-1, w-bbox[0]-1
323 | phrase = phrase.replace('right','*&^special^&*').replace('left','right').replace('*&^special^&*','left')
324 |
325 | ## random intensity, saturation change
326 | if augment_hsv:
327 | fraction = 0.5
328 | img_hsv = cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2HSV)
329 | S = img_hsv[:, :, 1].astype(np.float32)
330 | V = img_hsv[:, :, 2].astype(np.float32)
331 | a = (random.random() * 2 - 1) * fraction + 1
332 | if a > 1:
333 | np.clip(S, a_min=0, a_max=255, out=S)
334 | a = (random.random() * 2 - 1) * fraction + 1
335 | V *= a
336 | if a > 1:
337 | np.clip(V, a_min=0, a_max=255, out=V)
338 |
339 | img_hsv[:, :, 1] = S.astype(np.uint8)
340 | img_hsv[:, :, 2] = V.astype(np.uint8)
341 | img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR), cv2.COLOR_BGR2RGB)
342 |
343 | mask = np.ones_like(img)
344 | img, mask, ratio, dw, dh = letterbox(img, mask, self.imsize)
345 | #ht, _, ratio, dw, dh = letterbox(ht, None, self.imsize)
346 | gest, _, ratio, dw, dh = letterbox(gest, None, self.imsize)
347 | #sal, _, ratio, dw, dh = letterbox(sal, None, self.imsize)
348 | bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw
349 | bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh
350 | ## random affine transformation
351 | if augment_affine:
352 | gt = np.asarray(torch.zeros([512,512]))
353 | gt[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] = 1
354 | img, mask, bbox, M = random_affine(img, mask, bbox, \
355 | degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10))
356 | pt = cv2.warpPerspective(pt, M, dsize=(512, 512), flags=cv2.INTER_LINEAR,
357 | borderValue=0)
358 | ht = cv2.warpPerspective(ht, M, dsize=(512, 512), flags=cv2.INTER_LINEAR,
359 | borderValue=0)
360 | gest = cv2.warpPerspective(gest, M, dsize=(512, 512), flags=cv2.INTER_NEAREST,
361 | borderValue=0)
362 | sal = cv2.warpPerspective(sal, M, dsize=(256, 256), flags=cv2.INTER_NEAREST,
363 | borderValue=0)
364 | gt = cv2.warpPerspective(gt, M, dsize=(512, 512), flags=cv2.INTER_NEAREST,
365 | borderValue=0)
366 | else: ## should be inference, or specified training
367 | mask = np.ones_like(img)
368 | img, mask, ratio, dw, dh = letterbox(img, mask, self.imsize)
369 | # ht, _, ratio, dw, dh = letterbox(ht, None, self.imsize)
370 | gest, _, ratio, dw, dh = letterbox(gest, None, self.imsize)
371 | bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw
372 | bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh
373 | gt = np.asarray(torch.zeros([512,512]))
374 | gt[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] = 1
375 | ## Norm, to tensor
376 | # print(img.shape)
377 |
378 | pt = pt[:,:,0]
379 | gest = gest[:,:,0]
380 | mask = mask[:,:,0]
381 | sal = np.reshape(sal, (3, 256, 256))
382 | sal = sal[0,:,:]
383 | if self.transform is not None:
384 |
385 | img = self.transform(img)
386 |
387 | #pt = self.t(pt)
388 | #print(ht.shape)
389 |
390 | ht = self.transform(ht)
391 |
392 | #print(ht.shape)
393 | if self.lstm:
394 | phrase = self.tokenize_phrase(phrase)
395 | word_id = phrase
396 | # word_mask = np.zeros(word_id.shape)
397 | word_mask = np.array(word_id>0,dtype=int)
398 | else:
399 | ## encode phrase to bert input
400 |
401 | examples = read_examples(phrase, idx)
402 | features = convert_examples_to_features(
403 | examples=examples, seq_length=self.query_len, tokenizer=self.tokenizer)
404 | word_id = features[0].input_ids
405 | word_mask = features[0].input_mask
406 | #phrase = features[0].input_mask #clip.tokenize(phrase, context_length=20)
407 | if self.testmode:
408 | return img, pt, ht, gest, gt, mask, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
409 | np.array(bbox, dtype=np.float32), np.array(ratio, dtype=np.float32), \
410 | np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0],sal , phrase
411 | else:
412 | return img, pt, ht, gest, gt, mask, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
413 | np.array(bbox, dtype=np.float32),sal, phrase, img_file
--------------------------------------------------------------------------------
/model/darknet.py:
--------------------------------------------------------------------------------
1 | from __future__ import division
2 |
3 | import math
4 | import torch
5 | import torch.nn as nn
6 | import torch.nn.functional as F
7 | from torch.autograd import Variable
8 | import numpy as np
9 | from collections import defaultdict, OrderedDict
10 |
11 | from PIL import Image
12 |
13 | # from utils.parse_config import *
14 | from utils.utils import *
15 | # import matplotlib.pyplot as plt
16 | # import matplotlib.patches as patches
17 |
18 | exist_id = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, \
19 | 11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, \
20 | 23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, \
21 | 37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, \
22 | 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, \
23 | 60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, \
24 | 75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, \
25 | 87, 88, 89, 90]
26 | catmap_dict = OrderedDict()
27 | for ii in range(len(exist_id)):
28 | catmap_dict[exist_id[ii]] = ii
29 |
30 | def build_object_targets(
31 | pred_boxes, pred_conf, pred_cls, target, anchors, num_anchors, num_classes, grid_size, ignore_thres, img_dim
32 | ):
33 | nB = target.size(0)
34 | nA = num_anchors
35 | nC = num_classes
36 | nG = grid_size
37 | mask = torch.zeros(nB, nA, nG, nG)
38 | conf_mask = torch.ones(nB, nA, nG, nG)
39 | tx = torch.zeros(nB, nA, nG, nG)
40 | ty = torch.zeros(nB, nA, nG, nG)
41 | tw = torch.zeros(nB, nA, nG, nG)
42 | th = torch.zeros(nB, nA, nG, nG)
43 | tconf = torch.ByteTensor(nB, nA, nG, nG).fill_(0)
44 | tcls = torch.ByteTensor(nB, nA, nG, nG, nC).fill_(0)
45 |
46 | nGT = 0
47 | nCorrect = 0
48 | for b in range(nB):
49 | for t in range(target.shape[1]):
50 | if target[b, t].sum() == 0:
51 | continue
52 | nGT += 1
53 | # Convert to position relative to box
54 | gx = target[b, t, 1] * nG
55 | gy = target[b, t, 2] * nG
56 | gw = target[b, t, 3] * nG
57 | gh = target[b, t, 4] * nG
58 | # Get grid box indices
59 | gi = int(gx)
60 | gj = int(gy)
61 | # Get shape of gt box
62 | gt_box = torch.FloatTensor(np.array([0, 0, gw, gh])).unsqueeze(0)
63 | # Get shape of anchor box
64 | anchor_shapes = torch.FloatTensor(np.concatenate((np.zeros((len(anchors), 2)), np.array(anchors)), 1))
65 | # Calculate iou between gt and anchor shapes
66 | anch_ious = bbox_iou(gt_box, anchor_shapes)
67 | # Where the overlap is larger than threshold set mask to zero (ignore)
68 | conf_mask[b, anch_ious > ignore_thres, gj, gi] = 0
69 | # Find the best matching anchor box
70 | best_n = np.argmax(anch_ious)
71 | # Get ground truth box
72 | gt_box = torch.FloatTensor(np.array([gx, gy, gw, gh])).unsqueeze(0)
73 | # Get the best prediction
74 | pred_box = pred_boxes[b, best_n, gj, gi].unsqueeze(0)
75 | # Masks
76 | mask[b, best_n, gj, gi] = 1
77 | conf_mask[b, best_n, gj, gi] = 1
78 | # Coordinates
79 | tx[b, best_n, gj, gi] = gx - gi
80 | ty[b, best_n, gj, gi] = gy - gj
81 | # Width and height
82 | tw[b, best_n, gj, gi] = math.log(gw / anchors[best_n][0] + 1e-16)
83 | th[b, best_n, gj, gi] = math.log(gh / anchors[best_n][1] + 1e-16)
84 | # One-hot encoding of label
85 | target_label = int(target[b, t, 0])
86 | target_label = catmap_dict[target_label]
87 | tcls[b, best_n, gj, gi, target_label] = 1
88 | tconf[b, best_n, gj, gi] = 1
89 |
90 | # Calculate iou between ground truth and best matching prediction
91 | iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False)
92 | pred_label = torch.argmax(pred_cls[b, best_n, gj, gi])
93 | score = pred_conf[b, best_n, gj, gi]
94 | if iou > 0.5 and pred_label == target_label and score > 0.5:
95 | nCorrect += 1
96 |
97 | return nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls
98 |
99 | def parse_model_config(path):
100 | """Parses the yolo-v3 layer configuration file and returns module definitions"""
101 | file = open(path, 'r')
102 | lines = file.read().split('\n')
103 | lines = [x for x in lines if x and not x.startswith('#')]
104 | lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces
105 | module_defs = []
106 | for line in lines:
107 | if line.startswith('['): # This marks the start of a new block
108 | module_defs.append({})
109 | module_defs[-1]['type'] = line[1:-1].rstrip()
110 | if module_defs[-1]['type'] == 'convolutional' or module_defs[-1]['type'] == 'yoloconvolutional':
111 | module_defs[-1]['batch_normalize'] = 0
112 | else:
113 | key, value = line.split("=")
114 | value = value.strip()
115 | module_defs[-1][key.rstrip()] = value.strip()
116 | return module_defs
117 |
118 | class ConvBatchNormReLU(nn.Sequential):
119 | def __init__(
120 | self,
121 | in_channels,
122 | out_channels,
123 | kernel_size,
124 | stride,
125 | padding,
126 | dilation,
127 | leaky=False,
128 | relu=True,
129 | instance=False,
130 | ):
131 | super(ConvBatchNormReLU, self).__init__()
132 | self.add_module(
133 | "conv",
134 | nn.Conv2d(
135 | in_channels=in_channels,
136 | out_channels=out_channels,
137 | kernel_size=kernel_size,
138 | stride=stride,
139 | padding=padding,
140 | dilation=dilation,
141 | bias=False,
142 | ),
143 | )
144 | if instance:
145 | self.add_module(
146 | "bn",
147 | nn.InstanceNorm2d(num_features=out_channels),
148 | )
149 | else:
150 | self.add_module(
151 | "bn",
152 | nn.BatchNorm2d(
153 | num_features=out_channels, eps=1e-5, momentum=0.999, affine=True
154 | ),
155 | )
156 |
157 | if leaky:
158 | self.add_module("relu", nn.LeakyReLU(0.1))
159 | elif relu:
160 | self.add_module("relu", nn.ReLU())
161 |
162 | def forward(self, x):
163 | return super(ConvBatchNormReLU, self).forward(x)
164 |
165 | class ConvBatchNormReLU_3d(nn.Sequential):
166 | def __init__(
167 | self,
168 | in_channels,
169 | out_channels,
170 | kernel_size,
171 | stride,
172 | padding,
173 | dilation,
174 | leaky=False,
175 | relu=True,
176 | ):
177 | super(ConvBatchNormReLU_3d, self).__init__()
178 | self.add_module(
179 | "conv",
180 | nn.Conv3d(
181 | in_channels=in_channels,
182 | out_channels=out_channels,
183 | kernel_size=kernel_size,
184 | stride=stride,
185 | padding=padding,
186 | dilation=dilation,
187 | bias=False,
188 | ),
189 | )
190 | self.add_module(
191 | "bn",
192 | nn.BatchNorm3d(
193 | num_features=out_channels, eps=1e-5, momentum=0.999, affine=True
194 | ),
195 | )
196 |
197 | if leaky:
198 | self.add_module("relu", nn.LeakyReLU(0.1))
199 | elif relu:
200 | self.add_module("relu", nn.ReLU())
201 |
202 | def forward(self, x):
203 | return super(ConvBatchNormReLU_3d, self).forward(x)
204 |
205 | class MyUpsample2(nn.Module):
206 | def forward(self, x):
207 | return x[:, :, :, None, :, None].expand(-1, -1, -1, 2, -1, 2).reshape(x.size(0), x.size(1), x.size(2)*2, x.size(3)*2)
208 |
209 | def create_modules(module_defs):
210 | """
211 | Constructs module list of layer blocks from module configuration in module_defs
212 | """
213 | hyperparams = module_defs.pop(0)
214 | output_filters = [int(hyperparams["channels"])]
215 | module_list = nn.ModuleList()
216 | for i, module_def in enumerate(module_defs):
217 | modules = nn.Sequential()
218 |
219 | if module_def["type"] == "convolutional" or module_def["type"] == "yoloconvolutional":
220 | bn = int(module_def["batch_normalize"])
221 | filters = int(module_def["filters"])
222 | kernel_size = int(module_def["size"])
223 | pad = (kernel_size - 1) // 2 if int(module_def["pad"]) else 0
224 | modules.add_module(
225 | "conv_%d" % i,
226 | nn.Conv2d(
227 | in_channels=output_filters[-1],
228 | out_channels=filters,
229 | kernel_size=kernel_size,
230 | stride=int(module_def["stride"]),
231 | padding=pad,
232 | bias=not bn,
233 | ),
234 | )
235 | if bn:
236 | modules.add_module("batch_norm_%d" % i, nn.BatchNorm2d(filters))
237 | if module_def["activation"] == "leaky":
238 | modules.add_module("leaky_%d" % i, nn.LeakyReLU(0.1))
239 |
240 | elif module_def["type"] == "maxpool":
241 | kernel_size = int(module_def["size"])
242 | stride = int(module_def["stride"])
243 | if kernel_size == 2 and stride == 1:
244 | padding = nn.ZeroPad2d((0, 1, 0, 1))
245 | modules.add_module("_debug_padding_%d" % i, padding)
246 | maxpool = nn.MaxPool2d(
247 | kernel_size=int(module_def["size"]),
248 | stride=int(module_def["stride"]),
249 | padding=int((kernel_size - 1) // 2),
250 | )
251 | modules.add_module("maxpool_%d" % i, maxpool)
252 |
253 | elif module_def["type"] == "upsample":
254 | # upsample = nn.Upsample(scale_factor=int(module_def["stride"]), mode="nearest")
255 | assert(int(module_def["stride"])==2)
256 | upsample = MyUpsample2()
257 | modules.add_module("upsample_%d" % i, upsample)
258 |
259 | elif module_def["type"] == "route":
260 | layers = [int(x) for x in module_def["layers"].split(",")]
261 | filters = sum([output_filters[layer_i] for layer_i in layers])
262 | modules.add_module("route_%d" % i, EmptyLayer())
263 |
264 | elif module_def["type"] == "shortcut":
265 | filters = output_filters[int(module_def["from"])]
266 | modules.add_module("shortcut_%d" % i, EmptyLayer())
267 |
268 | elif module_def["type"] == "yolo":
269 | anchor_idxs = [int(x) for x in module_def["mask"].split(",")]
270 | # Extract anchors
271 | anchors = [int(x) for x in module_def["anchors"].split(",")]
272 | anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
273 | anchors = [anchors[i] for i in anchor_idxs]
274 | num_classes = int(module_def["classes"])
275 | img_height = int(hyperparams["height"])
276 | # Define detection layer
277 | # yolo_layer = YOLOLayer(anchors, num_classes, img_height)
278 | yolo_layer = YOLOLayer(anchors, num_classes, 256)
279 | modules.add_module("yolo_%d" % i, yolo_layer)
280 | # Register module list and number of output filters
281 | module_list.append(modules)
282 | output_filters.append(filters)
283 |
284 | return hyperparams, module_list
285 |
286 | class EmptyLayer(nn.Module):
287 | """Placeholder for 'route' and 'shortcut' layers"""
288 |
289 | def __init__(self):
290 | super(EmptyLayer, self).__init__()
291 |
292 | class YOLOLayer(nn.Module):
293 | """Detection layer"""
294 |
295 | def __init__(self, anchors, num_classes, img_dim):
296 | super(YOLOLayer, self).__init__()
297 | self.anchors = anchors
298 | self.num_anchors = len(anchors)
299 | self.num_classes = num_classes
300 | self.bbox_attrs = 5 + num_classes
301 | self.image_dim = img_dim
302 | self.ignore_thres = 0.5
303 | self.lambda_coord = 1
304 |
305 | self.mse_loss = nn.MSELoss(size_average=True) # Coordinate loss
306 | self.bce_loss = nn.BCELoss(size_average=True) # Confidence loss
307 | self.ce_loss = nn.CrossEntropyLoss() # Class loss
308 |
309 | def forward(self, x, targets=None):
310 | nA = self.num_anchors
311 | nB = x.size(0)
312 | nG = x.size(2)
313 | stride = self.image_dim / nG
314 |
315 | # Tensors for cuda support
316 | FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
317 | LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
318 | ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor
319 |
320 | prediction = x.view(nB, nA, self.bbox_attrs, nG, nG).permute(0, 1, 3, 4, 2).contiguous()
321 |
322 | # Get outputs
323 | x = torch.sigmoid(prediction[..., 0]) # Center x
324 | y = torch.sigmoid(prediction[..., 1]) # Center y
325 | w = prediction[..., 2] # Width
326 | h = prediction[..., 3] # Height
327 | pred_conf = torch.sigmoid(prediction[..., 4]) # Conf
328 | pred_cls = torch.sigmoid(prediction[..., 5:]) # Cls pred.
329 |
330 | # Calculate offsets for each grid
331 | grid_x = torch.arange(nG).repeat(nG, 1).view([1, 1, nG, nG]).type(FloatTensor)
332 | grid_y = torch.arange(nG).repeat(nG, 1).t().view([1, 1, nG, nG]).type(FloatTensor)
333 | # scaled_anchors = FloatTensor([(a_w / stride, a_h / stride) for a_w, a_h in self.anchors])
334 | scaled_anchors = FloatTensor([(a_w / (416 / nG), a_h / (416 / nG)) for a_w, a_h in self.anchors])
335 | anchor_w = scaled_anchors[:, 0:1].view((1, nA, 1, 1))
336 | anchor_h = scaled_anchors[:, 1:2].view((1, nA, 1, 1))
337 |
338 | # Add offset and scale with anchors
339 | pred_boxes = FloatTensor(prediction[..., :4].shape)
340 | pred_boxes[..., 0] = x.data + grid_x
341 | pred_boxes[..., 1] = y.data + grid_y
342 | pred_boxes[..., 2] = torch.exp(w.data) * anchor_w
343 | pred_boxes[..., 3] = torch.exp(h.data) * anchor_h
344 |
345 | # Training
346 | if targets is not None:
347 | targets = targets.clone()
348 | targets[:,:,1:] = targets[:,:,1:]/self.image_dim
349 | for b_i in range(targets.shape[0]):
350 | targets[b_i,:,1:] = xyxy2xywh(targets[b_i,:,1:])
351 |
352 | if x.is_cuda:
353 | self.mse_loss = self.mse_loss.cuda()
354 | self.bce_loss = self.bce_loss.cuda()
355 | self.ce_loss = self.ce_loss.cuda()
356 |
357 | nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls = build_object_targets(
358 | pred_boxes=pred_boxes.cpu().data,
359 | pred_conf=pred_conf.cpu().data,
360 | pred_cls=pred_cls.cpu().data,
361 | target=targets.cpu().data,
362 | anchors=scaled_anchors.cpu().data,
363 | num_anchors=nA,
364 | num_classes=self.num_classes,
365 | grid_size=nG,
366 | ignore_thres=self.ignore_thres,
367 | img_dim=self.image_dim,
368 | )
369 |
370 | nProposals = int((pred_conf > 0.5).sum().item())
371 | recall = float(nCorrect / nGT) if nGT else 1
372 | precision = float(nCorrect / nProposals) if nProposals else 0
373 |
374 | # Handle masks
375 | mask = Variable(mask.type(ByteTensor))
376 | conf_mask = Variable(conf_mask.type(ByteTensor))
377 |
378 | # Handle target variables
379 | tx = Variable(tx.type(FloatTensor), requires_grad=False)
380 | ty = Variable(ty.type(FloatTensor), requires_grad=False)
381 | tw = Variable(tw.type(FloatTensor), requires_grad=False)
382 | th = Variable(th.type(FloatTensor), requires_grad=False)
383 | tconf = Variable(tconf.type(FloatTensor), requires_grad=False)
384 | tcls = Variable(tcls.type(LongTensor), requires_grad=False)
385 |
386 | # Get conf mask where gt and where there is no gt
387 | conf_mask_true = mask
388 | conf_mask_false = conf_mask - mask
389 |
390 | # Mask outputs to ignore non-existing objects
391 | loss_x = self.mse_loss(x[mask], tx[mask])
392 | loss_y = self.mse_loss(y[mask], ty[mask])
393 | loss_w = self.mse_loss(w[mask], tw[mask])
394 | loss_h = self.mse_loss(h[mask], th[mask])
395 | loss_conf = self.bce_loss(pred_conf[conf_mask_false], tconf[conf_mask_false]) + self.bce_loss(
396 | pred_conf[conf_mask_true], tconf[conf_mask_true]
397 | )
398 | loss_cls = (1 / nB) * self.ce_loss(pred_cls[mask], torch.argmax(tcls[mask], 1))
399 | loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
400 | return (
401 | loss,
402 | loss_x.item(),
403 | loss_y.item(),
404 | loss_w.item(),
405 | loss_h.item(),
406 | loss_conf.item(),
407 | loss_cls.item(),
408 | recall,
409 | precision,
410 | )
411 |
412 | else:
413 | # If not in training phase return predictions
414 | output = torch.cat(
415 | (
416 | pred_boxes.view(nB, -1, 4) * stride,
417 | pred_conf.view(nB, -1, 1),
418 | pred_cls.view(nB, -1, self.num_classes),
419 | ),
420 | -1,
421 | )
422 | return output
423 |
424 | class Darknet(nn.Module):
425 | """YOLOv3 object detection model"""
426 |
427 | def __init__(self, config_path='./model/yolov3.cfg', img_size=416, obj_out=False):
428 | super(Darknet, self).__init__()
429 | self.config_path = config_path
430 | self.obj_out = obj_out
431 | self.module_defs = parse_model_config(config_path)
432 | self.hyperparams, self.module_list = create_modules(self.module_defs)
433 | self.img_size = img_size
434 | self.seen = 0
435 | self.header_info = np.array([0, 0, 0, self.seen, 0])
436 | self.loss_names = ["x", "y", "w", "h", "conf", "cls", "recall", "precision"]
437 |
438 | def forward(self, x, targets=None):
439 | batch = x.shape[0]
440 | is_training = targets is not None
441 | output, output_obj = [], []
442 | self.losses = defaultdict(float)
443 | layer_outputs = []
444 | for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
445 | if module_def["type"] in ["convolutional", "upsample", "maxpool"]:
446 | x = module(x)
447 | elif module_def["type"] == "route":
448 | layer_i = [int(x) for x in module_def["layers"].split(",")]
449 | x = torch.cat([layer_outputs[i] for i in layer_i], 1)
450 | elif module_def["type"] == "shortcut":
451 | layer_i = int(module_def["from"])
452 | x = layer_outputs[-1] + layer_outputs[layer_i]
453 | elif module_def["type"] == "yoloconvolutional":
454 | output.append(x) ## save final feature block
455 | x = module(x)
456 | elif module_def["type"] == "yolo":
457 | # Train phase: get loss
458 | if is_training:
459 | x, *losses = module[0](x, targets)
460 | for name, loss in zip(self.loss_names, losses):
461 | self.losses[name] += loss
462 | # Test phase: Get detections
463 | else:
464 | x = module(x)
465 | output_obj.append(x)
466 | # x = module(x)
467 | # output.append(x)
468 | layer_outputs.append(x)
469 |
470 | self.losses["recall"] /= 3
471 | self.losses["precision"] /= 3
472 | # return sum(output) if is_training else torch.cat(output, 1)
473 | # return torch.cat(output, 1)
474 | if self.obj_out:
475 | return output, sum(output_obj) if is_training else torch.cat(output_obj, 1), self.losses["precision"], self.losses["recall"]
476 | # return output, sum(output_obj)/(len(output_obj)*batch) if is_training else torch.cat(output_obj, 1)
477 | else:
478 | return output
479 |
480 | def load_weights(self, weights_path):
481 | """Parses and loads the weights stored in 'weights_path'"""
482 |
483 | # Open the weights file
484 | fp = open(weights_path, "rb")
485 | if self.config_path=='./model/yolo9000.cfg':
486 | header = np.fromfile(fp, dtype=np.int32, count=4) # First five are header values
487 | else:
488 | header = np.fromfile(fp, dtype=np.int32, count=5) # First five are header values
489 | # Needed to write header when saving weights
490 | self.header_info = header
491 |
492 | self.seen = header[3]
493 | weights = np.fromfile(fp, dtype=np.float32) # The rest are weights
494 | fp.close()
495 |
496 | ptr = 0
497 | for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
498 | if module_def["type"] == "convolutional" or module_def["type"] == "yoloconvolutional":
499 | conv_layer = module[0]
500 | if module_def["batch_normalize"]:
501 | # Load BN bias, weights, running mean and running variance
502 | bn_layer = module[1]
503 | num_b = bn_layer.bias.numel() # Number of biases
504 | # Bias
505 | bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias)
506 | bn_layer.bias.data.copy_(bn_b)
507 | ptr += num_b
508 | # Weight
509 | bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight)
510 | bn_layer.weight.data.copy_(bn_w)
511 | ptr += num_b
512 | # Running Mean
513 | bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean)
514 | bn_layer.running_mean.data.copy_(bn_rm)
515 | ptr += num_b
516 | # Running Var
517 | bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var)
518 | bn_layer.running_var.data.copy_(bn_rv)
519 | ptr += num_b
520 | else:
521 | # Load conv. bias
522 | num_b = conv_layer.bias.numel()
523 | conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias)
524 | conv_layer.bias.data.copy_(conv_b)
525 | ptr += num_b
526 | # Load conv. weights
527 | num_w = conv_layer.weight.numel()
528 | conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight)
529 | conv_layer.weight.data.copy_(conv_w)
530 | ptr += num_w
531 |
532 | """
533 | @:param path - path of the new weights file
534 | @:param cutoff - save layers between 0 and cutoff (cutoff = -1 -> all are saved)
535 | """
536 |
537 | def save_weights(self, path, cutoff=-1):
538 |
539 | fp = open(path, "wb")
540 | self.header_info[3] = self.seen
541 | self.header_info.tofile(fp)
542 |
543 | # Iterate through layers
544 | for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])):
545 | if module_def["type"] == "convolutional":
546 | conv_layer = module[0]
547 | # If batch norm, load bn first
548 | if module_def["batch_normalize"]:
549 | bn_layer = module[1]
550 | bn_layer.bias.data.cpu().numpy().tofile(fp)
551 | bn_layer.weight.data.cpu().numpy().tofile(fp)
552 | bn_layer.running_mean.data.cpu().numpy().tofile(fp)
553 | bn_layer.running_var.data.cpu().numpy().tofile(fp)
554 | # Load conv bias
555 | else:
556 | conv_layer.bias.data.cpu().numpy().tofile(fp)
557 | # Load conv weights
558 | conv_layer.weight.data.cpu().numpy().tofile(fp)
559 |
560 | fp.close
561 |
562 | class Darknetfort(nn.Module):
563 | """YOLOv3 object detection model"""
564 |
565 | def __init__(self, config_path='./model/yolov3.cfg', img_size=416, obj_out=False):
566 | super(Darknetfort, self).__init__()
567 | self.config_path = config_path
568 | self.obj_out = obj_out
569 | self.module_defs = parse_model_config(config_path)
570 | self.hyperparams, self.module_list = create_modules(self.module_defs)
571 | self.img_size = img_size
572 | self.seen = 0
573 | self.header_info = np.array([0, 0, 0, self.seen, 0])
574 | self.loss_names = ["x", "y", "w", "h", "conf", "cls", "recall", "precision"]
575 | self.layer_num = 12
576 | def forward(self, x, targets=None):
577 | batch = x.shape[0]
578 | is_training = targets is not None
579 | output, output_obj = [], []
580 | self.losses = defaultdict(float)
581 | layer_outputs = []
582 | layer = 0
583 | for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
584 | if module_def["type"] in ["convolutional", "upsample", "maxpool"]:
585 | x = module(x)
586 | layer += 1
587 | elif module_def["type"] == "route":
588 | layer_i = [int(x) for x in module_def["layers"].split(",")]
589 | x = torch.cat([layer_outputs[i] for i in layer_i], 1)
590 | layer += 1
591 | elif module_def["type"] == "shortcut":
592 | layer_i = int(module_def["from"])
593 | x = layer_outputs[-1] + layer_outputs[layer_i]
594 | layer += 1
595 | elif module_def["type"] == "yoloconvolutional":
596 | output.append(x) ## save final feature block
597 | x = module(x)
598 | layer += 1
599 | elif module_def["type"] == "yolo":
600 | # Train phase: get loss
601 | if is_training:
602 | x, *losses = module[0](x, targets)
603 | for name, loss in zip(self.loss_names, losses):
604 | self.losses[name] += loss
605 | # Test phase: Get detections
606 | else:
607 | x = module(x)
608 | output_obj.append(x)
609 | layer += 1
610 | # x = module(x)
611 | # output.append(x)
612 | layer_outputs.append(x)
613 |
614 | self.losses["recall"] /= 3
615 | self.losses["precision"] /= 3
616 | # return sum(output) if is_training else torch.cat(output, 1)
617 | # return torch.cat(output, 1)
618 | if self.obj_out:
619 | return output, sum(output_obj) if is_training else torch.cat(output_obj, 1), self.losses["precision"], self.losses["recall"]
620 | # return output, sum(output_obj)/(len(output_obj)*batch) if is_training else torch.cat(output_obj, 1)
621 | else:
622 | return output
623 |
624 | def load_weights(self, weights_path):
625 | """Parses and loads the weights stored in 'weights_path'"""
626 |
627 | # Open the weights file
628 | fp = open(weights_path, "rb")
629 | if self.config_path=='./model/yolo9000.cfg':
630 | header = np.fromfile(fp, dtype=np.int32, count=4) # First five are header values
631 | else:
632 | header = np.fromfile(fp, dtype=np.int32, count=5) # First five are header values
633 | # Needed to write header when saving weights
634 | self.header_info = header
635 |
636 | self.seen = header[3]
637 | weights = np.fromfile(fp, dtype=np.float32) # The rest are weights
638 | fp.close()
639 |
640 | ptr = 0
641 | for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
642 | if module_def["type"] == "convolutional" or module_def["type"] == "yoloconvolutional":
643 | conv_layer = module[0]
644 | if module_def["batch_normalize"]:
645 | # Load BN bias, weights, running mean and running variance
646 | bn_layer = module[1]
647 | num_b = bn_layer.bias.numel() # Number of biases
648 | # Bias
649 | bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias)
650 | bn_layer.bias.data.copy_(bn_b)
651 | ptr += num_b
652 | # Weight
653 | bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight)
654 | bn_layer.weight.data.copy_(bn_w)
655 | ptr += num_b
656 | # Running Mean
657 | bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean)
658 | bn_layer.running_mean.data.copy_(bn_rm)
659 | ptr += num_b
660 | # Running Var
661 | bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var)
662 | bn_layer.running_var.data.copy_(bn_rv)
663 | ptr += num_b
664 | else:
665 | # Load conv. bias
666 | num_b = conv_layer.bias.numel()
667 | conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias)
668 | conv_layer.bias.data.copy_(conv_b)
669 | ptr += num_b
670 | # Load conv. weights
671 | num_w = conv_layer.weight.numel()
672 | conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight)
673 | conv_layer.weight.data.copy_(conv_w)
674 | ptr += num_w
675 |
676 | """
677 | @:param path - path of the new weights file
678 | @:param cutoff - save layers between 0 and cutoff (cutoff = -1 -> all are saved)
679 | """
680 |
681 | def save_weights(self, path, cutoff=-1):
682 |
683 | fp = open(path, "wb")
684 | self.header_info[3] = self.seen
685 | self.header_info.tofile(fp)
686 |
687 | # Iterate through layers
688 | for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])):
689 | if module_def["type"] == "convolutional":
690 | conv_layer = module[0]
691 | # If batch norm, load bn first
692 | if module_def["batch_normalize"]:
693 | bn_layer = module[1]
694 | bn_layer.bias.data.cpu().numpy().tofile(fp)
695 | bn_layer.weight.data.cpu().numpy().tofile(fp)
696 | bn_layer.running_mean.data.cpu().numpy().tofile(fp)
697 | bn_layer.running_var.data.cpu().numpy().tofile(fp)
698 | # Load conv bias
699 | else:
700 | conv_layer.bias.data.cpu().numpy().tofile(fp)
701 | # Load conv weights
702 | conv_layer.weight.data.cpu().numpy().tofile(fp)
703 |
704 | fp.close
705 |
706 |
707 | if __name__ == "__main__":
708 | import torch
709 | import numpy as np
710 | torch.manual_seed(13)
711 | np.random.seed(13)
712 | torch.backends.cudnn.deterministic = True
713 | torch.backends.cudnn.benchmark = False
714 |
715 | model = Darknet()
716 | model.load_weights('./saved_models/yolov3.weights')
717 | # model.eval()
718 |
719 | image = torch.autograd.Variable(torch.randn(1, 3, 416, 416))
720 | output1, output2, output3 = model(image)
721 | print(output1)
722 | # print(output1.size(), output2.size(), output3.size())
723 | # print(model(image))
724 | # print(len(output), output[0].size(), output[1].size(), output[2].size())
725 |
--------------------------------------------------------------------------------
/model/modulation.py:
--------------------------------------------------------------------------------
1 | from collections import OrderedDict
2 | import math
3 | import random
4 | import pprint
5 | import torch
6 | import torch.nn as nn
7 | import torch.nn.functional as F
8 | from torch.autograd import Variable
9 | import torchvision.models
10 | from torch.nn.init import kaiming_normal, kaiming_uniform_
11 | from .darknet import ConvBatchNormReLU, ConvBatchNormReLU_3d
12 |
13 | class Bottleneck(nn.Module):
14 | expansion = 4
15 |
16 | def __init__(self, inplanes, planes, stride=1):
17 | super().__init__()
18 |
19 | # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
20 | self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
21 | self.bn1 = nn.BatchNorm2d(planes)
22 |
23 | self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
24 | self.bn2 = nn.BatchNorm2d(planes)
25 |
26 | self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
27 |
28 | self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
29 | self.bn3 = nn.BatchNorm2d(planes * self.expansion)
30 |
31 | self.relu = nn.ReLU(inplace=True)
32 | self.downsample = None
33 | self.stride = stride
34 |
35 | if stride > 1 or inplanes != planes * Bottleneck.expansion:
36 | # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
37 | self.downsample = nn.Sequential(OrderedDict([
38 | ("-1", nn.AvgPool2d(stride)),
39 | ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
40 | ("1", nn.BatchNorm2d(planes * self.expansion))
41 | ]))
42 |
43 | def forward(self, x: torch.Tensor):
44 | identity = x
45 |
46 | out = self.relu(self.bn1(self.conv1(x)))
47 | out = self.relu(self.bn2(self.conv2(out)))
48 | out = self.avgpool(out)
49 | out = self.bn3(self.conv3(out))
50 |
51 | if self.downsample is not None:
52 | identity = self.downsample(x)
53 |
54 | out += identity
55 | out = self.relu(out)
56 | return out
57 |
58 | class AttentionPool2d(nn.Module):
59 | def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
60 | super().__init__()
61 | self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
62 | self.k_proj = nn.Linear(embed_dim, embed_dim)
63 | self.q_proj = nn.Linear(embed_dim, embed_dim)
64 | self.v_proj = nn.Linear(embed_dim, embed_dim)
65 | self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
66 | self.num_heads = num_heads
67 | self.embed_dim = embed_dim
68 | self.spacial_dim = spacial_dim
69 |
70 | def forward(self, x):
71 | B, C, H, W = x.shape
72 | x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1) # NCHW -> (HW)NC
73 | x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0) # (HW+1)NC
74 |
75 | cls_pos = self.positional_embedding[0:1, :]
76 | # spatial_pos = F.interpolate(self.positional_embedding[1:,].reshape(1, self.spacial_dim, self.spacial_dim, self.embed_dim).permute(0, 3, 1, 2), size=(H, W), mode='bilinear')
77 | spatial_pos = self.positional_embedding[1:].reshape(self.spacial_dim, self.spacial_dim, self.embed_dim)[:H, :W]
78 | spatial_pos = spatial_pos.reshape(-1, self.embed_dim)
79 | # spatial_pos = spatial_pos.reshape(self.embed_dim, H*W).permute(1, 0)
80 | positional_embedding = torch.cat([cls_pos, spatial_pos], dim=0)
81 |
82 | x = x + positional_embedding[:, None, :]
83 | x, _ = F.multi_head_attention_forward(
84 | query=x, key=x, value=x,
85 | embed_dim_to_check=x.shape[-1],
86 | num_heads=self.num_heads,
87 | q_proj_weight=self.q_proj.weight,
88 | k_proj_weight=self.k_proj.weight,
89 | v_proj_weight=self.v_proj.weight,
90 | in_proj_weight=None,
91 | in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
92 | bias_k=None,
93 | bias_v=None,
94 | add_zero_attn=False,
95 | dropout_p=0,
96 | out_proj_weight=self.c_proj.weight,
97 | out_proj_bias=self.c_proj.bias,
98 | use_separate_proj_weight=True,
99 | training=self.training,
100 | need_weights=False
101 | )
102 |
103 | x = x.permute(1, 2, 0)
104 | global_feat = x[:, :, 0]
105 | feature_map = x[:, :, 1:].reshape(B, -1, H, W)
106 | return global_feat, feature_map
107 |
108 | class CLIPResNetWithAttention(nn.Module):
109 | """
110 | A ResNet class that is similar to torchvision's but contains the following changes:
111 | - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
112 | - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
113 | - The final pooling layer is a QKV attention instead of an average pool
114 | """
115 |
116 | def __init__(self, layers, output_dim=1024, input_resolution=224, width=64, pretrained=None, att_level3=False, baseline=False, **kwargs):
117 | super().__init__()
118 | self.pretrained = pretrained
119 | self.output_dim = output_dim
120 | self.input_resolution = input_resolution
121 |
122 | # the 3-layer stem
123 | self.conv1 = nn.Conv2d(4, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
124 | self.bn1 = nn.BatchNorm2d(width // 2)
125 | self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
126 | self.bn2 = nn.BatchNorm2d(width // 2)
127 | self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
128 | self.bn3 = nn.BatchNorm2d(width)
129 | self.avgpool = nn.AvgPool2d(2)
130 | self.relu = nn.ReLU(inplace=True)
131 | self.reg = torch.nn.Sequential(
132 | nn.Conv2d(256, 1, kernel_size=1, padding=0, bias=False),
133 | nn.Sigmoid()
134 | )
135 | # residual layers
136 | self._inplanes = width # this is a *mutable* variable used during construction
137 | self.layer1 = self._make_layer(width, layers[0])
138 | self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
139 | self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
140 | self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
141 |
142 | embed_dim = width * 32 # the ResNet feature dimension
143 | self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, 32, output_dim)
144 | self.att_level3 = att_level3
145 | self.baseline = baseline
146 |
147 | def init_weights(self, pretrained=None):
148 | pretrained = pretrained or self.pretrained
149 | if isinstance(pretrained, str):
150 | checkpoint = torch.jit.load(pretrained, map_location='cpu').float().state_dict()
151 |
152 | state_dict = {}
153 |
154 | for k in checkpoint.keys():
155 | if k.startswith('visual.'):
156 | new_k = k.replace('visual.', '')
157 | state_dict[new_k] = checkpoint[k]
158 |
159 | if 'positional_embedding' in new_k:
160 | if self.attnpool.positional_embedding.shape != state_dict[new_k].shape:
161 | print(f'Resize the pos_embed shape from {state_dict[new_k].shape} to {self.attnpool.positional_embedding.shape}')
162 | cls_pos = state_dict[new_k][0:1, :]
163 | H = W = self.input_resolution // 32
164 | spatial_pos = F.interpolate(state_dict[new_k][1:,].reshape(1, 7, 7, cls_pos.shape[1]).permute(0, 3, 1, 2), size=(H, W), mode='bilinear')
165 | spatial_pos = spatial_pos.reshape(cls_pos.shape[1], H*W).permute(1, 0)
166 | positional_embedding = torch.cat([cls_pos, spatial_pos], dim=0)
167 | state_dict[new_k] = positional_embedding
168 | assert self.attnpool.positional_embedding.shape == state_dict[new_k].shape
169 |
170 | u, w = self.load_state_dict(state_dict, False)
171 | print(u, w, 'are misaligned params in CLIPResNet')
172 |
173 | def _make_layer(self, planes, blocks, stride=1):
174 | layers = [Bottleneck(self._inplanes, planes, stride)]
175 |
176 | self._inplanes = planes * Bottleneck.expansion
177 | for _ in range(1, blocks):
178 | layers.append(Bottleneck(self._inplanes, planes))
179 |
180 | return nn.Sequential(*layers)
181 |
182 | def forward(self, x):
183 | def stem(x):
184 | for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]:
185 | x = self.relu(bn(conv(x)))
186 | x = self.avgpool(x)
187 | return x
188 |
189 | x = x.type(self.conv1.weight.dtype)
190 | x = stem(x)
191 |
192 | outs = []
193 | x = self.layer1(x)
194 | out1 = self.reg(x)
195 | outs.append(out1)
196 | x = self.layer2(x)
197 | outs.append(x)
198 | x = self.layer3(x)
199 | outs.append(x)
200 | x = self.layer4(x)
201 | outs.append(x)
202 |
203 | x_global, x_local = self.attnpool(x)
204 | outs.append([x_global, x_local])
205 | if self.att_level3:
206 | new_outs = [outs[0], outs[1], outs[2], outs[4][1], outs[4]]
207 | if self.baseline:
208 | new_outs = new_outs[:-1]
209 | return tuple(new_outs)
210 | else:
211 | return tuple(outs)
212 |
213 | class CLIPTextEncoder(nn.Module):
214 | def __init__(self, context_length=77,
215 | vocab_size=49408,
216 | transformer_width=512,
217 | transformer_heads=8,
218 | transformer_layers=12,
219 | embed_dim=512,
220 | out_dim=256,
221 | pretrained=None, **kwargs):
222 | super().__init__()
223 |
224 | self.pretrained = pretrained
225 |
226 | self.context_length = context_length
227 |
228 | self.transformer = Transformer(
229 | width=transformer_width,
230 | layers=transformer_layers,
231 | heads=transformer_heads,
232 | attn_mask=self.build_attention_mask()
233 | )
234 |
235 | self.vocab_size = vocab_size
236 | self.token_embedding = nn.Embedding(vocab_size, transformer_width)
237 | self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
238 | self.ln_final = LayerNorm(transformer_width)
239 | self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
240 |
241 | def init_weights(self, pretrained=None):
242 | pretrained = pretrained or self.pretrained
243 | if isinstance(pretrained, str):
244 | checkpoint = torch.jit.load(pretrained, map_location='cpu').float().state_dict()
245 |
246 | state_dict = {}
247 |
248 | for k in checkpoint.keys():
249 | if k.startswith('transformer.'):
250 | state_dict[k] = checkpoint[k]
251 |
252 | if k == 'positional_embedding' or k == 'text_projection' or k.startswith('token_embedding') or k.startswith('ln_final'):
253 | if k == 'positional_embedding' and checkpoint[k].size(0) > self.context_length:
254 | checkpoint[k] = checkpoint[k][:self.context_length]
255 | print('positional_embedding is tuncated from 77 to', self.context_length)
256 | state_dict[k] = checkpoint[k]
257 |
258 | u, w = self.load_state_dict(state_dict, False)
259 | print(u, w, 'are misaligned params in text encoder')
260 |
261 |
262 | def build_attention_mask(self):
263 | # lazily create causal attention mask, with full attention between the vision tokens
264 | # pytorch uses additive attention mask; fill with -inf
265 | mask = torch.empty(self.context_length, self.context_length)
266 | mask.fill_(float("-inf"))
267 | mask.triu_(1) # zero out the lower diagonal
268 | return mask
269 |
270 | def forward(self, text):
271 | x = self.token_embedding(text) # [batch_size, n_ctx, d_model]
272 | #print(x.shape)
273 | #exit()
274 | x = x + self.positional_embedding
275 |
276 | x = x.permute(1, 0, 2) # NLD -> LND
277 | x = self.transformer(x)
278 | x = x.permute(1, 0, 2) # LND -> NLD
279 | x = self.ln_final(x)
280 | x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
281 | #x = self.out_proj(x)
282 | return x
283 |
284 | class CLIPVisionTransformer(nn.Module):
285 | def __init__(self, input_resolution=224, patch_size=32, width=768, layers=3, heads=2, output_dim=512, out_indices=[0,1,2], pretrained=None, **kwargs):
286 | super().__init__()
287 | self.pretrained = pretrained
288 | self.input_resolution = input_resolution
289 | self.output_dim = output_dim
290 | self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
291 |
292 | scale = width ** -0.5
293 | self.class_embedding = nn.Parameter(scale * torch.randn(width))
294 | self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
295 | self.spatial_size = input_resolution // patch_size
296 | self.ln_pre = LayerNorm(width)
297 |
298 | self.transformer = Transformer(width, layers, heads)
299 |
300 | self.out_indices = out_indices
301 |
302 | self.ln_post = LayerNorm(width)
303 | self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
304 |
305 | embed_dim = width
306 | if patch_size == 16:
307 | self.fpn1 = nn.Sequential(
308 | nn.GroupNorm(1, embed_dim),
309 | nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
310 | nn.BatchNorm2d(embed_dim),
311 | nn.GELU(),
312 | nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
313 | )
314 |
315 | self.fpn2 = nn.Sequential(
316 | nn.GroupNorm(1, embed_dim),
317 | nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
318 | )
319 |
320 | self.fpn3 = nn.GroupNorm(1, embed_dim)
321 |
322 | self.fpn4 = nn.Sequential(
323 | nn.GroupNorm(1, embed_dim),
324 | nn.MaxPool2d(kernel_size=2, stride=2)
325 | )
326 |
327 | elif patch_size == 8:
328 | self.fpn1 = nn.Sequential(
329 | nn.GroupNorm(1, embed_dim),
330 | nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
331 | )
332 |
333 | self.fpn2 = nn.GroupNorm(1, embed_dim)
334 |
335 | self.fpn3 = nn.Sequential(
336 | nn.GroupNorm(1, embed_dim),
337 | nn.MaxPool2d(kernel_size=2, stride=2),
338 | )
339 |
340 | self.fpn4 = nn.Sequential(
341 | nn.GroupNorm(1, embed_dim),
342 | nn.MaxPool2d(kernel_size=4, stride=4),
343 | )
344 |
345 |
346 | def init_weights(self, pretrained=None):
347 | pretrained = pretrained or self.pretrained
348 | if isinstance(pretrained, str):
349 | checkpoint = torch.jit.load(pretrained, map_location='cpu').float().state_dict()
350 |
351 | state_dict = {}
352 |
353 | for k in checkpoint.keys():
354 | if k.startswith('visual.'):
355 | new_k = k.replace('visual.', '')
356 | state_dict[new_k] = checkpoint[k]
357 |
358 | if 'positional_embedding' in state_dict.keys():
359 | if self.positional_embedding.shape != state_dict['positional_embedding'].shape:
360 | print(f'Resize the pos_embed shape from {state_dict["positional_embedding"].shape} to {self.positional_embedding.shape}')
361 | cls_pos = state_dict["positional_embedding"][0:1, :]
362 | spatial_pos = F.interpolate(state_dict["positional_embedding"][1:,].reshape(1, 14, 14, 768).permute(0, 3, 1, 2), size=(self.spatial_size, self.spatial_size), mode='bilinear')
363 | spatial_pos = spatial_pos.reshape(768, self.spatial_size*self.spatial_size).permute(1, 0)
364 | positional_embedding = torch.cat([cls_pos, spatial_pos], dim=0)
365 | state_dict['positional_embedding'] = positional_embedding
366 | assert self.positional_embedding.shape == state_dict['positional_embedding'].shape
367 |
368 | u, w = self.load_state_dict(state_dict, False)
369 | #print(u[0])
370 | print(u, w, 'are misaligned params in vision transformer')
371 |
372 | def forward(self, x: torch.Tensor):
373 | #x = self.conv1(x) # shape = [*, width, grid, grid]
374 | x = x
375 | #print(x.shape)
376 | B, C, H, W = x.shape
377 |
378 | x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
379 | x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
380 | x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width]
381 |
382 |
383 | pos = self.positional_embedding.to(x.dtype)
384 | cls_pos = pos[0,:] + self.class_embedding.to(x.dtype)
385 | spatial_pos = F.interpolate(pos[1:,].reshape(1, self.spatial_size, self.spatial_size, C).permute(0, 3, 1, 2), size=(H, W), mode='bilinear')
386 | spatial_pos = spatial_pos.reshape(1, C, H*W).permute(0, 2, 1)
387 | pos = torch.cat([cls_pos.reshape(1, 1, C), spatial_pos], dim=1)
388 | x = x + pos
389 | x = self.ln_pre(x)
390 | x = x.permute(1, 0, 2) # NLD -> LND
391 |
392 | features = []
393 | for i, blk in enumerate(self.transformer.resblocks):
394 | x = blk(x)
395 | if i in self.out_indices:
396 | xp = x[1:,: , :].permute(1, 2, 0).reshape(B, -1, H, W)
397 | features.append(xp.contiguous())
398 |
399 | ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
400 | for i in range(len(features)):
401 | features[i] = ops[i](features[i])
402 |
403 | return tuple(features)
404 |
405 |
406 | class ResidualAttentionBlock(nn.Module):
407 | def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, drop_path=0.):
408 | super().__init__()
409 |
410 | self.attn = nn.MultiheadAttention(d_model, n_head)
411 | self.ln_1 = LayerNorm(d_model)
412 | self.mlp = nn.Sequential(OrderedDict([
413 | ("c_fc", nn.Linear(d_model, d_model * 4)),
414 | ("gelu", QuickGELU()),
415 | ("c_proj", nn.Linear(d_model * 4, d_model))
416 | ]))
417 | self.ln_2 = LayerNorm(d_model)
418 | self.attn_mask = attn_mask
419 |
420 | self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
421 |
422 | def attention(self, x: torch.Tensor):
423 | self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
424 | return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
425 |
426 | def forward(self, x: torch.Tensor):
427 | x = x + self.drop_path(self.attention(self.ln_1(x)))
428 | x = x + self.drop_path(self.mlp(self.ln_2(x)))
429 | return x
430 |
431 | class QuickGELU(nn.Module):
432 | def forward(self, x: torch.Tensor):
433 | return x * torch.sigmoid(1.702 * x)
434 | class Transformer(nn.Module):
435 | def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, drop_path_rate=0.):
436 | super().__init__()
437 | self.width = width
438 | self.layers = layers
439 | dpr = [x.item() for x in torch.linspace(0, drop_path_rate, layers)] # stochastic depth decay rule
440 | self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask, dpr[i]) for i in range(layers)])
441 |
442 | def forward(self, x: torch.Tensor):
443 | return self.resblocks(x)
444 |
445 | def init_modules(modules, init='uniform'):
446 | if init.lower() == 'normal':
447 | init_params = kaiming_normal
448 | elif init.lower() == 'uniform':
449 | init_params = kaiming_uniform_
450 | else:
451 | return
452 | for m in modules:
453 | if isinstance(m, (nn.Conv3d, nn.Conv2d, nn.Linear)):
454 | init_params(m.weight)
455 |
456 | def gelu(x):
457 | return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
458 |
459 | class FiLM(nn.Module):
460 | """
461 | A Feature-wise Linear Modulation Layer from
462 | 'FiLM: Visual Reasoning with a General Conditioning Layer'
463 | """
464 | def forward(self, x, gammas, betas):
465 | # gammas = gammas.unsqueeze(2).unsqueeze(3).expand_as(x)
466 | # betas = betas.unsqueeze(2).unsqueeze(3).expand_as(x)
467 | return (gammas * x) + betas
468 |
469 | def mask_softmax(attn_score, word_mask, tempuature=10., clssep=False, lstm=False):
470 | if len(attn_score.shape)!=2:
471 | attn_score = attn_score.squeeze(2).squeeze(2)
472 | word_mask_cp = word_mask[:,:attn_score.shape[1]].clone()
473 | score = F.softmax(attn_score*tempuature, dim=1)
474 | if not clssep:
475 | for ii in range(word_mask_cp.shape[0]):
476 | if lstm:
477 | word_mask_cp[ii,word_mask_cp[ii,:].sum()-1]=0
478 | else:
479 | word_mask_cp[ii,0]=0
480 | word_mask_cp[ii,word_mask_cp[ii,:].sum()]=0 ## set one to 0 already
481 | mask_score = score * word_mask_cp.float()
482 | mask_score = mask_score/(mask_score.sum(1)+1e-8).view(mask_score.size(0), 1).expand(mask_score.size(0), mask_score.size(1))
483 | return mask_score
484 |
485 | class FiLMedConvBlock_context(nn.Module):
486 | def __init__(self, with_residual=True, with_batchnorm=True,
487 | with_cond=[False], dropout=0, num_extra_channels=0, extra_channel_freq=1,
488 | with_input_proj=1, num_cond_maps=8, kernel_size=1, batchnorm_affine=False,
489 | num_layers=1, condition_method='bn-film', debug_every=float('inf'),
490 | textdim=768,visudim=512,contextdim=512,emb_size=512,fusion='prod',cont_map=False,
491 | lstm=False,baseline=False):
492 | super(FiLMedConvBlock_context, self).__init__()
493 |
494 | self.cont_map = cont_map ## mapping context with language feature
495 | self.lstm = lstm
496 | self.emb_size = emb_size
497 | self.with_residual = with_residual
498 | self.fusion = fusion
499 | self.baseline = baseline
500 | self.film = FiLM()
501 |
502 | if self.cont_map:
503 | self.sent_map = nn.Linear(768, emb_size)
504 | self.context_map = nn.Linear(emb_size, emb_size)
505 | if self.fusion == 'cat':
506 | self.attn_map = nn.Conv1d(textdim+visudim, emb_size//2, kernel_size=1)
507 | elif self.fusion == 'prod':
508 | assert(textdim==visudim) ## if product fusion
509 | self.attn_map = nn.Conv1d(visudim, emb_size//2, kernel_size=1)
510 |
511 | self.attn_score = nn.Conv1d(emb_size//2, 1, kernel_size=1)
512 | if self.baseline:
513 | self.fusion_layer = ConvBatchNormReLU(visudim+textdim+8, emb_size, 1, 1, 0, 1)
514 | else:
515 | self.gamme_decode = nn.Linear(textdim, 2 * emb_size)
516 | self.conv1 = nn.Conv2d(visudim+8, emb_size, kernel_size=1)
517 | # self.bn1 = nn.BatchNorm2d(emb_size)
518 | self.bn1 = nn.InstanceNorm2d(emb_size)
519 | init_modules(self.modules())
520 |
521 |
522 | def forward(self, fvisu, fword, context_score, fcoord,gest, textattn=None,weight=None,fsent=None,word_mask=None):
523 | fword = fword.permute(0, 2, 1)
524 | B, Dvisu, H, W = fvisu.size()
525 | B, Dlang, N = fword.size()
526 | B, N = context_score.size()
527 | assert(Dvisu==Dlang)
528 |
529 | if self.cont_map and fsent is not None:
530 | fsent = F.normalize(F.relu(self.sent_map(fsent)), p=2, dim=1)
531 | fcont = torch.matmul(context_score.view(B,1,N),fword.permute(0,2,1)).squeeze(1)
532 | fcontext = F.relu(self.context_map(fsent*fcont)).unsqueeze(2).repeat(1,1,N)
533 | ## word attention
534 | tile_visu = torch.mean(fvisu.view(B, Dvisu, -1),dim=2,keepdim=True).repeat(1,1,N)
535 | if self.fusion == 'cat':
536 | context_tile = torch.cat([tile_visu,\
537 | fword, fcontext], dim=1)
538 | elif self.fusion == 'prod':
539 | context_tile = tile_visu * \
540 | fword * fcontext
541 | else:
542 | ## word attention
543 | tile_visu = torch.mean(fvisu.view(B, Dvisu, -1),dim=2,keepdim=True).repeat(1,1,N)
544 | if self.fusion == 'cat':
545 | context_tile = torch.cat([tile_visu,\
546 | fword * context_score.view(B, 1, N).repeat(1, Dlang, 1,)], dim=1)
547 | elif self.fusion == 'prod':
548 | context_tile = tile_visu * \
549 | fword * context_score.view(B, 1, N).repeat(1, Dlang, 1,)
550 | #print(context_tile.shape)
551 | #print(tile_visu.shape)
552 |
553 | attn_feat = F.tanh(self.attn_map(context_tile))
554 | attn_score = self.attn_score(attn_feat).squeeze(1)
555 | mask_score = mask_softmax(attn_score,word_mask,lstm=self.lstm)
556 | attn_lang = torch.matmul(mask_score.view(B,1,N),fword.permute(0,2,1))
557 | attn_lang = attn_lang.view(B,Dlang).squeeze(1)
558 |
559 | if self.baseline:
560 | fmodu = self.fusion_layer(torch.cat([fvisu,\
561 | attn_lang.unsqueeze(2).unsqueeze(2).repeat(1,1,fvisu.shape[-1],fvisu.shape[-1]),fcoord],dim=1))
562 | else:
563 | ## lang-> gamma, beta
564 | film_param = self.gamme_decode(attn_lang)
565 | film_param = film_param.view(B,2*self.emb_size,1,1).repeat(1,1,H,W)
566 | #print(film_param.shape)
567 | gammas, betas = torch.split(film_param, self.emb_size, dim=1)
568 |
569 | gammas, betas = F.tanh(gammas), F.tanh(betas)
570 | #gest = F.tanh(gest)
571 | # GEST LANGUAGE FUSION
572 | # gammas = gammas * gest.repeat(1,512,1,1).detach()
573 | # betas = betas * gest.repeat(1,512,1,1).detach()
574 |
575 | ## modulate visu feature
576 | fmodu = self.bn1(self.conv1(torch.cat([fvisu,fcoord],dim=1)))
577 | #print(fmodu.shape)
578 | #print(gammas.shape)
579 | #print(betas.shape)
580 | #exit()
581 | fmodu = self.film(fmodu, gammas, betas)
582 | fmodu = F.relu(fmodu)
583 | if self.with_residual:
584 | if weight is None:
585 | fmodu = fvisu + fmodu
586 | else:
587 | weight = weight.view(B,1,1,1).repeat(1, Dvisu, H, W)
588 | fmodu = (1-weight)*fvisu + weight*fmodu
589 | return fmodu, attn_lang, attn_score
590 |
591 | class LayerNorm(nn.LayerNorm):
592 | """Subclass torch's LayerNorm to handle fp16."""
593 |
594 | def forward(self, x: torch.Tensor):
595 | orig_type = x.dtype
596 | ret = super().forward(x.type(torch.float32))
597 | return ret.type(orig_type)
598 |
599 | class FiLMedConvBlock_multihop(nn.Module):
600 | def __init__(self, NFilm=2, with_residual=True, with_batchnorm=True,
601 | with_cond=[False], dropout=0, num_extra_channels=0, extra_channel_freq=1,
602 | with_input_proj=1, num_cond_maps=8, kernel_size=1, batchnorm_affine=False,
603 | num_layers=1, condition_method='bn-film', debug_every=float('inf'),
604 | textdim=768,visudim=512,emb_size=512,fusion='cat',intmd=False,lstm=False,erasing=0.):
605 | super(FiLMedConvBlock_multihop, self).__init__()
606 |
607 | self.NFilm = NFilm
608 | self.emb_size = emb_size
609 | self.with_residual = with_residual
610 | self.cont_size = emb_size
611 | self.fusion = fusion
612 | self.intmd = intmd
613 | self.lstm = lstm
614 | self.erasing = erasing
615 | if self.fusion=='cat':
616 | self.cont_size = emb_size*2
617 |
618 | self.modulesdict = nn.ModuleDict()
619 | modules = OrderedDict()
620 | modules["film0"] = FiLMedConvBlock_context(textdim=textdim,visudim=emb_size,contextdim=emb_size,emb_size=emb_size,fusion=fusion,lstm=self.lstm)
621 | for n in range(1,NFilm):
622 | modules["conv%d"%n] = ConvBatchNormReLU(emb_size, emb_size, 3, 1, 1, 1)
623 | modules["film%d"%n] = FiLMedConvBlock_context(textdim=textdim,visudim=emb_size,contextdim=self.cont_size,emb_size=emb_size,fusion=fusion,lstm=self.lstm)
624 | self.modulesdict.update(modules)
625 |
626 | def forward(self, fvisu, fword, fcoord,gest = None, weight=None,fsent=None,word_mask=None):
627 | B, Dvisu, H, W = fvisu.size()
628 | B, N, Dlang = fword.size()
629 | intmd_feat, attnscore_list = [], []
630 |
631 | x, _, attn_score = self.modulesdict["film0"](fvisu, fword, Variable(torch.ones(B,N).cuda()), fcoord,gest, fsent=fsent,word_mask=word_mask)
632 | attnscore_list.append(attn_score.view(B,N,1,1))
633 | if self.intmd:
634 | intmd_feat.append(x)
635 | if self.NFilm==1:
636 | intmd_feat = [x]
637 | for n in range(1,self.NFilm):
638 | score_list = [mask_softmax(score.squeeze(2).squeeze(2),word_mask,lstm=self.lstm) for score in attnscore_list]
639 |
640 | score = torch.clamp(torch.max(torch.stack(score_list, dim=1), dim=1, keepdim=False)[0],min=0.,max=1.)
641 | x = self.modulesdict["conv%d"%n](x)
642 | x, _, attn_score = self.modulesdict["film%d"%n](x, fword, (1-score), fcoord,gest, fsent=fsent,word_mask=word_mask)
643 | attnscore_list.append(attn_score.view(B,N,1,1)) ## format match div loss in main func
644 | if self.intmd:
645 | intmd_feat.append(x)
646 | elif n==self.NFilm-1:
647 | intmd_feat = [x]
648 | return intmd_feat, attnscore_list
649 |
650 | class Vector(nn.Sequential):
651 | def __init__(self, input_resolution=224, patch_size=32, width=768, layers=3, heads=2, output_dim=3, out_indices=[0,1,2], pretrained=None, **kwargs):
652 | super().__init__()
653 | self.pretrained = pretrained
654 | self.input_resolution = input_resolution
655 | self.output_dim = output_dim
656 | self.conv1 = nn.Conv2d(in_channels=6, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
657 |
658 | scale = width ** -0.5
659 | self.class_embedding = nn.Parameter(scale * torch.randn(width))
660 | self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
661 | self.spatial_size = input_resolution // patch_size
662 | self.ln_pre = LayerNorm(width)
663 |
664 | self.transformer = Transformer(width, layers, heads)
665 |
666 | self.out_indices = out_indices
667 |
668 | self.ln_post = LayerNorm(width)
669 | self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
670 |
671 | embed_dim = width
672 |
673 | def forward(self, x: torch.Tensor):
674 | x = self.conv1(x) # shape = [*, width, grid, grid]
675 | B, C, H, W = x.shape
676 |
677 | x = x.reshape(x.shape[0], x.shape[1], -1) # shape = [*, width, grid ** 2]
678 | x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width]
679 | x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1) # shape = [*, grid ** 2 + 1, width]
680 |
681 |
682 | pos = self.positional_embedding.to(x.dtype)
683 | cls_pos = pos[0,:] + self.class_embedding.to(x.dtype)
684 | spatial_pos = F.interpolate(pos[1:,].reshape(1, self.spatial_size, self.spatial_size, C).permute(0, 3, 1, 2), size=(H, W), mode='bilinear')
685 | spatial_pos = spatial_pos.reshape(1, C, H*W).permute(0, 2, 1)
686 | pos = torch.cat([cls_pos.reshape(1, 1, C), spatial_pos], dim=1)
687 | x = x + pos
688 | x = self.ln_pre(x)
689 | x = x.permute(1, 0, 2) # NLD -> LND
690 | x = self.transformer(x)
691 | x = self.ln_post(x[0,:,:]) @ self.proj
692 | return x
693 |
694 | class MLP(nn.Module):
695 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
696 | super().__init__()
697 | self.num_layers = num_layers
698 | h = [hidden_dim] * (num_layers - 1)
699 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
700 |
701 | def forward(self, x):
702 | for i, layer in enumerate(self.layers):
703 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
704 | return x
705 |
706 |
707 |
708 |
709 |
710 | if __name__ == "__main__":
711 | import torch
712 | import numpy as np
713 |
714 | vect = Vector()
715 |
716 | dep = torch.autograd.Variable(torch.randn(1, 3, 512, 512))
717 | paf = torch.autograd.Variable(torch.randn(1, 1, 256, 256))
718 | output = model(paf)
719 | print(output)
720 | # print(output1.size(), output2.size(), output3.size())
721 | # print(model(image))
722 | # print(len(output), output[0].size(), output[1].size(), output[2].size())
723 |
--------------------------------------------------------------------------------