├── doc
    ├── frame.png
    └── intro.png
├── saved_models
    └── yolov3_weights.sh
├── model
    ├── __pycache__
    │   ├── clip.cpython-39.pyc
    │   ├── loss.cpython-36.pyc
    │   ├── loss.cpython-38.pyc
    │   ├── loss.cpython-39.pyc
    │   ├── convlstm.cpython-36.pyc
    │   ├── convlstm.cpython-38.pyc
    │   ├── convlstm.cpython-39.pyc
    │   ├── darknet.cpython-36.pyc
    │   ├── darknet.cpython-38.pyc
    │   ├── darknet.cpython-39.pyc
    │   ├── modulation.cpython-36.pyc
    │   ├── modulation.cpython-38.pyc
    │   ├── modulation.cpython-39.pyc
    │   ├── grounding_model.cpython-36.pyc
    │   ├── grounding_model.cpython-38.pyc
    │   ├── grounding_model.cpython-39.pyc
    │   └── grounding_modelbest.cpython-38.pyc
    ├── convlstm.py
    ├── loss.py
    ├── yolov3.cfg
    ├── grounding_model.py
    ├── darknet.py
    └── modulation.py
├── utils
    ├── __pycache__
    │   ├── utils.cpython-36.pyc
    │   ├── utils.cpython-38.pyc
    │   ├── utils.cpython-39.pyc
    │   ├── __init__.cpython-36.pyc
    │   ├── __init__.cpython-38.pyc
    │   ├── __init__.cpython-39.pyc
    │   ├── checkpoint.cpython-36.pyc
    │   ├── checkpoint.cpython-38.pyc
    │   ├── checkpoint.cpython-39.pyc
    │   ├── transforms.cpython-36.pyc
    │   ├── transforms.cpython-38.pyc
    │   ├── transforms.cpython-39.pyc
    │   ├── word_utils.cpython-36.pyc
    │   ├── word_utils.cpython-38.pyc
    │   ├── word_utils.cpython-39.pyc
    │   ├── parsing_metrics.cpython-36.pyc
    │   ├── parsing_metrics.cpython-38.pyc
    │   └── parsing_metrics.cpython-39.pyc
    ├── __init__.py
    ├── losses.py
    ├── misc_utils.py
    ├── checkpoint.py
    ├── word_utils.py
    ├── utils.py
    ├── parsing_metrics.py
    ├── transforms.py
    ├── transformsv2.py
    └── temp.py
├── dataset
    ├── __pycache__
    │   ├── data_loader.cpython-36.pyc
    │   ├── data_loader.cpython-38.pyc
    │   └── data_loader.cpython-39.pyc
    ├── data_loaderv2.py
    └── data_loader.py
├── ln_data
    └── README.md
├── README.md
└── evaluation_results.py


/doc/frame.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/doc/frame.png


--------------------------------------------------------------------------------
/doc/intro.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/doc/intro.png


--------------------------------------------------------------------------------
/saved_models/yolov3_weights.sh:
--------------------------------------------------------------------------------
1 | #wget -P saved_models https://pjreddie.com/media/files/yolov3.weights


--------------------------------------------------------------------------------
/model/__pycache__/clip.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/clip.cpython-39.pyc


--------------------------------------------------------------------------------
/model/__pycache__/loss.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/loss.cpython-36.pyc


--------------------------------------------------------------------------------
/model/__pycache__/loss.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/loss.cpython-38.pyc


--------------------------------------------------------------------------------
/model/__pycache__/loss.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/loss.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/utils.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/utils.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/utils.cpython-39.pyc


--------------------------------------------------------------------------------
/model/__pycache__/convlstm.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/convlstm.cpython-36.pyc


--------------------------------------------------------------------------------
/model/__pycache__/convlstm.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/convlstm.cpython-38.pyc


--------------------------------------------------------------------------------
/model/__pycache__/convlstm.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/convlstm.cpython-39.pyc


--------------------------------------------------------------------------------
/model/__pycache__/darknet.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/darknet.cpython-36.pyc


--------------------------------------------------------------------------------
/model/__pycache__/darknet.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/darknet.cpython-38.pyc


--------------------------------------------------------------------------------
/model/__pycache__/darknet.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/darknet.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/model/__pycache__/modulation.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/modulation.cpython-36.pyc


--------------------------------------------------------------------------------
/model/__pycache__/modulation.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/modulation.cpython-38.pyc


--------------------------------------------------------------------------------
/model/__pycache__/modulation.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/modulation.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/checkpoint.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/checkpoint.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/checkpoint.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/checkpoint.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/checkpoint.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/checkpoint.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/transforms.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/transforms.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/transforms.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/transforms.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/transforms.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/transforms.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/word_utils.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/word_utils.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/word_utils.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/word_utils.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/word_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/word_utils.cpython-39.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/data_loader.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/dataset/__pycache__/data_loader.cpython-36.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/data_loader.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/dataset/__pycache__/data_loader.cpython-38.pyc


--------------------------------------------------------------------------------
/dataset/__pycache__/data_loader.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/dataset/__pycache__/data_loader.cpython-39.pyc


--------------------------------------------------------------------------------
/model/__pycache__/grounding_model.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/grounding_model.cpython-36.pyc


--------------------------------------------------------------------------------
/model/__pycache__/grounding_model.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/grounding_model.cpython-38.pyc


--------------------------------------------------------------------------------
/model/__pycache__/grounding_model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/grounding_model.cpython-39.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/parsing_metrics.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/parsing_metrics.cpython-36.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/parsing_metrics.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/parsing_metrics.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__pycache__/parsing_metrics.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/utils/__pycache__/parsing_metrics.cpython-39.pyc


--------------------------------------------------------------------------------
/ln_data/README.md:
--------------------------------------------------------------------------------
1 | # Dataset
2 | Download the YouRefIt dataset from [Dataset Request Page](https://yixchen.github.io/YouRefIt/request.html) and put here.


--------------------------------------------------------------------------------
/model/__pycache__/grounding_modelbest.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/SooLab/REP-ERU/HEAD/model/__pycache__/grounding_modelbest.cpython-38.pyc


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # -----------------------------------------------------------------------------
 3 | # Copyright (c) Edgar Andrés Margffoy-Tuay, Emilio Botero and Juan Camilo Pérez
 4 | #
 5 | # Licensed under the terms of the MIT License
 6 | # (see LICENSE for details)
 7 | # -----------------------------------------------------------------------------
 8 | 
 9 | """Misc data and other helping utillites."""
10 | 
11 | from .word_utils import Corpus
12 | from .transforms import ResizeImage, ResizeAnnotation
13 | 
14 | Corpus
15 | ResizeImage
16 | ResizeAnnotation
17 | 
18 | 
19 | class AverageMeter(object):
20 |     """Computes and stores the average and current value"""
21 | 
22 |     def __init__(self):
23 |         self.reset()
24 | 
25 |     def reset(self):
26 |         self.val = 0
27 |         self.avg = 0
28 |         self.sum = 0
29 |         self.count = 0
30 | 
31 |     def update(self, val, n=1):
32 |         self.val = val
33 |         self.sum += val * n
34 |         self.count += n
35 |         self.avg = self.sum / self.count
36 | 


--------------------------------------------------------------------------------
/utils/losses.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Custom loss function definitions.
 5 | """
 6 | 
 7 | import torch.nn as nn
 8 | import torch.nn.functional as F
 9 | 
10 | 
11 | class IoULoss(nn.Module):
12 |     """
13 |     Creates a criterion that computes the Intersection over Union (IoU)
14 |     between a segmentation mask and its ground truth.
15 | 
16 |     Rahman, M.A. and Wang, Y:
17 |     Optimizing Intersection-Over-Union in Deep Neural Networks for
18 |     Image Segmentation. International Symposium on Visual Computing (2016)
19 |     http://www.cs.umanitoba.ca/~ywang/papers/isvc16.pdf
20 |     """
21 | 
22 |     def __init__(self, size_average=True):
23 |         super().__init__()
24 |         self.size_average = size_average
25 | 
26 |     def forward(self, input, target):
27 |         input = F.sigmoid(input)
28 |         intersection = (input * target).sum()
29 |         union = ((input + target) - (input * target)).sum()
30 |         iou = intersection / union
31 |         iou_dual = input.size(0) - iou
32 |         if self.size_average:
33 |             iou_dual = iou_dual / input.size(0)
34 |         return iou_dual
35 | 


--------------------------------------------------------------------------------
/utils/misc_utils.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | """
 4 | Misc download and visualization helper functions and class wrappers.
 5 | """
 6 | 
 7 | import sys
 8 | import time
 9 | import torch
10 | from visdom import Visdom
11 | 
12 | 
13 | def reporthook(count, block_size, total_size):
14 |     global start_time
15 |     if count == 0:
16 |         start_time = time.time()
17 |         return
18 |     duration = time.time() - start_time
19 |     progress_size = int(count * block_size)
20 |     speed = int(progress_size / (1024 * duration))
21 |     percent = min(int(count * block_size * 100 / total_size), 100)
22 |     sys.stdout.write("\r...%d%%, %d MB, %d KB/s, %d seconds passed" %
23 |                      (percent, progress_size / (1024 * 1024), speed, duration))
24 |     sys.stdout.flush()
25 | 
26 | 
27 | class VisdomWrapper(Visdom):
28 |     def __init__(self, *args, env=None, **kwargs):
29 |         Visdom.__init__(self, *args, **kwargs)
30 |         self.env = env
31 |         self.plots = {}
32 | 
33 |     def init_line_plot(self, name,
34 |                        X=torch.zeros((1,)).cpu(),
35 |                        Y=torch.zeros((1,)).cpu(), **opts):
36 |         self.plots[name] = self.line(X=X, Y=Y, env=self.env, opts=opts)
37 | 
38 |     def plot_line(self, name, **kwargs):
39 |         self.line(win=self.plots[name], env=self.env, **kwargs)
40 | 


--------------------------------------------------------------------------------
/utils/checkpoint.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import shutil
 3 | import numpy as np
 4 | import torch
 5 | import torch.nn.functional as F
 6 | from collections import OrderedDict
 7 | 
 8 | def save_checkpoint(state, is_best, args, filename='default'):
 9 |     if filename=='default':
10 |         filename = 'filmconv_nofpn32_%s_batch%d'%(args.dataset,args.batch_size)
11 | 
12 |     checkpoint_name = './saved_models/%s_checkpoint.pth.tar'%(filename)
13 |     best_name = './saved_models/%s_model_best.pth.tar'%(filename)
14 |     torch.save(state, checkpoint_name)
15 |     if is_best:
16 |         shutil.copyfile(checkpoint_name, best_name)
17 | 
18 | def load_pretrain(model, args, logging):
19 |     if os.path.isfile(args.pretrain):
20 |         checkpoint = torch.load(args.pretrain)
21 |         #print(checkpoint.items())
22 |         pretrained_dict = checkpoint['state_dict']
23 |         #print(pretrained_dict)
24 | 
25 |         # new_state_dict = OrderedDict()
26 |         # for k, v in pretrained_dict.items():  # k为module.xxx.weight, v为权重
27 |         #     name = k[7:]  # 截取`module.`后面的xxx.weight
28 |         #     new_state_dict[name] = v
29 | 
30 |         model_dict = model.state_dict()
31 |         pretrained_dict = {k: v for k, v in pretrained_dict.items() if k in model_dict}
32 |         #pretrained_dict = {k: v for k, v in new_state_dict.items() if k in model_dict}
33 | 
34 | 
35 |         assert (len([k for k, v in pretrained_dict.items()])!=0)
36 |         model_dict.update(pretrained_dict)
37 | 
38 |         model.load_state_dict(model_dict)
39 |         #model.load_state_dict(new_state_dict)
40 |         print("=> loaded pretrain model at {}"
41 |               .format(args.pretrain))
42 |         logging.info("=> loaded pretrain model at {}"
43 |               .format(args.pretrain))
44 |         del checkpoint  # dereference seems crucial
45 |         torch.cuda.empty_cache()
46 |     else:
47 |         print(("=> no pretrained file found at '{}'".format(args.pretrain)))
48 |         logging.info("=> no pretrained file found at '{}'".format(args.pretrain))
49 |     return model
50 | 
51 | def load_resume(model, args, logging):
52 |     if os.path.isfile(args.resume):
53 |         print(("=> loading checkpoint '{}'".format(args.resume)))
54 |         logging.info("=> loading checkpoint '{}'".format(args.resume))
55 |         checkpoint = torch.load(args.resume)
56 |         args.start_epoch = checkpoint['epoch']
57 |         best_loss = checkpoint['best_loss']
58 |         model.load_state_dict(checkpoint['state_dict'])
59 |         print(("=> loaded checkpoint (epoch {}) Loss{}"
60 |               .format(checkpoint['epoch'], best_loss)))
61 |         logging.info("=> loaded checkpoint (epoch {}) Loss{}"
62 |               .format(checkpoint['epoch'], best_loss))
63 |         del checkpoint  # dereference seems crucial
64 |         torch.cuda.empty_cache()
65 |     else:
66 |         print(("=> no checkpoint found at '{}'".format(args.resume)))
67 |         logging.info(("=> no checkpoint found at '{}'".format(args.resume)))
68 |     return model


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Spatial and Visual Perspective-Taking via View Rotation and Relation Reasoning for Embodied Reference Understanding
 2 | 
 3 | by [Cheng Shi](https://github.com/ChengShiest/) and [Sibei Yang](https://sibeiyang.github.io/)
 4 | 
 5 | European Conference on Computer Vision (ECCV), 2022
 6 | 
 7 | ## Introduction
 8 | 
 9 | Embodied Reference Understanding studies the reference understanding in an embodied fashion, where a receiver requires to locate a target object referred to by both language and gesture of the sender in a shared physical environment. Its main challenge lies in how to make the receiver with the egocentric view access spatial and visual information relative to the sender to judge how objects are oriented around and seen from the sender, i.e., spatial and visual perspective-taking. In this paper, we propose a REasoning from your Perspective (REP) method to tackle the challenge by modeling relations between the receiver and the sender as well as the sender and the objects via the proposed novel view rotation and relation reasoning. Specifically, view rotation first rotates the receiver to the position of the sender by constructing an embodied 3D coordinate system with the position of the sender as the origin. Then, it changes the orientation of the receiver to the orientation of the sender by encoding the body orientation and gesture of the sender. Relation reasoning models both the nonverbal and verbal relations between the sender and the objects by multi-modal cooperative reasoning in gesture, language, visual content, and spatial position.
10 | 
11 | 
12 | <p align="center">
13 |   <img src="doc/intro.png" width="50%"/>
14 | </p>
15 | 
16 | ## Framework
17 | 
18 | <p align="center">
19 |   <img src="doc/frame.png" width="100%"/>
20 | </p>
21 | 
22 | ## Dataset
23 | Download the YouRefIt dataset from [Dataset Request Page](https://yixchen.github.io/YouRefIt/request.html) and put under ```./ln_data```
24 | 
25 | ## Model weights
26 | * [Yolov3](https://pjreddie.com/media/files/yolov3.weights): download the pretrained model and place the file in ``./saved_models`` by 
27 |     ```
28 |     sh saved_models/yolov3_weights.sh
29 |     ```
30 | 
31 | Make sure to put the files in the following structure:
32 | 
33 | ```
34 | |-- ROOT
35 | |	|-- ln_data
36 | |		|-- yourefit
37 | |			|-- images
38 | |			|-- paf
39 | |			|-- saliency
40 | ```
41 | 
42 | ## Training and Evaluation
43 | The training and evaluation script is the same as [YouRefIt](https://github.com/yixchen/YouRefIt_ERU)
44 | 
45 | ## Checklist
46 | 
47 | + [x] code
48 | + [ ] pre-process data
49 | 
50 | ### Citation
51 | 
52 |     @inproceedings{shi2022spatial,
53 |       title={Spatial and Visual Perspective-Taking via View Rotation and Relation Reasoning for Embodied Reference Understanding},
54 |       author={Shi, Cheng and Yang, Sibei},
55 |       booktitle={Computer Vision--ECCV 2022: 17th European Conference, Tel Aviv, Israel, October 23--27, 2022, Proceedings, Part XXXVI},
56 |       pages={201--218},
57 |       year={2022},
58 |       organization={Springer}
59 |     }
60 | 
61 | ### Acknowledgement
62 | Our code is built on [ReSC](https://github.com/zyang-ur/ReSC) and [YouRefIt](https://github.com/yixchen/YouRefIt_ERU), we thank the authors for their hard work.
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/utils/word_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Language-related data loading helper functions and class wrappers.
  5 | """
  6 | 
  7 | import re
  8 | import torch
  9 | import codecs
 10 | 
 11 | UNK_TOKEN = '<unk>'
 12 | PAD_TOKEN = '<pad>'
 13 | END_TOKEN = '<eos>'
 14 | SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
 15 | 
 16 | 
 17 | class Dictionary(object):
 18 |     def __init__(self):
 19 |         self.word2idx = {}
 20 |         self.idx2word = []
 21 | 
 22 |     def add_word(self, word):
 23 |         if word not in self.word2idx:
 24 |             self.idx2word.append(word)
 25 |             self.word2idx[word] = len(self.idx2word) - 1
 26 |         return self.word2idx[word]
 27 | 
 28 |     def __len__(self):
 29 |         return len(self.idx2word)
 30 | 
 31 |     def __getitem__(self, a):
 32 |         if isinstance(a, int):
 33 |             return self.idx2word[a]
 34 |         elif isinstance(a, list):
 35 |             return [self.idx2word[x] for x in a]
 36 |         elif isinstance(a, str):
 37 |             return self.word2idx[a]
 38 |         else:
 39 |             raise TypeError("Query word/index argument must be int or str")
 40 | 
 41 |     def __contains__(self, word):
 42 |         return word in self.word2idx
 43 | 
 44 | 
 45 | class Corpus(object):
 46 |     def __init__(self):
 47 |         self.dictionary = Dictionary()
 48 | 
 49 |     def set_max_len(self, value):
 50 |         self.max_len = value
 51 | 
 52 |     def load_file(self, filename):
 53 |         with codecs.open(filename, 'r', 'utf-8') as f:
 54 |             for line in f:
 55 |                 line = line.strip()
 56 |                 self.add_to_corpus(line)
 57 |         self.dictionary.add_word(UNK_TOKEN)
 58 |         self.dictionary.add_word(PAD_TOKEN)
 59 | 
 60 |     def add_to_corpus(self, line):
 61 |         """Tokenizes a text line."""
 62 |         # Add words to the dictionary
 63 |         words = line.split()
 64 |         # tokens = len(words)
 65 |         for word in words:
 66 |             word = word.lower()
 67 |             self.dictionary.add_word(word)
 68 | 
 69 |     def tokenize(self, line, max_len=20):
 70 |         # Tokenize line contents
 71 |         words = SENTENCE_SPLIT_REGEX.split(line.strip())
 72 |         # words = [w.lower() for w in words if len(w) > 0]
 73 |         words = [w.lower() for w in words if (len(w) > 0 and w!=' ')]   ## do not include space as a token
 74 | 
 75 |         if words[-1] == '.':
 76 |             words = words[:-1]
 77 | 
 78 |         if max_len > 0:
 79 |             if len(words) > max_len:
 80 |                 words = words[:max_len]
 81 |             elif len(words) < max_len:
 82 |                 # words = [PAD_TOKEN] * (max_len - len(words)) + words
 83 |                 words = words + [END_TOKEN] + [PAD_TOKEN] * (max_len - len(words) - 1)
 84 | 
 85 |         tokens = len(words) ## for end token
 86 |         ids = torch.LongTensor(tokens)
 87 |         token = 0
 88 |         for word in words:
 89 |             if word not in self.dictionary:
 90 |                 word = UNK_TOKEN
 91 |             # print(word, type(word), word.encode('ascii','ignore').decode('ascii'), type(word.encode('ascii','ignore').decode('ascii')))
 92 |             if type(word)!=type('a'):
 93 |                 print(word, type(word), word.encode('ascii','ignore').decode('ascii'), type(word.encode('ascii','ignore').decode('ascii')))
 94 |                 word = word.encode('ascii','ignore').decode('ascii')
 95 |             ids[token] = self.dictionary[word]
 96 |             token += 1
 97 |         # ids[token] = self.dictionary[END_TOKEN]
 98 |         return ids
 99 | 
100 |     def __len__(self):
101 |         return len(self.dictionary)
102 | 


--------------------------------------------------------------------------------
/evaluation_results.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | import cv2
  4 | import pickle5 as pickle
  5 | import torch
  6 | import json
  7 | def bbox_iou(box1, box2, x1y1x2y2=True):
  8 |     """
  9 |     Returns the IoU of two bounding boxes
 10 |     """
 11 |     box1 = torch.tensor(box1)
 12 |     box2 = torch.tensor(box2)
 13 |     if x1y1x2y2:
 14 |         # Get the coordinates of bounding boxes
 15 |         b1_x1, b1_y1, b1_x2, b1_y2 = box1[ 0], box1[ 1], box1[ 2], box1[ 3]
 16 |         b2_x1, b2_y1, b2_x2, b2_y2 = box2[ 0], box2[ 1], box2[ 2], box2[ 3]
 17 |     else:
 18 |         # Transform from center and width to exact coordinates
 19 |         b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
 20 |         b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
 21 |         b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
 22 |         b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
 23 | 
 24 |     # get the coordinates of the intersection rectangle
 25 |     inter_rect_x1 = torch.max(b1_x1, b2_x1)
 26 |     inter_rect_y1 = torch.max(b1_y1, b2_y1)
 27 |     inter_rect_x2 = torch.min(b1_x2, b2_x2)
 28 |     inter_rect_y2 = torch.min(b1_y2, b2_y2)
 29 |     # Intersection area
 30 |     inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1, 0) * torch.clamp(inter_rect_y2 - inter_rect_y1, 0)
 31 |     # Union Area
 32 |     b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
 33 |     b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
 34 | 
 35 |     # print(box1, box1.shape)
 36 |     # print(box2, box2.shape)
 37 |     return inter_area / (b1_area + b2_area - inter_area + 1e-16)
 38 | 
 39 | # Given 2 bounding boxes, return their IoU
 40 | def bb_IoU(bb1,bb2):
 41 | 
 42 |     Area1 = abs(bb1[2] - bb1[0]) * abs(bb1[3]-bb1[1])
 43 |     Area2 = abs(bb2[2] - bb2[0]) * abs(bb2[3]-bb2[1])
 44 | 
 45 |     xA = max(bb1[0],bb2[0])
 46 |     yA = max(bb1[1],bb2[1])
 47 |     xB = min(bb1[2],bb2[2])
 48 |     yB = min(bb1[3],bb2[3])
 49 | 
 50 |     intersection = max(0, xB - xA) * max(0, yB - yA)
 51 |     IoU = intersection / (Area1 + Area2 - intersection + 1e-16)
 52 | 
 53 |     return(IoU)
 54 | 
 55 | def Area(bb1, image):
 56 |         area1 = abs(bb1[2] - bb1[0]) * abs(bb1[3]-bb1[1])
 57 |         return area1/image
 58 | 
 59 | def evaluation(image_path, gt_path, predict_path):
 60 |     yolopred = dict()
 61 | 
 62 |     with open("ln_data/yourefit/test_id.txt", "r") as f:
 63 |         test_id_list = f.readlines()
 64 |     test_id_list = [x.strip('\n') for x in test_id_list]
 65 |     print(test_id_list)
 66 | 
 67 |     with open("ln_data/yourefit/train_id.txt", "r") as f:
 68 |         train_id_list = f.readlines()
 69 |     train_id_list = [x.strip('\n') for x in train_id_list]
 70 | 
 71 | 
 72 | 
 73 |     TP= dict()
 74 |     TP['all'] = np.zeros((3,))
 75 |     TP['s'] = np.zeros((3,))
 76 |     TP['m'] = np.zeros((3,))
 77 |     TP['l'] = np.zeros((3,))
 78 | 
 79 |     FP= dict()
 80 |     FP['all'] = np.zeros((3,))
 81 |     FP['s'] = np.zeros((3,))
 82 |     FP['m'] = np.zeros((3,))
 83 |     FP['l'] = np.zeros((3,))
 84 |     gt_boxes = []
 85 |     for ind, pattern in enumerate(test_id_list):
 86 |         img = cv2.imread(os.path.join(image_path, pattern+'.jpg'))
 87 |         H,W,_ = img.shape
 88 |         pickle_name = os.path.join(gt_path, pattern+'.p')
 89 |         gt = pickle.load(open( pickle_name, "rb" ))
 90 |         ground_truth_box = gt['bbox']
 91 |         gt_boxes.append(ground_truth_box)
 92 |     # read prediction file (Need to change based on input)
 93 |         pred_pickle = os.path.join(predict_path, pattern+'.jpg.p')
 94 |         pred = pickle.load(open(pred_pickle, "rb" ))
 95 |         predicted_box = pred[0]
 96 |     #
 97 |         yolopred[test_id_list[ind]] = predicted_box
 98 |         for ind, IoU in enumerate([0.25, 0.5, 0.75] ):
 99 |             if bbox_iou(predicted_box,ground_truth_box) >= IoU:
100 |                 TP['all'][ind] +=1
101 |                 if 100*Area(ground_truth_box, H*W) < 0.48:
102 |                     TP['s'][ind] += 1
103 |                 else:
104 |                     if 100*Area(ground_truth_box, H*W) < 1.75:
105 |                         TP['m'][ind] += 1
106 |                     else:
107 |                         TP['l'][ind] += 1
108 |             else:
109 |                 FP['all'][ind] +=1
110 |                 if 100*Area(ground_truth_box, H*W) < 0.48:
111 |                     FP['s'][ind] += 1
112 |                 else:
113 |                     if 100*Area(ground_truth_box, H*W) < 1.75:
114 |                         FP['m'][ind] += 1
115 |                     else:
116 |                         FP['l'][ind] += 1
117 | 
118 |     for ind, IoU in enumerate([0.25, 0.5, 0.75]):
119 |         print('Accuracy =',TP['all'][ind]/(TP['all'][ind]+FP['all'][ind]))
120 |         print('Small Accuracy =',TP['s'][ind]/(TP['s'][ind]+FP['s'][ind]), 'in', TP['s'][ind]+FP['s'][ind], 'samples')
121 |         print('Medium Accuracy =',TP['m'][ind]/(TP['m'][ind]+FP['m'][ind]), 'in', TP['m'][ind]+FP['m'][ind], 'samples')
122 |         print('Large Accuracy =',TP['l'][ind]/(TP['l'][ind]+FP['l'][ind]), 'in', TP['l'][ind]+FP['l'][ind], 'samples')
123 | 
124 | if __name__ == "__main__":
125 | 
126 |     image_path= 'ln_data/yourefit/images'
127 |     gt_path= 'ln_data/yourefit/pickle'
128 |     predict_path = 'test/test_final'
129 |     evaluation(image_path, gt_path, predict_path)
130 | 


--------------------------------------------------------------------------------
/utils/utils.py:
--------------------------------------------------------------------------------
  1 | import random
  2 | 
  3 | import cv2
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn.functional as F
  7 | 
  8 | class AverageMeter(object):
  9 |     """Computes and stores the average and current value"""
 10 |     def __init__(self):
 11 |         self.reset()
 12 | 
 13 |     def reset(self):
 14 |         self.val = 0
 15 |         self.avg = 0
 16 |         self.sum = 0
 17 |         self.count = 0
 18 | 
 19 |     def update(self, val, n=1):
 20 |         self.val = val
 21 |         self.sum += val * n
 22 |         self.count += n
 23 |         self.avg = self.sum / self.count
 24 | 
 25 | def xyxy2xywh(x):  # Convert bounding box format from [x1, y1, x2, y2] to [x, y, w, h]
 26 |     y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape)
 27 |     y[:, 0] = (x[:, 0] + x[:, 2]) / 2
 28 |     y[:, 1] = (x[:, 1] + x[:, 3]) / 2
 29 |     y[:, 2] = x[:, 2] - x[:, 0]
 30 |     y[:, 3] = x[:, 3] - x[:, 1]
 31 |     return y
 32 | 
 33 | 
 34 | def xywh2xyxy(x):  # Convert bounding box format from [x, y, w, h] to [x1, y1, x2, y2]
 35 |     y = torch.zeros(x.shape) if x.dtype is torch.float32 else np.zeros(x.shape)
 36 |     y[:, 0] = (x[:, 0] - x[:, 2] / 2)
 37 |     y[:, 1] = (x[:, 1] - x[:, 3] / 2)
 38 |     y[:, 2] = (x[:, 0] + x[:, 2] / 2)
 39 |     y[:, 3] = (x[:, 1] + x[:, 3] / 2)
 40 |     return y
 41 |     
 42 | def bbox_iou_numpy(box1, box2):
 43 |     """Computes IoU between bounding boxes.
 44 |     Parameters
 45 |     ----------
 46 |     box1 : ndarray
 47 |         (N, 4) shaped array with bboxes
 48 |     box2 : ndarray
 49 |         (M, 4) shaped array with bboxes
 50 |     Returns
 51 |     -------
 52 |     : ndarray
 53 |         (N, M) shaped array with IoUs
 54 |     """
 55 |     area = (box2[:, 2] - box2[:, 0]) * (box2[:, 3] - box2[:, 1])
 56 | 
 57 |     iw = np.minimum(np.expand_dims(box1[:, 2], axis=1), box2[:, 2]) - np.maximum(
 58 |         np.expand_dims(box1[:, 0], 1), box2[:, 0]
 59 |     )
 60 |     ih = np.minimum(np.expand_dims(box1[:, 3], axis=1), box2[:, 3]) - np.maximum(
 61 |         np.expand_dims(box1[:, 1], 1), box2[:, 1]
 62 |     )
 63 | 
 64 |     iw = np.maximum(iw, 0)
 65 |     ih = np.maximum(ih, 0)
 66 | 
 67 |     ua = np.expand_dims((box1[:, 2] - box1[:, 0]) * (box1[:, 3] - box1[:, 1]), axis=1) + area - iw * ih
 68 | 
 69 |     ua = np.maximum(ua, np.finfo(float).eps)
 70 | 
 71 |     intersection = iw * ih
 72 | 
 73 |     return intersection / ua
 74 | 
 75 | 
 76 | def bbox_iou(box1, box2, x1y1x2y2=True):
 77 |     """
 78 |     Returns the IoU of two bounding boxes
 79 |     """
 80 |     if x1y1x2y2:
 81 |         # Get the coordinates of bounding boxes
 82 |         b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
 83 |         b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
 84 |     else:
 85 |         # Transform from center and width to exact coordinates
 86 |         b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
 87 |         b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
 88 |         b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
 89 |         b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
 90 | 
 91 |     # get the coordinates of the intersection rectangle
 92 |     inter_rect_x1 = torch.max(b1_x1, b2_x1)
 93 |     inter_rect_y1 = torch.max(b1_y1, b2_y1)
 94 |     inter_rect_x2 = torch.min(b1_x2, b2_x2)
 95 |     inter_rect_y2 = torch.min(b1_y2, b2_y2)
 96 |     # Intersection area
 97 |     inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1, 0) * torch.clamp(inter_rect_y2 - inter_rect_y1, 0)
 98 |     # Union Area
 99 |     b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1)
100 |     b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1)
101 | 
102 |     # print(box1, box1.shape)
103 |     # print(box2, box2.shape)
104 |     return inter_area / (b1_area + b2_area - inter_area + 1e-16)
105 | 
106 | def multiclass_metrics(pred, gt):
107 |   """
108 |   check precision and recall for predictions.
109 |   Output: overall = {precision, recall, f1}
110 |   """
111 |   eps=1e-6
112 |   overall = {'precision': -1, 'recall': -1, 'f1': -1}
113 |   NP, NR, NC = 0, 0, 0  # num of pred, num of recall, num of correct
114 |   for ii in range(pred.shape[0]):
115 |     pred_ind = np.array(pred[ii]>0.5, dtype=int)
116 |     gt_ind = np.array(gt[ii]>0.5, dtype=int)
117 |     inter = pred_ind * gt_ind
118 |     # add to overall
119 |     NC += np.sum(inter)
120 |     NP += np.sum(pred_ind)
121 |     NR += np.sum(gt_ind)
122 |   if NP > 0:
123 |     overall['precision'] = float(NC)/NP
124 |   if NR > 0:
125 |     overall['recall'] = float(NC)/NR
126 |   if NP > 0 and NR > 0:
127 |     overall['f1'] = 2*overall['precision']*overall['recall']/(overall['precision']+overall['recall']+eps)
128 |   return overall
129 | 
130 | def compute_ap(recall, precision):
131 |     """ Compute the average precision, given the recall and precision curves.
132 |     Code originally from https://github.com/rbgirshick/py-faster-rcnn.
133 |     # Arguments
134 |         recall:    The recall curve (list).
135 |         precision: The precision curve (list).
136 |     # Returns
137 |         The average precision as computed in py-faster-rcnn.
138 |     """
139 |     # correct AP calculation
140 |     # first append sentinel values at the end
141 |     mrec = np.concatenate(([0.0], recall, [1.0]))
142 |     mpre = np.concatenate(([0.0], precision, [0.0]))
143 | 
144 |     # compute the precision envelope
145 |     for i in range(mpre.size - 1, 0, -1):
146 |         mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])
147 | 
148 |     # to calculate area under PR curve, look for points
149 |     # where X axis (recall) changes value
150 |     i = np.where(mrec[1:] != mrec[:-1])[0]
151 | 
152 |     # and sum (\Delta recall) * prec
153 |     ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
154 |     return ap
155 | 


--------------------------------------------------------------------------------
/utils/parsing_metrics.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import os
  4 | 
  5 | # from plot_util import plot_confusion_matrix
  6 | # from makemask import *
  7 | 
  8 | def _fast_hist(label_true, label_pred, n_class):
  9 | 	mask = (label_true >= 0) & (label_true < n_class)
 10 | 	hist = np.bincount(
 11 | 		n_class * label_true[mask].astype(int) +
 12 | 		label_pred[mask], minlength=n_class ** 2).reshape(n_class, n_class)
 13 | 	return hist
 14 | 
 15 | def label_accuracy_score(label_trues, label_preds, n_class, bg_thre=200):
 16 | 	"""Returns accuracy score evaluation result.
 17 | 	  - overall accuracy
 18 | 	  - mean accuracy
 19 | 	  - mean IU
 20 | 	  - fwavacc
 21 | 	"""
 22 | 	hist = np.zeros((n_class, n_class))
 23 | 	for lt, lp in zip(label_trues, label_preds):
 24 | 		# hist += _fast_hist(lt.flatten(), lp.flatten(), n_class)
 25 | 		hist += _fast_hist(lt[lt<bg_thre].flatten(), lp[lt<bg_thre].flatten(), n_class)
 26 | 	acc = np.diag(hist).sum() / hist.sum()
 27 | 	acc_cls = np.diag(hist) / hist.sum(axis=1)
 28 | 	acc_cls = np.nanmean(acc_cls)
 29 | 	iu = np.diag(hist) / (hist.sum(axis=1) + hist.sum(axis=0) - np.diag(hist))
 30 | 	mean_iu = np.nanmean(iu)
 31 | 	freq = hist.sum(axis=1) / hist.sum()
 32 | 	fwavacc = (freq[freq > 0] * iu[freq > 0]).sum()
 33 | 	return acc, acc_cls, mean_iu, fwavacc
 34 | 
 35 | def label_confusion_matrix(label_trues, label_preds, n_class, bg_thre=200):
 36 | 	# eps=1e-20
 37 | 	hist=np.zeros((n_class,n_class),dtype=float)
 38 | 	""" (8,256,256), (256,256) """
 39 | 	for lt,lp in zip(label_trues, label_preds):
 40 | 		# hist += _fast_hist(lt.flatten(), lp.flatten(), n_class)
 41 | 		hist += _fast_hist(lt[lt<bg_thre].flatten(), lp[lt<bg_thre].flatten(), n_class)
 42 | 	iu = np.diag(hist) / (hist.sum(axis=1) + hist.sum(axis=0) - np.diag(hist))
 43 | 	# for i in range(n_class):
 44 | 	# 	hist[i,:]=(hist[i,:]+eps)/sum(hist[i,:]+eps)
 45 | 	return hist, iu
 46 | 
 47 | def body_region_confusion_matrix(label_trues, label_preds, n_class, boxes, counter):
 48 | 	## pred: [bb,region_index,c,h,w] (pred score)
 49 | 	## gt: [bb,region_index,h,w] (0-nclass score)
 50 | 	label_trues = label_trues.data.cpu().numpy()
 51 | 	label_preds = label_preds.data.cpu().numpy()
 52 | 	hist=np.zeros((label_trues.shape[1],n_class,n_class),dtype=float)
 53 | 	for body_i in range(label_trues.shape[1]):
 54 | 		for bb in range(label_trues.shape[0]):
 55 | 			if body_i != label_trues.shape[1]-1 and \
 56 | 				torch.equal(boxes[bb,body_i,:], torch.Tensor([0.,0.,1.,1.])):
 57 | 				counter+=1
 58 | 				continue
 59 | 			else:
 60 | 				hist[body_i,:,:] += label_confusion_matrix(label_trues[bb,body_i,:,:], \
 61 | 						np.argmax(label_preds[bb,body_i,:,:,:], axis=0), n_class)[0]
 62 | 	return hist
 63 | 
 64 | def hist_based_accu_cal(hist):
 65 | 	acc = np.diag(hist).sum() / hist.sum()
 66 | 	acc_cls = np.diag(hist) / hist.sum(axis=1)
 67 | 	acc_cls = np.nanmean(acc_cls)
 68 | 	iu = np.diag(hist) / (hist.sum(axis=1) + hist.sum(axis=0) - np.diag(hist))
 69 | 	mean_iu = np.nanmean(iu)
 70 | 	freq = hist.sum(axis=1) / hist.sum()
 71 | 	fwavacc = (freq[freq > 0] * iu[freq > 0]).sum()
 72 | 	return acc, acc_cls, mean_iu, fwavacc, iu
 73 | 
 74 | # if __name__ == '__main__':
 75 | # 	""" Evaluating from saved png segmentation maps 
 76 | # 		0.862723060822 0.608076070823 0.503493670787 0.76556929118
 77 | # 	"""
 78 | # 	import csv
 79 | # 	from PIL import Image
 80 | # 	import matplotlib as mpl
 81 | # 	mpl.use('Agg')
 82 | # 	from matplotlib import pyplot as plt
 83 | # 	eps=1e-20
 84 | 
 85 | # 	class AverageMeter(object):
 86 | # 		"""Computes and stores the average and current value"""
 87 | # 		def __init__(self):
 88 | # 			self.reset()
 89 | 
 90 | # 		def reset(self):
 91 | # 			self.val = 0
 92 | # 			self.avg = 0
 93 | # 			self.sum = 0
 94 | # 			self.count = 0
 95 | 
 96 | # 		def update(self, val, n=1):
 97 | # 			self.val = val
 98 | # 			self.sum += val * n
 99 | # 			self.count += n
100 | # 			self.avg = self.sum / self.count
101 | # 	def load_csv(csv_file):
102 | # 		img_list, kpt_list, conf_list=[],[],[]
103 | # 		with open(csv_file, 'rb') as f:
104 | # 			reader = csv.reader(f)
105 | # 			for row in reader:
106 | # 				img_list.append(row[0])
107 | # 				kpt_list.append([row[i] for i in range(1,len(row)) if i%3!=0])
108 | # 				conf_list.append([row[i] for i in range(1,len(row)) if i%3==0])
109 | # 		# print len(img_list),len(kpt_list[0]),len(conf_list[0])
110 | # 		return img_list,kpt_list,conf_list
111 | 
112 | # 	n_class = 7
113 | # 	superpixel_smooth = False
114 | # 	# valfile = '../../ln_data/LIP/TrainVal_pose_annotations/lip_val_set.csv'
115 | # 	# pred_folder = '../../../git_code/LIP_JPPNet/output/parsing/val/'
116 | # 	# pred_folder = '../visulizations/refinenet_baseline/test_out/'
117 | # 	pred_folder = '../visulizations/refinenet_splittask/test_out/'
118 | # 	gt_folder = '../../ln_data/pascal_data/SegmentationPart/'
119 | # 	img_path = '../../ln_data/pascal_data/JPEGImages/'
120 | 
121 | # 	file = '../../ln_data/pascal_data/val_id.txt'
122 | # 	missjoints = '../../ln_data/pascal_data/no_joint_list.txt'
123 | # 	img_list = [x.strip().split(' ')[0] for x in open(file)]
124 | # 	miss_list = [x.strip().split(' ')[0] for x in open(missjoints)]
125 | 
126 | # 	conf_matrices = AverageMeter()
127 | # 	for index in range(len(img_list)):
128 | # 		img_name = img_list[index]
129 | # 		if img_name in miss_list:
130 | # 			continue
131 | # 		if not os.path.isfile(pred_folder + img_name + '.png'):
132 | # 			continue
133 | # 		pred_file = pred_folder + img_name + '.png'
134 | # 		pred = Image.open(pred_file)
135 | # 		gt_file = gt_folder + img_name + '.png'
136 | # 		gt = Image.open(gt_file)
137 | # 		pred, gt = np.array(pred, dtype=np.int32), np.array(gt, dtype=np.int32)
138 | # 		if superpixel_smooth:
139 | # 			img_file = img_path+img_name+'.jpg'
140 | # 			img = Image.open(img_file)
141 | # 			pred = superpixel_expand(np.array(img),pred)
142 | # 		confusion, _ = label_confusion_matrix(gt, pred, n_class)
143 | # 		conf_matrices.update(confusion,1)
144 | # 	acc, acc_cls, mean_iu, fwavacc, iu = hist_based_accu_cal(conf_matrices.avg)
145 | # 	print(acc, acc_cls, mean_iu, fwavacc)
146 | # 	print(iu)
147 | 
148 | # 	## SAVE CONFUSION MATRIX
149 | # 	figure=plt.figure()
150 | # 	class_name=['bg', 'head', 'torso', 'upper arm', 'lower arm', 'upper leg', 'lower leg']
151 | # 	conf_matrices = conf_matrices.avg
152 | # 	for i in range(n_class):
153 | # 		conf_matrices[i,:]=(conf_matrices[i,:]+eps)/sum(conf_matrices[i,:]+eps)
154 | # 	plot_confusion_matrix(conf_matrices, classes=class_name,
155 | # 		rotation=0, include_text=True,
156 | # 		title='Confusion matrix, without normalization')
157 | # 	plt.show()
158 | # 	plt.savefig('../saved_models/Baseline_refinenet_test.jpg')
159 | # 	plt.close('all')
160 | 


--------------------------------------------------------------------------------
/model/convlstm.py:
--------------------------------------------------------------------------------
  1 | import torch.nn as nn
  2 | from torch.autograd import Variable
  3 | import torch
  4 | 
  5 | """
  6 | https://github.com/ndrplz/ConvLSTM_pytorch
  7 | """
  8 | 
  9 | class ConvLSTMCell(nn.Module):
 10 | 
 11 |     def __init__(self, input_size, input_dim, hidden_dim, kernel_size, bias):
 12 |         """
 13 |         Initialize ConvLSTM cell.
 14 |         
 15 |         Parameters
 16 |         ----------
 17 |         input_size: (int, int)
 18 |             Height and width of input tensor as (height, width).
 19 |         input_dim: int
 20 |             Number of channels of input tensor.
 21 |         hidden_dim: int
 22 |             Number of channels of hidden state.
 23 |         kernel_size: (int, int)
 24 |             Size of the convolutional kernel.
 25 |         bias: bool
 26 |             Whether or not to add the bias.
 27 |         """
 28 | 
 29 |         super(ConvLSTMCell, self).__init__()
 30 | 
 31 |         self.height, self.width = input_size
 32 |         self.input_dim  = input_dim
 33 |         self.hidden_dim = hidden_dim
 34 | 
 35 |         self.kernel_size = kernel_size
 36 |         self.padding     = kernel_size[0] // 2, kernel_size[1] // 2
 37 |         self.bias        = bias
 38 |         
 39 |         self.conv = nn.Conv2d(in_channels=self.input_dim + self.hidden_dim,
 40 |                               out_channels=4 * self.hidden_dim,
 41 |                               kernel_size=self.kernel_size,
 42 |                               padding=self.padding,
 43 |                               bias=self.bias)
 44 | 
 45 |     def forward(self, input_tensor, cur_state):
 46 |         
 47 |         h_cur, c_cur = cur_state
 48 | 
 49 |         combined = torch.cat([input_tensor, h_cur], dim=1)  # concatenate along channel axis
 50 |         
 51 |         combined_conv = self.conv(combined)
 52 |         cc_i, cc_f, cc_o, cc_g = torch.split(combined_conv, self.hidden_dim, dim=1) 
 53 |         i = torch.sigmoid(cc_i)
 54 |         f = torch.sigmoid(cc_f)
 55 |         o = torch.sigmoid(cc_o)
 56 |         g = torch.tanh(cc_g)
 57 | 
 58 |         c_next = f * c_cur + i * g
 59 |         h_next = o * torch.tanh(c_next)
 60 |         
 61 |         return h_next, c_next
 62 | 
 63 |     def init_hidden(self, batch_size):
 64 |         return (Variable(torch.zeros(batch_size, self.hidden_dim, self.height, self.width)).cuda(),
 65 |                 Variable(torch.zeros(batch_size, self.hidden_dim, self.height, self.width)).cuda())
 66 | 
 67 | 
 68 | class ConvLSTM(nn.Module):
 69 | 
 70 |     def __init__(self, input_size, input_dim, hidden_dim, kernel_size, num_layers,
 71 |                  batch_first=False, bias=True, return_all_layers=False):
 72 |         super(ConvLSTM, self).__init__()
 73 | 
 74 |         self._check_kernel_size_consistency(kernel_size)
 75 | 
 76 |         # Make sure that both `kernel_size` and `hidden_dim` are lists having len == num_layers
 77 |         kernel_size = self._extend_for_multilayer(kernel_size, num_layers)
 78 |         hidden_dim  = self._extend_for_multilayer(hidden_dim, num_layers)
 79 |         if not len(kernel_size) == len(hidden_dim) == num_layers:
 80 |             raise ValueError('Inconsistent list length.')
 81 | 
 82 |         self.height, self.width = input_size
 83 | 
 84 |         self.input_dim  = input_dim
 85 |         self.hidden_dim = hidden_dim
 86 |         self.kernel_size = kernel_size
 87 |         self.num_layers = num_layers
 88 |         self.batch_first = batch_first
 89 |         self.bias = bias
 90 |         self.return_all_layers = return_all_layers
 91 | 
 92 |         cell_list = []
 93 |         for i in range(0, self.num_layers):
 94 |             cur_input_dim = self.input_dim if i == 0 else self.hidden_dim[i-1]
 95 | 
 96 |             cell_list.append(ConvLSTMCell(input_size=(self.height, self.width),
 97 |                                           input_dim=cur_input_dim,
 98 |                                           hidden_dim=self.hidden_dim[i],
 99 |                                           kernel_size=self.kernel_size[i],
100 |                                           bias=self.bias))
101 | 
102 |         self.cell_list = nn.ModuleList(cell_list)
103 | 
104 |     def forward(self, input_tensor, hidden_state=None):
105 |         """
106 |         
107 |         Parameters
108 |         ----------
109 |         input_tensor: todo 
110 |             5-D Tensor either of shape (t, b, c, h, w) or (b, t, c, h, w)
111 |         hidden_state: todo
112 |             None. todo implement stateful
113 |             
114 |         Returns
115 |         -------
116 |         last_state_list, layer_output
117 |         """
118 |         if not self.batch_first:
119 |             # (t, b, c, h, w) -> (b, t, c, h, w)
120 |             input_tensor = input_tensor.permute(1, 0, 2, 3, 4)
121 | 
122 |         # Implement stateful ConvLSTM
123 |         if hidden_state is not None:
124 |             raise NotImplementedError()
125 |         else:
126 |             hidden_state = self._init_hidden(batch_size=input_tensor.size(0))
127 | 
128 |         layer_output_list = []
129 |         last_state_list   = []
130 | 
131 |         seq_len = input_tensor.size(1)
132 |         cur_layer_input = input_tensor
133 | 
134 |         for layer_idx in range(self.num_layers):
135 | 
136 |             h, c = hidden_state[layer_idx]
137 |             output_inner = []
138 |             for t in range(seq_len):
139 | 
140 |                 h, c = self.cell_list[layer_idx](input_tensor=cur_layer_input[:, t, :, :, :],
141 |                                                  cur_state=[h, c])
142 |                 output_inner.append(h)
143 | 
144 |             layer_output = torch.stack(output_inner, dim=1)
145 |             cur_layer_input = layer_output
146 | 
147 |             layer_output_list.append(layer_output)
148 |             last_state_list.append([h, c])
149 | 
150 |         if not self.return_all_layers:
151 |             layer_output_list = layer_output_list[-1:]
152 |             last_state_list   = last_state_list[-1:]
153 | 
154 |         return layer_output_list, last_state_list
155 | 
156 |     def _init_hidden(self, batch_size):
157 |         init_states = []
158 |         for i in range(self.num_layers):
159 |             init_states.append(self.cell_list[i].init_hidden(batch_size))
160 |         return init_states
161 | 
162 |     @staticmethod
163 |     def _check_kernel_size_consistency(kernel_size):
164 |         if not (isinstance(kernel_size, tuple) or
165 |                     (isinstance(kernel_size, list) and all([isinstance(elem, tuple) for elem in kernel_size]))):
166 |             raise ValueError('`kernel_size` must be tuple or list of tuples')
167 | 
168 |     @staticmethod
169 |     def _extend_for_multilayer(param, num_layers):
170 |         if not isinstance(param, list):
171 |             param = [param] * num_layers
172 |         return param


--------------------------------------------------------------------------------
/model/loss.py:
--------------------------------------------------------------------------------
  1 | import numpy as np
  2 | import torch
  3 | import torch.nn.functional as F
  4 | from torch.autograd import Variable
  5 | from model.modulation import mask_softmax
  6 | from utils.utils import bbox_iou
  7 | import math
  8 | from torchvision.ops.boxes import box_area
  9 | def lr_poly(base_lr, iter, max_iter, power):
 10 |     return base_lr * ((1 - float(iter) / max_iter) ** (power))
 11 | 
 12 | def lr_cos(base_lr, iter, max_iter, warm_up=0.05):
 13 |     warm_up_epoch = int(max_iter*warm_up)
 14 |     if iter<=warm_up_epoch:
 15 |         lr = base_lr*(0.8*iter/warm_up_epoch+0.2)
 16 |     else:
 17 |         lr = 0.5*base_lr*(1+math.cos(math.pi*(iter-warm_up_epoch)/(max_iter-warm_up_epoch)))
 18 |     return lr
 19 | 
 20 | def adjust_learning_rate(args, optimizer, i_iter):
 21 |     # print(optimizer.param_groups[0]['lr'], optimizer.param_groups[1]['lr'])
 22 |     if args.power==-1:
 23 |         lr = lr_cos(args.lr, i_iter, args.nb_epoch)
 24 |     elif args.power==-2:
 25 |         lr = args.lr*((0.5)**(i_iter//10))
 26 |     elif args.power==-3:
 27 |         lr = args.lr*((0.5)**(i_iter//30))
 28 |     elif args.power!=0.:
 29 |         lr = lr_poly(args.lr, i_iter, args.nb_epoch, args.power)
 30 |     else:
 31 |         # lr = args.lr*((0.1)**(i_iter//(args.nb_epoch//4)))
 32 |         lr = args.lr*((0.5)**(i_iter//(args.nb_epoch//10)))
 33 |     print(lr)
 34 |     optimizer.param_groups[0]['lr'] = lr
 35 |     if len(optimizer.param_groups) > 1:
 36 |       optimizer.param_groups[1]['lr'] = lr / 10
 37 |     if len(optimizer.param_groups) > 2:
 38 |       optimizer.param_groups[2]['lr'] = lr / 10
 39 | 
 40 | def yolo_loss(input, target, gi, gj, best_n_list, w_coord=5., w_neg=1./5, size_average=True):
 41 |     mseloss = torch.nn.MSELoss(size_average=True)
 42 |     celoss = torch.nn.CrossEntropyLoss(size_average=True)
 43 |     batch = input.size(0)
 44 | 
 45 |     pred_bbox = Variable(torch.zeros(batch,4).cuda())
 46 |     gt_bbox = Variable(torch.zeros(batch,4).cuda())
 47 |     for ii in range(batch):
 48 |         pred_bbox[ii, 0:2] = F.sigmoid(input[ii,best_n_list[ii],0:2,gj[ii],gi[ii]])
 49 |         pred_bbox[ii, 2:4] = input[ii,best_n_list[ii],2:4,gj[ii],gi[ii]]
 50 |         gt_bbox[ii, :] = target[ii,best_n_list[ii],:4,gj[ii],gi[ii]]
 51 |     loss_x = mseloss(pred_bbox[:,0], gt_bbox[:,0])
 52 |     loss_y = mseloss(pred_bbox[:,1], gt_bbox[:,1])
 53 |     loss_w = mseloss(pred_bbox[:,2], gt_bbox[:,2])
 54 |     loss_h = mseloss(pred_bbox[:,3], gt_bbox[:,3])
 55 | 
 56 |     pred_conf_list, gt_conf_list = [], []
 57 |     pred_conf_list.append(input[:,:,4,:,:].contiguous().view(batch,-1))
 58 |     gt_conf_list.append(target[:,:,4,:,:].contiguous().view(batch,-1))
 59 |     pred_conf = torch.cat(pred_conf_list, dim=1)
 60 |     gt_conf = torch.cat(gt_conf_list, dim=1)
 61 |     loss_conf = celoss(pred_conf, gt_conf.max(1)[1])
 62 | 
 63 | 
 64 |     return (loss_x+loss_y+loss_w+loss_h)*w_coord + loss_conf 
 65 | 
 66 | def generalized_box_iou(boxes1, boxes2):
 67 |     """
 68 |     Generalized IoU from https://giou.stanford.edu/
 69 | 
 70 |     The boxes should be in [x0, y0, x1, y1] format
 71 | 
 72 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
 73 |     and M = len(boxes2)
 74 |     """
 75 |     # degenerate boxes gives inf / nan results
 76 |     # so do an early check
 77 |     if (boxes1[:, 2:] >= boxes1[:, :2]).all() and  (boxes2[:, 2:] >= boxes2[:, :2]).all():
 78 | 
 79 |         iou, union = box_iou(boxes1, boxes2)
 80 | 
 81 |         lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
 82 |         rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 83 | 
 84 |         wh = (rb - lt).clamp(min=0)  # [N,M,2]
 85 |         area = wh[:, :, 0] * wh[:, :, 1]
 86 | 
 87 |         return iou - (area - union) / (area)
 88 |     else:
 89 |         return torch.tensor([0.])
 90 | def box_iou(boxes1, boxes2):
 91 |     area1 = box_area(boxes1)
 92 |     area2 = box_area(boxes2)
 93 | 
 94 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 95 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 96 | 
 97 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 98 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 99 | 
100 |     union = area1[:, None] + area2 - inter
101 | 
102 |     iou = inter / union
103 |     return iou, union
104 | def diverse_loss(score_list, word_mask, m=-1, coverage_reg=True):
105 |     score_matrix = torch.stack([mask_softmax(score,word_mask) for score in score_list], dim=1)    ## (B,Nfilm,N,H,W)
106 |     cov_matrix = torch.bmm(score_matrix,score_matrix.permute(0,2,1))    ## (BHW,Nfilm,Nfilm)
107 |     id_matrix = Variable(torch.eye(cov_matrix.shape[1]).unsqueeze(0).repeat(cov_matrix.shape[0],1,1).cuda())
108 |     if m==-1.:
109 |         div_reg = torch.sum(((cov_matrix*(1-id_matrix))**2).view(-1))/cov_matrix.shape[0]
110 |     else:
111 |         div_reg = torch.sum(((cov_matrix-m*id_matrix)**2).view(-1))/cov_matrix.shape[0]
112 |     if coverage_reg:
113 |         word_mask_cp = word_mask.clone()
114 |         for ii in range(word_mask_cp.shape[0]):
115 |             word_mask_cp[ii,0]=0
116 |             word_mask_cp[ii,word_mask_cp[ii,:].sum()]=0 ## set one to 0 already
117 |         cover_matrix = 1.-torch.clamp(torch.sum(score_matrix, dim=1, keepdim=False),min=0.,max=1.)
118 |         cover_reg = torch.sum((cover_matrix*word_mask_cp.float()).view(-1))/cov_matrix.shape[0]
119 |         div_reg += cover_reg
120 |     return div_reg
121 | 
122 | def build_target(raw_coord, pred, anchors_full, args):
123 |     coord = Variable(torch.zeros(raw_coord.size(0), raw_coord.size(1)).cuda())
124 |     batch, grid = raw_coord.size(0), args.size//args.gsize
125 |     coord[:,0] = (raw_coord[:,0] + raw_coord[:,2])/(2*args.size)
126 |     coord[:,1] = (raw_coord[:,1] + raw_coord[:,3])/(2*args.size)
127 |     coord[:,2] = (raw_coord[:,2] - raw_coord[:,0])/(args.size)
128 |     coord[:,3] = (raw_coord[:,3] - raw_coord[:,1])/(args.size)
129 |     coord = coord * grid
130 |     bbox=torch.zeros(coord.size(0),9,5,grid, grid)
131 |     best_n_list, best_gi, best_gj = [],[],[]
132 | 
133 |     for ii in range(batch):
134 |         batch, grid = raw_coord.size(0), args.size//args.gsize
135 |         gi = coord[ii,0].long()
136 |         gj = coord[ii,1].long()
137 |         tx = coord[ii,0] - gi.float()
138 |         ty = coord[ii,1] - gj.float()
139 | 
140 |         gw = coord[ii,2]
141 |         gh = coord[ii,3]
142 | 
143 |         anchor_idxs = range(9)
144 |         anchors = [anchors_full[i] for i in anchor_idxs]
145 |         scaled_anchors = [ (x[0] / (args.anchor_imsize/grid), \
146 |             x[1] / (args.anchor_imsize/grid)) for x in anchors]
147 | 
148 |         ## Get shape of gt box
149 |         gt_box = torch.FloatTensor(np.array([0, 0, gw.cpu(), gh.cpu()],dtype=np.float32)).unsqueeze(0)
150 |         ## Get shape of anchor box
151 |         anchor_shapes = torch.FloatTensor(np.concatenate((np.zeros((len(scaled_anchors), 2)), np.array(scaled_anchors)), 1))
152 |         ## Calculate iou between gt and anchor shapes
153 |         # anch_ious = list(bbox_iou(gt_box, anchor_shapes))
154 |         anch_ious = list(bbox_iou(gt_box, anchor_shapes,x1y1x2y2=False))
155 |         ## Find the best matching anchor box
156 |         best_n = np.argmax(np.array(anch_ious))
157 | 
158 |         tw = torch.log(gw / scaled_anchors[best_n][0] + 1e-16)
159 |         th = torch.log(gh / scaled_anchors[best_n][1] + 1e-16)
160 | 
161 |         bbox[ii, best_n, :, gj, gi] = torch.stack([tx, ty, tw, th, torch.ones(1).cuda().squeeze()])
162 |         best_n_list.append(int(best_n))
163 |         best_gi.append(gi)
164 |         best_gj.append(gj)
165 | 
166 |     bbox = Variable(bbox.cuda())
167 |     return bbox, best_gi, best_gj, best_n_list
168 | 


--------------------------------------------------------------------------------
/utils/transforms.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Generic Image Transform utillities.
  5 | """
  6 | import torch
  7 | import cv2
  8 | import random, math
  9 | import numpy as np
 10 | from collections import Iterable
 11 | 
 12 | import torch.nn.functional as F
 13 | from torch.autograd import Variable
 14 | 
 15 | 
 16 | class ResizePad:
 17 |     """
 18 |     Resize and pad an image to given size.
 19 |     """
 20 | 
 21 |     def __init__(self, size):
 22 |         if not isinstance(size, (int, Iterable)):
 23 |             raise TypeError('Got inappropriate size arg: {}'.format(size))
 24 | 
 25 |         self.h, self.w = size
 26 | 
 27 |     def __call__(self, img):
 28 |         h, w = img.shape[:2]
 29 |         scale = min(self.h / h, self.w / w)
 30 |         resized_h = int(np.round(h * scale))
 31 |         resized_w = int(np.round(w * scale))
 32 |         pad_h = int(np.floor(self.h - resized_h) / 2)
 33 |         pad_w = int(np.floor(self.w - resized_w) / 2)
 34 | 
 35 |         resized_img = cv2.resize(img, (resized_w, resized_h))
 36 | 
 37 |         # if img.ndim > 2:
 38 |         if img.ndim > 2:
 39 |             new_img = np.zeros(
 40 |                 (self.h, self.w, img.shape[-1]), dtype=resized_img.dtype)
 41 |         else:
 42 |             resized_img = np.expand_dims(resized_img, -1)
 43 |             new_img = np.zeros((self.h, self.w, 1), dtype=resized_img.dtype)
 44 |         new_img[pad_h: pad_h + resized_h,
 45 |                 pad_w: pad_w + resized_w, ...] = resized_img
 46 |         return new_img
 47 | 
 48 | 
 49 | class CropResize:
 50 |     """Remove padding and resize image to its original size."""
 51 | 
 52 |     def __call__(self, img, size):
 53 |         if not isinstance(size, (int, Iterable)):
 54 |             raise TypeError('Got inappropriate size arg: {}'.format(size))
 55 |         im_h, im_w = img.data.shape[:2]
 56 |         input_h, input_w = size
 57 |         scale = max(input_h / im_h, input_w / im_w)
 58 |         # scale = torch.Tensor([[input_h / im_h, input_w / im_w]]).max()
 59 |         resized_h = int(np.round(im_h * scale))
 60 |         # resized_h = torch.round(im_h * scale)
 61 |         resized_w = int(np.round(im_w * scale))
 62 |         # resized_w = torch.round(im_w * scale)
 63 |         crop_h = int(np.floor(resized_h - input_h) / 2)
 64 |         # crop_h = torch.floor(resized_h - input_h) // 2
 65 |         crop_w = int(np.floor(resized_w - input_w) / 2)
 66 |         # crop_w = torch.floor(resized_w - input_w) // 2
 67 |         # resized_img = cv2.resize(img, (resized_w, resized_h))
 68 |         resized_img = F.upsample(
 69 |             img.unsqueeze(0).unsqueeze(0), size=(resized_h, resized_w),
 70 |             mode='bilinear')
 71 | 
 72 |         resized_img = resized_img.squeeze().unsqueeze(0)
 73 | 
 74 |         return resized_img[0, crop_h: crop_h + input_h,
 75 |                            crop_w: crop_w + input_w]
 76 | 
 77 | 
 78 | class ResizeImage:
 79 |     """Resize the largest of the sides of the image to a given size"""
 80 |     def __init__(self, size):
 81 |         if not isinstance(size, (int, Iterable)):
 82 |             raise TypeError('Got inappropriate size arg: {}'.format(size))
 83 | 
 84 |         self.size = size
 85 | 
 86 |     def __call__(self, img):
 87 |         im_h, im_w = img.shape[-2:]
 88 |         scale = min(self.size / im_h, self.size / im_w)
 89 |         resized_h = int(np.round(im_h * scale))
 90 |         resized_w = int(np.round(im_w * scale))
 91 |         out = F.upsample(
 92 |             Variable(img).unsqueeze(0), size=(resized_h, resized_w),
 93 |             mode='bilinear').squeeze().data
 94 |         return out
 95 | 
 96 | 
 97 | class ResizeAnnotation:
 98 |     """Resize the largest of the sides of the annotation to a given size"""
 99 |     def __init__(self, size):
100 |         if not isinstance(size, (int, Iterable)):
101 |             raise TypeError('Got inappropriate size arg: {}'.format(size))
102 | 
103 |         self.size = size
104 | 
105 |     def __call__(self, img):
106 |         im_h, im_w = img.shape[-2:]
107 |         scale = min(self.size / im_h, self.size / im_w)
108 |         resized_h = int(np.round(im_h * scale))
109 |         resized_w = int(np.round(im_w * scale))
110 |         out = F.upsample(
111 |             Variable(img).unsqueeze(0).unsqueeze(0),
112 |             size=(resized_h, resized_w),
113 |             mode='bilinear').squeeze().data
114 |         return out
115 | 
116 | 
117 | class ToNumpy:
118 |     """Transform an torch.*Tensor to an numpy ndarray."""
119 | 
120 |     def __call__(self, x):
121 |         return x.numpy()
122 | 
123 | def letterbox(img, mask, height, color=(123.7, 116.3, 103.5)):  # resize a rectangular image to a padded square
124 |     shape = img.shape[:2]  # shape = [height, width]
125 |     #print(shape)
126 |     ratio = float(height) / max(shape)  # ratio  = old / new
127 |     new_shape = (round(shape[1] * ratio), round(shape[0] * ratio))
128 |     dw = (height - new_shape[0]) / 2  # width padding
129 |     dh = (height - new_shape[1]) / 2  # height padding
130 |     top, bottom = round(dh - 0.1), round(dh + 0.1)
131 |     left, right = round(dw - 0.1), round(dw + 0.1)
132 |     img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
133 |     img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # padded square
134 |     if mask is not None:
135 |         mask = cv2.resize(mask, new_shape, interpolation=cv2.INTER_NEAREST)  # resized, no border
136 |         mask = cv2.copyMakeBorder(mask, top, bottom, left, right, cv2.BORDER_CONSTANT, value=0)  # padded square
137 |     return img, mask, ratio, dw, dh
138 | 
139 | def random_affine(img, mask, targets, degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-2, 2),
140 |                   borderValue=(123.7, 116.3, 103.5), all_bbox=None):
141 |     border = 0  # width of added border (optional)
142 |     height = max(img.shape[0], img.shape[1]) + border * 2
143 |     # Rotation and Scale
144 |     R = np.eye(3)
145 |     Rht = np.eye(3)
146 |     a = random.random() * (degrees[1] - degrees[0]) + degrees[0]
147 |     # a += random.choice([-180, -90, 0, 90])  # 90deg rotations added to small rotations
148 |     s = random.random() * (scale[1] - scale[0]) + scale[0]
149 |     R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s)
150 |     # Translation
151 |     T = np.eye(3)
152 |     r1 = random.random()
153 |     r2 = random.random()
154 |     T[0, 2] = (r1 * 2 - 1) * translate[0] * img.shape[0] + border  # x translation (pixels)
155 |     T[1, 2] = (r2 * 2 - 1) * translate[1] * img.shape[1] + border  # y translation (pixels)
156 | 
157 |     # Shear
158 |     S = np.eye(3)
159 |     S[0, 1] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180)  # x shear (deg)
160 |     S[1, 0] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180)  # y shear (deg)
161 | 
162 |     M = S @ T @ R  # Combined rotation matrix. ORDER IS IMPORTANT HERE!!
163 |     imw = cv2.warpPerspective(img, M, dsize=(height, height), flags=cv2.INTER_LINEAR,
164 |                               borderValue=borderValue)  # BGR order borderValue
165 | 
166 |     if mask is not None:
167 |         maskw = cv2.warpPerspective(mask, M, dsize=(height, height), flags=cv2.INTER_NEAREST,
168 |                                   borderValue=0)  # BGR order borderValue
169 |     else:
170 |         maskw = None
171 | 
172 |     # Return warped points also
173 |     if type(targets)==type([1]):
174 |         targetlist=[]
175 |         for bbox in targets:
176 |             targetlist.append(wrap_points(bbox, M, height, a))
177 |         return imw, maskw, targetlist, M
178 |     elif all_bbox is not None:
179 |         targets = wrap_points(targets, M, height, a)
180 |         for ii in range(all_bbox.shape[0]):
181 |             all_bbox[ii,:] = wrap_points(all_bbox[ii,:], M, height, a)
182 |         return imw, maskw, targets, all_bbox, M
183 |     elif targets is not None:   ## previous main
184 |         targets = wrap_points(targets, M, height, a)
185 |         return imw, maskw, targets, M
186 |     else:
187 |         return imw
188 | 
189 | def wrap_points(targets, M, height, a):
190 |     # n = targets.shape[0]
191 |     # points = targets[:, 1:5].copy()
192 |     points = targets.copy()
193 |     # area0 = (points[:, 2] - points[:, 0]) * (points[:, 3] - points[:, 1])
194 |     area0 = (points[2] - points[0]) * (points[3] - points[1])
195 | 
196 |     # warp points
197 |     xy = np.ones((4, 3))
198 |     xy[:, :2] = points[[0, 1, 2, 3, 0, 3, 2, 1]].reshape(4, 2)  # x1y1, x2y2, x1y2, x2y1
199 |     xy = (xy @ M.T)[:, :2].reshape(1, 8)
200 | 
201 |     # create new boxes
202 |     x = xy[:, [0, 2, 4, 6]]
203 |     y = xy[:, [1, 3, 5, 7]]
204 |     xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, 1).T
205 | 
206 |     # apply angle-based reduction
207 |     radians = a * math.pi / 180
208 |     reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5
209 |     x = (xy[:, 2] + xy[:, 0]) / 2
210 |     y = (xy[:, 3] + xy[:, 1]) / 2
211 |     w = (xy[:, 2] - xy[:, 0]) * reduction
212 |     h = (xy[:, 3] - xy[:, 1]) * reduction
213 |     xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, 1).T
214 | 
215 |     # reject warped points outside of image
216 |     np.clip(xy, 0, height, out=xy)
217 |     w = xy[:, 2] - xy[:, 0]
218 |     h = xy[:, 3] - xy[:, 1]
219 |     area = w * h
220 |     ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16))
221 |     i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10)
222 | 
223 |     ## print(targets, xy)
224 |     ## [ 56  36 108 210] [[ 47.80464857  15.6096533  106.30993434 196.71267693]]
225 |     # targets = targets[i]
226 |     # targets[:, 1:5] = xy[i]
227 |     targets = xy[0]
228 |     return targets   


--------------------------------------------------------------------------------
/utils/transformsv2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Generic Image Transform utillities.
  5 | """
  6 | 
  7 | import cv2
  8 | import random, math
  9 | import numpy as np
 10 | from collections import Iterable
 11 | 
 12 | import torch.nn.functional as F
 13 | from torch.autograd import Variable
 14 | 
 15 | 
 16 | class ResizePad:
 17 |     """
 18 |     Resize and pad an image to given size.
 19 |     """
 20 | 
 21 |     def __init__(self, size):
 22 |         if not isinstance(size, (int, Iterable)):
 23 |             raise TypeError('Got inappropriate size arg: {}'.format(size))
 24 | 
 25 |         self.h, self.w = size
 26 | 
 27 |     def __call__(self, img):
 28 |         h, w = img.shape[:2]
 29 |         scale = min(self.h / h, self.w / w)
 30 |         resized_h = int(np.round(h * scale))
 31 |         resized_w = int(np.round(w * scale))
 32 |         pad_h = int(np.floor(self.h - resized_h) / 2)
 33 |         pad_w = int(np.floor(self.w - resized_w) / 2)
 34 | 
 35 |         resized_img = cv2.resize(img, (resized_w, resized_h))
 36 | 
 37 |         # if img.ndim > 2:
 38 |         if img.ndim > 2:
 39 |             new_img = np.zeros(
 40 |                 (self.h, self.w, img.shape[-1]), dtype=resized_img.dtype)
 41 |         else:
 42 |             resized_img = np.expand_dims(resized_img, -1)
 43 |             new_img = np.zeros((self.h, self.w, 1), dtype=resized_img.dtype)
 44 |         new_img[pad_h: pad_h + resized_h,
 45 |                 pad_w: pad_w + resized_w, ...] = resized_img
 46 |         return new_img
 47 | 
 48 | 
 49 | class CropResize:
 50 |     """Remove padding and resize image to its original size."""
 51 | 
 52 |     def __call__(self, img, size):
 53 |         if not isinstance(size, (int, Iterable)):
 54 |             raise TypeError('Got inappropriate size arg: {}'.format(size))
 55 |         im_h, im_w = img.data.shape[:2]
 56 |         input_h, input_w = size
 57 |         scale = max(input_h / im_h, input_w / im_w)
 58 |         # scale = torch.Tensor([[input_h / im_h, input_w / im_w]]).max()
 59 |         resized_h = int(np.round(im_h * scale))
 60 |         # resized_h = torch.round(im_h * scale)
 61 |         resized_w = int(np.round(im_w * scale))
 62 |         # resized_w = torch.round(im_w * scale)
 63 |         crop_h = int(np.floor(resized_h - input_h) / 2)
 64 |         # crop_h = torch.floor(resized_h - input_h) // 2
 65 |         crop_w = int(np.floor(resized_w - input_w) / 2)
 66 |         # crop_w = torch.floor(resized_w - input_w) // 2
 67 |         # resized_img = cv2.resize(img, (resized_w, resized_h))
 68 |         resized_img = F.upsample(
 69 |             img.unsqueeze(0).unsqueeze(0), size=(resized_h, resized_w),
 70 |             mode='bilinear')
 71 | 
 72 |         resized_img = resized_img.squeeze().unsqueeze(0)
 73 | 
 74 |         return resized_img[0, crop_h: crop_h + input_h,
 75 |                            crop_w: crop_w + input_w]
 76 | 
 77 | 
 78 | class ResizeImage:
 79 |     """Resize the largest of the sides of the image to a given size"""
 80 |     def __init__(self, size):
 81 |         if not isinstance(size, (int, Iterable)):
 82 |             raise TypeError('Got inappropriate size arg: {}'.format(size))
 83 | 
 84 |         self.size = size
 85 | 
 86 |     def __call__(self, img):
 87 |         im_h, im_w = img.shape[-2:]
 88 |         scale = min(self.size / im_h, self.size / im_w)
 89 |         resized_h = int(np.round(im_h * scale))
 90 |         resized_w = int(np.round(im_w * scale))
 91 |         out = F.upsample(
 92 |             Variable(img).unsqueeze(0), size=(resized_h, resized_w),
 93 |             mode='bilinear').squeeze().data
 94 |         return out
 95 | 
 96 | 
 97 | class ResizeAnnotation:
 98 |     """Resize the largest of the sides of the annotation to a given size"""
 99 |     def __init__(self, size):
100 |         if not isinstance(size, (int, Iterable)):
101 |             raise TypeError('Got inappropriate size arg: {}'.format(size))
102 | 
103 |         self.size = size
104 | 
105 |     def __call__(self, img):
106 |         im_h, im_w = img.shape[-2:]
107 |         scale = min(self.size / im_h, self.size / im_w)
108 |         resized_h = int(np.round(im_h * scale))
109 |         resized_w = int(np.round(im_w * scale))
110 |         out = F.upsample(
111 |             Variable(img).unsqueeze(0).unsqueeze(0),
112 |             size=(resized_h, resized_w),
113 |             mode='bilinear').squeeze().data
114 |         return out
115 | 
116 | 
117 | class ToNumpy:
118 |     """Transform an torch.*Tensor to an numpy ndarray."""
119 | 
120 |     def __call__(self, x):
121 |         return x.numpy()
122 | 
123 | def letterbox(img, mask, height, color=(123.7, 116.3, 103.5)):  # resize a rectangular image to a padded square
124 |     shape = img.shape[:2]  # shape = [height, width]
125 |     ratio = float(height) / max(shape)  # ratio  = old / new
126 |     new_shape = (round(shape[1] * ratio), round(shape[0] * ratio))
127 |     dw = (height - new_shape[0]) / 2  # width padding
128 |     dh = (height - new_shape[1]) / 2  # height padding
129 |     top, bottom = round(dh - 0.1), round(dh + 0.1)
130 |     left, right = round(dw - 0.1), round(dw + 0.1)
131 |     img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
132 |     img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # padded square
133 |     if mask is not None:
134 |         mask = cv2.resize(mask, new_shape, interpolation=cv2.INTER_NEAREST)  # resized, no border
135 |         mask = cv2.copyMakeBorder(mask, top, bottom, left, right, cv2.BORDER_CONSTANT, value=255)  # padded square
136 |     return img, mask, ratio, dw, dh
137 | 
138 | def random_affine(img, mask, targets, degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-2, 2),
139 |                   borderValue=(123.7, 116.3, 103.5), all_bbox=None):
140 |     border = 0  # width of added border (optional)
141 |     height = max(img.shape[0], img.shape[1]) + border * 2
142 |     heightht = max(ht.shape[0], ht.shape[1]) + border * 2
143 |     # Rotation and Scale
144 |     R = np.eye(3)
145 |     Rht = np.eye(3)
146 |     a = random.random() * (degrees[1] - degrees[0]) + degrees[0]
147 |     # a += random.choice([-180, -90, 0, 90])  # 90deg rotations added to small rotations
148 |     s = random.random() * (scale[1] - scale[0]) + scale[0]
149 |     R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s)
150 |     Rht[:2] = cv2.getRotationMatrix2D(angle=a, center=(ht.shape[1] / 2, ht.shape[0] / 2), scale=s)
151 |     # Translation
152 |     T = np.eye(3)
153 |     r1 = random.random()
154 |     r2 = random.random()
155 |     T[0, 2] = (r1 * 2 - 1) * translate[0] * img.shape[0] + border  # x translation (pixels)
156 |     T[1, 2] = (r2 * 2 - 1) * translate[1] * img.shape[1] + border  # y translation (pixels)
157 | 
158 |     Tht = np.eye(3)
159 |     Tht[0, 2] = (r1 * 2 - 1) * translate[0] * ht.shape[0] + border  # x translation (pixels)
160 |     Tht[1, 2] = (r2 * 2 - 1) * translate[1] * ht.shape[1] + border  # y translation (pixels)
161 |     # Shear
162 |     S = np.eye(3)
163 |     S[0, 1] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180)  # x shear (deg)
164 |     S[1, 0] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180)  # y shear (deg)
165 | 
166 |     M = S @ T @ R  # Combined rotation matrix. ORDER IS IMPORTANT HERE!!
167 |     Mht = S@Tht @ Rht
168 |     imw = cv2.warpPerspective(img, M, dsize=(height, height), flags=cv2.INTER_LINEAR,
169 |                               borderValue=borderValue)  # BGR order borderValue
170 | 
171 |     if mask is not None:
172 |         maskw = cv2.warpPerspective(mask, M, dsize=(height, height), flags=cv2.INTER_NEAREST,
173 |                                   borderValue=0)  # BGR order borderValue
174 |     else:
175 |         maskw = None
176 | 
177 |     # Return warped points also
178 |     if type(targets)==type([1]):
179 |         targetlist=[]
180 |         for bbox in targets:
181 |             targetlist.append(wrap_points(bbox, M, height, a))
182 |         return imw, maskw, targetlist, M
183 |     elif all_bbox is not None:
184 |         targets = wrap_points(targets, M, height, a)
185 |         for ii in range(all_bbox.shape[0]):
186 |             all_bbox[ii,:] = wrap_points(all_bbox[ii,:], M, height, a)
187 |         return imw, maskw, targets, all_bbox, M
188 |     elif targets is not None:   ## previous main
189 |         targets = wrap_points(targets, M, height, a)
190 |         return imw, maskw, targets, M
191 |     else:
192 |         return imw
193 | 
194 | def wrap_points(targets, M, height, a):
195 |     # n = targets.shape[0]
196 |     # points = targets[:, 1:5].copy()
197 |     points = targets.copy()
198 |     # area0 = (points[:, 2] - points[:, 0]) * (points[:, 3] - points[:, 1])
199 |     area0 = (points[2] - points[0]) * (points[3] - points[1])
200 | 
201 |     # warp points
202 |     xy = np.ones((4, 3))
203 |     xy[:, :2] = points[[0, 1, 2, 3, 0, 3, 2, 1]].reshape(4, 2)  # x1y1, x2y2, x1y2, x2y1
204 |     xy = (xy @ M.T)[:, :2].reshape(1, 8)
205 | 
206 |     # create new boxes
207 |     x = xy[:, [0, 2, 4, 6]]
208 |     y = xy[:, [1, 3, 5, 7]]
209 |     xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, 1).T
210 | 
211 |     # apply angle-based reduction
212 |     radians = a * math.pi / 180
213 |     reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5
214 |     x = (xy[:, 2] + xy[:, 0]) / 2
215 |     y = (xy[:, 3] + xy[:, 1]) / 2
216 |     w = (xy[:, 2] - xy[:, 0]) * reduction
217 |     h = (xy[:, 3] - xy[:, 1]) * reduction
218 |     xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, 1).T
219 | 
220 |     # reject warped points outside of image
221 |     np.clip(xy, 0, height, out=xy)
222 |     w = xy[:, 2] - xy[:, 0]
223 |     h = xy[:, 3] - xy[:, 1]
224 |     area = w * h
225 |     ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16))
226 |     i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10)
227 | 
228 |     ## print(targets, xy)
229 |     ## [ 56  36 108 210] [[ 47.80464857  15.6096533  106.30993434 196.71267693]]
230 |     # targets = targets[i]
231 |     # targets[:, 1:5] = xy[i]
232 |     targets = xy[0]
233 |     return targets   


--------------------------------------------------------------------------------
/model/yolov3.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | #batch=1
  4 | #subdivisions=1
  5 | # Training
  6 | batch=16
  7 | subdivisions=1
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 | 
583 | [yoloconvolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 | 
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 | 
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 | 
606 | 
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 | 
617 | 
618 | [route]
619 | layers = -4
620 | 
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 | 
629 | [upsample]
630 | stride=2
631 | 
632 | [route]
633 | layers = -1, 61
634 | 
635 | 
636 | 
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 | 
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 | 
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 | 
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 | 
669 | [yoloconvolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 | 
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 | 
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 | 
692 | 
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 | 
703 | 
704 | 
705 | [route]
706 | layers = -4
707 | 
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 | 
716 | [upsample]
717 | stride=2
718 | 
719 | [route]
720 | layers = -1, 36
721 | 
722 | 
723 | 
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 | 
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 | 
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 | 
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 | 
756 | [yoloconvolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 | 
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 | 
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 | 
779 | 
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 | 


--------------------------------------------------------------------------------
/dataset/data_loaderv2.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | YouRefIt referring image PyTorch dataset.
  5 | Define and group batches of images and queries.
  6 | Based on:
  7 | https://github.com/zyang-ur/ReSC/blob/master/dataset/data_loader.py
  8 | """
  9 | from torchvision.transforms import Compose, ToTensor, Normalize
 10 | import os
 11 | import sys
 12 | import cv2
 13 | import json
 14 | import uuid
 15 | import tqdm
 16 | import math
 17 | import torch
 18 | import random
 19 | # import h5py
 20 | import numpy as np
 21 | import os.path as osp
 22 | import scipy.io as sio
 23 | import torch.utils.data as data
 24 | from collections import OrderedDict
 25 | sys.path.append('.')
 26 | import operator
 27 | import utils
 28 | from utils import Corpus
 29 | 
 30 | import argparse
 31 | import collections
 32 | import logging
 33 | import json
 34 | import re
 35 | 
 36 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 37 | from pytorch_pretrained_bert.modeling import BertModel
 38 | # from transformers import BertTokenizer,BertModel
 39 | from utils.transforms import letterbox, random_affine
 40 | 
 41 | sys.modules['utils'] = utils
 42 | 
 43 | cv2.setNumThreads(0)
 44 | 
 45 | def read_examples(input_line, unique_id):
 46 |     """Read a list of `InputExample`s from an input file."""
 47 |     examples = []
 48 |     # unique_id = 0
 49 |     line = input_line #reader.readline()
 50 |     # if not line:
 51 |     #     break
 52 |     line = line.strip()
 53 |     text_a = None
 54 |     text_b = None
 55 |     m = re.match(r"^(.*) \|\|\| (.*)$", line)
 56 |     if m is None:
 57 |         text_a = line
 58 |     else:
 59 |         text_a = m.group(1)
 60 |         text_b = m.group(2)
 61 |     examples.append(
 62 |         InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
 63 |     # unique_id += 1
 64 |     return examples
 65 | 
 66 | ## Bert text encoding
 67 | class InputExample(object):
 68 |     def __init__(self, unique_id, text_a, text_b):
 69 |         self.unique_id = unique_id
 70 |         self.text_a = text_a
 71 |         self.text_b = text_b
 72 | 
 73 | class InputFeatures(object):
 74 |     """A single set of features of data."""
 75 |     def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
 76 |         self.unique_id = unique_id
 77 |         self.tokens = tokens
 78 |         self.input_ids = input_ids
 79 |         self.input_mask = input_mask
 80 |         self.input_type_ids = input_type_ids
 81 | 
 82 | def convert_examples_to_features(examples, seq_length, tokenizer):
 83 |     """Loads a data file into a list of `InputBatch`s."""
 84 |     features = []
 85 |     for (ex_index, example) in enumerate(examples):
 86 |         tokens_a = tokenizer.tokenize(example.text_a)
 87 | 
 88 |         tokens_b = None
 89 |         if example.text_b:
 90 |             tokens_b = tokenizer.tokenize(example.text_b)
 91 | 
 92 |         if tokens_b:
 93 |             # Modifies `tokens_a` and `tokens_b` in place so that the total
 94 |             # length is less than the specified length.
 95 |             # Account for [CLS], [SEP], [SEP] with "- 3"
 96 |             _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
 97 |         else:
 98 |             # Account for [CLS] and [SEP] with "- 2"
 99 |             if len(tokens_a) > seq_length - 2:
100 |                 tokens_a = tokens_a[0:(seq_length - 2)]
101 |         tokens = []
102 |         input_type_ids = []
103 |         tokens.append("[CLS]")
104 |         input_type_ids.append(0)
105 |         for token in tokens_a:
106 |             tokens.append(token)
107 |             input_type_ids.append(0)
108 |         tokens.append("[SEP]")
109 |         input_type_ids.append(0)
110 | 
111 |         if tokens_b:
112 |             for token in tokens_b:
113 |                 tokens.append(token)
114 |                 input_type_ids.append(1)
115 |             tokens.append("[SEP]")
116 |             input_type_ids.append(1)
117 | 
118 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
119 | 
120 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
121 |         # tokens are attended to.
122 |         input_mask = [1] * len(input_ids)
123 | 
124 |         # Zero-pad up to the sequence length.
125 |         while len(input_ids) < seq_length:
126 |             input_ids.append(0)
127 |             input_mask.append(0)
128 |             input_type_ids.append(0)
129 | 
130 |         assert len(input_ids) == seq_length
131 |         assert len(input_mask) == seq_length
132 |         assert len(input_type_ids) == seq_length
133 |         features.append(
134 |             InputFeatures(
135 |                 unique_id=example.unique_id,
136 |                 tokens=tokens,
137 |                 input_ids=input_ids,
138 |                 input_mask=input_mask,
139 |                 input_type_ids=input_type_ids))
140 |     return features
141 | 
142 | class DatasetNotFoundError(Exception):
143 |     pass
144 | 
145 | class ReferDataset(data.Dataset):
146 |     SUPPORTED_DATASETS = {
147 |         'yourefit': {'splits': ('train', 'val', 'test')},
148 |         'referit': {'splits': ('train', 'val', 'trainval', 'test')},
149 |         'unc': {
150 |             'splits': ('train', 'val', 'trainval', 'testA', 'testB'),
151 |             'params': {'dataset': 'refcoco', 'split_by': 'unc'}
152 |         },
153 |         'unc+': {
154 |             'splits': ('train', 'val', 'trainval', 'testA', 'testB'),
155 |             'params': {'dataset': 'refcoco+', 'split_by': 'unc'}
156 |         },
157 |         'gref': {
158 |             'splits': ('train', 'val'),
159 |             'params': {'dataset': 'refcocog', 'split_by': 'google'}
160 |         },
161 |         'gref_umd': {
162 |             'splits': ('train', 'val', 'test'),
163 |             'params': {'dataset': 'refcocog', 'split_by': 'umd'}
164 |         },
165 |         'flickr': {
166 |             'splits': ('train', 'val', 'test')}
167 |     }
168 | 
169 |     def __init__(self, data_root, split_root='data', dataset='referit', imsize=256,
170 |                  transform=None, augment=False, device=None, return_idx=False, testmode=False,
171 |                  split='train', max_query_len=128, lstm=False, bert_model='bert-base-uncased'):
172 |         self.images = []
173 |         self.data_root = data_root
174 |         self.split_root = split_root
175 |         self.dataset = dataset
176 |         self.imsize = imsize
177 |         self.query_len = max_query_len
178 |         self.lstm = lstm
179 |         self.transform = transform
180 |         self.testmode = testmode
181 |         self.split = split
182 |         self.device = device
183 |         self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)
184 |         self.augment=augment
185 |         self.return_idx=return_idx
186 | 
187 |         if self.dataset == 'yourefit':
188 |             self.dataset_root = osp.join(self.data_root, 'yourefit')
189 |             self.im_dir = osp.join(self.dataset_root, 'images')
190 |         elif self.dataset == 'referit':
191 |             self.dataset_root = osp.join(self.data_root, 'referit')
192 |             self.im_dir = osp.join(self.dataset_root, 'images')
193 |             self.split_dir = osp.join(self.dataset_root, 'splits')
194 |         elif  self.dataset == 'flickr':
195 |             self.dataset_root = osp.join(self.data_root, 'Flickr30k')
196 |             self.im_dir = osp.join(self.dataset_root, 'flickr30k_images')
197 |         else:   ## refcoco, etc.
198 |             self.dataset_root = osp.join(self.data_root, 'other')
199 |             self.im_dir = osp.join(
200 |                 self.dataset_root, 'images', 'mscoco', 'images', 'train2014')
201 |             self.split_dir = osp.join(self.dataset_root, 'splits')
202 | 
203 |         if not self.exists_dataset():
204 |             print('Please download index cache to data folder')
205 |             exit(0)
206 | 
207 |         dataset_path = osp.join(self.split_root, self.dataset)
208 |         valid_splits = self.SUPPORTED_DATASETS[self.dataset]['splits']
209 | 
210 |         if self.lstm:
211 |             self.corpus = Corpus()
212 |             corpus_path = osp.join(dataset_path, 'corpus.pth')
213 |             self.corpus = torch.load(corpus_path)
214 | 
215 |         if split not in valid_splits:
216 |             raise ValueError(
217 |                 'Dataset {0} does not have split {1}'.format(
218 |                     self.dataset, split))
219 | 
220 |         splits = [split]
221 |         if self.dataset != 'referit':
222 |             splits = ['train', 'val'] if split == 'trainval' else [split]
223 |         for split in splits:
224 |             imgset_file = '{0}_{1}full.pth'.format(self.dataset, split)
225 |             imgset_path = osp.join(dataset_path, imgset_file)
226 |             self.images += torch.load(imgset_path)
227 | 
228 |     def exists_dataset(self):
229 |         return osp.exists(osp.join(self.split_root, self.dataset))
230 | 
231 | 
232 |     def pull_item(self, idx):
233 |         if self.dataset == 'flickr':
234 |             img_file, bbox, phrase = self.images[idx]
235 |         else:
236 |             img_file, _, bbox, phrase, attri = self.images[idx]
237 |         ## box format: to x1y1x2y2
238 |         if not (self.dataset == 'referit' or self.dataset == 'flickr'):
239 |             bbox = np.array(bbox, dtype=int)
240 |             #bbox[2], bbox[3] = bbox[0]+bbox[2], bbox[1]+bbox[3]
241 |         else:
242 |             bbox = np.array(bbox, dtype=int)
243 |         
244 |         img_path = osp.join(self.im_dir, img_file)
245 |         img = cv2.imread(img_path)
246 | 
247 |         htmapdir = self.im_dir.replace('images', 'paf')
248 |         htmapfile = img_file.replace('.jpg', '_rendered.png')
249 |         htmap_path = osp.join(htmapdir, htmapfile)
250 |         htmap = cv2.imread(htmap_path)
251 |         
252 |         ht = np.asarray(htmap)
253 |         ht = np.mean(ht, axis=2)
254 |         ht = cv2.resize(ht, (256, 256))
255 | 
256 |         ptdir = self.im_dir.replace('images', 'depimg')
257 |         ptfile = img_file #.replace('.jpg', '.jpeg')
258 |         pt_path = osp.join(ptdir, ptfile)
259 |         pt = cv2.imread(pt_path)
260 |         # print(pt.shape)
261 |         # exit()
262 |         # pt = cv2.resize(pt, (256,256))
263 |         # pt = np.reshape(pt, (3, 256, 256))
264 | 
265 |         gestdir = self.im_dir.replace('images','gest')
266 |         gestfile = img_file.replace('.jpg' , '.json')
267 |         gest_path = osp.join(gestdir,gestfile)
268 |         gest = json.load(open(gest_path))
269 | 
270 |         ## duplicate channel if gray image
271 |         if img.shape[-1] > 1:
272 |             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
273 |         else:
274 |             img = np.stack([img] * 3)
275 |         
276 |         return img, pt, ht, phrase, bbox, gest
277 |  #       return img, phrase, bbox, pt, ht
278 | 
279 |     def tokenize_phrase(self, phrase):
280 |         return self.corpus.tokenize(phrase, self.query_len)
281 | 
282 |     def untokenize_word_vector(self, words):
283 |         return self.corpus.dictionary[words]
284 | 
285 |     def __len__(self):
286 |         return len(self.images)
287 | 
288 |     def __getitem__(self, idx):
289 |         img, pt, ht, phrase, bbox, gest = self.pull_item(idx)
290 |         # phrase = phrase.decode("utf-8").encode().lower()
291 | 
292 |         center_point = gest['candidate']
293 |         try:
294 |             center_point = np.asarray(center_point)[:,0:2]
295 |             if center_point[0,0] != 0:
296 |                 center_point = center_point [0,:]
297 |             elif center_point[1,0] != 0:
298 |                 center_point = center_point [1,:]
299 |             else :
300 |                 center_point = np.asarray([256,256])
301 |             # mask = center_point!=0
302 |             # print(center_point.shape)
303 |             # center_point = center_point[mask]
304 |             # print(center_point.shape)
305 |             # center_point = center_point [0:2,:]
306 |             # center_point = np.mean(center_point , axis = 0)
307 |         except IndexError:
308 |             center_point = np.asarray([256,256])
309 | 
310 |         phrase = phrase.lower()
311 |         if self.augment:
312 |             augment_flip, augment_hsv, augment_affine = True,True,True
313 |         
314 |         ## seems a bug in torch transformation resize, so separate in advance
315 |         h,w = img.shape[0], img.shape[1]
316 |         if self.augment:
317 |             ## random horizontal flip
318 |             if augment_flip and random.random() > 0.5:
319 |                 img = cv2.flip(img, 1)
320 |                 pt = cv2.flip(pt , 1 )
321 |                 ht = cv2.flip(ht , 1 )
322 |                 center_point[0] = w - center_point[0] - 1
323 |                 bbox[0], bbox[2] = w-bbox[2]-1, w-bbox[0]-1
324 |                 phrase = phrase.replace('right','*&^special^&*').replace('left','right').replace('*&^special^&*','left')
325 |             ## random intensity, saturation change
326 |             if augment_hsv:
327 |                 fraction = 0.50
328 |                 img_hsv = cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2HSV)
329 |                 S = img_hsv[:, :, 1].astype(np.float32)
330 |                 V = img_hsv[:, :, 2].astype(np.float32)
331 |                 a = (random.random() * 2 - 1) * fraction + 1
332 |                 if a > 1:
333 |                     np.clip(S, a_min=0, a_max=255, out=S)
334 |                 a = (random.random() * 2 - 1) * fraction + 1
335 |                 V *= a
336 |                 if a > 1:
337 |                     np.clip(V, a_min=0, a_max=255, out=V)
338 | 
339 |                 img_hsv[:, :, 1] = S.astype(np.uint8)
340 |                 img_hsv[:, :, 2] = V.astype(np.uint8)
341 |                 img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR), cv2.COLOR_BGR2RGB)
342 |             img, _, ratio, dw, dh = letterbox(img, None, self.imsize)
343 |             bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw
344 |             bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh
345 |             ## random affine transformation
346 |             if augment_affine:
347 |                 img, _, bbox, M, center_point, pt, gt = random_affine(center_point, pt, img, None, bbox, \
348 |                     degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10))
349 |         else:   ## should be inference, or specified training
350 |             img, _, ratio, dw, dh = letterbox(img, None, self.imsize)
351 |             bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw
352 |             bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh
353 |             gt = np.asarray(torch.zeros([512,512]))
354 |             gt[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] = 1
355 |         ## Norm, to tensor
356 |         # print(img.shape)
357 |         if self.transform is not None:
358 |             img = self.transform(img)
359 |             pt = self.t(pt)
360 |             #print(ht.shape)
361 |             ht = self.t(ht)
362 |             #print(ht.shape)
363 |         if self.lstm:
364 |             phrase = self.tokenize_phrase(phrase)
365 |             word_id = phrase
366 |             # word_mask = np.zeros(word_id.shape)
367 |             word_mask = np.array(word_id>0,dtype=int)
368 |         else:
369 |             ## encode phrase to bert input
370 |             examples = read_examples(phrase, idx)
371 |             features = convert_examples_to_features(
372 |                 examples=examples, seq_length=self.query_len, tokenizer=self.tokenizer)
373 |             word_id = features[0].input_ids
374 |             word_mask = features[0].input_mask
375 | 
376 |         if self.testmode:
377 |             return img, pt, ht, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
378 |                 np.array(bbox, dtype=np.float32), np.array(ratio, dtype=np.float32), \
379 |                 np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0], center_point, gt
380 |         else:
381 |             return img, pt, ht, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
382 |             np.array(bbox, dtype=np.float32), center_point, gt


--------------------------------------------------------------------------------
/utils/temp.py:
--------------------------------------------------------------------------------
  1 | class grounding_model_multihop(nn.Module):
  2 |     def __init__(self, corpus=None, emb_size=256, jemb_drop_out=0.1, bert_model='bert-base-uncased', \
  3 |         NFilm=2, fusion='prod', intmd=False, mstage=False, convlstm=False, \
  4 |         coordmap=True, leaky=False, dataset=None, bert_emb=False, tunebert=False, use_sal=False, use_paf=False):
  5 |         super(grounding_model_multihop, self).__init__()
  6 |         self.coordmap = coordmap
  7 |         self.emb_size = emb_size
  8 |         self.NFilm = NFilm
  9 |         self.intmd = intmd
 10 |         self.mstage = mstage
 11 |         self.convlstm = convlstm
 12 |         self.tunebert = tunebert
 13 |         self.use_sal = use_sal
 14 |         self.use_paf = use_paf
 15 |         self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)
 16 |         
 17 |         if bert_model=='bert-base-uncased':
 18 |             self.textdim=768
 19 |         else:
 20 |             self.textdim=1024
 21 |         ## Visual model
 22 |         self.visumodel = Darknet(config_path='./model/yolov3.cfg')
 23 |         self.visumodel.load_weights('./saved_models/yolov3.weights')
 24 |         ## Text model
 25 |         self.textmodel = BertModel.from_pretrained(bert_model)
 26 |         
 27 |         self.mapping_visu = ConvBatchNormReLU(512 if self.convlstm else 256, emb_size, 1, 1, 0, 1, leaky=leaky)
 28 |         
 29 |         self.mapping_lang = torch.nn.Sequential(
 30 |           nn.Linear(self.textdim, emb_size),
 31 |           nn.ReLU(),
 32 |           nn.Dropout(jemb_drop_out),
 33 |           nn.Linear(emb_size, emb_size),
 34 |           nn.ReLU(),)
 35 |           
 36 |         textdim=emb_size
 37 |         
 38 |         self.film = FiLMedConvBlock_multihop(NFilm=3,textdim=textdim,visudim=emb_size,\
 39 |             emb_size=emb_size,fusion=fusion,intmd=(intmd or mstage or convlstm))
 40 |             
 41 |         self.film1 = FiLMedConvBlock_multihop(NFilm=1,textdim=textdim,visudim=emb_size,\
 42 |             emb_size=emb_size,fusion=fusion,intmd=(intmd or mstage or convlstm))
 43 |             
 44 |         if self.convlstm:
 45 |             output_emb = emb_size
 46 |             self.global_out = ConvLSTM(input_size=(32, 32),
 47 |                   input_dim=emb_size,
 48 |                   hidden_dim=[emb_size],
 49 |                   kernel_size=(1, 1),
 50 |                   num_layers=1,
 51 |                   batch_first=True,
 52 |                   bias=True,
 53 |                   return_all_layers=False)
 54 |             
 55 |             self.fcn_out = torch.nn.Sequential(
 56 |                   ConvBatchNormReLU(output_emb+8, output_emb//2, 1, 1, 0, 1, leaky=leaky),
 57 |                   nn.Conv2d(output_emb//2, 3*5, kernel_size=1))
 58 |                   
 59 |             self.fcn_out1 = torch.nn.Sequential(
 60 |                   ConvBatchNormReLU(2*output_emb+8, output_emb//2, 1, 1, 0, 1, leaky=leaky),
 61 |                   nn.Conv2d(output_emb//2, 6*5, kernel_size=1))
 62 |         #self.vl_transformer = VisionLanguageEncoder(d_model=512, nhead=8, num_encoder_layers=6,num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,activation="relu", normalize_before=False)
 63 |         '''
 64 |         #transformer
 65 |         decoder_layer = TransformerDecoderLayer(512, 8, 2048, 0.1, "relu", False)
 66 |         decoder_norm = nn.LayerNorm(512)
 67 |         self.decoder = TransformerDecoder(decoder_layer, 6, decoder_norm, return_intermediate=True,d_model=512)
 68 |         
 69 |         encoder_layer = TransformerEncoderLayer(512, 8, 2048, 0.1, "relu", False)
 70 |         encoder_norm = None
 71 |         self.encoder = TransformerEncoder(encoder_layer, 6, encoder_norm)
 72 |         '''
 73 |             
 74 |         ## Mapping module
 75 |         '''
 76 |         for i in self.parameters():
 77 |             i.requires_grad=False
 78 |         ''' 
 79 |         self.mapping_visu2 = ConvBatchNormReLU(512 if self.convlstm else 256+1, emb_size, 3, 1, 1, 1, leaky=leaky)
 80 |         self.mapping_visu1 = ConvBatchNormReLU(512+4 if self.convlstm else 256+1, emb_size, 3, 1, 1, 1, leaky=leaky)
 81 |         self.mp1 = nn.MaxPool2d(16, stride=16)
 82 |         self.mp2 = nn.AvgPool2d(4, stride=4)
 83 |         self.mp3 = nn.AvgPool2d(16, stride=16)
 84 |         self.mp4 = nn.AvgPool2d(2, stride=2)
 85 |         
 86 |         self.mapbodyfeature = MLP(512,512,512,2)
 87 |         
 88 |         self.linecode = MLP(512,128,3,2)
 89 |         
 90 |         self.poscode = MLP(3,128,512,2)
 91 |         
 92 |         
 93 |         #self.pattention = nn.Conv2d(512,1,1)
 94 |         
 95 |         #self.l_embed = nn.Embedding(22, 512)
 96 |         
 97 |         ## output head
 98 |         
 99 |         #self.maplast = ConvBatchNormReLU(output_emb+8, output_emb, 1, 1, 0, 1, leaky=leaky)
100 |         
101 |         output_emb = emb_size
102 |         if self.mstage:
103 |             self.fcn_out = nn.ModuleDict()
104 |             modules = OrderedDict()
105 |             for n in range(0,NFilm):
106 |                 modules["out%d"%n] = torch.nn.Sequential(
107 |                     ConvBatchNormReLU(output_emb, output_emb//2, 1, 1, 0, 1, leaky=leaky),
108 |                     nn.Conv2d(output_emb//2, 9*5, kernel_size=1))
109 |             self.fcn_out.update(modules)
110 |         else:
111 |             if self.intmd: 
112 |                 output_emb = emb_size*NFilm
113 |             if self.use_sal:
114 |                 self.conv1 = nn.Conv2d(1, 2, 4, 4)
115 |                 self.conv15 = nn.Conv2d(2, 4, 2, 2)
116 |                 self.conv2 = nn.Conv2d(4, 8, 2, 2)
117 |             else:
118 |                 self.fcn_out = torch.nn.Sequential(
119 |                         ConvBatchNormReLU(output_emb+8, output_emb//2, 1, 1, 0, 1, leaky=leaky),
120 |                         nn.Conv2d(output_emb//2, 3*5, kernel_size=1))
121 |                 self.fcn_out1 = torch.nn.Sequential(
122 |                         ConvBatchNormReLU(2*output_emb+8, output_emb//2, 1, 1, 0, 1, leaky=leaky),
123 |                         nn.Conv2d(output_emb//2, 6*5, kernel_size=1))
124 |         
125 |     def _reset_parameters(self):
126 |         for p in self.parameters():
127 |             if p.dim() > 1:
128 |                 nn.init.xavier_uniform_(p)
129 |                 
130 |     def forward(self, image, seg, ht, dp, word_id, word_mask):
131 |         ## Visual Module
132 |         batch_size = image.size(0)
133 |         '''
134 |         memory_mask = word_mask.view(batch_size,1,-1,1)
135 |         memory_mask = memory_mask.repeat(1,8,1,1024)
136 |         membed_mask = torch.ones(batch_size, 8, 3, 1024).cuda()
137 |         memory_mask = torch.cat((memory_mask,membed_mask),dim=2)
138 |         memory_mask = word_mask.view(batch_size*8,23,1024)
139 |         print(memory_mask.size())
140 |                 
141 |         tgt_key_padding_mask = word_mask
142 |         embed_mask = torch.ones(batch_size,3).cuda()
143 |         tgt_key_padding_mask = torch.cat((tgt_key_padding_mask,embed_mask),dim=1)
144 |         tgt_key_padding_mask = tgt_key_padding_mask.bool()
145 |         '''
146 |         dp = dp.unsqueeze(1)
147 |         seg = seg.unsqueeze(1)
148 |         dp = dp.type(torch.FloatTensor).cuda()
149 |         seg = seg.type(torch.FloatTensor).cuda()
150 | 
151 |         distxy = distancexy.repeat(batch_size,1,1,1).cuda()
152 |         dist = torch.cat([distxy,dp],dim=1)
153 |         
154 |         seeg = seg.view(batch_size,1,-1)
155 |         seeg = F.normalize(seeg,dim=2,p=1)
156 |         
157 |         dist = dist.view(batch_size,3,-1)
158 |         dist = F.normalize(dist,dim=2,p=1)
159 |         
160 |         #===============================#   
161 |         distfeature = dist.permute(0,2,1)
162 |         #distfeature = distfeature.permute(1,0,2)
163 |         #distfeature = self.posecode(distfeature)
164 |         #distfeature = distfeature.permute(1,0,2)
165 |         
166 |         #===============================#
167 |         bodypositionseg = torch.mul(seeg,dist)
168 |         bodyposition = torch.sum(bodypositionseg,dim=2)
169 |         bodyposition = bodyposition.view(batch_size,1,3)
170 |         
171 |         #bodypfeature = self.poscode(bodyposition)
172 |         #bodypfeature = bodypfeature.view(batch_size,1,-1)
173 |         #bodypfeature = bodyposition.permute(1,0,2)
174 |         #bodypfeature = gen_sineembed_for_position(bodypfeature*512)
175 |         #bodypfeature = bodypfeature.permute(1,0,2)
176 |         #bodypfeature = bodypfeature.view(batch_size,1,-1)
177 |         
178 |         #restdistfeature = distfeature - bodypfeature
179 |         #restdistfeature = F.normalize(restdistfeature,dim=2,p=2)
180 |         
181 |         #restdistfeature = restdistfeature.permute(0,2,1)
182 |         #restdistfeature = restdistfeature.view(batch_size,512,512,512)
183 |         
184 |         #distfeature = distfeature.permute(0,2,1)
185 |         #distfeature = distfeature.view(batch_size,512,512,512)
186 |         
187 |         bodyp = bodyposition.view(batch_size,3,1)
188 |         relatepos = torch.sub(dist,bodyp)
189 |         relatepos = relatepos.view(batch_size,3,-1)
190 |         relatepos = relatepos.permute(0,2,1)
191 |         relateposfeature = self.poscode(relatepos)
192 |         relateposfeature = F.normalize(relateposfeature,dim=2,p=2)
193 |         relateposfeature = relateposfeature.permute(0,2,1)
194 |         relateposfeature = relateposfeature.view(batch_size,512,512,512)
195 |         
196 |         relatepos = F.normalize(relatepos,dim=2,p=2)
197 |         relatepos = relatepos.permute(0,2,1)
198 |         relatepos = relatepos.view(batch_size,3,512,512)
199 |         #restdist = restdist * seg
200 |         
201 |         #====================================================#
202 |         raw_fvisu = self.visumodel(image)
203 |         raw_fvisu = raw_fvisu[1]
204 |         bodyinfo = raw_fvisu
205 |         
206 |         #bodypfeature = bodypfeature.view(batch_size,-1)
207 |         #compute position informations
208 |         ht = ht.type(torch.FloatTensor).cuda()
209 |         ht = ht.view(batch_size,-1,3)
210 |         ht = ht.permute(0,2,1)
211 |         ht = ht.view(batch_size,3,512,512)
212 |         ht = torch.mean(ht,dim=1)
213 |         ht = ht.view(batch_size,1,512,512)
214 |         ht = self.mp1(ht)
215 |                 
216 |         rd = self.mp3(relatepos)
217 |         
218 |         bodyinfo = torch.cat((bodyinfo, ht),1)
219 |         bodyinfo = torch.cat((bodyinfo, rd),1)
220 |         
221 |         bodyinfo = self.mapping_visu1(bodyinfo)
222 |         bodyinfo = self.mp2(bodyinfo)
223 |         bodyinfo = self.mapping_visu2(bodyinfo)
224 |         bodyinfo = self.mp2(bodyinfo)
225 |         bodyinfo = self.mp4(bodyinfo)
226 |         
227 |         bodyfeature = bodyinfo.view(batch_size,-1)
228 |         #bodyfeature = torch.cat([bodyinfo,bodypfeature],dim=1)
229 |         #bodypfeature = bodypfeature.view(batch_size,1,-1)
230 |         bodyfeature = self.mapbodyfeature(bodyfeature).view(batch_size,-1)  
231 |         bodyfeature = F.normalize(bodyfeature,dim=1,p=2)
232 |         
233 |         line = self.linecode(bodyfeature)
234 |         line = line.view(batch_size,1,3)
235 |         
236 |         lor = line.view(batch_size,3)
237 |         
238 |         '''
239 |         word_id = []
240 |         word_mask = []
241 |         for uu in range(batch_size):
242 |             if(lor[uu,0]>0):
243 |                 word_idt = word_ida[uu,:]
244 |                 word_idt = word_idt.unsqueeze(0)
245 |                 word_id.append(word_idt)
246 |                 
247 |                 word_maskt = word_maska[uu,:]
248 |                 word_maskt = word_maskt.unsqueeze(0)
249 |                 word_mask.append(word_maskt)
250 |             else:
251 |                 word_idt = word_idb[uu,:]
252 |                 word_idt = word_idt.unsqueeze(0)
253 |                 word_id.append(word_idt)
254 |                 
255 |                 word_maskt = word_maskb[uu,:]
256 |                 word_maskt = word_maskt.unsqueeze(0)
257 |                 word_mask.append(word_maskt)
258 |                 
259 |         word_id = torch.cat(word_id,dim=0).contiguous()
260 |         word_mask = torch.cat(word_mask,dim=0).contiguous()
261 |         '''
262 |         
263 |         relatepos = relatepos.view(batch_size,3,512,512)
264 |         relateposfeature = relateposfeature.view(batch_size,512,512,512)
265 |         
266 |         relatepos = relatepos.view(batch_size,3,-1)
267 |         relateposfeature = relateposfeature.view(batch_size,512,-1)
268 |         line = F.normalize(line,dim=2,p=2)
269 |         #pt3 = torch.matmul(line,restdist)
270 |         
271 |         bodyfeature = bodyfeature.view(batch_size,1,512)
272 |         pt512 = torch.matmul(bodyfeature,relateposfeature)
273 | 
274 |         #pt3 = pt3.view(batch_size,1,512,512)            
275 |         #attention1 = pt3.view(batch_size,1,512,512)
276 |         attention = pt512.view(batch_size,1,512,512)
277 |         
278 |         #===============================================#
279 |         seg = seg.clamp(max=1)
280 |         seg = -seg+1
281 |         attention = attention * seg
282 |         attention = attention.view(batch_size,1,-1)
283 |         attention = F.softmax(attention,dim=2)
284 |         attention = attention.view(batch_size,1,512,512)
285 |         
286 |         pt = attention
287 |         pt = pt.view(batch_size,1,512,512)
288 |         fvisu = self.mapping_visu(raw_fvisu)
289 |         
290 |         #restdistfeature = restdistfeature.view(batch_size,512,512,512)
291 |         #raw_fvisu = fvisu
292 | 
293 | 
294 | 
295 | 
296 | 
297 | 
298 | 
299 | 
300 | 
301 | 
302 | 
303 | 
304 | 
305 | 
306 | 
307 | 
308 | 
309 | 
310 | 
311 | 
312 | 
313 | 
314 | 
315 | 
316 | 
317 | 
318 | def attt_loss(line,relatepos,attention, bbox, eps=1e-3):
319 |     """This function computes the Kullback-Leibler divergence between ground
320 |        truth saliency maps and their predictions. Values are first divided by
321 |        their sum for each image to yield a distribution that adds to 1.
322 |     Args:
323 |         y_true (tensor, float32): A 4d tensor that holds the ground truth
324 |                                   saliency maps with values between 0 and 255.
325 |         y_pred (tensor, float32): A 4d tensor that holds the predicted saliency
326 |                                   maps with values between 0 and 1.
327 |         eps (scalar, float, optional): A small factor to avoid numerical
328 |                                        instabilities. Defaults to 1e-7.
329 |     Returns:
330 |         tensor, float32: A 0D tensor that holds the averaged error.
331 |     """
332 |     loss = 0
333 |     batch = line.size(0)
334 |     bbox = bbox.int()
335 |     for ii in range(batch):
336 |         
337 |         region1 = attention[ii,0,bbox[ii][0]:max(bbox[ii][2],bbox[ii][0]+1),bbox[ii][1]:max(bbox[ii][3],bbox[ii][1]+1)].contiguous()
338 |         region1.view(-1)
339 |         region1 = torch.sum(region1)
340 |         
341 |         relatepos = relatepos.view(batch,3,512,512)
342 |         region2 = relatepos[ii,:,bbox[ii][0]:max(bbox[ii][2],bbox[ii][0]+1),bbox[ii][1]:max(bbox[ii][3],bbox[ii][1]+1)].contiguous()
343 |         region2 = region2.view(3,-1)
344 |         region2 = torch.mean(region2,dim=1)
345 |         region2 = region2.view(3)
346 |         
347 |         region2 = torch.sum(torch.abs(region2-line[ii]))
348 |         #print(region)
349 |         loss += region2+1-region1 #-region1
350 |     loss = loss/batch
351 |     return loss
352 | 
353 | def depth_loss(input, dp, bbox, gi, gj, best_n_list):
354 |     mseloss = torch.nn.MSELoss(reduction='mean' )
355 |     batch = input.size(0)
356 |     dp = dp.view(batch,-1).float()
357 |     dpmax,_ = torch.max(dp,dim=1)
358 |     dpmax = dpmax.view(batch,-1).float()
359 |     bbox = bbox.int()
360 |     dp = dp/dpmax
361 |     loss = 0
362 |     dp = dp.view(batch,512,512)
363 |     
364 |     for ii in range(batch):
365 |         pred_depth = F.sigmoid(input[ii,best_n_list[ii],-1,gj[ii],gi[ii]])
366 |         target_bbox = dp[ii,bbox[ii][0]:max(bbox[ii][2],bbox[ii][0]+1),bbox[ii][1]:max(bbox[ii][3],bbox[ii][1]+1)].contiguous()
367 |         target_bbox = target_bbox.view(-1)
368 |         target_bbox = torch.mean(target_bbox,dim=0)
369 |         loss += mseloss(pred_depth,target_bbox)
370 |     loss = loss/batch
371 |     loss = loss.float()
372 |     return loss
373 | 
374 | 
375 | 
376 | 
377 | 
378 | 
379 | 


--------------------------------------------------------------------------------
/model/grounding_model.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | 
  3 | import torch
  4 | import torch.nn as nn
  5 | import torch.nn.functional as F
  6 | import torch.utils.model_zoo as model_zoo
  7 | from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
  8 | from torch.utils.data.distributed import DistributedSampler
  9 | 
 10 | from .darknet import *
 11 | from .convlstm import *
 12 | from .modulation import *
 13 | 
 14 | import argparse
 15 | import collections
 16 | import logging
 17 | import json
 18 | import re
 19 | import time
 20 | from tqdm import tqdm
 21 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 22 | from pytorch_pretrained_bert.modeling import BertModel
 23 | 
 24 | def generate_coord(batch, height, width):
 25 |     # coord = Variable(torch.zeros(batch,8,height,width).cuda())
 26 |     xv, yv = torch.meshgrid([torch.arange(0,height), torch.arange(0,width)])
 27 |     xv_min = (xv.float()*2 - width)/width
 28 |     yv_min = (yv.float()*2 - height)/height
 29 |     xv_max = ((xv+1).float()*2 - width)/width
 30 |     yv_max = ((yv+1).float()*2 - height)/height
 31 |     xv_ctr = (xv_min+xv_max)/2
 32 |     yv_ctr = (yv_min+yv_max)/2
 33 |     hmap = torch.ones(height,width)*(1./height)
 34 |     wmap = torch.ones(height,width)*(1./width)
 35 |     coord = torch.autograd.Variable(torch.cat([xv_min.unsqueeze(0), yv_min.unsqueeze(0),\
 36 |         xv_max.unsqueeze(0), yv_max.unsqueeze(0),\
 37 |         xv_ctr.unsqueeze(0), yv_ctr.unsqueeze(0),\
 38 |         hmap.unsqueeze(0), wmap.unsqueeze(0)], dim=0).cuda())
 39 |     coord = coord.unsqueeze(0).repeat(batch,1,1,1)
 40 |     return coord
 41 | 
 42 | class grounding_model_multihop(nn.Module):
 43 |     def __init__(self, corpus=None, emb_size=256, jemb_drop_out=0.1, bert_model='bert-base-uncased', \
 44 |         NFilm=2, fusion='prod', intmd=False, mstage=False, convlstm=False, \
 45 |         coordmap=True, leaky=False, dataset=None, bert_emb=False, tunebert=False, use_sal=False, use_paf=False):
 46 |         super(grounding_model_multihop, self).__init__()
 47 |         self.coordmap = coordmap
 48 |         self.emb_size = emb_size
 49 |         self.NFilm = NFilm
 50 |         self.intmd = intmd
 51 |         self.mstage = mstage
 52 |         self.convlstm = convlstm
 53 |         self.tunebert = tunebert
 54 |         self.use_sal = use_sal
 55 |         self.use_paf = use_paf
 56 |         if bert_model=='bert-base-uncased':
 57 |             self.textdim=768
 58 |         else:
 59 |             self.textdim=1024
 60 |         ## Visual model
 61 |         self.visumodel = Darknet(config_path='model/yolov3.cfg')
 62 |         self.visumodel.load_weights('saved_models/yolov3.weights')
 63 |         self.trans = CLIPVisionTransformer(512,8,256)
 64 | 
 65 |         self.visumodel4t = Darknetfort(config_path='model/yolov3.cfg')
 66 |         self.visumodel4t.load_weights('saved_models/yolov3.weights')
 67 | 
 68 |         ## Text model
 69 |         self.textmodel = BertModel.from_pretrained(bert_model)
 70 |         
 71 |         ## Mapping module
 72 |         if self.use_paf:
 73 |             self.mapping_visu = ConvBatchNormReLU(512+3+1 if self.convlstm else 256+3, emb_size, 1, 1, 0, 1, leaky=leaky)
 74 |             self.mp1 = nn.MaxPool2d(16, stride=16)
 75 |             self.mp2 = nn.AvgPool2d(16, stride=16)
 76 |         else:
 77 |             self.mapping_visu = ConvBatchNormReLU(512 if self.convlstm else 256, emb_size, 1, 1, 0, 1, leaky=leaky)
 78 | 
 79 |         self.mapping_lang = torch.nn.Sequential(
 80 |           nn.Linear(self.textdim, emb_size),
 81 |           nn.ReLU(),
 82 |           nn.Dropout(jemb_drop_out),
 83 |           nn.Linear(emb_size, emb_size),
 84 |           nn.ReLU(),)
 85 |         self.mp3 = nn.MaxPool2d(8, stride=8)
 86 |         self.mp4 = nn.AvgPool2d(8, stride=8)
 87 |         self.mapping_visuf = ConvBatchNormReLU(256 + 4 +1, 256, 1, 1,0, 1, leaky=leaky)
 88 |         textdim=emb_size
 89 |         self.film = FiLMedConvBlock_multihop(NFilm=NFilm,textdim=textdim,visudim=emb_size,\
 90 |             emb_size=emb_size,fusion=fusion,intmd=(intmd or mstage or convlstm))
 91 | 
 92 |         ## output head
 93 |         output_emb = emb_size
 94 |         self.loc_avg = nn.AvgPool2d(16, stride=16)
 95 |         self.pt_avg = nn.AvgPool2d(16, stride=16)
 96 |         self.ht_avg = nn.AvgPool2d(16, stride=16)
 97 |         self.vis_map = ConvBatchNormReLU(512, 128, 3, 1, 1, 1, leaky=leaky)
 98 |         self.locationpool = torch.nn.Sequential(
 99 |                 nn.AvgPool2d(8, stride=8),
100 |                 #ConvBatchNormReLU(3, 256, 1, 1, 0, 1, leaky=leaky)
101 |                 )
102 |         self.linear1 = torch.nn.Sequential(
103 |                     ConvBatchNormReLU(256,128,8, 8, 0, 1, leaky=leaky),
104 |                     ConvBatchNormReLU(128,32,9, 1, 4, 1, leaky=leaky),
105 |                     nn.MaxPool2d(8, stride=8)
106 |         )
107 |         self.linear2 = nn.Linear(32, 3)
108 |         self.language = nn.Linear(512, 1)
109 |         self.stage0 = torch.nn.Sequential(
110 |                     ConvBatchNormReLU(135, 1024, 1, 1, 0, 1, leaky=leaky)
111 |                 )
112 |         self.stage1 = torch.nn.Sequential(
113 |                     ConvBatchNormReLU(1024, 1, 9, 1, 4, 1, leaky=leaky),
114 |                     torch.nn.Upsample(512,mode = 'bilinear' , align_corners = True),
115 |                 )
116 |         self.upsample = torch.nn.Upsample(512,mode = 'bilinear' , align_corners = True)
117 |         self.tohyper = torch.nn.Sequential(
118 |                     ConvBatchNormReLU(768, 512, 1, 1, 0, 1, leaky=leaky)
119 |                 )
120 |         self.word_projection = nn.Sequential(nn.Linear(512, 256),
121 |                                              nn.ReLU(),
122 |                                              nn.Dropout(0.1),
123 |                                              nn.Linear(256, 256),
124 |                                              nn.ReLU())
125 |         #self.tstage0 = torch.nn.Con
126 |         self.center = torch.nn.Sequential(
127 |                     nn.AvgPool2d(16, stride=16)
128 |                 )
129 |         if self.mstage:
130 |             self.fcn_out = nn.ModuleDict()
131 |             modules = OrderedDict()
132 |             for n in range(0,NFilm):
133 |                 modules["out%d"%n] = torch.nn.Sequential(
134 |                     ConvBatchNormReLU(output_emb, output_emb//2, 1, 1, 0, 1, leaky=leaky),
135 |                     nn.Conv2d(output_emb//2, 9*5, kernel_size=1))
136 |             self.fcn_out.update(modules)
137 |         else:
138 |             if self.intmd: 
139 |                 output_emb = emb_size*NFilm
140 |             if self.convlstm:
141 |                 output_emb = emb_size
142 |                 self.global_out = ConvLSTM(input_size=(32, 32),
143 |                      input_dim=emb_size,
144 |                      hidden_dim=[emb_size],
145 |                      kernel_size=(1, 1),
146 |                      num_layers=1,
147 |                      batch_first=True,
148 |                      bias=True,
149 |                      return_all_layers=False)
150 |             if self.use_sal:
151 |                 self.conv1 = torch.nn.Sequential(
152 |                     nn.AvgPool2d(16)
153 |                 )
154 |                 self.fcn_out = torch.nn.Sequential(
155 |                     ConvBatchNormReLU(output_emb+1, output_emb//2, 1, 1, 0, 1, leaky=leaky),
156 |                     nn.Conv2d(output_emb//2, 9*5, kernel_size=1))
157 |             else:
158 |                 self.fcn_out = torch.nn.Sequential(
159 |                         ConvBatchNormReLU(output_emb, output_emb//2, 1, 1, 0, 1, leaky=leaky),
160 |                         nn.Conv2d(output_emb//2, 9*5, kernel_size=1))
161 |         self.test = Vector(512,16,512)
162 |         self.vectmaxp = torch.nn.Sequential(
163 |                         nn.MaxPool2d(16, stride=16),
164 |                         nn.ReLU()
165 |         )
166 |         self.ptmax = torch.nn.Sequential(
167 |                         #nn.MaxPool2d(8, stride=8),
168 |                         nn.ReLU()
169 |         )
170 |         self.draw = torch.nn.Sequential(
171 |                         nn.ReLU()
172 |         )
173 |         self.softmax = nn.Softmax(dim=-1)
174 |         self.linear = nn.Linear(256, 1)
175 |     def forward(self, image, dp, ht, word_id, word_mask, gest, bbox, gt, amask, sal,phrase):
176 |         ## Visual Module
177 |         
178 |         batch_size = image.size(0)
179 |         out = self.visumodel(image)
180 |         intemide = self.visumodel(image)[1]
181 |         gest = gest.type(torch.FloatTensor).cuda()
182 |         amask = amask.type(torch.FloatTensor).cuda()
183 |         dp = torch.mul(amask,dp)
184 |         dp = F.normalize(dp.type(torch.FloatTensor).view(batch_size,-1),dim=1,p=float('INF')).view(batch_size,1,512,512).cuda() #* 1.5 
185 |         
186 |         
187 |         raw_fvisu4t = self.visumodel4t(image)
188 |   
189 |         xv, yv = torch.meshgrid([torch.arange(0,512), torch.arange(0,512)]) 
190 |         xv = (xv / 512 ).unsqueeze(0).unsqueeze(0).repeat(batch_size,1,1,1).cuda()
191 |         yv = (yv / 512 ).unsqueeze(0).unsqueeze(0).repeat(batch_size,1,1,1).cuda()
192 |         xyz = torch.cat( (xv,yv,dp), dim = 1).cuda()
193 | 
194 |         gestfraw = torch.mul(gest , amask)
195 |         
196 |         gestf = F.normalize(gestfraw.view(batch_size,1,-1),dim=2,p=1).view(batch_size,1,512,512).repeat(1,3,1,1)
197 |         body = torch.mul(gestf , xyz).view(batch_size, 3, -1)
198 |         body = torch.sum(body,dim=2)
199 | 
200 |         gtbo = F.normalize(gt.view(batch_size,1,-1),dim=2,p=1).view(batch_size,1,512,512).repeat(1,3,1,1)
201 |         target = torch.mul(gtbo , xyz).view(batch_size, 3, -1)
202 |         target = torch.sum(target,dim=2) - body
203 |         
204 |         xyz_cent = xyz - body.unsqueeze(2).unsqueeze(2).repeat(1,1,512,512)
205 | 
206 |         t = self.test(torch.cat( (ht.type(torch.FloatTensor).cuda(),xyz_cent) ,dim = 1))
207 |         vloss = 1 - torch.cosine_similarity(t, target, dim=1)
208 |         vectmap = torch.cosine_similarity(xyz_cent , t.unsqueeze(2).unsqueeze(2).repeat(1,1,512,512) , dim = 1).unsqueeze(1) - 0.7
209 | 
210 |         # cv2.imwrite('output/'+rank+'img.jpg' , imagedraw*255)
211 |         norm = torch.max(gestfraw.reshape(batch_size,-1), dim=1, keepdim = True)[0].detach().unsqueeze(2).unsqueeze(2).repeat(1,1,512,512)
212 |         gestfraw = (gestfraw.unsqueeze(1))/norm
213 |         maxgestvect =  self.ptmax(vectmap  ) #+self.ptmax(gestfraw)
214 |         maxvecter = self.vectmaxp(vectmap ) 
215 |         mid = torch.cat((raw_fvisu4t[2], self.mp3(ht.type(torch.FloatTensor).cuda()), self.mp4(dp.type(torch.FloatTensor).cuda()), self.mp4(vectmap.type(torch.FloatTensor).cuda())),1) 
216 |         #
217 |         mid = self.mapping_visuf(mid)
218 |         raw_fvisu = torch.cat((intemide, self.mp1(ht.type(torch.FloatTensor).cuda()), self.mp2(dp.type(torch.FloatTensor).cuda())),1)
219 |         fvisu = self.mapping_visu(raw_fvisu) * maxvecter.repeat(1,512,1,1).detach()
220 |         raw_fvisu = F.normalize(fvisu, p=2, dim=1)
221 |         size = (raw_fvisu.shape[2])
222 |         
223 |         ## Language Module
224 |         all_encoder_layers, _ = self.textmodel(word_id, \
225 |             token_type_ids=None, attention_mask=word_mask)
226 |         ## Sentence feature at the first position [cls]
227 |         raw_flang = (all_encoder_layers[-1][:,0,:] + all_encoder_layers[-2][:,0,:]\
228 |              + all_encoder_layers[-3][:,0,:] + all_encoder_layers[-4][:,0,:])/4
229 |         raw_fword = (all_encoder_layers[-1] + all_encoder_layers[-2]\
230 |              + all_encoder_layers[-3] + all_encoder_layers[-4])/4
231 |         if not self.tunebert:
232 |             ## fix bert during training
233 |             # raw_flang = raw_flang.detach()
234 |             hidden = raw_flang.detach()
235 |             raw_fword = raw_fword.detach()
236 | 
237 |         fword = Variable(torch.zeros(raw_fword.shape[0], raw_fword.shape[1], self.emb_size).cuda())
238 |         for ii in range(raw_fword.shape[0]):
239 |             ntoken = (word_mask[ii] != 0).sum()
240 |             fword[ii,:ntoken,:] = F.normalize(self.mapping_lang(raw_fword[ii,:ntoken,:]), p=2, dim=1)
241 |             ## [CLS], [SEP]
242 |             # fword[ii,1:ntoken-1,:] = F.normalize(self.mapping_lang(raw_fword[ii,1:ntoken-1,:].view(-1,self.textdim)), p=2, dim=1)
243 |         raw_fword = fword
244 |         x = self.trans(mid)[1].reshape(batch_size,256,-1).permute(0,2,1)
245 |         x = self.linear(x)
246 |         pt = self.upsample ( torch.softmax(x , dim = 1).squeeze(2).reshape(batch_size,1,64,64) )
247 | 
248 |         gt = gt.unsqueeze(1)
249 |         gest = 1 - torch.mul(gest , amask).clamp(max = 1,min=0)
250 |         pt = torch.mul(pt, amask.unsqueeze(1))
251 |         pt = F.normalize(pt.view(batch_size,1,-1),dim=2,p=1).view(batch_size,1,512,512)
252 |         loss3  = 1 - torch.sum(torch.mul(pt,gt).reshape(batch_size,-1) , -1)
253 |         gt = F.normalize(gt.view(batch_size,1,-1),dim=2,p=1).view(batch_size,1,512,512)
254 |         eps = 1e-7
255 | 
256 |         vect = torch.mul(pt , xyz).view(batch_size, 3, -1)
257 |         vect = torch.sum(vect,dim=2) - body
258 |         loss1 = torch.sum(torch.abs(vect - target))
259 |         loss3  += torch.sum( (torch.log( gt / (eps + pt) + eps ) * gt).reshape(batch_size,-1) , -1) * 0.1
260 | 
261 |         norm = torch.max(pt.reshape(batch_size,-1), dim=1, keepdim = True)[0].detach().unsqueeze(2).unsqueeze(2).repeat(1,1,512,512)
262 |         pt = (pt/norm).detach() 
263 |         centerout = self.center(pt.type(torch.FloatTensor)).squeeze(1).cuda()
264 | 
265 |         coord = generate_coord(batch_size, raw_fvisu.size(2), raw_fvisu.size(3))
266 |         x, attnscore_list = self.film(raw_fvisu, raw_fword, coord,maxvecter,fsent=None,word_mask=word_mask)
267 |         if self.mstage:
268 |             outbox = []
269 |             for film_ii in range(len(x)):
270 |                 outbox.append(self.fcn_out["out%d"%film_ii](x[film_ii]))
271 |         elif self.convlstm:
272 |             x = torch.stack(x, dim=1)
273 | 
274 |             output, state = self.global_out(x)
275 |             output, hidden, cell = output[-1], state[-1][0], state[-1][1]
276 |             if self.use_sal:
277 |                 #pt = sal.type(torch.FloatTensor).cuda()
278 |                 pt_c = self.conv1(pt.type(torch.FloatTensor).cuda())
279 | 
280 |                 hidden = torch.cat((hidden, pt_c), 1)
281 | 
282 |             outbox = [self.fcn_out(hidden)]
283 |         else:
284 |             x = torch.stack(x, dim=1).view(batch_size, -1, raw_fvisu.size(2), raw_fvisu.size(3))
285 |             outbox = [self.fcn_out(x)]
286 | 
287 |         return outbox, attnscore_list, loss1, vloss, centerout,loss3,pt   ## list of (B,N,H,W)
288 | 
289 | 
290 | if __name__ == "__main__":
291 |     import sys
292 |     import argparse
293 |     sys.path.append('.')
294 |     from dataset.data_loader import *
295 |     from torch.autograd import Variable
296 |     from torch.utils.data import DataLoader
297 |     from torchvision.transforms import Compose, ToTensor, Normalize
298 |     from utils.transforms import ResizeImage, ResizeAnnotation
299 |     parser = argparse.ArgumentParser(
300 |         description='Dataloader test')
301 |     parser.add_argument('--size', default=416, type=int,
302 |                         help='image size')
303 |     parser.add_argument('--data', type=str, default='./ln_data/',
304 |                         help='path to ReferIt splits data folder')
305 |     parser.add_argument('--dataset', default='referit', type=str,
306 |                         help='referit/flickr/unc/unc+/gref')
307 |     parser.add_argument('--split', default='train', type=str,
308 |                         help='name of the dataset split used to train')
309 |     parser.add_argument('--time', default=20, type=int,
310 |                         help='maximum time steps (lang length) per batch')
311 |     parser.add_argument('--emb_size', default=256, type=int,
312 |                         help='word embedding dimensions')
313 |     # parser.add_argument('--lang_layers', default=3, type=int,
314 |     #                     help='number of SRU/LSTM stacked layers')
315 | 
316 |     args = parser.parse_args()
317 | 
318 |     torch.manual_seed(13)
319 |     np.random.seed(13)
320 |     torch.backends.cudnn.deterministic = True
321 |     torch.backends.cudnn.benchmark = False
322 |     input_transform = Compose([
323 |         ToTensor(),
324 |         # ResizeImage(args.size),
325 |         Normalize(
326 |             mean=[0.485, 0.456, 0.406],
327 |             std=[0.229, 0.224, 0.225])
328 |     ])
329 | 
330 |     refer = ReferDataset(data_root=args.data,
331 |                          dataset=args.dataset,
332 |                          split=args.split,
333 |                          imsize = args.size,
334 |                          transform=input_transform,
335 |                          max_query_len=args.time)
336 | 
337 |     train_loader = DataLoader(refer, batch_size=1, shuffle=True,
338 |                               pin_memory=True, num_workers=0)
339 | 
340 | #    model = textcam_yolo_light(emb_size=args.emb_size)
341 |     
342 |     for i in enumerate(train_loader):
343 |         print(i)
344 |         break
345 | 


--------------------------------------------------------------------------------
/dataset/data_loader.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | YouRefIt referring image PyTorch dataset.
  5 | Define and group batches of images and queries.
  6 | Based on:
  7 | https://github.com/zyang-ur/ReSC/blob/master/dataset/data_loader.py
  8 | """
  9 | from torchvision.transforms import Compose, ToTensor, Normalize
 10 | import os
 11 | import sys
 12 | import cv2
 13 | import json
 14 | import uuid
 15 | import tqdm
 16 | import math
 17 | import torch
 18 | import random
 19 | # import h5py
 20 | import numpy as np
 21 | import os.path as osp
 22 | import scipy.io as sio
 23 | import torch.utils.data as data
 24 | from collections import OrderedDict
 25 | sys.path.append('.')
 26 | import operator
 27 | import utils
 28 | from utils import Corpus
 29 | import clip
 30 | import argparse
 31 | import collections
 32 | import logging
 33 | import json
 34 | import re
 35 | 
 36 | np.set_printoptions(threshold=np.inf)
 37 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 38 | from pytorch_pretrained_bert.modeling import BertModel
 39 | # from transformers import BertTokenizer,BertModel
 40 | from utils.transforms import letterbox, random_affine
 41 | 
 42 | sys.modules['utils'] = utils
 43 | cv2.setNumThreads(0)
 44 | cv2.ocl.setUseOpenCL(True)
 45 | 
 46 | def read_examples(input_line, unique_id):
 47 |     """Read a list of `InputExample`s from an input file."""
 48 |     examples = []
 49 |     # unique_id = 0
 50 |     line = input_line #reader.readline()
 51 |     # if not line:
 52 |     #     break
 53 |     line = line.strip()
 54 |     text_a = None
 55 |     text_b = None
 56 |     m = re.match(r"^(.*) \|\|\| (.*)$", line)
 57 |     if m is None:
 58 |         text_a = line
 59 |     else:
 60 |         text_a = m.group(1)
 61 |         text_b = m.group(2)
 62 |     examples.append(
 63 |         InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
 64 |     # unique_id += 1
 65 |     return examples
 66 | 
 67 | ## Bert text encoding
 68 | class InputExample(object):
 69 |     def __init__(self, unique_id, text_a, text_b):
 70 |         self.unique_id = unique_id
 71 |         self.text_a = text_a
 72 |         self.text_b = text_b
 73 | 
 74 | class InputFeatures(object):
 75 |     """A single set of features of data."""
 76 |     def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
 77 |         self.unique_id = unique_id
 78 |         self.tokens = tokens
 79 |         self.input_ids = input_ids
 80 |         self.input_mask = input_mask
 81 |         self.input_type_ids = input_type_ids
 82 | 
 83 | def convert_examples_to_features(examples, seq_length, tokenizer):
 84 |     """Loads a data file into a list of `InputBatch`s."""
 85 |     features = []
 86 |     for (ex_index, example) in enumerate(examples):
 87 |         tokens_a = tokenizer.tokenize(example.text_a)
 88 | 
 89 |         tokens_b = None
 90 |         if example.text_b:
 91 |             tokens_b = tokenizer.tokenize(example.text_b)
 92 | 
 93 |         if tokens_b:
 94 |             # Modifies `tokens_a` and `tokens_b` in place so that the total
 95 |             # length is less than the specified length.
 96 |             # Account for [CLS], [SEP], [SEP] with "- 3"
 97 |             _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
 98 |         else:
 99 |             # Account for [CLS] and [SEP] with "- 2"
100 |             if len(tokens_a) > seq_length - 2:
101 |                 tokens_a = tokens_a[0:(seq_length - 2)]
102 |         tokens = []
103 |         input_type_ids = []
104 |         tokens.append("[CLS]")
105 |         input_type_ids.append(0)
106 |         for token in tokens_a:
107 |             tokens.append(token)
108 |             input_type_ids.append(0)
109 |         tokens.append("[SEP]")
110 |         input_type_ids.append(0)
111 | 
112 |         if tokens_b:
113 |             for token in tokens_b:
114 |                 tokens.append(token)
115 |                 input_type_ids.append(1)
116 |             tokens.append("[SEP]")
117 |             input_type_ids.append(1)
118 | 
119 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
120 | 
121 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
122 |         # tokens are attended to.
123 |         input_mask = [1] * len(input_ids)
124 | 
125 |         # Zero-pad up to the sequence length.
126 |         while len(input_ids) < seq_length:
127 |             input_ids.append(0)
128 |             input_mask.append(0)
129 |             input_type_ids.append(0)
130 | 
131 |         assert len(input_ids) == seq_length
132 |         assert len(input_mask) == seq_length
133 |         assert len(input_type_ids) == seq_length
134 |         features.append(
135 |             InputFeatures(
136 |                 unique_id=example.unique_id,
137 |                 tokens=tokens,
138 |                 input_ids=input_ids,
139 |                 input_mask=input_mask,
140 |                 input_type_ids=input_type_ids))
141 |     return features
142 | 
143 | class DatasetNotFoundError(Exception):
144 |     pass
145 | 
146 | class ReferDataset(data.Dataset):
147 |     SUPPORTED_DATASETS = {
148 |         'yourefit': {'splits': ('train', 'val', 'test')},
149 |         'referit': {'splits': ('train', 'val', 'trainval', 'test')},
150 |         'unc': {
151 |             'splits': ('train', 'val', 'trainval', 'testA', 'testB'),
152 |             'params': {'dataset': 'refcoco', 'split_by': 'unc'}
153 |         },
154 |         'unc+': {
155 |             'splits': ('train', 'val', 'trainval', 'testA', 'testB'),
156 |             'params': {'dataset': 'refcoco+', 'split_by': 'unc'}
157 |         },
158 |         'gref': {
159 |             'splits': ('train', 'val'),
160 |             'params': {'dataset': 'refcocog', 'split_by': 'google'}
161 |         },
162 |         'gref_umd': {
163 |             'splits': ('train', 'val', 'test'),
164 |             'params': {'dataset': 'refcocog', 'split_by': 'umd'}
165 |         },
166 |         'flickr': {
167 |             'splits': ('train', 'val', 'test')}
168 |     }
169 | 
170 |     def __init__(self, data_root, split_root='data', dataset='referit', imsize=256,
171 |                  transform=None, augment=False, device=None, return_idx=False, testmode=False,
172 |                  split='train', max_query_len=128, lstm=False, bert_model='bert-base-uncased'):
173 |         self.images = []
174 |         self.data_root = data_root
175 |         self.split_root = split_root
176 |         self.dataset = dataset
177 |         self.imsize = imsize
178 |         self.query_len = max_query_len
179 |         self.lstm = lstm
180 |         self.transform = transform
181 |         self.testmode = testmode
182 |         self.split = split
183 |         self.device = device
184 |         self.t = input_transform = Compose([
185 |         ToTensor()
186 |     ])
187 |         self.tokenizer = BertTokenizer.from_pretrained(bert_model, do_lower_case=True)
188 |         self.augment=augment
189 |         self.return_idx=return_idx
190 |         self.num = 0
191 |         if self.dataset == 'yourefit':
192 |             self.dataset_root = osp.join(self.data_root, 'yourefit')
193 |             self.im_dir = osp.join(self.dataset_root, 'images')
194 |         elif self.dataset == 'referit':
195 |             self.dataset_root = osp.join(self.data_root, 'referit')
196 |             self.im_dir = osp.join(self.dataset_root, 'images')
197 |             self.split_dir = osp.join(self.dataset_root, 'splits')
198 |         elif  self.dataset == 'flickr':
199 |             self.dataset_root = osp.join(self.data_root, 'Flickr30k')
200 |             self.im_dir = osp.join(self.dataset_root, 'flickr30k_images')
201 |         else:   ## refcoco, etc.
202 |             self.dataset_root = osp.join(self.data_root, 'other')
203 |             self.im_dir = osp.join(
204 |                 self.dataset_root, 'images', 'mscoco', 'images', 'train2014')
205 |             self.split_dir = osp.join(self.dataset_root, 'splits')
206 | 
207 |         if not self.exists_dataset():
208 |             print('Please download index cache to data folder')
209 |             exit(0)
210 | 
211 |         dataset_path = osp.join(self.split_root, self.dataset)
212 |         valid_splits = self.SUPPORTED_DATASETS[self.dataset]['splits']
213 | 
214 |         if self.lstm:
215 |             self.corpus = Corpus()
216 |             corpus_path = osp.join(dataset_path, 'corpus.pth')
217 |             self.corpus = torch.load(corpus_path)
218 | 
219 |         if split not in valid_splits:
220 |             raise ValueError(
221 |                 'Dataset {0} does not have split {1}'.format(
222 |                     self.dataset, split))
223 | 
224 |         splits = [split]
225 |         if self.dataset != 'referit':
226 |             splits = ['train', 'val'] if split == 'trainval' else [split]
227 |         for split in splits:
228 |             imgset_file = '{0}_{1}full.pth'.format(self.dataset, split)
229 |             imgset_path = osp.join(dataset_path, imgset_file)
230 |             self.images += torch.load(imgset_path)
231 | 
232 |     def exists_dataset(self):
233 |         return osp.exists(osp.join(self.split_root, self.dataset))
234 | 
235 | 
236 |     def pull_item(self, idx):
237 |         if self.dataset == 'flickr':
238 |             img_file, bbox, phrase = self.images[idx]
239 |         else:
240 |             img_file, _, bbox, phrase, attri = self.images[idx]
241 |         ## box format: to x1y1x2y2
242 |         if not (self.dataset == 'referit' or self.dataset == 'flickr'):
243 |             bbox = np.array(bbox, dtype=int)
244 |             #bbox[2], bbox[3] = bbox[0]+bbox[2], bbox[1]+bbox[3]
245 |         else:
246 |             bbox = np.array(bbox, dtype=int)
247 |         
248 |         img_path = osp.join(self.im_dir, img_file)
249 |         img = cv2.imread(img_path)
250 | 
251 |         htmapdir = self.im_dir.replace('images', 'pafours')
252 |         htmapfile = img_file #.replace('.jpg', '_rendered.png')
253 |         htmap_path = osp.join(htmapdir, htmapfile)
254 |         htmap = cv2.imread(htmap_path)
255 |         
256 |         ht = np.asarray(htmap)
257 | 
258 |         # #ht = np.mean(ht, axis=2)
259 |          
260 | 
261 |         # ht = cv2.resize(ht, (512, 512))
262 | 
263 |         ptdir = self.im_dir.replace('images', 'depimg')
264 |         ptfile = img_file #.replace('.jpg', '_depth.png')
265 |         pt_path = osp.join(ptdir, ptfile)
266 |         pt = cv2.imread(pt_path)
267 |         # print(pt.shape)
268 |         # exit()
269 |         # pt = cv2.resize(pt, (256,256))
270 |         # pt = np.reshape(pt, (3, 256, 256))
271 | 
272 |         saldir = self.im_dir.replace('images', 'saliency')
273 |         salfile = img_file.replace('.jpg', '.jpeg')
274 |         sal_path = osp.join(saldir, salfile)
275 |         sal = cv2.imread(sal_path)
276 |         sal = cv2.resize(sal, (256,256))
277 |         #sal = np.reshape(sal, (3, 256, 256))
278 | 
279 |         gestdir = 'ln_data/bodysegment'
280 |         gestfile = img_file.replace('.jpg' , '_seg.png')
281 |         gest_path = osp.join(gestdir,gestfile)
282 |         gest = cv2.imread(gest_path)
283 |         if gest.shape != img.shape:
284 |             gest = cv2.resize(gest, img.shape[:2], interpolation=cv2.INTER_AREA)
285 |         ## duplicate channel if gray image
286 |         if img.shape[-1] > 1:
287 |             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
288 |         else:
289 |             img = np.stack([img] * 3)
290 |         
291 |         return img, pt, ht, phrase, bbox, gest, sal, img_file
292 |  #       return img, phrase, bbox, pt, ht
293 | 
294 |     def tokenize_phrase(self, phrase):
295 |         return self.corpus.tokenize(phrase, self.query_len)
296 | 
297 |     def untokenize_word_vector(self, words):
298 |         return self.corpus.dictionary[words]
299 | 
300 |     def __len__(self):
301 |         return len(self.images)
302 | 
303 |     def __getitem__(self, idx):
304 |         img, pt, ht, phrase, bbox, gest, sal, img_file = self.pull_item(idx)
305 |         # phrase = phrase.decode("utf-8").encode().lower()
306 | 
307 | 
308 |         phrase = phrase.lower()
309 |         if self.augment:
310 |             augment_flip, augment_hsv, augment_affine = True,True,True
311 |         
312 |         ## seems a bug in torch transformation resize, so separate in advance
313 |         h,w = img.shape[0], img.shape[1]
314 |         if self.augment:
315 |             ## random horizontal flip
316 |             if augment_flip and random.random() > 0.5:
317 |                 img = cv2.flip(img, 1)
318 |                 pt = cv2.flip(pt, 1 )
319 |                 ht = cv2.flip(ht, 1 )
320 |                 gest = cv2.flip(gest, 1)
321 |                 sal = cv2.flip(sal, 1 )
322 |                 bbox[0], bbox[2] = w-bbox[2]-1, w-bbox[0]-1
323 |                 phrase = phrase.replace('right','*&^special^&*').replace('left','right').replace('*&^special^&*','left')
324 |    
325 |             ## random intensity, saturation change
326 |             if augment_hsv:
327 |                 fraction = 0.5
328 |                 img_hsv = cv2.cvtColor(cv2.cvtColor(img, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2HSV)
329 |                 S = img_hsv[:, :, 1].astype(np.float32)
330 |                 V = img_hsv[:, :, 2].astype(np.float32)
331 |                 a = (random.random() * 2 - 1) * fraction + 1
332 |                 if a > 1:
333 |                     np.clip(S, a_min=0, a_max=255, out=S)
334 |                 a = (random.random() * 2 - 1) * fraction + 1
335 |                 V *= a
336 |                 if a > 1:
337 |                     np.clip(V, a_min=0, a_max=255, out=V)
338 | 
339 |                 img_hsv[:, :, 1] = S.astype(np.uint8)
340 |                 img_hsv[:, :, 2] = V.astype(np.uint8)
341 |                 img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR), cv2.COLOR_BGR2RGB)
342 |             
343 |             mask = np.ones_like(img)
344 |             img, mask, ratio, dw, dh = letterbox(img, mask, self.imsize)
345 |             #ht, _, ratio, dw, dh = letterbox(ht, None, self.imsize)
346 |             gest, _, ratio, dw, dh = letterbox(gest, None, self.imsize)
347 |             #sal, _, ratio, dw, dh = letterbox(sal, None, self.imsize)
348 |             bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw
349 |             bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh
350 |             ## random affine transformation
351 |             if augment_affine:
352 |                 gt = np.asarray(torch.zeros([512,512]))
353 |                 gt[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] = 1
354 |                 img, mask, bbox, M = random_affine(img, mask, bbox, \
355 |                     degrees=(-5, 5), translate=(0.10, 0.10), scale=(0.90, 1.10))
356 |                 pt = cv2.warpPerspective(pt, M, dsize=(512, 512), flags=cv2.INTER_LINEAR,
357 |                               borderValue=0)
358 |                 ht = cv2.warpPerspective(ht, M, dsize=(512, 512), flags=cv2.INTER_LINEAR,
359 |                               borderValue=0)
360 |                 gest = cv2.warpPerspective(gest, M, dsize=(512, 512), flags=cv2.INTER_NEAREST,
361 |                               borderValue=0)
362 |                 sal = cv2.warpPerspective(sal, M, dsize=(256, 256), flags=cv2.INTER_NEAREST,
363 |                               borderValue=0)
364 |                 gt = cv2.warpPerspective(gt, M, dsize=(512, 512), flags=cv2.INTER_NEAREST,
365 |                               borderValue=0)
366 |         else:   ## should be inference, or specified training
367 |             mask = np.ones_like(img)
368 |             img, mask, ratio, dw, dh = letterbox(img, mask, self.imsize)
369 |             # ht, _, ratio, dw, dh = letterbox(ht, None, self.imsize)
370 |             gest, _, ratio, dw, dh = letterbox(gest, None, self.imsize)
371 |             bbox[0], bbox[2] = bbox[0]*ratio+dw, bbox[2]*ratio+dw
372 |             bbox[1], bbox[3] = bbox[1]*ratio+dh, bbox[3]*ratio+dh
373 |             gt = np.asarray(torch.zeros([512,512]))
374 |             gt[int(bbox[1]):int(bbox[3]), int(bbox[0]):int(bbox[2])] = 1
375 |         ## Norm, to tensor
376 |         # print(img.shape)
377 | 
378 |         pt = pt[:,:,0]
379 |         gest = gest[:,:,0]
380 |         mask = mask[:,:,0]
381 |         sal = np.reshape(sal, (3, 256, 256))
382 |         sal = sal[0,:,:]
383 |         if self.transform is not None:
384 | 
385 |             img = self.transform(img)
386 |     
387 |             #pt = self.t(pt)
388 |             #print(ht.shape)
389 | 
390 |             ht = self.transform(ht)
391 |  
392 |             #print(ht.shape)
393 |         if self.lstm:
394 |             phrase = self.tokenize_phrase(phrase)
395 |             word_id = phrase
396 |             # word_mask = np.zeros(word_id.shape)
397 |             word_mask = np.array(word_id>0,dtype=int)
398 |         else:
399 |             ## encode phrase to bert input
400 |             
401 |             examples = read_examples(phrase, idx)
402 |             features = convert_examples_to_features(
403 |                 examples=examples, seq_length=self.query_len, tokenizer=self.tokenizer)
404 |             word_id = features[0].input_ids
405 |             word_mask = features[0].input_mask
406 |             #phrase = features[0].input_mask #clip.tokenize(phrase, context_length=20)
407 |         if self.testmode:
408 |             return img, pt, ht, gest, gt, mask, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
409 |                 np.array(bbox, dtype=np.float32), np.array(ratio, dtype=np.float32), \
410 |                 np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0],sal , phrase
411 |         else:
412 |             return img, pt, ht, gest, gt, mask, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
413 |             np.array(bbox, dtype=np.float32),sal, phrase, img_file


--------------------------------------------------------------------------------
/model/darknet.py:
--------------------------------------------------------------------------------
  1 | from __future__ import division
  2 | 
  3 | import math
  4 | import torch
  5 | import torch.nn as nn
  6 | import torch.nn.functional as F
  7 | from torch.autograd import Variable
  8 | import numpy as np
  9 | from collections import defaultdict, OrderedDict
 10 | 
 11 | from PIL import Image
 12 | 
 13 | # from utils.parse_config import *
 14 | from utils.utils import *
 15 | # import matplotlib.pyplot as plt
 16 | # import matplotlib.patches as patches
 17 | 
 18 | exist_id = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, \
 19 |     11, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, \
 20 |     23, 24, 25, 27, 28, 31, 32, 33, 34, 35, 36, \
 21 |     37, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, \
 22 |     49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, \
 23 |     60, 61, 62, 63, 64, 65, 67, 70, 72, 73, 74, \
 24 |     75, 76, 77, 78, 79, 80, 81, 82, 84, 85, 86, \
 25 |     87, 88, 89, 90]
 26 | catmap_dict = OrderedDict()
 27 | for ii in range(len(exist_id)):
 28 |     catmap_dict[exist_id[ii]] = ii
 29 | 
 30 | def build_object_targets(
 31 |     pred_boxes, pred_conf, pred_cls, target, anchors, num_anchors, num_classes, grid_size, ignore_thres, img_dim
 32 | ):
 33 |     nB = target.size(0)
 34 |     nA = num_anchors
 35 |     nC = num_classes
 36 |     nG = grid_size
 37 |     mask = torch.zeros(nB, nA, nG, nG)
 38 |     conf_mask = torch.ones(nB, nA, nG, nG)
 39 |     tx = torch.zeros(nB, nA, nG, nG)
 40 |     ty = torch.zeros(nB, nA, nG, nG)
 41 |     tw = torch.zeros(nB, nA, nG, nG)
 42 |     th = torch.zeros(nB, nA, nG, nG)
 43 |     tconf = torch.ByteTensor(nB, nA, nG, nG).fill_(0)
 44 |     tcls = torch.ByteTensor(nB, nA, nG, nG, nC).fill_(0)
 45 | 
 46 |     nGT = 0
 47 |     nCorrect = 0
 48 |     for b in range(nB):
 49 |         for t in range(target.shape[1]):
 50 |             if target[b, t].sum() == 0:
 51 |                 continue
 52 |             nGT += 1
 53 |             # Convert to position relative to box
 54 |             gx = target[b, t, 1] * nG
 55 |             gy = target[b, t, 2] * nG
 56 |             gw = target[b, t, 3] * nG
 57 |             gh = target[b, t, 4] * nG
 58 |             # Get grid box indices
 59 |             gi = int(gx)
 60 |             gj = int(gy)
 61 |             # Get shape of gt box
 62 |             gt_box = torch.FloatTensor(np.array([0, 0, gw, gh])).unsqueeze(0)
 63 |             # Get shape of anchor box
 64 |             anchor_shapes = torch.FloatTensor(np.concatenate((np.zeros((len(anchors), 2)), np.array(anchors)), 1))
 65 |             # Calculate iou between gt and anchor shapes
 66 |             anch_ious = bbox_iou(gt_box, anchor_shapes)
 67 |             # Where the overlap is larger than threshold set mask to zero (ignore)
 68 |             conf_mask[b, anch_ious > ignore_thres, gj, gi] = 0
 69 |             # Find the best matching anchor box
 70 |             best_n = np.argmax(anch_ious)
 71 |             # Get ground truth box
 72 |             gt_box = torch.FloatTensor(np.array([gx, gy, gw, gh])).unsqueeze(0)
 73 |             # Get the best prediction
 74 |             pred_box = pred_boxes[b, best_n, gj, gi].unsqueeze(0)
 75 |             # Masks
 76 |             mask[b, best_n, gj, gi] = 1
 77 |             conf_mask[b, best_n, gj, gi] = 1
 78 |             # Coordinates
 79 |             tx[b, best_n, gj, gi] = gx - gi
 80 |             ty[b, best_n, gj, gi] = gy - gj
 81 |             # Width and height
 82 |             tw[b, best_n, gj, gi] = math.log(gw / anchors[best_n][0] + 1e-16)
 83 |             th[b, best_n, gj, gi] = math.log(gh / anchors[best_n][1] + 1e-16)
 84 |             # One-hot encoding of label
 85 |             target_label = int(target[b, t, 0])
 86 |             target_label = catmap_dict[target_label]
 87 |             tcls[b, best_n, gj, gi, target_label] = 1
 88 |             tconf[b, best_n, gj, gi] = 1
 89 | 
 90 |             # Calculate iou between ground truth and best matching prediction
 91 |             iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False)
 92 |             pred_label = torch.argmax(pred_cls[b, best_n, gj, gi])
 93 |             score = pred_conf[b, best_n, gj, gi]
 94 |             if iou > 0.5 and pred_label == target_label and score > 0.5:
 95 |                 nCorrect += 1
 96 | 
 97 |     return nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls
 98 | 
 99 | def parse_model_config(path):
100 |     """Parses the yolo-v3 layer configuration file and returns module definitions"""
101 |     file = open(path, 'r')
102 |     lines = file.read().split('\n')
103 |     lines = [x for x in lines if x and not x.startswith('#')]
104 |     lines = [x.rstrip().lstrip() for x in lines] # get rid of fringe whitespaces
105 |     module_defs = []
106 |     for line in lines:
107 |         if line.startswith('['): # This marks the start of a new block
108 |             module_defs.append({})
109 |             module_defs[-1]['type'] = line[1:-1].rstrip()
110 |             if module_defs[-1]['type'] == 'convolutional' or module_defs[-1]['type'] == 'yoloconvolutional':
111 |                 module_defs[-1]['batch_normalize'] = 0
112 |         else:
113 |             key, value = line.split("=")
114 |             value = value.strip()
115 |             module_defs[-1][key.rstrip()] = value.strip()
116 |     return module_defs
117 | 
118 | class ConvBatchNormReLU(nn.Sequential):
119 |     def __init__(
120 |         self,
121 |         in_channels,
122 |         out_channels,
123 |         kernel_size,
124 |         stride,
125 |         padding,
126 |         dilation,
127 |         leaky=False,
128 |         relu=True,
129 |         instance=False,
130 |     ):
131 |         super(ConvBatchNormReLU, self).__init__()
132 |         self.add_module(
133 |             "conv",
134 |             nn.Conv2d(
135 |                 in_channels=in_channels,
136 |                 out_channels=out_channels,
137 |                 kernel_size=kernel_size,
138 |                 stride=stride,
139 |                 padding=padding,
140 |                 dilation=dilation,
141 |                 bias=False,
142 |             ),
143 |         )
144 |         if instance:
145 |             self.add_module(
146 |                 "bn",
147 |                 nn.InstanceNorm2d(num_features=out_channels),
148 |             )
149 |         else:
150 |             self.add_module(
151 |                 "bn",
152 |                 nn.BatchNorm2d(
153 |                     num_features=out_channels, eps=1e-5, momentum=0.999, affine=True
154 |                 ),
155 |             )
156 | 
157 |         if leaky:
158 |             self.add_module("relu", nn.LeakyReLU(0.1))
159 |         elif relu:
160 |             self.add_module("relu", nn.ReLU())
161 | 
162 |     def forward(self, x):
163 |         return super(ConvBatchNormReLU, self).forward(x)
164 | 
165 | class ConvBatchNormReLU_3d(nn.Sequential):
166 |     def __init__(
167 |         self,
168 |         in_channels,
169 |         out_channels,
170 |         kernel_size,
171 |         stride,
172 |         padding,
173 |         dilation,
174 |         leaky=False,
175 |         relu=True,
176 |     ):
177 |         super(ConvBatchNormReLU_3d, self).__init__()
178 |         self.add_module(
179 |             "conv",
180 |             nn.Conv3d(
181 |                 in_channels=in_channels,
182 |                 out_channels=out_channels,
183 |                 kernel_size=kernel_size,
184 |                 stride=stride,
185 |                 padding=padding,
186 |                 dilation=dilation,
187 |                 bias=False,
188 |             ),
189 |         )
190 |         self.add_module(
191 |             "bn",
192 |             nn.BatchNorm3d(
193 |                 num_features=out_channels, eps=1e-5, momentum=0.999, affine=True
194 |             ),
195 |         )
196 | 
197 |         if leaky:
198 |             self.add_module("relu", nn.LeakyReLU(0.1))
199 |         elif relu:
200 |             self.add_module("relu", nn.ReLU())
201 | 
202 |     def forward(self, x):
203 |         return super(ConvBatchNormReLU_3d, self).forward(x)
204 |         
205 | class MyUpsample2(nn.Module):
206 |     def forward(self, x):
207 |         return x[:, :, :, None, :, None].expand(-1, -1, -1, 2, -1, 2).reshape(x.size(0), x.size(1), x.size(2)*2, x.size(3)*2)
208 | 
209 | def create_modules(module_defs):
210 |     """
211 |     Constructs module list of layer blocks from module configuration in module_defs
212 |     """
213 |     hyperparams = module_defs.pop(0)
214 |     output_filters = [int(hyperparams["channels"])]
215 |     module_list = nn.ModuleList()
216 |     for i, module_def in enumerate(module_defs):
217 |         modules = nn.Sequential()
218 | 
219 |         if module_def["type"] == "convolutional" or module_def["type"] == "yoloconvolutional":
220 |             bn = int(module_def["batch_normalize"])
221 |             filters = int(module_def["filters"])
222 |             kernel_size = int(module_def["size"])
223 |             pad = (kernel_size - 1) // 2 if int(module_def["pad"]) else 0
224 |             modules.add_module(
225 |                 "conv_%d" % i,
226 |                 nn.Conv2d(
227 |                     in_channels=output_filters[-1],
228 |                     out_channels=filters,
229 |                     kernel_size=kernel_size,
230 |                     stride=int(module_def["stride"]),
231 |                     padding=pad,
232 |                     bias=not bn,
233 |                 ),
234 |             )
235 |             if bn:
236 |                 modules.add_module("batch_norm_%d" % i, nn.BatchNorm2d(filters))
237 |             if module_def["activation"] == "leaky":
238 |                 modules.add_module("leaky_%d" % i, nn.LeakyReLU(0.1))
239 | 
240 |         elif module_def["type"] == "maxpool":
241 |             kernel_size = int(module_def["size"])
242 |             stride = int(module_def["stride"])
243 |             if kernel_size == 2 and stride == 1:
244 |                 padding = nn.ZeroPad2d((0, 1, 0, 1))
245 |                 modules.add_module("_debug_padding_%d" % i, padding)
246 |             maxpool = nn.MaxPool2d(
247 |                 kernel_size=int(module_def["size"]),
248 |                 stride=int(module_def["stride"]),
249 |                 padding=int((kernel_size - 1) // 2),
250 |             )
251 |             modules.add_module("maxpool_%d" % i, maxpool)
252 | 
253 |         elif module_def["type"] == "upsample":
254 |             # upsample = nn.Upsample(scale_factor=int(module_def["stride"]), mode="nearest")
255 |             assert(int(module_def["stride"])==2)
256 |             upsample = MyUpsample2()
257 |             modules.add_module("upsample_%d" % i, upsample)
258 | 
259 |         elif module_def["type"] == "route":
260 |             layers = [int(x) for x in module_def["layers"].split(",")]
261 |             filters = sum([output_filters[layer_i] for layer_i in layers])
262 |             modules.add_module("route_%d" % i, EmptyLayer())
263 | 
264 |         elif module_def["type"] == "shortcut":
265 |             filters = output_filters[int(module_def["from"])]
266 |             modules.add_module("shortcut_%d" % i, EmptyLayer())
267 | 
268 |         elif module_def["type"] == "yolo":
269 |             anchor_idxs = [int(x) for x in module_def["mask"].split(",")]
270 |             # Extract anchors
271 |             anchors = [int(x) for x in module_def["anchors"].split(",")]
272 |             anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
273 |             anchors = [anchors[i] for i in anchor_idxs]
274 |             num_classes = int(module_def["classes"])
275 |             img_height = int(hyperparams["height"])
276 |             # Define detection layer
277 |             # yolo_layer = YOLOLayer(anchors, num_classes, img_height)
278 |             yolo_layer = YOLOLayer(anchors, num_classes, 256)
279 |             modules.add_module("yolo_%d" % i, yolo_layer)
280 |         # Register module list and number of output filters
281 |         module_list.append(modules)
282 |         output_filters.append(filters)
283 | 
284 |     return hyperparams, module_list
285 | 
286 | class EmptyLayer(nn.Module):
287 |     """Placeholder for 'route' and 'shortcut' layers"""
288 | 
289 |     def __init__(self):
290 |         super(EmptyLayer, self).__init__()
291 | 
292 | class YOLOLayer(nn.Module):
293 |     """Detection layer"""
294 | 
295 |     def __init__(self, anchors, num_classes, img_dim):
296 |         super(YOLOLayer, self).__init__()
297 |         self.anchors = anchors
298 |         self.num_anchors = len(anchors)
299 |         self.num_classes = num_classes
300 |         self.bbox_attrs = 5 + num_classes
301 |         self.image_dim = img_dim
302 |         self.ignore_thres = 0.5
303 |         self.lambda_coord = 1
304 | 
305 |         self.mse_loss = nn.MSELoss(size_average=True)  # Coordinate loss
306 |         self.bce_loss = nn.BCELoss(size_average=True)  # Confidence loss
307 |         self.ce_loss = nn.CrossEntropyLoss()  # Class loss
308 | 
309 |     def forward(self, x, targets=None):
310 |         nA = self.num_anchors
311 |         nB = x.size(0)
312 |         nG = x.size(2)
313 |         stride = self.image_dim / nG
314 | 
315 |         # Tensors for cuda support
316 |         FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
317 |         LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
318 |         ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor
319 | 
320 |         prediction = x.view(nB, nA, self.bbox_attrs, nG, nG).permute(0, 1, 3, 4, 2).contiguous()
321 | 
322 |         # Get outputs
323 |         x = torch.sigmoid(prediction[..., 0])  # Center x
324 |         y = torch.sigmoid(prediction[..., 1])  # Center y
325 |         w = prediction[..., 2]  # Width
326 |         h = prediction[..., 3]  # Height
327 |         pred_conf = torch.sigmoid(prediction[..., 4])  # Conf
328 |         pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.
329 | 
330 |         # Calculate offsets for each grid
331 |         grid_x = torch.arange(nG).repeat(nG, 1).view([1, 1, nG, nG]).type(FloatTensor)
332 |         grid_y = torch.arange(nG).repeat(nG, 1).t().view([1, 1, nG, nG]).type(FloatTensor)
333 |         # scaled_anchors = FloatTensor([(a_w / stride, a_h / stride) for a_w, a_h in self.anchors])
334 |         scaled_anchors = FloatTensor([(a_w / (416 / nG), a_h / (416 / nG)) for a_w, a_h in self.anchors])
335 |         anchor_w = scaled_anchors[:, 0:1].view((1, nA, 1, 1))
336 |         anchor_h = scaled_anchors[:, 1:2].view((1, nA, 1, 1))
337 | 
338 |         # Add offset and scale with anchors
339 |         pred_boxes = FloatTensor(prediction[..., :4].shape)
340 |         pred_boxes[..., 0] = x.data + grid_x
341 |         pred_boxes[..., 1] = y.data + grid_y
342 |         pred_boxes[..., 2] = torch.exp(w.data) * anchor_w
343 |         pred_boxes[..., 3] = torch.exp(h.data) * anchor_h
344 | 
345 |         # Training
346 |         if targets is not None:
347 |             targets = targets.clone()
348 |             targets[:,:,1:] = targets[:,:,1:]/self.image_dim
349 |             for b_i in range(targets.shape[0]):
350 |                 targets[b_i,:,1:] = xyxy2xywh(targets[b_i,:,1:])
351 | 
352 |             if x.is_cuda:
353 |                 self.mse_loss = self.mse_loss.cuda()
354 |                 self.bce_loss = self.bce_loss.cuda()
355 |                 self.ce_loss = self.ce_loss.cuda()
356 | 
357 |             nGT, nCorrect, mask, conf_mask, tx, ty, tw, th, tconf, tcls = build_object_targets(
358 |                 pred_boxes=pred_boxes.cpu().data,
359 |                 pred_conf=pred_conf.cpu().data,
360 |                 pred_cls=pred_cls.cpu().data,
361 |                 target=targets.cpu().data,
362 |                 anchors=scaled_anchors.cpu().data,
363 |                 num_anchors=nA,
364 |                 num_classes=self.num_classes,
365 |                 grid_size=nG,
366 |                 ignore_thres=self.ignore_thres,
367 |                 img_dim=self.image_dim,
368 |             )
369 | 
370 |             nProposals = int((pred_conf > 0.5).sum().item())
371 |             recall = float(nCorrect / nGT)  if nGT else 1
372 |             precision = float(nCorrect / nProposals) if nProposals else 0
373 | 
374 |             # Handle masks
375 |             mask = Variable(mask.type(ByteTensor))
376 |             conf_mask = Variable(conf_mask.type(ByteTensor))
377 | 
378 |             # Handle target variables
379 |             tx = Variable(tx.type(FloatTensor), requires_grad=False)
380 |             ty = Variable(ty.type(FloatTensor), requires_grad=False)
381 |             tw = Variable(tw.type(FloatTensor), requires_grad=False)
382 |             th = Variable(th.type(FloatTensor), requires_grad=False)
383 |             tconf = Variable(tconf.type(FloatTensor), requires_grad=False)
384 |             tcls = Variable(tcls.type(LongTensor), requires_grad=False)
385 | 
386 |             # Get conf mask where gt and where there is no gt
387 |             conf_mask_true = mask
388 |             conf_mask_false = conf_mask - mask
389 | 
390 |             # Mask outputs to ignore non-existing objects
391 |             loss_x = self.mse_loss(x[mask], tx[mask])
392 |             loss_y = self.mse_loss(y[mask], ty[mask])
393 |             loss_w = self.mse_loss(w[mask], tw[mask])
394 |             loss_h = self.mse_loss(h[mask], th[mask])
395 |             loss_conf = self.bce_loss(pred_conf[conf_mask_false], tconf[conf_mask_false]) + self.bce_loss(
396 |                 pred_conf[conf_mask_true], tconf[conf_mask_true]
397 |             )
398 |             loss_cls = (1 / nB) * self.ce_loss(pred_cls[mask], torch.argmax(tcls[mask], 1))
399 |             loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
400 |             return (
401 |                 loss,
402 |                 loss_x.item(),
403 |                 loss_y.item(),
404 |                 loss_w.item(),
405 |                 loss_h.item(),
406 |                 loss_conf.item(),
407 |                 loss_cls.item(),
408 |                 recall,
409 |                 precision,
410 |             )
411 | 
412 |         else:
413 |             # If not in training phase return predictions
414 |             output = torch.cat(
415 |                 (
416 |                     pred_boxes.view(nB, -1, 4) * stride,
417 |                     pred_conf.view(nB, -1, 1),
418 |                     pred_cls.view(nB, -1, self.num_classes),
419 |                 ),
420 |                 -1,
421 |             )
422 |             return output
423 | 
424 | class Darknet(nn.Module):
425 |     """YOLOv3 object detection model"""
426 | 
427 |     def __init__(self, config_path='./model/yolov3.cfg', img_size=416, obj_out=False):
428 |         super(Darknet, self).__init__()
429 |         self.config_path = config_path
430 |         self.obj_out = obj_out
431 |         self.module_defs = parse_model_config(config_path)
432 |         self.hyperparams, self.module_list = create_modules(self.module_defs)
433 |         self.img_size = img_size
434 |         self.seen = 0
435 |         self.header_info = np.array([0, 0, 0, self.seen, 0])
436 |         self.loss_names = ["x", "y", "w", "h", "conf", "cls", "recall", "precision"]
437 | 
438 |     def forward(self, x, targets=None):
439 |         batch = x.shape[0]
440 |         is_training = targets is not None
441 |         output, output_obj = [], []
442 |         self.losses = defaultdict(float)
443 |         layer_outputs = []
444 |         for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
445 |             if module_def["type"] in ["convolutional", "upsample", "maxpool"]:
446 |                 x = module(x)
447 |             elif module_def["type"] == "route":
448 |                 layer_i = [int(x) for x in module_def["layers"].split(",")]
449 |                 x = torch.cat([layer_outputs[i] for i in layer_i], 1)
450 |             elif module_def["type"] == "shortcut":
451 |                 layer_i = int(module_def["from"])
452 |                 x = layer_outputs[-1] + layer_outputs[layer_i]
453 |             elif module_def["type"] == "yoloconvolutional":
454 |                 output.append(x)    ## save final feature block
455 |                 x = module(x)
456 |             elif module_def["type"] == "yolo":
457 |                 # Train phase: get loss
458 |                 if is_training:
459 |                     x, *losses = module[0](x, targets)
460 |                     for name, loss in zip(self.loss_names, losses):
461 |                         self.losses[name] += loss
462 |                 # Test phase: Get detections
463 |                 else:
464 |                     x = module(x)
465 |                 output_obj.append(x)
466 |                 # x = module(x)
467 |                 # output.append(x)
468 |             layer_outputs.append(x)
469 | 
470 |         self.losses["recall"] /= 3
471 |         self.losses["precision"] /= 3
472 |         # return sum(output) if is_training else torch.cat(output, 1)
473 |         # return torch.cat(output, 1)
474 |         if self.obj_out:
475 |             return output, sum(output_obj) if is_training else torch.cat(output_obj, 1), self.losses["precision"], self.losses["recall"]
476 |             # return output, sum(output_obj)/(len(output_obj)*batch) if is_training else torch.cat(output_obj, 1)
477 |         else:
478 |             return output
479 | 
480 |     def load_weights(self, weights_path):
481 |         """Parses and loads the weights stored in 'weights_path'"""
482 | 
483 |         # Open the weights file
484 |         fp = open(weights_path, "rb")
485 |         if self.config_path=='./model/yolo9000.cfg':
486 |             header = np.fromfile(fp, dtype=np.int32, count=4)  # First five are header values
487 |         else:
488 |             header = np.fromfile(fp, dtype=np.int32, count=5)  # First five are header values
489 |         # Needed to write header when saving weights
490 |         self.header_info = header
491 | 
492 |         self.seen = header[3]
493 |         weights = np.fromfile(fp, dtype=np.float32)  # The rest are weights
494 |         fp.close()
495 | 
496 |         ptr = 0
497 |         for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
498 |             if module_def["type"] == "convolutional" or module_def["type"] == "yoloconvolutional":
499 |                 conv_layer = module[0]
500 |                 if module_def["batch_normalize"]:
501 |                     # Load BN bias, weights, running mean and running variance
502 |                     bn_layer = module[1]
503 |                     num_b = bn_layer.bias.numel()  # Number of biases
504 |                     # Bias
505 |                     bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias)
506 |                     bn_layer.bias.data.copy_(bn_b)
507 |                     ptr += num_b
508 |                     # Weight
509 |                     bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight)
510 |                     bn_layer.weight.data.copy_(bn_w)
511 |                     ptr += num_b
512 |                     # Running Mean
513 |                     bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean)
514 |                     bn_layer.running_mean.data.copy_(bn_rm)
515 |                     ptr += num_b
516 |                     # Running Var
517 |                     bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var)
518 |                     bn_layer.running_var.data.copy_(bn_rv)
519 |                     ptr += num_b
520 |                 else:
521 |                     # Load conv. bias
522 |                     num_b = conv_layer.bias.numel()
523 |                     conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias)
524 |                     conv_layer.bias.data.copy_(conv_b)
525 |                     ptr += num_b
526 |                 # Load conv. weights
527 |                 num_w = conv_layer.weight.numel()
528 |                 conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight)
529 |                 conv_layer.weight.data.copy_(conv_w)
530 |                 ptr += num_w
531 | 
532 |     """
533 |         @:param path    - path of the new weights file
534 |         @:param cutoff  - save layers between 0 and cutoff (cutoff = -1 -> all are saved)
535 |     """
536 | 
537 |     def save_weights(self, path, cutoff=-1):
538 | 
539 |         fp = open(path, "wb")
540 |         self.header_info[3] = self.seen
541 |         self.header_info.tofile(fp)
542 | 
543 |         # Iterate through layers
544 |         for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])):
545 |             if module_def["type"] == "convolutional":
546 |                 conv_layer = module[0]
547 |                 # If batch norm, load bn first
548 |                 if module_def["batch_normalize"]:
549 |                     bn_layer = module[1]
550 |                     bn_layer.bias.data.cpu().numpy().tofile(fp)
551 |                     bn_layer.weight.data.cpu().numpy().tofile(fp)
552 |                     bn_layer.running_mean.data.cpu().numpy().tofile(fp)
553 |                     bn_layer.running_var.data.cpu().numpy().tofile(fp)
554 |                 # Load conv bias
555 |                 else:
556 |                     conv_layer.bias.data.cpu().numpy().tofile(fp)
557 |                 # Load conv weights
558 |                 conv_layer.weight.data.cpu().numpy().tofile(fp)
559 | 
560 |         fp.close
561 | 
562 | class Darknetfort(nn.Module):
563 |     """YOLOv3 object detection model"""
564 | 
565 |     def __init__(self, config_path='./model/yolov3.cfg', img_size=416, obj_out=False):
566 |         super(Darknetfort, self).__init__()
567 |         self.config_path = config_path
568 |         self.obj_out = obj_out
569 |         self.module_defs = parse_model_config(config_path)
570 |         self.hyperparams, self.module_list = create_modules(self.module_defs)
571 |         self.img_size = img_size
572 |         self.seen = 0
573 |         self.header_info = np.array([0, 0, 0, self.seen, 0])
574 |         self.loss_names = ["x", "y", "w", "h", "conf", "cls", "recall", "precision"]
575 |         self.layer_num = 12
576 |     def forward(self, x, targets=None):
577 |         batch = x.shape[0]
578 |         is_training = targets is not None
579 |         output, output_obj = [], []
580 |         self.losses = defaultdict(float)
581 |         layer_outputs = []
582 |         layer = 0
583 |         for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
584 |             if module_def["type"] in ["convolutional", "upsample", "maxpool"]:
585 |                 x = module(x)
586 |                 layer += 1
587 |             elif module_def["type"] == "route":
588 |                 layer_i = [int(x) for x in module_def["layers"].split(",")]
589 |                 x = torch.cat([layer_outputs[i] for i in layer_i], 1)
590 |                 layer += 1
591 |             elif module_def["type"] == "shortcut":
592 |                 layer_i = int(module_def["from"])
593 |                 x = layer_outputs[-1] + layer_outputs[layer_i]
594 |                 layer += 1
595 |             elif module_def["type"] == "yoloconvolutional":
596 |                 output.append(x)    ## save final feature block
597 |                 x = module(x)
598 |                 layer += 1
599 |             elif module_def["type"] == "yolo":
600 |                 # Train phase: get loss
601 |                 if is_training:
602 |                     x, *losses = module[0](x, targets)
603 |                     for name, loss in zip(self.loss_names, losses):
604 |                         self.losses[name] += loss
605 |                 # Test phase: Get detections
606 |                 else:
607 |                     x = module(x)
608 |                 output_obj.append(x)
609 |                 layer += 1
610 |                 # x = module(x)
611 |                 # output.append(x)
612 |             layer_outputs.append(x)
613 | 
614 |         self.losses["recall"] /= 3
615 |         self.losses["precision"] /= 3
616 |         # return sum(output) if is_training else torch.cat(output, 1)
617 |         # return torch.cat(output, 1)
618 |         if self.obj_out:
619 |             return output, sum(output_obj) if is_training else torch.cat(output_obj, 1), self.losses["precision"], self.losses["recall"]
620 |             # return output, sum(output_obj)/(len(output_obj)*batch) if is_training else torch.cat(output_obj, 1)
621 |         else:
622 |             return output
623 | 
624 |     def load_weights(self, weights_path):
625 |         """Parses and loads the weights stored in 'weights_path'"""
626 | 
627 |         # Open the weights file
628 |         fp = open(weights_path, "rb")
629 |         if self.config_path=='./model/yolo9000.cfg':
630 |             header = np.fromfile(fp, dtype=np.int32, count=4)  # First five are header values
631 |         else:
632 |             header = np.fromfile(fp, dtype=np.int32, count=5)  # First five are header values
633 |         # Needed to write header when saving weights
634 |         self.header_info = header
635 | 
636 |         self.seen = header[3]
637 |         weights = np.fromfile(fp, dtype=np.float32)  # The rest are weights
638 |         fp.close()
639 | 
640 |         ptr = 0
641 |         for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
642 |             if module_def["type"] == "convolutional" or module_def["type"] == "yoloconvolutional":
643 |                 conv_layer = module[0]
644 |                 if module_def["batch_normalize"]:
645 |                     # Load BN bias, weights, running mean and running variance
646 |                     bn_layer = module[1]
647 |                     num_b = bn_layer.bias.numel()  # Number of biases
648 |                     # Bias
649 |                     bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias)
650 |                     bn_layer.bias.data.copy_(bn_b)
651 |                     ptr += num_b
652 |                     # Weight
653 |                     bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight)
654 |                     bn_layer.weight.data.copy_(bn_w)
655 |                     ptr += num_b
656 |                     # Running Mean
657 |                     bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean)
658 |                     bn_layer.running_mean.data.copy_(bn_rm)
659 |                     ptr += num_b
660 |                     # Running Var
661 |                     bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var)
662 |                     bn_layer.running_var.data.copy_(bn_rv)
663 |                     ptr += num_b
664 |                 else:
665 |                     # Load conv. bias
666 |                     num_b = conv_layer.bias.numel()
667 |                     conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias)
668 |                     conv_layer.bias.data.copy_(conv_b)
669 |                     ptr += num_b
670 |                 # Load conv. weights
671 |                 num_w = conv_layer.weight.numel()
672 |                 conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight)
673 |                 conv_layer.weight.data.copy_(conv_w)
674 |                 ptr += num_w
675 | 
676 |     """
677 |         @:param path    - path of the new weights file
678 |         @:param cutoff  - save layers between 0 and cutoff (cutoff = -1 -> all are saved)
679 |     """
680 | 
681 |     def save_weights(self, path, cutoff=-1):
682 | 
683 |         fp = open(path, "wb")
684 |         self.header_info[3] = self.seen
685 |         self.header_info.tofile(fp)
686 | 
687 |         # Iterate through layers
688 |         for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])):
689 |             if module_def["type"] == "convolutional":
690 |                 conv_layer = module[0]
691 |                 # If batch norm, load bn first
692 |                 if module_def["batch_normalize"]:
693 |                     bn_layer = module[1]
694 |                     bn_layer.bias.data.cpu().numpy().tofile(fp)
695 |                     bn_layer.weight.data.cpu().numpy().tofile(fp)
696 |                     bn_layer.running_mean.data.cpu().numpy().tofile(fp)
697 |                     bn_layer.running_var.data.cpu().numpy().tofile(fp)
698 |                 # Load conv bias
699 |                 else:
700 |                     conv_layer.bias.data.cpu().numpy().tofile(fp)
701 |                 # Load conv weights
702 |                 conv_layer.weight.data.cpu().numpy().tofile(fp)
703 | 
704 |         fp.close
705 | 
706 | 
707 | if __name__ == "__main__":
708 |     import torch
709 |     import numpy as np
710 |     torch.manual_seed(13)
711 |     np.random.seed(13)
712 |     torch.backends.cudnn.deterministic = True
713 |     torch.backends.cudnn.benchmark = False
714 | 
715 |     model = Darknet()
716 |     model.load_weights('./saved_models/yolov3.weights')
717 |     # model.eval()
718 | 
719 |     image = torch.autograd.Variable(torch.randn(1, 3, 416, 416))
720 |     output1, output2, output3 = model(image)
721 |     print(output1)
722 |     # print(output1.size(), output2.size(), output3.size())
723 |     # print(model(image))
724 |     # print(len(output), output[0].size(), output[1].size(), output[2].size())
725 | 


--------------------------------------------------------------------------------
/model/modulation.py:
--------------------------------------------------------------------------------
  1 | from collections import OrderedDict
  2 | import math
  3 | import random
  4 | import pprint
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from torch.autograd import Variable
  9 | import torchvision.models
 10 | from torch.nn.init import kaiming_normal, kaiming_uniform_
 11 | from .darknet import ConvBatchNormReLU, ConvBatchNormReLU_3d
 12 | 
 13 | class Bottleneck(nn.Module):
 14 |     expansion = 4
 15 | 
 16 |     def __init__(self, inplanes, planes, stride=1):
 17 |         super().__init__()
 18 | 
 19 |         # all conv layers have stride 1. an avgpool is performed after the second convolution when stride > 1
 20 |         self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
 21 |         self.bn1 = nn.BatchNorm2d(planes)
 22 | 
 23 |         self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
 24 |         self.bn2 = nn.BatchNorm2d(planes)
 25 | 
 26 |         self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
 27 | 
 28 |         self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
 29 |         self.bn3 = nn.BatchNorm2d(planes * self.expansion)
 30 | 
 31 |         self.relu = nn.ReLU(inplace=True)
 32 |         self.downsample = None
 33 |         self.stride = stride
 34 | 
 35 |         if stride > 1 or inplanes != planes * Bottleneck.expansion:
 36 |             # downsampling layer is prepended with an avgpool, and the subsequent convolution has stride 1
 37 |             self.downsample = nn.Sequential(OrderedDict([
 38 |                 ("-1", nn.AvgPool2d(stride)),
 39 |                 ("0", nn.Conv2d(inplanes, planes * self.expansion, 1, stride=1, bias=False)),
 40 |                 ("1", nn.BatchNorm2d(planes * self.expansion))
 41 |             ]))
 42 | 
 43 |     def forward(self, x: torch.Tensor):
 44 |         identity = x
 45 | 
 46 |         out = self.relu(self.bn1(self.conv1(x)))
 47 |         out = self.relu(self.bn2(self.conv2(out)))
 48 |         out = self.avgpool(out)
 49 |         out = self.bn3(self.conv3(out))
 50 | 
 51 |         if self.downsample is not None:
 52 |             identity = self.downsample(x)
 53 | 
 54 |         out += identity
 55 |         out = self.relu(out)
 56 |         return out
 57 | 
 58 | class AttentionPool2d(nn.Module):
 59 |     def __init__(self, spacial_dim: int, embed_dim: int, num_heads: int, output_dim: int = None):
 60 |         super().__init__()
 61 |         self.positional_embedding = nn.Parameter(torch.randn(spacial_dim ** 2 + 1, embed_dim) / embed_dim ** 0.5)
 62 |         self.k_proj = nn.Linear(embed_dim, embed_dim)
 63 |         self.q_proj = nn.Linear(embed_dim, embed_dim)
 64 |         self.v_proj = nn.Linear(embed_dim, embed_dim)
 65 |         self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
 66 |         self.num_heads = num_heads
 67 |         self.embed_dim = embed_dim
 68 |         self.spacial_dim = spacial_dim
 69 | 
 70 |     def forward(self, x):
 71 |         B, C, H, W = x.shape
 72 |         x = x.reshape(x.shape[0], x.shape[1], x.shape[2] * x.shape[3]).permute(2, 0, 1)  # NCHW -> (HW)NC
 73 |         x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
 74 | 
 75 |         cls_pos = self.positional_embedding[0:1, :]
 76 |         # spatial_pos = F.interpolate(self.positional_embedding[1:,].reshape(1, self.spacial_dim, self.spacial_dim, self.embed_dim).permute(0, 3, 1, 2), size=(H, W), mode='bilinear')
 77 |         spatial_pos = self.positional_embedding[1:].reshape(self.spacial_dim, self.spacial_dim, self.embed_dim)[:H, :W]
 78 |         spatial_pos = spatial_pos.reshape(-1, self.embed_dim)
 79 |         # spatial_pos = spatial_pos.reshape(self.embed_dim, H*W).permute(1, 0)
 80 |         positional_embedding = torch.cat([cls_pos, spatial_pos], dim=0)
 81 | 
 82 |         x = x + positional_embedding[:, None, :]
 83 |         x, _ = F.multi_head_attention_forward(
 84 |             query=x, key=x, value=x,
 85 |             embed_dim_to_check=x.shape[-1],
 86 |             num_heads=self.num_heads,
 87 |             q_proj_weight=self.q_proj.weight,
 88 |             k_proj_weight=self.k_proj.weight,
 89 |             v_proj_weight=self.v_proj.weight,
 90 |             in_proj_weight=None,
 91 |             in_proj_bias=torch.cat([self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
 92 |             bias_k=None,
 93 |             bias_v=None,
 94 |             add_zero_attn=False,
 95 |             dropout_p=0,
 96 |             out_proj_weight=self.c_proj.weight,
 97 |             out_proj_bias=self.c_proj.bias,
 98 |             use_separate_proj_weight=True,
 99 |             training=self.training,
100 |             need_weights=False
101 |         )
102 | 
103 |         x = x.permute(1, 2, 0)
104 |         global_feat = x[:, :, 0]
105 |         feature_map = x[:, :, 1:].reshape(B, -1, H, W)
106 |         return global_feat, feature_map
107 |            
108 | class CLIPResNetWithAttention(nn.Module):
109 |     """
110 |     A ResNet class that is similar to torchvision's but contains the following changes:
111 |     - There are now 3 "stem" convolutions as opposed to 1, with an average pool instead of a max pool.
112 |     - Performs anti-aliasing strided convolutions, where an avgpool is prepended to convolutions with stride > 1
113 |     - The final pooling layer is a QKV attention instead of an average pool
114 |     """
115 | 
116 |     def __init__(self, layers, output_dim=1024, input_resolution=224, width=64, pretrained=None, att_level3=False, baseline=False, **kwargs):
117 |         super().__init__()
118 |         self.pretrained = pretrained
119 |         self.output_dim = output_dim
120 |         self.input_resolution = input_resolution
121 | 
122 |         # the 3-layer stem
123 |         self.conv1 = nn.Conv2d(4, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
124 |         self.bn1 = nn.BatchNorm2d(width // 2)
125 |         self.conv2 = nn.Conv2d(width // 2, width // 2, kernel_size=3, padding=1, bias=False)
126 |         self.bn2 = nn.BatchNorm2d(width // 2)
127 |         self.conv3 = nn.Conv2d(width // 2, width, kernel_size=3, padding=1, bias=False)
128 |         self.bn3 = nn.BatchNorm2d(width)
129 |         self.avgpool = nn.AvgPool2d(2)
130 |         self.relu = nn.ReLU(inplace=True)
131 |         self.reg = torch.nn.Sequential(
132 |                         nn.Conv2d(256, 1, kernel_size=1, padding=0, bias=False),
133 |                         nn.Sigmoid()
134 |         )
135 |         # residual layers
136 |         self._inplanes = width  # this is a *mutable* variable used during construction
137 |         self.layer1 = self._make_layer(width, layers[0])
138 |         self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
139 |         self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
140 |         self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
141 | 
142 |         embed_dim = width * 32  # the ResNet feature dimension
143 |         self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim, 32, output_dim)
144 |         self.att_level3 = att_level3
145 |         self.baseline = baseline
146 | 
147 |     def init_weights(self, pretrained=None):
148 |         pretrained = pretrained or self.pretrained
149 |         if isinstance(pretrained, str):
150 |             checkpoint = torch.jit.load(pretrained, map_location='cpu').float().state_dict()
151 | 
152 |             state_dict = {}
153 | 
154 |             for k in checkpoint.keys():
155 |                 if k.startswith('visual.'):
156 |                     new_k = k.replace('visual.', '')
157 |                     state_dict[new_k] = checkpoint[k]
158 | 
159 |                     if 'positional_embedding' in new_k:
160 |                         if self.attnpool.positional_embedding.shape != state_dict[new_k].shape:
161 |                             print(f'Resize the pos_embed shape from {state_dict[new_k].shape} to {self.attnpool.positional_embedding.shape}')
162 |                             cls_pos = state_dict[new_k][0:1, :]
163 |                             H = W = self.input_resolution // 32
164 |                             spatial_pos = F.interpolate(state_dict[new_k][1:,].reshape(1, 7, 7, cls_pos.shape[1]).permute(0, 3, 1, 2), size=(H, W), mode='bilinear')
165 |                             spatial_pos = spatial_pos.reshape(cls_pos.shape[1], H*W).permute(1, 0)
166 |                             positional_embedding = torch.cat([cls_pos, spatial_pos], dim=0)
167 |                             state_dict[new_k] = positional_embedding
168 |                             assert self.attnpool.positional_embedding.shape == state_dict[new_k].shape
169 | 
170 |             u, w = self.load_state_dict(state_dict, False)
171 |             print(u, w, 'are misaligned params in CLIPResNet')
172 | 
173 |     def _make_layer(self, planes, blocks, stride=1):
174 |         layers = [Bottleneck(self._inplanes, planes, stride)]
175 | 
176 |         self._inplanes = planes * Bottleneck.expansion
177 |         for _ in range(1, blocks):
178 |             layers.append(Bottleneck(self._inplanes, planes))
179 | 
180 |         return nn.Sequential(*layers)
181 | 
182 |     def forward(self, x):
183 |         def stem(x):
184 |             for conv, bn in [(self.conv1, self.bn1), (self.conv2, self.bn2), (self.conv3, self.bn3)]:
185 |                 x = self.relu(bn(conv(x)))
186 |             x = self.avgpool(x)
187 |             return x
188 | 
189 |         x = x.type(self.conv1.weight.dtype)
190 |         x = stem(x)
191 | 
192 |         outs = []
193 |         x = self.layer1(x)
194 |         out1 = self.reg(x)
195 |         outs.append(out1)
196 |         x = self.layer2(x)
197 |         outs.append(x)
198 |         x = self.layer3(x)
199 |         outs.append(x)
200 |         x = self.layer4(x)
201 |         outs.append(x)
202 | 
203 |         x_global, x_local = self.attnpool(x)
204 |         outs.append([x_global, x_local])
205 |         if self.att_level3:
206 |             new_outs = [outs[0], outs[1], outs[2], outs[4][1], outs[4]]
207 |             if self.baseline:
208 |                 new_outs = new_outs[:-1]
209 |             return tuple(new_outs)
210 |         else:
211 |             return tuple(outs)
212 | 
213 | class CLIPTextEncoder(nn.Module):
214 |     def __init__(self, context_length=77,
215 |                  vocab_size=49408,
216 |                  transformer_width=512,
217 |                  transformer_heads=8,
218 |                  transformer_layers=12,
219 |                  embed_dim=512,
220 |                  out_dim=256,
221 |                  pretrained=None, **kwargs):
222 |         super().__init__()
223 | 
224 |         self.pretrained = pretrained
225 | 
226 |         self.context_length = context_length
227 | 
228 |         self.transformer = Transformer(
229 |             width=transformer_width,
230 |             layers=transformer_layers,
231 |             heads=transformer_heads,
232 |             attn_mask=self.build_attention_mask()
233 |         )
234 | 
235 |         self.vocab_size = vocab_size
236 |         self.token_embedding = nn.Embedding(vocab_size, transformer_width)
237 |         self.positional_embedding = nn.Parameter(torch.empty(self.context_length, transformer_width))
238 |         self.ln_final = LayerNorm(transformer_width)
239 |         self.text_projection = nn.Parameter(torch.empty(transformer_width, embed_dim))
240 | 
241 |     def init_weights(self, pretrained=None):
242 |         pretrained = pretrained or self.pretrained
243 |         if isinstance(pretrained, str):
244 |             checkpoint = torch.jit.load(pretrained, map_location='cpu').float().state_dict()
245 | 
246 |             state_dict = {}
247 | 
248 |             for k in checkpoint.keys():
249 |                 if k.startswith('transformer.'):
250 |                     state_dict[k] = checkpoint[k]
251 |                 
252 |                 if k == 'positional_embedding' or k == 'text_projection' or k.startswith('token_embedding') or k.startswith('ln_final'):
253 |                     if k == 'positional_embedding' and checkpoint[k].size(0) > self.context_length:
254 |                         checkpoint[k] = checkpoint[k][:self.context_length]
255 |                         print('positional_embedding is tuncated from 77 to', self.context_length)
256 |                     state_dict[k] = checkpoint[k]
257 |              
258 |             u, w = self.load_state_dict(state_dict, False)
259 |             print(u, w, 'are misaligned params in text encoder')
260 | 
261 | 
262 |     def build_attention_mask(self):
263 |         # lazily create causal attention mask, with full attention between the vision tokens
264 |         # pytorch uses additive attention mask; fill with -inf
265 |         mask = torch.empty(self.context_length, self.context_length)
266 |         mask.fill_(float("-inf"))
267 |         mask.triu_(1)  # zero out the lower diagonal
268 |         return mask
269 | 
270 |     def forward(self, text):
271 |         x = self.token_embedding(text)  # [batch_size, n_ctx, d_model]
272 |         #print(x.shape)
273 |         #exit()
274 |         x = x + self.positional_embedding
275 | 
276 |         x = x.permute(1, 0, 2)  # NLD -> LND
277 |         x = self.transformer(x)
278 |         x = x.permute(1, 0, 2)  # LND -> NLD
279 |         x = self.ln_final(x)
280 |         x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.text_projection
281 |         #x = self.out_proj(x)
282 |         return x
283 | 
284 | class CLIPVisionTransformer(nn.Module):
285 |     def __init__(self, input_resolution=224, patch_size=32, width=768, layers=3, heads=2, output_dim=512, out_indices=[0,1,2], pretrained=None, **kwargs):
286 |         super().__init__()
287 |         self.pretrained = pretrained
288 |         self.input_resolution = input_resolution
289 |         self.output_dim = output_dim
290 |         self.conv1 = nn.Conv2d(in_channels=3, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
291 | 
292 |         scale = width ** -0.5
293 |         self.class_embedding = nn.Parameter(scale * torch.randn(width))
294 |         self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
295 |         self.spatial_size = input_resolution // patch_size
296 |         self.ln_pre = LayerNorm(width)
297 | 
298 |         self.transformer = Transformer(width, layers, heads)
299 | 
300 |         self.out_indices = out_indices
301 | 
302 |         self.ln_post = LayerNorm(width)
303 |         self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
304 | 
305 |         embed_dim = width
306 |         if patch_size == 16:
307 |             self.fpn1 = nn.Sequential(
308 |                 nn.GroupNorm(1, embed_dim),
309 |                 nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
310 |                 nn.BatchNorm2d(embed_dim),
311 |                 nn.GELU(),
312 |                 nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
313 |             )
314 | 
315 |             self.fpn2 = nn.Sequential(
316 |                 nn.GroupNorm(1, embed_dim),
317 |                 nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
318 |             )
319 | 
320 |             self.fpn3 = nn.GroupNorm(1, embed_dim)
321 | 
322 |             self.fpn4 = nn.Sequential(
323 |                 nn.GroupNorm(1, embed_dim),
324 |                 nn.MaxPool2d(kernel_size=2, stride=2)
325 |             )
326 | 
327 |         elif patch_size == 8:
328 |             self.fpn1 = nn.Sequential(
329 |                 nn.GroupNorm(1, embed_dim),
330 |                 nn.ConvTranspose2d(embed_dim, embed_dim, kernel_size=2, stride=2),
331 |             )
332 | 
333 |             self.fpn2 = nn.GroupNorm(1, embed_dim)
334 | 
335 |             self.fpn3 = nn.Sequential(
336 |                 nn.GroupNorm(1, embed_dim),
337 |                 nn.MaxPool2d(kernel_size=2, stride=2),
338 |             )
339 | 
340 |             self.fpn4 = nn.Sequential(
341 |                 nn.GroupNorm(1, embed_dim),
342 |                 nn.MaxPool2d(kernel_size=4, stride=4),
343 |             )
344 |     
345 | 
346 |     def init_weights(self, pretrained=None):
347 |         pretrained = pretrained or self.pretrained
348 |         if isinstance(pretrained, str):
349 |             checkpoint = torch.jit.load(pretrained, map_location='cpu').float().state_dict()
350 | 
351 |             state_dict = {}
352 | 
353 |             for k in checkpoint.keys():
354 |                 if k.startswith('visual.'):
355 |                     new_k = k.replace('visual.', '')
356 |                     state_dict[new_k] = checkpoint[k]
357 | 
358 |             if 'positional_embedding' in state_dict.keys():
359 |                 if self.positional_embedding.shape != state_dict['positional_embedding'].shape:
360 |                     print(f'Resize the pos_embed shape from {state_dict["positional_embedding"].shape} to {self.positional_embedding.shape}')
361 |                     cls_pos = state_dict["positional_embedding"][0:1, :]
362 |                     spatial_pos = F.interpolate(state_dict["positional_embedding"][1:,].reshape(1, 14, 14, 768).permute(0, 3, 1, 2), size=(self.spatial_size, self.spatial_size), mode='bilinear')
363 |                     spatial_pos = spatial_pos.reshape(768, self.spatial_size*self.spatial_size).permute(1, 0)
364 |                     positional_embedding = torch.cat([cls_pos, spatial_pos], dim=0)
365 |                     state_dict['positional_embedding'] = positional_embedding
366 |                     assert self.positional_embedding.shape == state_dict['positional_embedding'].shape
367 | 
368 |             u, w = self.load_state_dict(state_dict, False)
369 |             #print(u[0])
370 |             print(u, w, 'are misaligned params in vision transformer')
371 | 
372 |     def forward(self, x: torch.Tensor):
373 |         #x = self.conv1(x)  # shape = [*, width, grid, grid]
374 |         x = x
375 |         #print(x.shape)
376 |         B, C, H, W = x.shape
377 | 
378 |         x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
379 |         x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
380 |         x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
381 | 
382 | 
383 |         pos = self.positional_embedding.to(x.dtype)
384 |         cls_pos = pos[0,:] + self.class_embedding.to(x.dtype)
385 |         spatial_pos = F.interpolate(pos[1:,].reshape(1, self.spatial_size, self.spatial_size, C).permute(0, 3, 1, 2), size=(H, W), mode='bilinear')
386 |         spatial_pos = spatial_pos.reshape(1, C, H*W).permute(0, 2, 1)
387 |         pos = torch.cat([cls_pos.reshape(1, 1, C), spatial_pos], dim=1)
388 |         x = x + pos
389 |         x = self.ln_pre(x)
390 |         x = x.permute(1, 0, 2)  # NLD -> LND
391 |   
392 |         features = []
393 |         for i, blk in enumerate(self.transformer.resblocks):
394 |             x = blk(x)
395 |             if i in self.out_indices:
396 |                 xp = x[1:,: , :].permute(1, 2, 0).reshape(B, -1, H, W)
397 |                 features.append(xp.contiguous())
398 | 
399 |         ops = [self.fpn1, self.fpn2, self.fpn3, self.fpn4]
400 |         for i in range(len(features)):
401 |             features[i] = ops[i](features[i])
402 | 
403 |         return tuple(features)
404 | 
405 | 
406 | class ResidualAttentionBlock(nn.Module):
407 |     def __init__(self, d_model: int, n_head: int, attn_mask: torch.Tensor = None, drop_path=0.):
408 |         super().__init__()
409 | 
410 |         self.attn = nn.MultiheadAttention(d_model, n_head)
411 |         self.ln_1 = LayerNorm(d_model)
412 |         self.mlp = nn.Sequential(OrderedDict([
413 |             ("c_fc", nn.Linear(d_model, d_model * 4)),
414 |             ("gelu", QuickGELU()),
415 |             ("c_proj", nn.Linear(d_model * 4, d_model))
416 |         ]))
417 |         self.ln_2 = LayerNorm(d_model)
418 |         self.attn_mask = attn_mask
419 | 
420 |         self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
421 | 
422 |     def attention(self, x: torch.Tensor):
423 |         self.attn_mask = self.attn_mask.to(dtype=x.dtype, device=x.device) if self.attn_mask is not None else None
424 |         return self.attn(x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
425 | 
426 |     def forward(self, x: torch.Tensor):
427 |         x = x + self.drop_path(self.attention(self.ln_1(x)))
428 |         x = x + self.drop_path(self.mlp(self.ln_2(x)))
429 |         return x
430 | 
431 | class QuickGELU(nn.Module):
432 |     def forward(self, x: torch.Tensor):
433 |         return x * torch.sigmoid(1.702 * x)
434 | class Transformer(nn.Module):
435 |     def __init__(self, width: int, layers: int, heads: int, attn_mask: torch.Tensor = None, drop_path_rate=0.):
436 |         super().__init__()
437 |         self.width = width
438 |         self.layers = layers
439 |         dpr = [x.item() for x in torch.linspace(0, drop_path_rate, layers)]  # stochastic depth decay rule
440 |         self.resblocks = nn.Sequential(*[ResidualAttentionBlock(width, heads, attn_mask, dpr[i]) for i in range(layers)])
441 | 
442 |     def forward(self, x: torch.Tensor):
443 |         return self.resblocks(x)
444 | 
445 | def init_modules(modules, init='uniform'):
446 |     if init.lower() == 'normal':
447 |         init_params = kaiming_normal
448 |     elif init.lower() == 'uniform':
449 |         init_params = kaiming_uniform_
450 |     else:
451 |         return
452 |     for m in modules:
453 |         if isinstance(m, (nn.Conv3d, nn.Conv2d, nn.Linear)):
454 |             init_params(m.weight)
455 | 
456 | def gelu(x):
457 |     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
458 | 
459 | class FiLM(nn.Module):
460 |     """
461 |     A Feature-wise Linear Modulation Layer from
462 |     'FiLM: Visual Reasoning with a General Conditioning Layer'
463 |     """
464 |     def forward(self, x, gammas, betas):
465 |         # gammas = gammas.unsqueeze(2).unsqueeze(3).expand_as(x)
466 |         # betas = betas.unsqueeze(2).unsqueeze(3).expand_as(x)
467 |         return (gammas * x) + betas
468 | 
469 | def mask_softmax(attn_score, word_mask, tempuature=10., clssep=False, lstm=False):
470 |     if len(attn_score.shape)!=2:
471 |         attn_score = attn_score.squeeze(2).squeeze(2)
472 |     word_mask_cp = word_mask[:,:attn_score.shape[1]].clone()
473 |     score = F.softmax(attn_score*tempuature, dim=1)
474 |     if not clssep:
475 |         for ii in range(word_mask_cp.shape[0]):
476 |             if lstm:
477 |                 word_mask_cp[ii,word_mask_cp[ii,:].sum()-1]=0
478 |             else:
479 |                 word_mask_cp[ii,0]=0
480 |                 word_mask_cp[ii,word_mask_cp[ii,:].sum()]=0 ## set one to 0 already
481 |     mask_score = score * word_mask_cp.float()
482 |     mask_score = mask_score/(mask_score.sum(1)+1e-8).view(mask_score.size(0), 1).expand(mask_score.size(0), mask_score.size(1))
483 |     return mask_score
484 | 
485 | class FiLMedConvBlock_context(nn.Module):
486 |     def __init__(self, with_residual=True, with_batchnorm=True,
487 |                              with_cond=[False], dropout=0, num_extra_channels=0, extra_channel_freq=1,
488 |                              with_input_proj=1, num_cond_maps=8, kernel_size=1, batchnorm_affine=False,
489 |                              num_layers=1, condition_method='bn-film', debug_every=float('inf'),
490 |                              textdim=768,visudim=512,contextdim=512,emb_size=512,fusion='prod',cont_map=False,
491 |                              lstm=False,baseline=False):
492 |         super(FiLMedConvBlock_context, self).__init__()
493 | 
494 |         self.cont_map = cont_map    ## mapping context with language feature
495 |         self.lstm = lstm
496 |         self.emb_size = emb_size
497 |         self.with_residual = with_residual
498 |         self.fusion = fusion
499 |         self.baseline = baseline
500 |         self.film = FiLM()
501 | 
502 |         if self.cont_map:
503 |             self.sent_map = nn.Linear(768, emb_size)
504 |             self.context_map = nn.Linear(emb_size, emb_size)
505 |         if self.fusion == 'cat':
506 |             self.attn_map = nn.Conv1d(textdim+visudim, emb_size//2, kernel_size=1)
507 |         elif self.fusion == 'prod':
508 |             assert(textdim==visudim) ## if product fusion
509 |             self.attn_map = nn.Conv1d(visudim, emb_size//2, kernel_size=1)
510 | 
511 |         self.attn_score = nn.Conv1d(emb_size//2, 1, kernel_size=1)
512 |         if self.baseline:
513 |             self.fusion_layer = ConvBatchNormReLU(visudim+textdim+8, emb_size, 1, 1, 0, 1)
514 |         else:
515 |             self.gamme_decode = nn.Linear(textdim, 2 * emb_size)
516 |             self.conv1 = nn.Conv2d(visudim+8, emb_size, kernel_size=1)
517 |             # self.bn1 = nn.BatchNorm2d(emb_size)
518 |             self.bn1 = nn.InstanceNorm2d(emb_size)
519 |         init_modules(self.modules())
520 | 
521 | 
522 |     def forward(self, fvisu, fword, context_score, fcoord,gest, textattn=None,weight=None,fsent=None,word_mask=None):
523 |         fword = fword.permute(0, 2, 1)
524 |         B, Dvisu, H, W = fvisu.size()
525 |         B, Dlang, N = fword.size()
526 |         B, N = context_score.size()
527 |         assert(Dvisu==Dlang)
528 | 
529 |         if self.cont_map and fsent is not None:
530 |             fsent = F.normalize(F.relu(self.sent_map(fsent)), p=2, dim=1)
531 |             fcont = torch.matmul(context_score.view(B,1,N),fword.permute(0,2,1)).squeeze(1)
532 |             fcontext = F.relu(self.context_map(fsent*fcont)).unsqueeze(2).repeat(1,1,N)
533 |             ## word attention
534 |             tile_visu = torch.mean(fvisu.view(B, Dvisu, -1),dim=2,keepdim=True).repeat(1,1,N)
535 |             if self.fusion == 'cat':
536 |                 context_tile = torch.cat([tile_visu,\
537 |                     fword, fcontext], dim=1)
538 |             elif self.fusion == 'prod':
539 |                 context_tile = tile_visu * \
540 |                     fword * fcontext
541 |         else:
542 |             ## word attention
543 |             tile_visu = torch.mean(fvisu.view(B, Dvisu, -1),dim=2,keepdim=True).repeat(1,1,N)
544 |             if self.fusion == 'cat':
545 |                 context_tile = torch.cat([tile_visu,\
546 |                     fword * context_score.view(B, 1, N).repeat(1, Dlang, 1,)], dim=1)
547 |             elif self.fusion == 'prod':
548 |                 context_tile = tile_visu * \
549 |                     fword * context_score.view(B, 1, N).repeat(1, Dlang, 1,)
550 |                 #print(context_tile.shape)
551 |                 #print(tile_visu.shape)
552 |                 
553 |         attn_feat = F.tanh(self.attn_map(context_tile))
554 |         attn_score = self.attn_score(attn_feat).squeeze(1)
555 |         mask_score = mask_softmax(attn_score,word_mask,lstm=self.lstm)
556 |         attn_lang = torch.matmul(mask_score.view(B,1,N),fword.permute(0,2,1))
557 |         attn_lang = attn_lang.view(B,Dlang).squeeze(1)
558 | 
559 |         if self.baseline:
560 |             fmodu = self.fusion_layer(torch.cat([fvisu,\
561 |                 attn_lang.unsqueeze(2).unsqueeze(2).repeat(1,1,fvisu.shape[-1],fvisu.shape[-1]),fcoord],dim=1))
562 |         else:
563 |             ## lang-> gamma, beta
564 |             film_param = self.gamme_decode(attn_lang)
565 |             film_param = film_param.view(B,2*self.emb_size,1,1).repeat(1,1,H,W)
566 |             #print(film_param.shape)
567 |             gammas, betas = torch.split(film_param, self.emb_size, dim=1)
568 |             
569 |             gammas, betas = F.tanh(gammas), F.tanh(betas)
570 |             #gest = F.tanh(gest)
571 |             # GEST LANGUAGE FUSION
572 |             # gammas = gammas * gest.repeat(1,512,1,1).detach()
573 |             # betas = betas * gest.repeat(1,512,1,1).detach()
574 | 
575 |             ## modulate visu feature
576 |             fmodu = self.bn1(self.conv1(torch.cat([fvisu,fcoord],dim=1)))
577 |             #print(fmodu.shape)
578 |             #print(gammas.shape)
579 |             #print(betas.shape)
580 |             #exit()
581 |             fmodu = self.film(fmodu, gammas, betas)
582 |             fmodu = F.relu(fmodu)
583 |         if self.with_residual:
584 |             if weight is None:
585 |                 fmodu = fvisu + fmodu
586 |             else:
587 |                 weight = weight.view(B,1,1,1).repeat(1, Dvisu, H, W)
588 |                 fmodu = (1-weight)*fvisu + weight*fmodu
589 |         return fmodu, attn_lang, attn_score
590 | 
591 | class LayerNorm(nn.LayerNorm):
592 |     """Subclass torch's LayerNorm to handle fp16."""
593 | 
594 |     def forward(self, x: torch.Tensor):
595 |         orig_type = x.dtype
596 |         ret = super().forward(x.type(torch.float32))
597 |         return ret.type(orig_type)
598 | 
599 | class FiLMedConvBlock_multihop(nn.Module):
600 |     def __init__(self, NFilm=2, with_residual=True, with_batchnorm=True,
601 |                              with_cond=[False], dropout=0, num_extra_channels=0, extra_channel_freq=1,
602 |                              with_input_proj=1, num_cond_maps=8, kernel_size=1, batchnorm_affine=False,
603 |                              num_layers=1, condition_method='bn-film', debug_every=float('inf'),
604 |                              textdim=768,visudim=512,emb_size=512,fusion='cat',intmd=False,lstm=False,erasing=0.):
605 |         super(FiLMedConvBlock_multihop, self).__init__()
606 | 
607 |         self.NFilm = NFilm
608 |         self.emb_size = emb_size
609 |         self.with_residual = with_residual
610 |         self.cont_size = emb_size
611 |         self.fusion = fusion
612 |         self.intmd = intmd
613 |         self.lstm = lstm
614 |         self.erasing = erasing
615 |         if self.fusion=='cat':
616 |             self.cont_size = emb_size*2
617 | 
618 |         self.modulesdict = nn.ModuleDict()
619 |         modules = OrderedDict()
620 |         modules["film0"] = FiLMedConvBlock_context(textdim=textdim,visudim=emb_size,contextdim=emb_size,emb_size=emb_size,fusion=fusion,lstm=self.lstm)
621 |         for n in range(1,NFilm):
622 |             modules["conv%d"%n] = ConvBatchNormReLU(emb_size, emb_size, 3, 1, 1, 1)
623 |             modules["film%d"%n] = FiLMedConvBlock_context(textdim=textdim,visudim=emb_size,contextdim=self.cont_size,emb_size=emb_size,fusion=fusion,lstm=self.lstm)
624 |         self.modulesdict.update(modules)
625 | 
626 |     def forward(self, fvisu, fword, fcoord,gest = None, weight=None,fsent=None,word_mask=None):
627 |         B, Dvisu, H, W = fvisu.size()
628 |         B, N, Dlang = fword.size()
629 |         intmd_feat, attnscore_list = [], []
630 | 
631 |         x, _, attn_score = self.modulesdict["film0"](fvisu, fword, Variable(torch.ones(B,N).cuda()), fcoord,gest, fsent=fsent,word_mask=word_mask)
632 |         attnscore_list.append(attn_score.view(B,N,1,1))
633 |         if self.intmd:
634 |             intmd_feat.append(x)
635 |         if self.NFilm==1:
636 |             intmd_feat = [x]
637 |         for n in range(1,self.NFilm):
638 |             score_list = [mask_softmax(score.squeeze(2).squeeze(2),word_mask,lstm=self.lstm) for score in attnscore_list]
639 | 
640 |             score = torch.clamp(torch.max(torch.stack(score_list, dim=1), dim=1, keepdim=False)[0],min=0.,max=1.)
641 |             x = self.modulesdict["conv%d"%n](x)
642 |             x, _, attn_score = self.modulesdict["film%d"%n](x, fword, (1-score), fcoord,gest, fsent=fsent,word_mask=word_mask)
643 |             attnscore_list.append(attn_score.view(B,N,1,1)) ## format match div loss in main func
644 |             if self.intmd:
645 |                 intmd_feat.append(x)
646 |             elif n==self.NFilm-1:
647 |                 intmd_feat = [x]
648 |         return intmd_feat, attnscore_list
649 | 
650 | class Vector(nn.Sequential):
651 |     def __init__(self, input_resolution=224, patch_size=32, width=768, layers=3, heads=2, output_dim=3, out_indices=[0,1,2], pretrained=None, **kwargs):
652 |         super().__init__()
653 |         self.pretrained = pretrained
654 |         self.input_resolution = input_resolution
655 |         self.output_dim = output_dim
656 |         self.conv1 = nn.Conv2d(in_channels=6, out_channels=width, kernel_size=patch_size, stride=patch_size, bias=False)
657 | 
658 |         scale = width ** -0.5
659 |         self.class_embedding = nn.Parameter(scale * torch.randn(width))
660 |         self.positional_embedding = nn.Parameter(scale * torch.randn((input_resolution // patch_size) ** 2 + 1, width))
661 |         self.spatial_size = input_resolution // patch_size
662 |         self.ln_pre = LayerNorm(width)
663 | 
664 |         self.transformer = Transformer(width, layers, heads)
665 | 
666 |         self.out_indices = out_indices
667 | 
668 |         self.ln_post = LayerNorm(width)
669 |         self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
670 | 
671 |         embed_dim = width
672 |         
673 |     def forward(self, x: torch.Tensor):
674 |         x = self.conv1(x)  # shape = [*, width, grid, grid]
675 |         B, C, H, W = x.shape
676 | 
677 |         x = x.reshape(x.shape[0], x.shape[1], -1)  # shape = [*, width, grid ** 2]
678 |         x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
679 |         x = torch.cat([self.class_embedding.to(x.dtype) + torch.zeros(x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x], dim=1)  # shape = [*, grid ** 2 + 1, width]
680 | 
681 | 
682 |         pos = self.positional_embedding.to(x.dtype)
683 |         cls_pos = pos[0,:] + self.class_embedding.to(x.dtype)
684 |         spatial_pos = F.interpolate(pos[1:,].reshape(1, self.spatial_size, self.spatial_size, C).permute(0, 3, 1, 2), size=(H, W), mode='bilinear')
685 |         spatial_pos = spatial_pos.reshape(1, C, H*W).permute(0, 2, 1)
686 |         pos = torch.cat([cls_pos.reshape(1, 1, C), spatial_pos], dim=1)
687 |         x = x + pos
688 |         x = self.ln_pre(x)
689 |         x = x.permute(1, 0, 2)  # NLD -> LND
690 |         x = self.transformer(x)
691 |         x =  self.ln_post(x[0,:,:]) @ self.proj 
692 |         return x
693 | 
694 | class MLP(nn.Module):
695 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
696 |             super().__init__()
697 |             self.num_layers = num_layers
698 |             h = [hidden_dim] * (num_layers - 1)
699 |             self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
700 | 
701 |     def forward(self, x):
702 |         for i, layer in enumerate(self.layers):
703 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
704 |         return x
705 | 
706 | 
707 | 
708 | 
709 | 
710 | if __name__ == "__main__":
711 |     import torch
712 |     import numpy as np
713 |     
714 |     vect = Vector()
715 | 
716 |     dep = torch.autograd.Variable(torch.randn(1, 3, 512, 512))
717 |     paf = torch.autograd.Variable(torch.randn(1, 1, 256, 256))
718 |     output  = model(paf)
719 |     print(output)
720 |     # print(output1.size(), output2.size(), output3.size())
721 |     # print(model(image))
722 |     # print(len(output), output[0].size(), output[1].size(), output[2].size())
723 | 


--------------------------------------------------------------------------------