├── .gitignore
├── README.md
├── configs
    ├── lbyl_bert_gref_batch64.json
    ├── lbyl_bert_referit_batch64.json
    ├── lbyl_bert_unc+_batch64.json
    ├── lbyl_bert_unc_batch64.json
    ├── lbyl_lstm_gref_batch64.json
    ├── lbyl_lstm_referit_batch64.json
    ├── lbyl_lstm_unc+_batch64.json
    └── lbyl_lstm_unc_batch64.json
├── core
    ├── __init__.py
    ├── base.py
    ├── config.py
    ├── dbs
    │   ├── __init__.py
    │   ├── base.py
    │   ├── dataset.py
    │   ├── referring.py
    │   ├── utils.py
    │   └── word_utils.py
    ├── groundors.py
    ├── models
    │   ├── context
    │   │   ├── _pconv
    │   │   │   ├── conv4.py
    │   │   │   └── conv8.py
    │   │   └── module.py
    │   ├── lang_encoder
    │   │   ├── RNNencoder.py
    │   │   └── __init__.py
    │   ├── net
    │   │   ├── __init__.py
    │   │   ├── baseline.py
    │   │   ├── darknet.py
    │   │   ├── lbylnet.py
    │   │   └── yolov3.cfg
    │   └── utils
    │   │   ├── __init__.py
    │   │   ├── losses.py
    │   │   └── word_utils.py
    ├── nnet
    │   ├── __init__.py
    │   └── nnet_factory.py
    ├── optimizer
    │   └── lr_scheduler.py
    ├── paths.py
    ├── sampler
    │   ├── collate_fn.py
    │   ├── sampler.py
    │   └── utils.py
    ├── test
    │   ├── __init__.py
    │   └── test.py
    ├── utils
    │   ├── __init__.py
    │   ├── data_parallel.py
    │   ├── meter.py
    │   ├── misc.py
    │   ├── scatter_gather.py
    │   ├── timer.py
    │   └── tqdm.py
    └── vis_utils.py
├── data
    └── refer
    │   └── download_data.sh
├── demo.py
├── evaluate.py
├── ext
    └── yolov3.cfg
├── imgs
    ├── demo.jpeg
    ├── demo_out.jpg
    └── landmarks.png
├── logs
    ├── lbyl_bert_gref_batch64.out
    ├── lbyl_bert_referit_batch64.out
    ├── lbyl_bert_unc+_batch64.out
    ├── lbyl_bert_unc_batch64.out
    ├── lbyl_lstm_gref_batch64.out
    ├── lbyl_lstm_referit_batch64.out
    ├── lbyl_lstm_unc+_batch64.out
    └── lbyl_lstm_unc_batch64.out
├── requirements.txt
├── results
    ├── lbyl_bert_gref_batch64
    │   └── gref
    │   │   └── 30
    │   │       └── val
    │   │           ├── metrics.json
    │   │           └── results.json
    ├── lbyl_bert_referit_batch64
    │   └── referit
    │   │   └── 100
    │   │       └── test
    │   │           ├── metrics.json
    │   │           └── results.json
    ├── lbyl_bert_unc+_batch64
    │   └── unc+
    │   │   └── 100
    │   │       ├── testA
    │   │           ├── metrics.json
    │   │           └── results.json
    │   │       ├── testB
    │   │           ├── metrics.json
    │   │           └── results.json
    │   │       └── val
    │   │           ├── metrics.json
    │   │           └── results.json
    ├── lbyl_bert_unc_batch64
    │   └── unc
    │   │   └── 100
    │   │       ├── testA
    │   │           ├── metrics.json
    │   │           └── results.json
    │   │       ├── testB
    │   │           ├── metrics.json
    │   │           └── results.json
    │   │       └── val
    │   │           ├── metrics.json
    │   │           └── results.json
    ├── lbyl_lstm_gref_batch64
    │   └── gref
    │   │   └── 30
    │   │       └── val
    │   │           ├── metrics.json
    │   │           └── results.json
    ├── lbyl_lstm_referit_batch64
    │   └── referit
    │   │   └── 100
    │   │       ├── test
    │   │           ├── metrics.json
    │   │           └── results.json
    │   │       └── validation
    │   │           ├── metrics.json
    │   │           └── results.json
    ├── lbyl_lstm_unc+_batch64
    │   └── unc+
    │   │   └── 100
    │   │       ├── testA
    │   │           ├── metrics.json
    │   │           └── results.json
    │   │       ├── testB
    │   │           ├── metrics.json
    │   │           └── results.json
    │   │       └── val
    │   │           ├── metrics.json
    │   │           └── results.json
    └── lbyl_lstm_unc_batch64
    │   └── unc
    │       └── 100
    │           ├── testA
    │               ├── metrics.json
    │               └── results.json
    │           ├── testB
    │               ├── metrics.json
    │               └── results.json
    │           └── val
    │               ├── metrics.json
    │               └── results.json
└── train.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | data/refer/data
 2 | data/refer/ln_data
 3 | cache
 4 | .vscode
 5 | ext/yolov3.weights
 6 | ext/lib
 7 | /*.out 
 8 | results/**/visulize/
 9 | **/*.pyc
10 | **/__pycache__/**


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # LBYL-Net
  2 | 
  3 | 
  4 | This repo implements paper [*Look Before You Leap: Learning Landmark Features For One-Stage Visual Grounding*](https://arxiv.org/abs/2104.04386) CVPR 2021. 
  5 | 
  6 | <image src="imgs/landmarks.png" width="512">
  7 | 
  8 | The core of this paper is [Landmark Convolution](https://github.com/hbb1/landmarkconv).
  9 |     
 10 | -----------
 11 | ### Getting Started
 12 | #### Prerequisites
 13 | - python 3.7
 14 | - pytorch 10.0
 15 | - cuda 10.0 
 16 | - gcc 4.92 or above
 17 | 
 18 | #### Installation
 19 | 1. Then clone the repo and install dependencies.
 20 |     ```bash
 21 |     git clone https://github.com/svip-lab/LBYLNet.git
 22 |     cd LBYLNet
 23 |     pip install requirements.txt 
 24 |     ```
 25 | 2. You also need to install our [landmark feature convolution](https://github.com/hbb1/landmarkconv):
 26 |     ```bash
 27 |     cd ext
 28 |     git clone https://github.com/hbb1/landmarkconv.git
 29 |     cd landmarkconv/lib/layers
 30 |     python setup.py install --user
 31 |     ```
 32 | 3. We follow dataset structure [DMS](https://github.com/BCV-Uniandes/DMS) and [FAOA](https://github.com/zyang-ur/onestage_grounding). For convience, we have pack them togather, including ReferitGame, RefCOCO, RefCOCO+, RefCOCOg.
 33 |     ```bash
 34 |     bash data/refer/download_data.sh ./data/refer
 35 |     ```
 36 | 4. download the generated index files and place them in `./data/refer`. Available at [[Gdrive]](https://drive.google.com/open?id=1cZI562MABLtAzM6YU4WmKPFFguuVr0lZ), [[One Drive]](https://uofr-my.sharepoint.com/:f:/g/personal/zyang39_ur_rochester_edu/Epw5WQ_mJ-tOlAbK5LxsnrsBElWwvNdU7aus0UIzWtwgKQ?e=XHQm7F)
 37 | .
 38 | 5. download the pretained model of YOLOv3. 
 39 |     ```bash
 40 |     wget -P ext https://pjreddie.com/media/files/yolov3.weights
 41 |     ```
 42 | -----------
 43 | ### Training and Evaluation
 44 | By default, we use 2 gpus and batchsize 64 with DDP (distributed data-parallel). 
 45 | We have provided several configurations and training log for reproducing our results. If you want to use different hyperparameters or models, you may create configs for yourself. Here are examples:
 46 | - For distributed training with gpus : 
 47 |     ```bash
 48 |     CUDA_VISIBLE_DEVICES=0,1 python train.py lbyl_lstm_referit_batch64  --workers 8 --distributed --world_size 1  --dist_url "tcp://127.0.0.1:60006"
 49 |     ```
 50 | - If you use single gpu or won't use distributed training (make sure to adjust the batchsize in the corresponding config file to match your devices):
 51 |     ```bash
 52 |     CUDA_VISIBLE_DEVICES=0, python train.py lbyl_lstm_referit_batch64  --workers 8
 53 |     ```
 54 | 
 55 | - For evaluation:
 56 |     ```bash
 57 |     CUDA_VISIBLE_DEVICES=0, python evaluate.py lbyl_lstm_referit_batch64 --testiter 100 --split val
 58 |     ```
 59 | -------- 
 60 | ### Trained Models
 61 | We provide the our retrained models with this *re-organized* codebase and provide their checkpoints and logs for reproducing the results. To use our trained models, download them from the [[Gdrive]](https://drive.google.com/drive/folders/1ICLArOUtWAx_W9nfn7uwobdtIkmN_RoA?usp=sharing) and save them into directory `cache`. Then the file path is expected to be `<LBYLNet dir>/cache/nnet/<config>/<dataset>/<config>_100.pkl` 
 62 | 
 63 | `Notice`: The reproduced performances are occassionally higher or lower (within a reasonable range) than the results reported in the paper.
 64 | 
 65 | In this repo, we provide the peformance of our LBYL-Nets below. You can also find the details on `<LBYLNet dir>/results` and `<LBYLNet dir>/logs`. 
 66 | 
 67 | - Performance on ReferitGame (Pr@0.5). 
 68 |     <table>
 69 |     <thead>
 70 |         <tr>
 71 |         <th>Dataset</th><th>Langauge</th><th>Split</th><th>Papar</th><th>Reproduce</th>
 72 |         </tr>
 73 |     <thead>
 74 |         <tr>
 75 |             <td rowspan="2">ReferitGame</td><td>LSTM
 76 |             <!-- <br> <a href='xxx'>model</a> <a href='xxx'>log</a> -->
 77 |             </td><td>test</td><td>65.48</td><td>65.98</td>
 78 |         </tr>
 79 |         <tr>
 80 |         <td>BERT
 81 |         <!-- <br><a href='xxx'>model</a> <a href='xxx'>log</a> -->
 82 |         </td><td>test</td><td>67.47</td><td>68.48</td>
 83 |         </tr>
 84 |     </table>
 85 | 
 86 | - Performance on RefCOCO (Pr@0.5).
 87 |     <table>
 88 |         <tr>
 89 |         <th>Dataset</th><th>Langauge</th><th>Split</th><th>Papar</th><th>Reproduce</th>
 90 |         </tr>
 91 |         <tr>
 92 |             <td rowspan="6">RefCOCO</td><td rowspan="3">LSTM
 93 |              <!-- <br> <a href='xxx'>model</a> <a href='xxx'>log</a> </td><td>val</td><td>78.76</td><td>78.50</td> -->
 94 |         </tr>
 95 |         <tr><td>testA</td><td>82.18</td><td>82.48</td></tr>
 96 |         <tr><td>testB</td><td>71.91</td><td>71.76</td></tr>
 97 |         <td rowspan="3">BERT 
 98 |         <!-- <br> <a href='xxx'>model</a> <a href='xxx'>log</a> </td><td>val</td><td>79.76</td><td>79.72</td> -->
 99 |         </tr>
100 |         <tr><td>testA</td><td>82.91</td><td>82.82</td></tr>
101 |         <tr><td>testB</td><td>74.15</td><td>72.82</td></tr>
102 |     </table>
103 | 
104 | - Performance on RefCOCO+ (Pr@0.5).
105 |     <table>
106 |     <tr>
107 |         <th>Dataset</th><th>Langauge</th><th>Split</th><th>Papar</th><th>Reproduce</th>
108 |         </tr>
109 |     <tr><td rowspan="6">RefCOCO+</td><td rowspan="3">LSTM 
110 |     <!-- <br> <a href='xxx'>model</a> <a href='xxx'>log</a> -->
111 |      </td><td>val</td><td>66.64</td><td>66.71</td>
112 |         </tr>
113 |         <tr><td>testA</td><td>73.21</td><td>72.63</td></tr>
114 |         <tr><td>testB</td><td>56.23</td><td>55.88</td></tr>
115 |         <td rowspan="3">BERT 
116 |         <!-- <br> <a href='xxx'>model</a> <a href='xxx'>log</a>  -->
117 |         </td><td>val</td><td>68.64</td><td>68.76</td>
118 |         </tr>
119 |         <tr><td>testA</td><td>73.38</td><td>73.73</td></tr>
120 |         <tr><td>testB</td><td>59.49</td><td>59.62</td></tr>
121 |     </table>
122 | 
123 | - Performance on RefCOCOg (Pr@0.5).
124 |     <table>
125 |         <tr>
126 |         <th>Dataset</th><th>Langauge</th><th>Split</th><th>Papar</th><th>Reproduce</th>
127 |         </tr>
128 |         <tr>
129 |             <td rowspan="2">RefCOCOg</td><td>LSTM 
130 |             <!-- <br> <a href='xxx'>model</a> <a href='xxx'>log</a>  -->
131 |             </td><td>val</td><td>58.72</td><td>60.03</td>
132 |         </tr>
133 |         <tr>
134 |         <td>BERT 
135 |         <!-- <br> <a href='xxx'>model</a> <a href='xxx'>log</a> -->
136 |          </td><td>val</td><td>62.70</td><td>63.20</td>
137 |         </tr>
138 |     </table>
139 | 
140 | ---------
141 | 
142 | ### Demo
143 | We also provide demo scripts to test if the repo is corretly installed. After installing the repo and download the pretained weights, you should be able to use the LBYL-Net to ground your own images.
144 | 
145 | ```bash
146 | python demo.py
147 | ```
148 | you can change the model, image or phrase in the `demo.py`. You will see the output image in `imgs/demo_out.jpg`.
149 | ```python
150 | #!/usr/bin/env python
151 | import cv2
152 | import torch
153 | from core.test.test import _visualize
154 | from core.groundors import Net 
155 | # pick one model
156 | cfg_file = "lbyl_bert_unc+_batch64"
157 | detector = Net(cfg_file, iter=100)
158 | # inference
159 | image = cv2.imread('imgs/demo.jpeg')
160 | phrase = 'the green gaint'
161 | bbox = detector(image, phrase)
162 | _visualize(image, pred_bbox=bbox, phrase=phrase, save_path='imgs/demo_out.jpg', color=(1, 174, 245), draw_phrase=True)
163 | ```
164 | 
165 | **Input:** 
166 | 
167 | <img src="imgs/demo.jpeg" width="512"> 
168 | 
169 | **Output:** 
170 | 
171 | <img src="imgs/demo_out.jpg" width="512">
172 | 
173 | --------------------------------
174 | ### Acknowledgements
175 | This repo is organized as [CornerNet-Lite](https://github.com/princeton-vl/CornerNet-Lite) and the code is partially from [FAOA](https://github.com/zyang-ur/onestage_grounding) (*e.g.* data preparation) and [MAttNet](https://github.com/lichengunc/MAttNet) (*e.g.* LSTM). We thank for their great works.
176 | 
177 | ----------------------------------
178 | 
179 | ### Citations:
180 | If you use any part of this repo in your research, please cite our paper:
181 | 
182 | ```
183 | @InProceedings{huang2021look,
184 |       title={Look Before You Leap: Learning Landmark Features for One-Stage Visual Grounding}, 
185 |       author={Huang, Binbin and Lian, Dongze and Luo, Weixin and Gao, Shenghua},
186 |       booktitle={IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
187 |       month = {June},
188 |       year={2021},
189 | }
190 | ```
191 | 


--------------------------------------------------------------------------------
/configs/lbyl_bert_gref_batch64.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "system": {
 3 |         "dataset": "gref",
 4 |         "model": "LBYLNet",
 5 |         "language": "bert-base-uncased",
 6 |         "batch_size": 64,
 7 |         "anchor_based": true,
 8 |         "context": "LandmarkP4",
 9 |         "ctx_dim" : 128,
10 |         
11 |         "train_split": "train",
12 |         "val_split": "val",
13 |         "test_split": "testA",
14 |         "val_iter": 1,
15 |         
16 |         "opt_algo": "adam",
17 |         "lr_scheduler": "cosin_lr",
18 |         "warm_up": true,
19 |         "warm_up_epoch": 5,
20 |         "gamma": 0.1,
21 |         "decay_rate": 10,
22 |         "warm_up_from_lr": 0.000001,
23 |         "learning_rate": 0.0001,
24 | 
25 |         "nb_epoch": 30,
26 |         "print_freq": 100,
27 |         "stepsize": 80,
28 |         "snapshot": 5,
29 |         "chunk_sizes": [32],
30 |         "data_dir": "./data",
31 |         "corpus_dir": "./data/refer/"
32 |     },
33 |     
34 |     "db": {
35 |         "random_color": true,
36 |         "random_flip": true,
37 |         "random_affine": true,
38 |         "input_size": [256, 256],
39 |         "output_sizes": [32, 32],
40 | 
41 |         "vocab_size": 0,
42 |         "word_embedding_size": 512,
43 |         "word_vec_size": 512,
44 |         "hidden_size": 512,
45 |         "bidirectional": true,
46 |         "input_dropout_p": 0.5,
47 |         "dropout_p": 0.2,
48 |         "n_layers": 1,
49 |         "max_query_len": 128,
50 |         "variable_length": true,
51 | 
52 |         "joint_embedding_size": 256,
53 |         "joint_out_dim": 256,
54 |         "joint_embedding_dropout": 0.1,
55 |         "joint_mlp_layers": 2
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/configs/lbyl_bert_referit_batch64.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "system": {
 3 |         "dataset": "referit",
 4 |         "model": "LBYLNet",
 5 |         "language": "bert-base-uncased",
 6 |         "batch_size": 64,
 7 |         "anchor_based": true,
 8 |         "context": "LandmarkP4",
 9 |         "ctx_dim" : 128,
10 |         
11 |         "train_split": "train",
12 |         "val_split": "val",
13 |         "test_split": "test",
14 |         "val_iter": 1,
15 |         
16 |         "opt_algo": "adam",
17 |         "lr_scheduler": "cosin_lr",
18 |         "warm_up": true,
19 |         "warm_up_epoch": 5,
20 |         "gamma": 0.1,
21 |         "decay_rate": 10,
22 |         "warm_up_from_lr": 0.000001,
23 |         "learning_rate": 0.0001,
24 | 
25 |         "nb_epoch": 100,
26 |         "print_freq": 100,
27 |         "stepsize": 80,
28 |         "snapshot": 5,
29 |         "chunk_sizes": [32],
30 |         "data_dir": "./data",
31 |         "corpus_dir": "./data/refer/"
32 |     },
33 |     
34 |     "db": {
35 |         "random_color": true,
36 |         "random_flip": true,
37 |         "random_affine": true,
38 |         "input_size": [256, 256],
39 |         "output_sizes": [32, 32],
40 | 
41 |         "vocab_size": 0,
42 |         "word_embedding_size": 512,
43 |         "word_vec_size": 512,
44 |         "hidden_size": 512,
45 |         "bidirectional": true,
46 |         "input_dropout_p": 0.5,
47 |         "dropout_p": 0.2,
48 |         "n_layers": 1,
49 |         "max_query_len": 128,
50 |         "variable_length": true,
51 | 
52 |         "joint_embedding_size": 256,
53 |         "joint_out_dim": 256,
54 |         "joint_embedding_dropout": 0.1,
55 |         "joint_mlp_layers": 2
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/configs/lbyl_bert_unc+_batch64.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "system": {
 3 |         "dataset": "unc+",
 4 |         "model": "LBYLNet",
 5 |         "language": "bert-base-uncased",
 6 |         "batch_size": 64,
 7 |         "anchor_based": true,
 8 |         "context": "LandmarkP4",
 9 |         "ctx_dim" : 128,
10 |         
11 |         "train_split": "train",
12 |         "val_split": "val",
13 |         "test_split": "testA",
14 |         "val_iter": 1,
15 |         
16 |         "opt_algo": "adam",
17 |         "lr_scheduler": "cosin_lr",
18 |         "warm_up": true,
19 |         "warm_up_epoch": 5,
20 |         "gamma": 0.1,
21 |         "decay_rate": 10,
22 |         "warm_up_from_lr": 0.000001,
23 |         "learning_rate": 0.0001,
24 | 
25 |         "nb_epoch": 100,
26 |         "print_freq": 100,
27 |         "stepsize": 80,
28 |         "snapshot": 5,
29 |         "chunk_sizes": [32],
30 |         "data_dir": "./data",
31 |         "corpus_dir": "./data/refer/"
32 |     },
33 |     
34 |     "db": {
35 |         "random_color": true,
36 |         "random_flip": true,
37 |         "random_affine": true,
38 |         "input_size": [256, 256],
39 |         "output_sizes": [32, 32],
40 | 
41 |         "vocab_size": 0,
42 |         "word_embedding_size": 512,
43 |         "word_vec_size": 512,
44 |         "hidden_size": 512,
45 |         "bidirectional": true,
46 |         "input_dropout_p": 0.5,
47 |         "dropout_p": 0.2,
48 |         "n_layers": 1,
49 |         "max_query_len": 128,
50 |         "variable_length": true,
51 | 
52 |         "joint_embedding_size": 256,
53 |         "joint_out_dim": 256,
54 |         "joint_embedding_dropout": 0.1,
55 |         "joint_mlp_layers": 2
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/configs/lbyl_bert_unc_batch64.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "system": {
 3 |         "dataset": "unc",
 4 |         "model": "LBYLNet",
 5 |         "language": "bert-base-uncased",
 6 |         "batch_size": 64,
 7 |         "anchor_based": true,
 8 |         "context": "LandmarkP4",
 9 |         "ctx_dim" : 128,
10 |         
11 |         "train_split": "train",
12 |         "val_split": "val",
13 |         "test_split": "testA",
14 |         "val_iter": 1,
15 |         
16 |         "opt_algo": "adam",
17 |         "lr_scheduler": "cosin_lr",
18 |         "warm_up": true,
19 |         "warm_up_epoch": 5,
20 |         "gamma": 0.1,
21 |         "decay_rate": 10,
22 |         "warm_up_from_lr": 0.000001,
23 |         "learning_rate": 0.0001,
24 | 
25 |         "nb_epoch": 100,
26 |         "print_freq": 100,
27 |         "stepsize": 80,
28 |         "snapshot": 5,
29 |         "chunk_sizes": [32],
30 |         "data_dir": "./data",
31 |         "corpus_dir": "./data/refer/"
32 |     },
33 |     
34 |     "db": {
35 |         "random_color": true,
36 |         "random_flip": true,
37 |         "random_affine": true,
38 |         "input_size": [256, 256],
39 |         "output_sizes": [32, 32],
40 | 
41 |         "vocab_size": 0,
42 |         "word_embedding_size": 512,
43 |         "word_vec_size": 512,
44 |         "hidden_size": 512,
45 |         "bidirectional": true,
46 |         "input_dropout_p": 0.5,
47 |         "dropout_p": 0.2,
48 |         "n_layers": 1,
49 |         "max_query_len": 128,
50 |         "variable_length": true,
51 | 
52 |         "joint_embedding_size": 256,
53 |         "joint_out_dim": 256,
54 |         "joint_embedding_dropout": 0.1,
55 |         "joint_mlp_layers": 2
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/configs/lbyl_lstm_gref_batch64.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "system": {
 3 |         "dataset": "gref",
 4 |         "model": "LBYLNet",
 5 |         "language": "lstm",
 6 |         "batch_size": 64,
 7 |         "anchor_based": true,
 8 |         "context": "LandmarkP4",
 9 |         "ctx_dim" : 128,
10 |         
11 |         "train_split": "train",
12 |         "val_split": "val",
13 |         "test_split": "testA",
14 |         "val_iter": 1,
15 |         
16 |         "opt_algo": "adam",
17 |         "lr_scheduler": "cosin_lr",
18 |         "warm_up": true,
19 |         "warm_up_epoch": 5,
20 |         "gamma": 0.1,
21 |         "decay_rate": 10,
22 |         "warm_up_from_lr": 0.000001,
23 |         "learning_rate": 0.0001,
24 | 
25 |         "nb_epoch": 100,
26 |         "print_freq": 100,
27 |         "stepsize": 80,
28 |         "snapshot": 5,
29 |         "chunk_sizes": [32],
30 |         "data_dir": "./data",
31 |         "corpus_dir": "./data/refer/"
32 |     },
33 |     
34 |     "db": {
35 |         "random_color": true,
36 |         "random_flip": true,
37 |         "random_affine": true,
38 |         "input_size": [256, 256],
39 |         "output_sizes": [32, 32],
40 | 
41 |         "vocab_size": 0,
42 |         "word_embedding_size": 512,
43 |         "word_vec_size": 512,
44 |         "hidden_size": 512,
45 |         "bidirectional": true,
46 |         "input_dropout_p": 0.5,
47 |         "dropout_p": 0.2,
48 |         "n_layers": 1,
49 |         "max_query_len": 128,
50 |         "variable_length": true,
51 | 
52 |         "joint_embedding_size": 256,
53 |         "joint_out_dim": 256,
54 |         "joint_embedding_dropout": 0.1,
55 |         "joint_mlp_layers": 2
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/configs/lbyl_lstm_referit_batch64.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "system": {
 3 |         "dataset": "referit",
 4 |         "model": "LBYLNet",
 5 |         "language": "lstm",
 6 |         "batch_size": 64,
 7 |         "anchor_based": true,
 8 |         "context": "LandmarkP4",
 9 |         "ctx_dim" : 128,
10 |         
11 |         "train_split": "train",
12 |         "val_split": "val",
13 |         "test_split": "test",
14 |         "val_iter": 1,
15 |         
16 |         "opt_algo": "adam",
17 |         "lr_scheduler": "cosin_lr",
18 |         "warm_up": true,
19 |         "warm_up_epoch": 5,
20 |         "gamma": 0.1,
21 |         "decay_rate": 10,
22 |         "warm_up_from_lr": 0.000001,
23 |         "learning_rate": 0.0001,
24 | 
25 |         "nb_epoch": 100,
26 |         "print_freq": 100,
27 |         "stepsize": 80,
28 |         "snapshot": 5,
29 |         "chunk_sizes": [32],
30 |         "data_dir": "./data",
31 |         "corpus_dir": "./data/refer/"
32 |     },
33 |     
34 |     "db": {
35 |         "random_color": true,
36 |         "random_flip": true,
37 |         "random_affine": true,
38 |         "input_size": [256, 256],
39 |         "output_sizes": [32, 32],
40 | 
41 |         "vocab_size": 0,
42 |         "word_embedding_size": 512,
43 |         "word_vec_size": 512,
44 |         "hidden_size": 512,
45 |         "bidirectional": true,
46 |         "input_dropout_p": 0.5,
47 |         "dropout_p": 0.2,
48 |         "n_layers": 1,
49 |         "max_query_len": 128,
50 |         "variable_length": true,
51 | 
52 |         "joint_embedding_size": 256,
53 |         "joint_out_dim": 256,
54 |         "joint_embedding_dropout": 0.1,
55 |         "joint_mlp_layers": 2
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/configs/lbyl_lstm_unc+_batch64.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "system": {
 3 |         "dataset": "unc+",
 4 |         "model": "LBYLNet",
 5 |         "language": "lstm",
 6 |         "batch_size": 64,
 7 |         "anchor_based": true,
 8 |         "context": "LandmarkP4",
 9 |         "ctx_dim" : 128,
10 |         
11 |         "train_split": "train",
12 |         "val_split": "val",
13 |         "test_split": "testA",
14 |         "val_iter": 1,
15 |         
16 |         "opt_algo": "adam",
17 |         "lr_scheduler": "cosin_lr",
18 |         "warm_up": true,
19 |         "warm_up_epoch": 5,
20 |         "gamma": 0.1,
21 |         "decay_rate": 10,
22 |         "warm_up_from_lr": 0.000001,
23 |         "learning_rate": 0.0001,
24 | 
25 |         "nb_epoch": 100,
26 |         "print_freq": 100,
27 |         "stepsize": 80,
28 |         "snapshot": 5,
29 |         "chunk_sizes": [32],
30 |         "data_dir": "./data",
31 |         "corpus_dir": "./data/refer/"
32 |     },
33 |     
34 |     "db": {
35 |         "random_color": true,
36 |         "random_flip": true,
37 |         "random_affine": true,
38 |         "input_size": [256, 256],
39 |         "output_sizes": [32, 32],
40 | 
41 |         "vocab_size": 0,
42 |         "word_embedding_size": 512,
43 |         "word_vec_size": 512,
44 |         "hidden_size": 512,
45 |         "bidirectional": true,
46 |         "input_dropout_p": 0.5,
47 |         "dropout_p": 0.2,
48 |         "n_layers": 1,
49 |         "max_query_len": 128,
50 |         "variable_length": true,
51 | 
52 |         "joint_embedding_size": 256,
53 |         "joint_out_dim": 256,
54 |         "joint_embedding_dropout": 0.1,
55 |         "joint_mlp_layers": 2
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/configs/lbyl_lstm_unc_batch64.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "system": {
 3 |         "dataset": "unc",
 4 |         "model": "LBYLNet",
 5 |         "language": "lstm",
 6 |         "batch_size": 64,
 7 |         "anchor_based": true,
 8 |         "context": "LandmarkP4",
 9 |         "ctx_dim" : 128,
10 |         
11 |         "train_split": "train",
12 |         "val_split": "val",
13 |         "test_split": "testA",
14 |         "val_iter": 1,
15 |         
16 |         "opt_algo": "adam",
17 |         "lr_scheduler": "cosin_lr",
18 |         "warm_up": true,
19 |         "warm_up_epoch": 5,
20 |         "gamma": 0.1,
21 |         "decay_rate": 10,
22 |         "warm_up_from_lr": 0.000001,
23 |         "learning_rate": 0.0001,
24 | 
25 |         "nb_epoch": 100,
26 |         "print_freq": 100,
27 |         "stepsize": 80,
28 |         "snapshot": 5,
29 |         "chunk_sizes": [32],
30 |         "data_dir": "./data",
31 |         "corpus_dir": "./data/refer/"
32 |     },
33 |     
34 |     "db": {
35 |         "random_color": true,
36 |         "random_flip": true,
37 |         "random_affine": true,
38 |         "input_size": [256, 256],
39 |         "output_sizes": [32, 32],
40 | 
41 |         "vocab_size": 0,
42 |         "word_embedding_size": 512,
43 |         "word_vec_size": 512,
44 |         "hidden_size": 512,
45 |         "bidirectional": true,
46 |         "input_dropout_p": 0.5,
47 |         "dropout_p": 0.2,
48 |         "n_layers": 1,
49 |         "max_query_len": 128,
50 |         "variable_length": true,
51 | 
52 |         "joint_embedding_size": 256,
53 |         "joint_out_dim": 256,
54 |         "joint_embedding_dropout": 0.1,
55 |         "joint_mlp_layers": 2
56 |     }
57 | }
58 | 


--------------------------------------------------------------------------------
/core/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svip-lab/LBYLNet/47a0e83b6db60d7efe1f74acdf4cb210ffd9554d/core/__init__.py


--------------------------------------------------------------------------------
/core/base.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | from .nnet.py_factory import NetworkFactory
 4 | 
 5 | class Base(object):
 6 |     def __init__(self, db, nnet, func, model=None):
 7 |         super(Base, self).__init__()
 8 | 
 9 |         self._db   = db
10 |         self._nnet = nnet
11 |         self._func = func
12 | 
13 |         if model is not None:
14 |             self._nnet.load_pretrained_params(model)
15 | 
16 |         self._nnet.cuda()
17 |         self._nnet.eval_mode()
18 | 
19 |     def _inference(self, image, *args, **kwargs):
20 |         return self._func(self._db, self._nnet, image.copy(), *args, **kwargs)
21 | 
22 |     def __call__(self, image, *args, **kwargs):
23 |         categories = self._db.configs["categories"]
24 |         bboxes     = self._inference(image, *args, **kwargs)
25 |         return {self._db.cls2name(j): bboxes[j] for j in range(1, categories + 1)}
26 | 
27 | def load_cfg(cfg_file):
28 |     with open(cfg_file, "r") as f:
29 |         cfg = json.load(f)
30 | 
31 |     cfg_sys = cfg["system"]
32 |     cfg_db  = cfg["db"]
33 |     return cfg_sys, cfg_db
34 | 
35 | def load_nnet(cfg_sys, model):
36 |     return NetworkFactory(cfg_sys, model)
37 | 


--------------------------------------------------------------------------------
/core/config.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import numpy as np
  3 | 
  4 | class SystemConfig(object):
  5 |     def __init__(self):
  6 |         self._configs = {}
  7 |         self._configs["dataset"] = None
  8 |         self._configs["language"]           = "lstm"
  9 |         self._configs["model"]              = "LBYLNet"
 10 |         self._configs["anchor_based"]       = False
 11 |         self._configs["context"]            = "LandmarkP4"
 12 |         self._configs["ctx_dim"]            = 128
 13 |         self._configs["visu_weight"] = "yolov3.weights"
 14 |         # optimizer 
 15 |         self._configs["lr_scheduler"]      = "cosin_lr"
 16 |         self._configs["warm_up"]           = True
 17 |         self._configs["warm_up_epoch"]     = 0
 18 |         self._configs["warm_up_from_lr"]   = 0.0001
 19 |         self._configs["learning_rate"]     = 0.001
 20 |         self._configs["decay_rate"]        = 10
 21 |         # multi step lr
 22 |         self._configs["milestone"]         = [30, 80]
 23 |         self._configs["gamma"]             = 0.1
 24 | 
 25 |         self._configs["opt_algo"]          = "adam"
 26 |         # Training Config
 27 |         self._configs["display"]           = 5
 28 |         self._configs["nb_epoch"]          = 100
 29 |         self._configs["print_freq"]        = 100
 30 |         self._configs["snapshot"]          = 10
 31 |         self._configs["stepsize"]          = 10
 32 |         self._configs["val_iter"]          = 20
 33 |         self._configs["batch_size"]        = 1
 34 |         self._configs["snapshot_name"]     = None
 35 |         self._configs["prefetch_size"]     = 100
 36 |         self._configs["pretrain"]          = None
 37 |         self._configs["chunk_sizes"]       = None
 38 | 
 39 |         # Directories
 40 |         self._configs["corpus_dir"] = "./data/refer/"
 41 |         self._configs["data_dir"]   = "./data"
 42 |         self._configs["cache_dir"]  = "./cache"
 43 |         self._configs["config_dir"] = "./config"
 44 |         self._configs["result_dir"] = "./results"
 45 | 
 46 |         # Split
 47 |         self._configs["train_split"] = "training"
 48 |         self._configs["val_split"]   = "validation"
 49 |         self._configs["test_split"]  = "test"
 50 | 
 51 |         # Rng
 52 |         self._configs["data_rng"] = np.random.RandomState(123)
 53 |         self._configs["nnet_rng"] = np.random.RandomState(317)
 54 | 
 55 |     @property
 56 |     def lstm(self):
 57 |         return self._configs['language'] == 'lstm'
 58 | 
 59 |     @property
 60 |     def lang_encoder(self):
 61 |         return self._configs['language']
 62 | 
 63 |     @property
 64 |     def model(self):
 65 |         return self._configs["model"]
 66 | 
 67 |     @property
 68 |     def ctx_dim(self):
 69 |         return self._configs["ctx_dim"]
 70 | 
 71 |     def freeze_epoch(self):
 72 |         return self._configs["frezze_epoch"]
 73 | 
 74 |     @property
 75 |     def visu_weight(self):
 76 |         return self._configs["visu_weight"]
 77 | 
 78 |     @property
 79 |     def warm_up_lr(self):
 80 |         return self._configs["warm_up_from_lr"]
 81 |     @property
 82 |     def warm_up(self):
 83 |         return self._configs["warm_up"]
 84 |     
 85 |     @property
 86 |     def context(self):
 87 |         return self._configs["context"]
 88 |     
 89 |     @property
 90 |     def corpus_dir(self):
 91 |         return self._configs["corpus_dir"]
 92 | 
 93 |     @property
 94 |     def print_freq(self):
 95 |         return self._configs["print_freq"]
 96 |     
 97 |     @property
 98 |     def nb_epoch(self):
 99 |         return self._configs["nb_epoch"]
100 | 
101 |     @property
102 |     def chunk_sizes(self):
103 |         return self._configs["chunk_sizes"]
104 | 
105 |     @property
106 |     def train_split(self):
107 |         return self._configs["train_split"]
108 | 
109 |     @property
110 |     def val_split(self):
111 |         return self._configs["val_split"]
112 | 
113 |     @property
114 |     def test_split(self):
115 |         return self._configs["test_split"]
116 | 
117 |     @property
118 |     def full(self):
119 |         return self._configs
120 | 
121 |     @property
122 |     def sampling_function(self):
123 |         return self._configs["sampling_function"]
124 | 
125 |     @property
126 |     def data_rng(self):
127 |         return self._configs["data_rng"]
128 | 
129 |     @property
130 |     def nnet_rng(self):
131 |         return self._configs["nnet_rng"]
132 | 
133 |     @property
134 |     def opt_algo(self):
135 |         return self._configs["opt_algo"]
136 | 
137 |     @property
138 |     def prefetch_size(self):
139 |         return self._configs["prefetch_size"]
140 | 
141 |     @property
142 |     def pretrain(self):
143 |         return self._configs["pretrain"]
144 | 
145 |     @property
146 |     def result_dir(self):
147 |         result_dir = os.path.join(self._configs["result_dir"], self.snapshot_name, self.dataset)
148 |         if not os.path.exists(result_dir):
149 |             os.makedirs(result_dir)
150 |         return result_dir
151 | 
152 |     @property
153 |     def dataset(self):
154 |         return self._configs["dataset"]
155 | 
156 |     @property
157 |     def snapshot_name(self):
158 |         return self._configs["snapshot_name"]
159 | 
160 |     @property
161 |     def snapshot_dir(self):
162 |         snapshot_dir = os.path.join(self.cache_dir, "nnet", self.snapshot_name, self.dataset)
163 | 
164 |         if not os.path.exists(snapshot_dir):
165 |             os.makedirs(snapshot_dir)
166 |         return snapshot_dir
167 | 
168 |     @property
169 |     def snapshot_file(self):
170 |         snapshot_file = os.path.join(self.snapshot_dir, self.snapshot_name + "_{}.pkl")
171 |         return snapshot_file
172 | 
173 |     @property
174 |     def config_dir(self):
175 |         return self._configs["config_dir"]
176 | 
177 |     @property
178 |     def batch_size(self):
179 |         return self._configs["batch_size"]
180 | 
181 |     @property
182 |     def learning_rate(self):
183 |         return self._configs["learning_rate"]
184 | 
185 |     @property
186 |     def stepsize(self):
187 |         return self._configs["stepsize"]
188 | 
189 |     @property
190 |     def snapshot(self):
191 |         return self._configs["snapshot"]
192 | 
193 |     @property
194 |     def display(self):
195 |         return self._configs["display"]
196 | 
197 |     @property
198 |     def val_iter(self):
199 |         return self._configs["val_iter"]
200 | 
201 |     @property
202 |     def data_dir(self):
203 |         return self._configs["data_dir"]
204 | 
205 |     @property
206 |     def cache_dir(self):
207 |         if not os.path.exists(self._configs["cache_dir"]):
208 |             os.makedirs(self._configs["cache_dir"])
209 |         return self._configs["cache_dir"]
210 | 
211 |     def update_config(self, new):
212 |         unrecognized = []
213 |         for key in new:
214 |             if key in self._configs:
215 |                 self._configs[key] = new[key]
216 |             else:
217 |                 unrecognized.append(key)
218 |         if len(unrecognized):
219 |             print("warning : unrecognized sys keys {}".format(unrecognized))
220 |         return self
221 | 


--------------------------------------------------------------------------------
/core/dbs/__init__.py:
--------------------------------------------------------------------------------
1 | from .dataset import Sampler
2 | datasets = {
3 |     "refer": Sampler 
4 | }
5 | 
6 | 


--------------------------------------------------------------------------------
/core/dbs/base.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import numpy as np
 3 | 
 4 | class BASE(object):
 5 |     def __init__(self):
 6 |         self._split     = None
 7 |         self._db_inds   = []
 8 |         self._image_ids = []
 9 |         self._mean    = np.zeros((3, ), dtype=np.float32)
10 |         self._std     = np.ones((3, ), dtype=np.float32)
11 |         self._eig_val = np.ones((3, ), dtype=np.float32)
12 |         self._eig_vec = np.zeros((3, 3), dtype=np.float32)
13 |         self._configs = {}
14 |         self._configs["data_aug"] = True
15 |         self._data_rng = None
16 | 
17 |     @property
18 |     def configs(self):
19 |         return self._configs
20 | 
21 |     @property
22 |     def mean(self):
23 |         return self._mean
24 | 
25 |     @property
26 |     def std(self):
27 |         return self._std
28 | 
29 |     @property
30 |     def eig_val(self):
31 |         return self._eig_val
32 | 
33 |     @property
34 |     def eig_vec(self):
35 |         return self._eig_vec
36 | 
37 |     @property
38 |     def db_inds(self):
39 |         return self._db_inds
40 | 
41 |     @property
42 |     def split(self):
43 |         return self._split
44 | 
45 |     def update_config(self, new):
46 |         unrecognized = []
47 |         for key in new:
48 |             if key in self._configs:
49 |                 self._configs[key] = new[key]
50 |             else:
51 |                 unrecognized.append(key)
52 |         if len(unrecognized):
53 |             print("warning: unrecognized db keys {}".format(unrecognized))
54 | 
55 |     def shuffle_inds(self, quiet=False):
56 |         if self._data_rng is None:
57 |             self._data_rng = np.random.RandomState(os.getpid())
58 | 
59 |         if not quiet:
60 |             print("shuffling indices...")
61 |         rand_perm = self._data_rng.permutation(len(self._db_inds))
62 |         self._db_inds = self._db_inds[rand_perm]
63 | 


--------------------------------------------------------------------------------
/core/dbs/dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import os.path as osp
  3 | import json
  4 | import numpy as np
  5 | import torch
  6 | import sys
  7 | from .referring import REFERDB
  8 | from ..paths import get_file_path
  9 | 
 10 | from . import utils
 11 | from .utils import Corpus
 12 | sys.modules['utils'] = utils
 13 | import pdb
 14 | 
 15 | 
 16 | class Sampler(REFERDB):
 17 |     def __init__(self, db_config, split=None, sys_config=None):
 18 |         super(Sampler, self).__init__(db_config)
 19 |         self._mean    = np.array([0.40789654, 0.44719302, 0.47026115], dtype=np.float32)
 20 |         self._std     = np.array([0.28863828, 0.27408164, 0.27809835], dtype=np.float32)
 21 |         self._eig_val = np.array([0.2141788, 0.01817699, 0.00341571], dtype=np.float32)
 22 |         self._eig_vec = np.array([
 23 |             [-0.58752847, -0.69563484, 0.41340352],
 24 |             [-0.5832747, 0.00994535, -0.81221408],
 25 |             [-0.56089297, 0.71832671, 0.41158938]
 26 |         ], dtype=np.float32)
 27 |         
 28 |         self._dataset = sys_config.dataset
 29 |         self.data_dir = sys_config.data_dir
 30 |         
 31 |         # setting datasource
 32 |         if self._dataset == 'referit':
 33 |             self._data_root = osp.join(sys_config.data_dir, 'refer', 'ln_data', 'referit')
 34 |             self._im_dir = osp.join(self._data_root, 'images')
 35 |             self._split_dir = osp.join(self._data_root, 'splits')
 36 |         elif self._dataset == 'flickr':
 37 |             self._data_root = osp.join(sys_config.data_dir, 'refer', 'ln_data', 'Flickr30k')
 38 |             self._im_dir = osp.join(self.data_dir, 'flickr30k_images')
 39 |         else: #refer coco etc.
 40 |             self._data_root = osp.join(sys_config.data_dir, 'refer', 'ln_data', 'other')
 41 |             self._im_dir = osp.join(self._data_root, 'images', 'mscoco', 'images', 'train2014')
 42 | 
 43 |         dataset_root = osp.join(sys_config.data_dir, 'refer', 'data', self._dataset)
 44 |         valid_splits = SUPPORTED_DATASETS[self._dataset]['splits']
 45 |         
 46 |         if split not in valid_splits:
 47 |             raise ValueError(
 48 |                 'Dataset {0} does not have split {1}'.format(
 49 |                     self._dataset, split))
 50 |         
 51 |         # setting database
 52 |         self.database = []
 53 |         splits = [split]
 54 |         if self._dataset != 'referit':
 55 |             splits = ['train', 'val'] if split == 'trainval' else [split]
 56 |         for split in splits:
 57 |             imgset_file = '{0}_{1}.pth'.format(self._dataset, split)
 58 |             imgset_path = osp.join(dataset_root, imgset_file)
 59 |             self.database += torch.load(imgset_path, map_location = "cpu")
 60 | 
 61 |         # processing database
 62 |         if self._dataset == 'flickr':
 63 |             self.img_names, self.bboxs, self.phrases = zip(*self.database)
 64 |         else:
 65 |             self.img_names, _, self.bboxs, self.phrases, _ = zip(*self.database)
 66 | 
 67 |         self._db_inds = np.arange(len(self.database))
 68 |         self.corpus = torch.load(db_config["corpus_path"], map_location='cpu')
 69 |         
 70 |         self.covert_bbox = []
 71 |         if not (self._dataset == 'referit' or self._dataset == 'flickr'): # for refcoco, etc
 72 |             # covert x1y1wh to x1y1x2y2
 73 |             for bbox in self.bboxs:
 74 |                 bbox = np.array(bbox, dtype=int)
 75 |                 bbox[2] += bbox[0]
 76 |                 bbox[3] += bbox[1]
 77 |                 self.covert_bbox.append(bbox)
 78 |         else:
 79 |             for bbox in self.bboxs:                                     # for referit, flickr
 80 |                 bbox = np.array(bbox, dtype=int)
 81 |                 self.covert_bbox.append(bbox)
 82 | 
 83 |     def image_path(self, ind): # notice: db index is the actual index of data.
 84 |         return osp.join(self._im_dir, self.img_names[ind])
 85 |     
 86 |     def annotation_box(self, ind):
 87 |         return self.covert_bbox[ind].copy()
 88 |     
 89 |     def phrase(self, ind):
 90 |         return self.phrases[ind]
 91 | 
 92 |         
 93 | # Meta Information
 94 | SUPPORTED_DATASETS = {
 95 |     'referit': {'splits': ('train', 'val', 'trainval', 'test')},
 96 |     'unc': {
 97 |         'splits': ('train', 'val', 'trainval', 'testA', 'testB'),
 98 |         'params': {'dataset': 'refcoco', 'split_by': 'unc'}
 99 |     },
100 |     'unc+': {
101 |         'splits': ('train', 'val', 'trainval', 'testA', 'testB'),
102 |         'params': {'dataset': 'refcoco+', 'split_by': 'unc'}
103 |     },
104 |     'gref': {
105 |         'splits': ('train', 'val'),
106 |         'params': {'dataset': 'refcocog', 'split_by': 'google'}
107 |     },
108 |     'flickr': {
109 |         'splits': ('train', 'val', 'test')}
110 | }
111 | 
112 | 
113 | 
114 | 
115 | 
116 | 
117 | 
118 | 
119 | 
120 | 


--------------------------------------------------------------------------------
/core/dbs/referring.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .base import BASE
 4 | 
 5 | class REFERDB(BASE):
 6 |     def __init__(self, db_config):
 7 |         super(REFERDB, self).__init__()
 8 |         # Configs for 
 9 |         self._configs["data_aug"]         = True
10 |         self._configs["random_flip"]      = True
11 |         self._configs["random_affine"]    = True
12 |         self._configs["random_color"]     = True
13 |         self._configs["random_lighting"]  = True
14 |         self._configs["input_size"]     = [256, 256]
15 |         self._configs["output_sizes"]   = [32,32]
16 | 
17 |         # Configs for both training and testing
18 |         self._configs["anchors"]              = None
19 | 
20 |         # Configs for language model 
21 |         self._configs["vocab_size"]           = 0       
22 |         self._configs["word_embedding_size"]  = 512
23 |         self._configs["word_vec_size"]        = 512
24 |         self._configs["hidden_size"]          = 512
25 |         self._configs["bidirectional"]        = True
26 |         self._configs["input_dropout_p"]      = 0.5
27 |         self._configs["dropout_p"]            = 0.2
28 |         self._configs["n_layers"]             = 1
29 |         self._configs["max_query_len"]        =  128
30 |         self._configs["variable_length"]      = True
31 |         self._configs["joint_embedding_size"] = 256
32 |         self._configs["joint_out_dim"]        = 256
33 |         self._configs["joint_embedding_dropout"] = 0.1
34 |         self._configs["joint_mlp_layers"]     = 2   
35 |         self._configs["corpus_path"]          = None
36 |         self.update_config(db_config)
37 | 


--------------------------------------------------------------------------------
/core/dbs/utils.py:
--------------------------------------------------------------------------------
1 | from .word_utils import Corpus
2 | Corpus
3 | 
4 | 


--------------------------------------------------------------------------------
/core/dbs/word_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Language-related data loading helper functions and class wrappers.
  5 | """
  6 | 
  7 | import re
  8 | import torch
  9 | import codecs
 10 | 
 11 | UNK_TOKEN = '<unk>'
 12 | PAD_TOKEN = '<pad>'
 13 | END_TOKEN = '<eos>'
 14 | SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
 15 | 
 16 | 
 17 | class Dictionary(object):
 18 |     def __init__(self):
 19 |         self.word2idx = {}
 20 |         self.idx2word = []
 21 | 
 22 |     def add_word(self, word):
 23 |         if word not in self.word2idx:
 24 |             self.idx2word.append(word)
 25 |             self.word2idx[word] = len(self.idx2word) - 1
 26 |         return self.word2idx[word]
 27 | 
 28 |     def __len__(self):
 29 |         return len(self.idx2word)
 30 | 
 31 |     def __getitem__(self, a):
 32 |         if isinstance(a, int):
 33 |             return self.idx2word[a]
 34 |         elif isinstance(a, list):
 35 |             return [self.idx2word[x] for x in a]
 36 |         elif isinstance(a, str):
 37 |             return self.word2idx[a]
 38 |         else:
 39 |             raise TypeError("Query word/index argument must be int or str")
 40 | 
 41 |     def __contains__(self, word):
 42 |         return word in self.word2idx
 43 | 
 44 | 
 45 | class Corpus(object):
 46 |     def __init__(self):
 47 |         self.dictionary = Dictionary()
 48 | 
 49 |     def set_max_len(self, value):
 50 |         self.max_len = value
 51 | 
 52 |     def load_file(self, filename):
 53 |         with codecs.open(filename, 'r', 'utf-8') as f:
 54 |             for line in f:
 55 |                 line = line.strip()
 56 |                 self.add_to_corpus(line)
 57 |         self.dictionary.add_word(UNK_TOKEN)
 58 |         self.dictionary.add_word(PAD_TOKEN)
 59 | 
 60 |     def add_to_corpus(self, line):
 61 |         """Tokenizes a text line."""
 62 |         # Add words to the dictionary
 63 |         words = line.split()
 64 |         # tokens = len(words)
 65 |         for word in words:
 66 |             word = word.lower()
 67 |             self.dictionary.add_word(word)
 68 | 
 69 |     def tokenize(self, line, max_len=20):
 70 |         # Tokenize line contents
 71 |         words = SENTENCE_SPLIT_REGEX.split(line.strip())
 72 |         # words = [w.lower() for w in words if len(w) > 0]
 73 |         words = [w.lower() for w in words if (len(w) > 0 and w!=' ')]   ## do not include space as a token
 74 | 
 75 |         if words[-1] == '.':
 76 |             words = words[:-1]
 77 | 
 78 |         if max_len > 0:
 79 |             if len(words) > max_len:
 80 |                 words = words[:max_len]
 81 |             elif len(words) < max_len:
 82 |                 # words = [PAD_TOKEN] * (max_len - len(words)) + words
 83 |                 words = words + [END_TOKEN] + [PAD_TOKEN] * (max_len - len(words) - 1)
 84 | 
 85 |         tokens = len(words) ## for end token
 86 |         ids = torch.LongTensor(tokens)
 87 |         token = 0
 88 |         for word in words:
 89 |             if word not in self.dictionary:
 90 |                 word = UNK_TOKEN
 91 |             # print(word, type(word), word.encode('ascii','ignore').decode('ascii'), type(word.encode('ascii','ignore').decode('ascii')))
 92 |             if type(word)!=type('a'):
 93 |                 print(word, type(word), word.encode('ascii','ignore').decode('ascii'), type(word.encode('ascii','ignore').decode('ascii')))
 94 |                 word = word.encode('ascii','ignore').decode('ascii')
 95 |             ids[token] = self.dictionary[word]
 96 |             token += 1
 97 |         # ids[token] = self.dictionary[END_TOKEN]
 98 |         return ids
 99 | 
100 |     def __len__(self):
101 |         return len(self.dictionary)
102 | 


--------------------------------------------------------------------------------
/core/groundors.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import cv2
 3 | import pdb
 4 | import torch
 5 | # from core.groundors import YoloRefer
 6 | import os.path as osp
 7 | import json
 8 | import pdb
 9 | import numpy as np
10 | from core.dbs import datasets
11 | from core.sampler.sampler import Referring
12 | from core.sampler.collate_fn import collate_fn
13 | from core.sampler.utils import letterbox, normalize_, resize_image_
14 | from core.config import SystemConfig
15 | from core.nnet.nnet_factory import NetworkFactory
16 | from core.utils import make_anchors
17 | from core.test.test import _decode, _bbox_iou, _visualize, _decode_anchorbased
18 | from core.models.net import LBYLNet as Model
19 | from core.paths import get_file_path
20 | 
21 | torch.backends.cudnn.benchmark = False
22 | 
23 | class Net(object):
24 |     def __init__(self, cfg_file, iter):
25 |         with open(osp.join("./configs", cfg_file + ".json"), "r") as f:
26 |             config = json.load(f)
27 |         
28 |         config["system"]["snapshot_name"] = cfg_file
29 |         system_config = SystemConfig().update_config(config["system"])
30 |         system_config.lr = 0.001
31 |         self.system_config = system_config
32 |         anchors = make_anchors(system_config.dataset, 416)
33 |         config["db"]["anchors"] = anchors
34 |         config["db"]["corpus_path"] = get_file_path("..", "data", "refer", "data",  config["system"]["dataset"], "corpus.pth") 
35 |         self.config = config
36 |         model = Model(system_config, config['db'])
37 |         self.model = model
38 |         self.nnet = NetworkFactory(system_config, model)
39 |         self.nnet.load_params(iter)
40 |         self.nnet.eval_mode()
41 |         split  = system_config.val_split
42 |         self.db = datasets['refer'](config["db"], split=split, sys_config=system_config)
43 |         self.input_size = self.db.configs["input_size"]
44 |         self.dataset = Referring(self.db, system_config, data_aug=False, \
45 |             debug=False, shuffle=False, test=True)
46 |         self.original_shape = None
47 |         print("initailize...")
48 | 
49 |     def tokenize(self, phrase):
50 |         phrase =  self.dataset._tokenize_phrase(phrase)
51 |         phrase = torch.stack([phrase])
52 |         return phrase
53 | 
54 |     def postprocess(self, out):
55 |         bbox = _decode_anchorbased(out)[0]
56 |         height, width = self.original_shape[0:2]
57 |         reshape_ratio = min(self.input_size[0] / height, \
58 |                             self.input_size[1] / width)
59 |         resize_shape = round(height * reshape_ratio), round(width * reshape_ratio)
60 |         dh = (self.input_size[0] - resize_shape[0]) / 2 # height padding
61 |         dw = (self.input_size[1] - resize_shape[1]) / 2 # width padding
62 |         bbox[0:4:2] = (bbox[0:4:2] - dw) / reshape_ratio
63 |         bbox[1:4:2] = (bbox[1:4:2] - dh) / reshape_ratio
64 |         bbox[0:4:2] = np.clip(bbox[0:4:2], 0, width-1)
65 |         bbox[1:4:2] = np.clip(bbox[1:4:2], 0, height-1)
66 |         return bbox
67 | 
68 |     def prepocess_image(self, image):
69 |         if not image.shape[-1] > 1:
70 |             image = np.stack([image] * 3) # duplicate channel if gray image
71 |         
72 |         self.original_shape = image.shape
73 |         dummy_bbox = [0, 100, 0, 100]
74 |         image, bbox = resize_image_(image, dummy_bbox.copy(), self.input_size, \
75 |                     padding_color=tuple((self.db.mean * 255).tolist()))
76 |         
77 |         image = image.astype(np.float32) / 255.
78 |         normalize_(image, self.db.mean, self.db.std)
79 |         image = image.transpose((2, 0, 1))
80 |         image = np.stack([image])
81 |         image = torch.from_numpy(image)
82 |         return image
83 | 
84 |     @torch.no_grad()
85 |     def __call__(self, image, phrase):
86 |         image = self.prepocess_image(image)
87 |         phrase = self.tokenize(phrase)
88 |         out = self.nnet.test(image, phrase)
89 |         bbox = self.postprocess(out)
90 |         return bbox


--------------------------------------------------------------------------------
/core/models/context/_pconv/conv4.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import pdb
 4 | from torch.autograd import Function
 5 | import torch.nn as nn
 6 | from landmarkconv import _C
 7 | 
 8 | class TopLeftPoolFunction(Function): 
 9 |     @staticmethod
10 |     def forward(ctx, input, guide):
11 |         output, maxout = _C.tl_pool_forward(input, guide)
12 |         ctx.save_for_backward(input, output, guide, maxout)
13 |         return output
14 |     
15 |     @staticmethod
16 |     def backward(ctx, grad_output):
17 |         input, output, guide, maxout = ctx.saved_variables
18 |         grad_input, grad_guide =_C.tl_pool_backward(input, guide, output, maxout, grad_output)
19 |         return grad_input, grad_guide
20 | 
21 | class TopRightPoolFunction(Function):
22 |     @staticmethod
23 |     def forward(ctx, input, guide):
24 |         output, maxout = _C.tr_pool_forward(input, guide)
25 |         ctx.save_for_backward(input, output, guide, maxout)
26 |         return output
27 |     
28 |     @staticmethod
29 |     def backward(ctx, grad_output):
30 |         input, output, guide, maxout = ctx.saved_variables
31 |         grad_input, grad_guide =_C.tr_pool_backward(input, guide, output, maxout, grad_output)
32 |         return grad_input, grad_guide
33 | 
34 | class BottomRightPoolFunction(Function):
35 |     @staticmethod
36 |     def forward(ctx, input, guide):
37 |         output, maxout = _C.br_pool_forward(input, guide)
38 |         ctx.save_for_backward(input, output, guide, maxout)
39 |         return output
40 |     
41 |     @staticmethod
42 |     def backward(ctx, grad_output):
43 |         input, output, guide, maxout = ctx.saved_variables
44 |         grad_input, grad_guide = _C.br_pool_backward(input, guide, output, maxout, grad_output)
45 |         return grad_input, grad_guide
46 | 
47 | class BottomLeftPoolFunction(Function):
48 |     @staticmethod
49 |     def forward(ctx, input, guide):
50 |         output, maxout = _C.bl_pool_forward(input, guide)
51 |         ctx.save_for_backward(input, output, guide, maxout)
52 |         return output
53 |     
54 |     @staticmethod
55 |     def backward(ctx, grad_output):
56 |         input, output, guide, maxout = ctx.saved_variables
57 |         grad_input, grad_guide =_C.bl_pool_backward(input, guide, output, maxout, grad_output)
58 |         return grad_input, grad_guide
59 | 
60 | class TopLeftPool(nn.Module):
61 |     def forward(self, x, guide):
62 |         x = x.contiguous()
63 |         guide = guide.expand_as(x).contiguous()
64 |         return TopLeftPoolFunction.apply(x, guide)
65 | 
66 | class TopRightPool(nn.Module):
67 |     def forward(self, x, guide):
68 |         x = x.contiguous()
69 |         guide = guide.expand_as(x).contiguous()
70 |         return TopRightPoolFunction.apply(x, guide)
71 | 
72 | class BottomRightPool(nn.Module):
73 |     def forward(self, x, guide):
74 |         x = x.contiguous()
75 |         guide = guide.expand_as(x).contiguous()
76 |         return BottomRightPoolFunction.apply(x, guide)
77 | 
78 | class BottomLeftPool(nn.Module):
79 |     def forward(self, x, guide):
80 |         x = x.contiguous()
81 |         guide = guide.expand_as(x).contiguous()
82 |         return BottomLeftPoolFunction.apply(x, guide)


--------------------------------------------------------------------------------
/core/models/context/_pconv/conv8.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import numpy as np
  3 | import pdb
  4 | from torch.autograd import gradcheck
  5 | from torch.autograd import Function
  6 | import torch.nn as nn
  7 | from landmarkconv import _C
  8 | 
  9 | class I1PoolFunction(Function):
 10 |     @staticmethod
 11 |     def forward(ctx, input, guide):
 12 |         output, maxout = _C.I1_pool_forward(input, guide)
 13 |         ctx.save_for_backward(input, output, guide, maxout)
 14 |         return output
 15 |     
 16 |     @staticmethod
 17 |     def backward(ctx, grad_output):
 18 |         input, output, guide, maxout = ctx.saved_variables
 19 |         grad_input, grad_guide =_C.I1_pool_backward(input, guide, output, maxout, grad_output)
 20 |         return grad_input, grad_guide
 21 | 
 22 | 
 23 | class I1Pool(nn.Module):
 24 |     def forward(self, x, guide):
 25 |         x = x.contiguous()
 26 |         guide = guide.expand_as(x).contiguous()
 27 |         return I1PoolFunction.apply(x, guide)
 28 | 
 29 | 
 30 | class I2PoolFunction(Function):
 31 |     @staticmethod
 32 |     def forward(ctx, input, guide):
 33 |         output, maxout = _C.I2_pool_forward(input, guide)
 34 |         ctx.save_for_backward(input, output, guide, maxout)
 35 |         return output
 36 |     
 37 |     @staticmethod
 38 |     def backward(ctx, grad_output):
 39 |         input, output, guide, maxout = ctx.saved_variables
 40 |         grad_input, grad_guide =_C.I2_pool_backward(input, guide, output, maxout, grad_output)
 41 |         return grad_input, grad_guide
 42 | 
 43 | 
 44 | class I2Pool(nn.Module):
 45 |     def forward(self, x, guide):
 46 |         x = x.contiguous()
 47 |         guide = guide.expand_as(x).contiguous()
 48 |         return I2PoolFunction.apply(x, guide)
 49 | 
 50 | 
 51 | class I3PoolFunction(Function):
 52 |     @staticmethod
 53 |     def forward(ctx, input, guide):
 54 |         output, maxout = _C.I3_pool_forward(input, guide)
 55 |         ctx.save_for_backward(input, output, guide, maxout)
 56 |         return output
 57 |     
 58 |     @staticmethod
 59 |     def backward(ctx, grad_output):
 60 |         input, output, guide, maxout = ctx.saved_variables
 61 |         grad_input, grad_guide =_C.I3_pool_backward(input, guide, output, maxout, grad_output)
 62 |         return grad_input, grad_guide
 63 | 
 64 | 
 65 | class I3Pool(nn.Module):
 66 |     def forward(self, x, guide):
 67 |         x = x.contiguous()
 68 |         guide = guide.expand_as(x).contiguous()
 69 |         return I3PoolFunction.apply(x, guide)
 70 | 
 71 | 
 72 | class I4PoolFunction(Function):
 73 |     @staticmethod
 74 |     def forward(ctx, input, guide):
 75 |         output, maxout = _C.I4_pool_forward(input, guide)
 76 |         ctx.save_for_backward(input, output, guide, maxout)
 77 |         return output
 78 |     
 79 |     @staticmethod
 80 |     def backward(ctx, grad_output):
 81 |         input, output, guide, maxout = ctx.saved_variables
 82 |         grad_input, grad_guide =_C.I4_pool_backward(input, guide, output, maxout, grad_output)
 83 |         return grad_input, grad_guide
 84 | 
 85 | 
 86 | class I4Pool(nn.Module):
 87 |     def forward(self, x, guide):
 88 |         x = x.contiguous()
 89 |         guide = guide.expand_as(x).contiguous()
 90 |         return I4PoolFunction.apply(x, guide)
 91 | 
 92 | class I5PoolFunction(Function):
 93 |     @staticmethod
 94 |     def forward(ctx, input, guide):
 95 |         output, maxout = _C.I5_pool_forward(input, guide)
 96 |         ctx.save_for_backward(input, output, guide, maxout)
 97 |         return output
 98 |     
 99 |     @staticmethod
100 |     def backward(ctx, grad_output):
101 |         input, output, guide, maxout = ctx.saved_variables
102 |         grad_input, grad_guide =_C.I5_pool_backward(input, guide, output, maxout, grad_output)
103 |         return grad_input, grad_guide
104 | 
105 | 
106 | class I5Pool(nn.Module):
107 |     def forward(self, x, guide):
108 |         x = x.contiguous()
109 |         guide = guide.expand_as(x).contiguous()
110 |         return I5PoolFunction.apply(x, guide)
111 | 
112 | 
113 | class I6PoolFunction(Function):
114 |     @staticmethod
115 |     def forward(ctx, input, guide):
116 |         output, maxout = _C.I6_pool_forward(input, guide)
117 |         ctx.save_for_backward(input, output, guide, maxout)
118 |         return output
119 |     
120 |     @staticmethod
121 |     def backward(ctx, grad_output):
122 |         input, output, guide, maxout = ctx.saved_variables
123 |         grad_input, grad_guide =_C.I6_pool_backward(input, guide, output, maxout, grad_output)
124 |         return grad_input, grad_guide
125 | 
126 | 
127 | class I6Pool(nn.Module):
128 |     def forward(self, x, guide):
129 |         x = x.contiguous()
130 |         guide = guide.expand_as(x).contiguous()
131 |         return I6PoolFunction.apply(x, guide)
132 | 
133 | 
134 | class I7PoolFunction(Function):
135 |     @staticmethod
136 |     def forward(ctx, input, guide):
137 |         output, maxout = _C.I7_pool_forward(input, guide)
138 |         ctx.save_for_backward(input, output, guide, maxout)
139 |         return output
140 |     
141 |     @staticmethod
142 |     def backward(ctx, grad_output):
143 |         input, output, guide, maxout = ctx.saved_variables
144 |         grad_input, grad_guide =_C.I7_pool_backward(input, guide, output, maxout, grad_output)
145 |         return grad_input, grad_guide
146 | 
147 | 
148 | class I7Pool(nn.Module):
149 |     def forward(self, x, guide):
150 |         x = x.contiguous()
151 |         guide = guide.expand_as(x).contiguous()
152 |         return I7PoolFunction.apply(x, guide)
153 | 
154 | 
155 | class I8PoolFunction(Function):
156 |     @staticmethod
157 |     def forward(ctx, input, guide):
158 |         output, maxout = _C.I8_pool_forward(input, guide)
159 |         ctx.save_for_backward(input, output, guide, maxout)
160 |         return output
161 |     
162 |     @staticmethod
163 |     def backward(ctx, grad_output):
164 |         input, output, guide, maxout = ctx.saved_variables
165 |         grad_input, grad_guide =_C.I8_pool_backward(input, guide, output, maxout, grad_output)
166 |         return grad_input, grad_guide
167 | 
168 | 
169 | class I8Pool(nn.Module):
170 |     def forward(self, x, guide):
171 |         x = x.contiguous()
172 |         guide = guide.expand_as(x).contiguous()
173 |         return I8PoolFunction.apply(x, guide)


--------------------------------------------------------------------------------
/core/models/context/module.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import pdb
  5 | 
  6 | class convolution(nn.Module):
  7 |     def __init__(self, k, inp_dim, out_dim, stride=1, with_bn=True):
  8 |         super(convolution, self).__init__()
  9 | 
 10 |         pad = (k - 1) // 2
 11 |         self.conv = nn.Conv2d(inp_dim, out_dim, (k, k), padding=(pad, pad), stride=(stride, stride), bias=not with_bn)
 12 |         self.bn   = nn.BatchNorm2d(out_dim) if with_bn else nn.Sequential()
 13 |         self.relu = nn.ReLU(inplace=True)
 14 | 
 15 |     def forward(self, x):
 16 |         conv = self.conv(x)
 17 |         bn   = self.bn(conv)
 18 |         relu = self.relu(bn)
 19 |         return relu
 20 | 
 21 | class residual(nn.Module):
 22 |     """
 23 |     residual block
 24 |     """
 25 |     def __init__(self, inp_dim, out_dim, k=3, stride=1):
 26 |         super(residual, self).__init__()
 27 |         p = (k - 1) // 2
 28 | 
 29 |         self.conv1 = nn.Conv2d(inp_dim, out_dim, (k, k), padding=(p, p), stride=(stride, stride), bias=False)
 30 |         self.bn1   = nn.BatchNorm2d(out_dim)
 31 |         self.relu1 = nn.ReLU(inplace=True)
 32 | 
 33 |         self.conv2 = nn.Conv2d(out_dim, out_dim, (k, k), padding=(p, p), bias=False)
 34 |         self.bn2   = nn.BatchNorm2d(out_dim)
 35 |         
 36 |         self.skip  = nn.Sequential(
 37 |             nn.Conv2d(inp_dim, out_dim, (1, 1), stride=(stride, stride), bias=False),
 38 |             nn.BatchNorm2d(out_dim)
 39 |         ) if stride != 1 or inp_dim != out_dim else nn.Sequential()
 40 |         self.relu  = nn.ReLU(inplace=True)
 41 | 
 42 |     def forward(self, x):
 43 |         conv1 = self.conv1(x)
 44 |         bn1   = self.bn1(conv1)
 45 |         relu1 = self.relu1(bn1)
 46 | 
 47 |         conv2 = self.conv2(relu1)
 48 |         bn2   = self.bn2(conv2)
 49 | 
 50 |         skip  = self.skip(x) # if downsampling, resize the feature map x
 51 |         return self.relu(bn2 + skip)
 52 | 
 53 | 
 54 | class Nonlocal(nn.Module):
 55 |     def __init__(self, dim, mapdim):
 56 |         super(Nonlocal, self).__init__()
 57 |         self.mapdim = dim
 58 |         self._init_layers(dim, mapdim)
 59 | 
 60 |     def _init_layers(self, dim, mapdim):
 61 |         self.conv1 = convolution(3, dim, dim) # b c h w 
 62 |         self.conv2   = convolution(3, dim, dim)
 63 |         self.phi = nn.Conv2d(dim, dim, 3, padding=1)
 64 |         self.theta = nn.Conv2d(dim, dim, 3, padding=1)
 65 |         self.g = nn.Conv2d(dim, dim, 3, padding=1)
 66 |         self.W = nn.Conv2d(dim, dim, 1)
 67 |         nn.init.constant_(self.W.weight, 0)
 68 |         nn.init.constant_(self.W.bias, 0)
 69 |         # pdb.set_trace()
 70 | 
 71 |     def forward(self, x):
 72 |         x1 = self.conv1(x)
 73 |         # pdb.set_trace()
 74 |         b, c, h, w = x.size()
 75 |         g_x = self.g(x1).view(b, self.mapdim, -1)
 76 |         g_x = g_x.permute(0, 2, 1)
 77 |         theta_x = self.theta(x1).view(b, self.mapdim, -1)
 78 |         theta_x = theta_x.permute(0, 2, 1)
 79 |         phi_x = self.phi(x1).view(b, self.mapdim, -1)
 80 |         f = torch.matmul(theta_x, phi_x)
 81 |         f_div_C = F.softmax(f, dim=-1)
 82 | 
 83 |         y = torch.matmul(f_div_C, g_x)
 84 |         y = y.permute(0, 2, 1).contiguous()
 85 |         y = y.view(b, self.mapdim, h, w)
 86 | 
 87 |         z = self.W(y) + x
 88 |         z = self.conv2(z)
 89 |         return z
 90 | 
 91 | class Dilated(nn.Module):
 92 |     def __init__(self, dim, mapdim, dilate=3):
 93 |         super(Dilated, self).__init__()
 94 |         self.mapdim = mapdim
 95 |         self._init_layers(dim, mapdim, dilate)
 96 | 
 97 |     def _init_layers(self, dim, mapdim, dilate):
 98 |         self.p1_conv = convolution(3, dim, dim) # b c h w 
 99 |         self.p2_conv = nn.Conv2d(dim, dim, (3, 3), padding=(dilate, dilate), dilation=(dilate, dilate), bias=False)
100 |         self.p2_bn   = nn.BatchNorm2d(dim)
101 |         self.conv1   = nn.Conv2d(dim, dim, (1,1), bias=False)
102 |         self.bn1     = nn.BatchNorm2d(dim)
103 |         self.relu1   = nn.ReLU(inplace=True)
104 |         self.conv2 = convolution(3, dim, dim)
105 | 
106 |     def forward(self,x):
107 |         conv = self.p1_conv(x)
108 |         p2_conv = self.p2_conv(conv)
109 |         p2_bn = self.p2_bn(p2_conv)
110 |         conv1 = self.conv1(x)
111 |         bn1 = self.bn1(conv1)
112 |         relu1 = self.relu1(bn1+p2_bn)
113 |         return self.conv2(relu1)
114 | 
115 | class LandmarkP4(nn.Module):
116 |     def __init__(self, dim, mapdim):
117 |         super(LandmarkP4, self).__init__()
118 |         # print('using pconv4 pooling checked ...')
119 |         self.mapdim = mapdim
120 |         self._init_layers(dim)
121 | 
122 |     def _init_layers(self, dim):
123 |         # print(dim)
124 |         # map to mapdim
125 |         self.p1_conv1 = convolution(3, dim, self.mapdim)
126 |         self.p2_conv1 = convolution(3, dim, self.mapdim)
127 |         self.p3_conv1 = convolution(3, dim, self.mapdim)
128 |         self.p4_conv1 = convolution(3, dim, self.mapdim)
129 |         # map back to dim
130 |         self.p_conv1 = nn.Conv2d(self.mapdim * 4, dim, (3,3), padding=(1, 1), bias=False)
131 |         self.p_bn1   = nn.BatchNorm2d(dim)
132 | 
133 |         self.conv1   = nn.Conv2d(dim, dim, (1,1), bias=False)
134 |         self.bn1     = nn.BatchNorm2d(dim)
135 |         self.relu1   = nn.ReLU(inplace=True)
136 |         self.conv2   = convolution(3, dim, dim)
137 |         from ._pconv.conv4 import TopLeftPool, TopRightPool, BottomLeftPool, BottomRightPool
138 |         self.pool1 = TopRightPool()
139 |         self.pool2 = TopLeftPool()
140 |         self.pool3 = BottomLeftPool()
141 |         self.pool4 = BottomRightPool()
142 |     
143 |     def forward(self, x):
144 |         p1_conv1 = self.p1_conv1(x)
145 |         p2_conv1 = self.p2_conv1(x)
146 |         p3_conv1 = self.p3_conv1(x)
147 |         p4_conv1 = self.p4_conv1(x)
148 |         p1 = self.pool1(p1_conv1, torch.ones_like(p1_conv1))
149 |         p2 = self.pool2(p2_conv1, torch.ones_like(p2_conv1))
150 |         p3 = self.pool3(p3_conv1, torch.ones_like(p3_conv1))
151 |         p4 = self.pool4(p4_conv1, torch.ones_like(p4_conv1))
152 | 
153 |         pool_feat = torch.cat([p1, p2, p3, p4], dim=1)
154 |         p_conv1 = self.p_conv1(pool_feat)
155 |         p_bn1   = self.p_bn1(p_conv1)
156 |         conv1 = self.conv1(x)
157 |         bn1   = self.bn1(conv1)
158 |         relu1 = self.relu1(p_bn1 + bn1)
159 |         conv2 = self.conv2(relu1)
160 |         return conv2
161 | 
162 | class LandmarkP1(nn.Module):
163 |     def __init__(self, dim, mapdim):
164 |         super(LandmarkP1, self).__init__()
165 |         # print('using pconv4 pooling checked ...')
166 |         self.mapdim = mapdim
167 |         self._init_layers(dim)
168 | 
169 |     def _init_layers(self, dim):
170 |         # map to mapdim
171 |         self.p1_conv1 = convolution(3, dim, self.mapdim)
172 |         # map back to dim
173 |         self.p_conv1 = nn.Conv2d(self.mapdim, dim, (3,3), padding=(1, 1), bias=False)
174 |         self.p_bn1   = nn.BatchNorm2d(dim)
175 | 
176 |         self.conv1   = nn.Conv2d(dim, dim, (1,1), bias=False)
177 |         self.bn1     = nn.BatchNorm2d(dim)
178 |         self.relu1   = nn.ReLU(inplace=True)
179 |         self.conv2   = convolution(3, dim, dim)
180 |         from ._pconv.conv4 import TopLeftPool, TopRightPool, BottomLeftPool, BottomRightPool
181 |         self.pool1 = TopRightPool()
182 |         self.pool2 = TopLeftPool()
183 |         self.pool3 = BottomLeftPool()
184 |         self.pool4 = BottomRightPool()
185 |     
186 |     def forward(self, x):
187 |         p1_conv1 = self.p1_conv1(x)
188 |         p2_conv1 = self.p1_conv1(x)
189 |         p3_conv1 = self.p1_conv1(x)
190 |         p4_conv1 = self.p1_conv1(x)
191 |         # bottom right region
192 |         p1 = self.pool1(p1_conv1, torch.ones_like(p1_conv1))
193 |         p2 = self.pool2(p2_conv1, torch.ones_like(p2_conv1))
194 |         p3 = self.pool3(p3_conv1, torch.ones_like(p3_conv1))
195 |         p4 = self.pool4(p4_conv1, torch.ones_like(p4_conv1))
196 |         p1 = torch.max(p1, p2)
197 |         p2 = torch.max(p3, p4)
198 |         p1 = torch.max(p1, p2)
199 |         pool_feat = p1
200 |         p_conv1 = self.p_conv1(pool_feat)
201 |         p_bn1   = self.p_bn1(p_conv1)
202 |         conv1 = self.conv1(x)
203 |         bn1   = self.bn1(conv1)
204 |         relu1 = self.relu1(p_bn1 + bn1)
205 |         conv2 = self.conv2(relu1)
206 |         return conv2
207 | 
208 | 
209 | class LandmarkP2(nn.Module):
210 |     def __init__(self, dim, mapdim):
211 |         super(LandmarkP2, self).__init__()
212 |         # print('using pconv4 pooling checked ...')
213 |         self.mapdim = mapdim
214 |         self._init_layers(dim)
215 | 
216 |     def _init_layers(self, dim):
217 |         # map to mapdim
218 |         self.p1_conv1 = convolution(3, dim, self.mapdim)
219 |         self.p2_conv1 = convolution(3, dim, self.mapdim)
220 |         # map back to dim
221 |         self.p_conv1 = nn.Conv2d(self.mapdim * 2, dim, (3,3), padding=(1, 1), bias=False)
222 |         self.p_bn1   = nn.BatchNorm2d(dim)
223 | 
224 |         self.conv1   = nn.Conv2d(dim, dim, (1,1), bias=False)
225 |         self.bn1     = nn.BatchNorm2d(dim)
226 |         self.relu1   = nn.ReLU(inplace=True)
227 |         self.conv2   = convolution(3, dim, dim)
228 |         from ._pconv.conv4 import TopLeftPool, TopRightPool, BottomLeftPool, BottomRightPool
229 |         self.pool1 = TopRightPool()
230 |         self.pool2 = TopLeftPool()
231 |         self.pool3 = BottomLeftPool()
232 |         self.pool4 = BottomRightPool()
233 |     
234 |     def forward(self, x):
235 |         p1_conv1 = self.p1_conv1(x)
236 |         p2_conv1 = self.p1_conv1(x)
237 |         p3_conv1 = self.p2_conv1(x)
238 |         p4_conv1 = self.p2_conv1(x)
239 |         # bottom right region
240 |         p1 = self.pool1(p1_conv1, torch.ones_like(p1_conv1))
241 |         p2 = self.pool2(p2_conv1, torch.ones_like(p2_conv1))
242 |         p3 = self.pool3(p3_conv1, torch.ones_like(p3_conv1))
243 |         p4 = self.pool4(p4_conv1, torch.ones_like(p4_conv1))
244 |         p1 = torch.max(p1, p2)
245 |         p2 = torch.max(p3, p4)
246 | 
247 |         pool_feat = torch.cat([p1, p2], dim=1)
248 |         p_conv1 = self.p_conv1(pool_feat)
249 |         p_bn1   = self.p_bn1(p_conv1)
250 |         conv1 = self.conv1(x)
251 |         bn1   = self.bn1(conv1)
252 |         relu1 = self.relu1(p_bn1 + bn1)
253 |         conv2 = self.conv2(relu1)
254 |         return conv2
255 | 
256 | class LandmarkP2x(nn.Module):
257 |     def __init__(self, dim, mapdim):
258 |         super(LandmarkP2x, self).__init__()
259 |         # print('using pconv4 pooling checked ...')
260 |         self.mapdim = mapdim
261 |         self._init_layers(dim)
262 | 
263 |     def _init_layers(self, dim):
264 |         # map to mapdim
265 |         self.p1_conv1 = convolution(3, dim, self.mapdim)
266 |         self.p2_conv1 = convolution(3, dim, self.mapdim)
267 |         # map back to dim
268 |         self.p_conv1 = nn.Conv2d(self.mapdim * 2, dim, (3,3), padding=(1, 1), bias=False)
269 |         self.p_bn1   = nn.BatchNorm2d(dim)
270 |         self.conv1   = nn.Conv2d(dim, dim, (1,1), bias=False)
271 |         self.bn1     = nn.BatchNorm2d(dim)
272 |         self.relu1   = nn.ReLU(inplace=True)
273 |         self.conv2   = convolution(3, dim, dim)
274 |         from ._pconv.conv4 import TopLeftPool, TopRightPool, BottomLeftPool, BottomRightPool
275 |         self.pool1 = TopRightPool()
276 |         self.pool2 = TopLeftPool()
277 |         self.pool3 = BottomLeftPool()
278 |         self.pool4 = BottomRightPool()
279 |     
280 |     def forward(self, x):
281 |         p1_conv1 = self.p1_conv1(x)
282 |         p2_conv1 = self.p2_conv1(x)
283 |         p3_conv1 = self.p2_conv1(x)
284 |         p4_conv1 = self.p1_conv1(x)
285 |         # bottom right region
286 |         p1 = self.pool1(p1_conv1, torch.ones_like(p1_conv1))
287 |         p2 = self.pool2(p2_conv1, torch.ones_like(p2_conv1))
288 |         p3 = self.pool3(p3_conv1, torch.ones_like(p3_conv1))
289 |         p4 = self.pool4(p4_conv1, torch.ones_like(p4_conv1))
290 |         p1 = torch.max(p1, p4)
291 |         p2 = torch.max(p2, p3)
292 |         pool_feat = torch.cat([p1, p2], dim=1)
293 |         p_conv1 = self.p_conv1(pool_feat)
294 |         p_bn1   = self.p_bn1(p_conv1)
295 |         conv1 = self.conv1(x)
296 |         bn1   = self.bn1(conv1)
297 |         relu1 = self.relu1(p_bn1 + bn1)
298 |         conv2 = self.conv2(relu1)
299 |         return conv2
300 | 
301 | class LandmarkP8(nn.Module):
302 |     def __init__(self, dim, mapdim):
303 |         super(Landmark8, self).__init__()
304 |         # print('using pconv8 pooling checked ...')
305 |         self.mapdim = mapdim
306 |         self._init_layers(dim)
307 | 
308 |     def _init_layers(self, dim):
309 |         # map to mapdim
310 |         self.p1_conv1 = convolution(3, dim, self.mapdim)
311 |         self.p2_conv1 = convolution(3, dim, self.mapdim)
312 |         self.p3_conv1 = convolution(3, dim, self.mapdim)
313 |         self.p4_conv1 = convolution(3, dim, self.mapdim)
314 |         self.p5_conv1 = convolution(3, dim, self.mapdim)
315 |         self.p6_conv1 = convolution(3, dim, self.mapdim)
316 |         self.p7_conv1 = convolution(3, dim, self.mapdim)
317 |         self.p8_conv1 = convolution(3, dim, self.mapdim)
318 | 
319 |         # map back to dim
320 |         self.p_conv1 = nn.Conv2d(self.mapdim * 8, dim, (3,3), padding=(1, 1), bias=False)
321 |         self.p_bn1   = nn.BatchNorm2d(dim)
322 | 
323 |         self.conv1   = nn.Conv2d(dim, dim, (1,1), bias=False)
324 |         self.bn1     = nn.BatchNorm2d(dim)
325 |         self.relu1   = nn.ReLU(inplace=True)
326 |         self.conv2   = convolution(3, dim, dim)
327 |         
328 |         from ._pconv.conv8 import I1Pool, I2Pool, I3Pool, I4Pool, I5Pool, I6Pool, I7Pool, I8Pool
329 |         self.pool1 = I1Pool()
330 |         self.pool2 = I2Pool()
331 |         self.pool3 = I3Pool()
332 |         self.pool4 = I4Pool()
333 |         self.pool5 = I5Pool()
334 |         self.pool6 = I6Pool()
335 |         self.pool7 = I7Pool()
336 |         self.pool8 = I8Pool()
337 |     
338 |     def forward(self, x, hook=None, hook_feat=None, hookdir=2):
339 |         p1_conv1 = self.p1_conv1(x)
340 |         p2_conv1 = self.p2_conv1(x)
341 |         p3_conv1 = self.p3_conv1(x)
342 |         p4_conv1 = self.p4_conv1(x)
343 |         p5_conv1 = self.p5_conv1(x)
344 |         p6_conv1 = self.p6_conv1(x)
345 |         p7_conv1 = self.p7_conv1(x)
346 |         p8_conv1 = self.p8_conv1(x)
347 | 
348 |         p1 = self.pool1(p1_conv1, torch.ones_like(p1_conv1))
349 |         p2 = self.pool2(p2_conv1, torch.ones_like(p2_conv1))
350 |         p3 = self.pool3(p3_conv1, torch.ones_like(p3_conv1))
351 |         p4 = self.pool4(p4_conv1, torch.ones_like(p4_conv1))
352 |         p5 = self.pool5(p5_conv1, torch.ones_like(p5_conv1))
353 |         p6 = self.pool6(p6_conv1, torch.ones_like(p6_conv1))
354 |         p7 = self.pool7(p7_conv1, torch.ones_like(p7_conv1))
355 |         p8 = self.pool8(p8_conv1, torch.ones_like(p8_conv1))
356 | 
357 |         pool_feat = torch.cat([p1, p2, p3, p4, p5, p6, p7, p8], dim=1)
358 |         p_conv1 = self.p_conv1(pool_feat)
359 |         p_bn1   = self.p_bn1(p_conv1)
360 |         conv1 = self.conv1(x)
361 |         bn1   = self.bn1(conv1)
362 |         relu1 = self.relu1(p_bn1 + bn1)
363 |         conv2 = self.conv2(relu1)
364 |         return conv2
365 | 
366 | 
367 | class LandmarkP4x(nn.Module):
368 |     def __init__(self, dim, mapdim):
369 |         super(LandmarkP4x, self).__init__()
370 |         # print('using pconv4x pooling checked ...')
371 |         self.mapdim = mapdim
372 |         self._init_layers(dim)
373 | 
374 |     def _init_layers(self, dim):
375 |         # map to mapdim
376 |         self.p1_conv1 = convolution(3, dim, self.mapdim)
377 |         self.p2_conv1 = convolution(3, dim, self.mapdim)
378 |         self.p3_conv1 = convolution(3, dim, self.mapdim)
379 |         self.p4_conv1 = convolution(3, dim, self.mapdim)
380 | 
381 |         # map back to dim
382 |         self.p_conv1 = nn.Conv2d(self.mapdim * 4, dim, (3,3), padding=(1, 1), bias=False)
383 |         self.p_bn1   = nn.BatchNorm2d(dim)
384 | 
385 |         self.conv1   = nn.Conv2d(dim, dim, (1,1), bias=False)
386 |         self.bn1     = nn.BatchNorm2d(dim)
387 |         self.relu1   = nn.ReLU(inplace=True)
388 |         self.conv2   = convolution(3, dim, dim)
389 |         
390 |         from ._pconv.conv8 import I1Pool, I2Pool, I3Pool, I4Pool, I5Pool, I6Pool, I7Pool, I8Pool
391 |         self.pool1 = I1Pool()
392 |         self.pool2 = I2Pool()
393 |         self.pool3 = I3Pool()
394 |         self.pool4 = I4Pool()
395 |         self.pool5 = I5Pool()
396 |         self.pool6 = I6Pool()
397 |         self.pool7 = I7Pool()
398 |         self.pool8 = I8Pool()
399 |     
400 |     def forward(self, x):
401 |         p1_conv1 = self.p1_conv1(x)
402 |         p2_conv1 = self.p2_conv1(x)
403 |         p3_conv1 = self.p2_conv1(x)
404 |         p4_conv1 = self.p3_conv1(x)
405 |         p5_conv1 = self.p3_conv1(x)
406 |         p6_conv1 = self.p4_conv1(x)
407 |         p7_conv1 = self.p4_conv1(x)
408 |         p8_conv1 = self.p1_conv1(x)
409 | 
410 |         p1 = self.pool1(p1_conv1, torch.ones_like(p1_conv1))
411 |         p2 = self.pool2(p2_conv1, torch.ones_like(p2_conv1))
412 |         p3 = self.pool3(p3_conv1, torch.ones_like(p3_conv1))
413 |         p4 = self.pool4(p4_conv1, torch.ones_like(p4_conv1))
414 |         p5 = self.pool5(p5_conv1, torch.ones_like(p5_conv1))
415 |         p6 = self.pool6(p6_conv1, torch.ones_like(p6_conv1))
416 |         p7 = self.pool7(p7_conv1, torch.ones_like(p7_conv1))
417 |         p8 = self.pool8(p8_conv1, torch.ones_like(p8_conv1))
418 |         p1 = torch.max(p1, p8)
419 |         p2 = torch.max(p2, p3)
420 |         p3 = torch.max(p4, p5)
421 |         p4 = torch.max(p6, p7)
422 |         pool_feat = torch.cat([p1, p2, p3, p4], dim=1)
423 |         p_conv1 = self.p_conv1(pool_feat)
424 |         p_bn1   = self.p_bn1(p_conv1)
425 |         conv1 = self.conv1(x)
426 |         bn1   = self.bn1(conv1)
427 |         relu1 = self.relu1(p_bn1 + bn1)
428 |         conv2 = self.conv2(relu1)
429 |         return conv2


--------------------------------------------------------------------------------
/core/models/lang_encoder/RNNencoder.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys 
  3 | import os.path as osp
  4 | import numpy as np
  5 | import torch
  6 | import torch.nn as nn
  7 | import torch.nn.functional as F
  8 | from pytorch_pretrained_bert.tokenization import BertTokenizer
  9 | from pytorch_pretrained_bert.modeling import BertModel
 10 | from .. import utils
 11 | from ..utils import Corpus
 12 | sys.modules['utils'] = utils
 13 | import pdb
 14 | 
 15 | class BertEncoder(nn.Module):
 16 |     def __init__(self, cfg_db, cfg_sys=None):
 17 |         super(BertEncoder, self).__init__()
 18 |         self.cfg_db = cfg_db
 19 |         self.bert_name = cfg_sys.lang_encoder
 20 |         self.model = BertModel.from_pretrained(self.bert_name)
 21 |         if self.bert_name == 'bert-base-uncased':
 22 |             self.lang_dim = 768
 23 |         else:
 24 |             self.lang_dim = 1024
 25 |         self.num_layers = 4
 26 | 
 27 |     def forward(self, input, mask=None):
 28 |         max_len = (input!=0).sum(1).max().item()
 29 |         encoded_layers, _= self.model(input[:, :max_len], attention_mask=mask[:, :max_len])
 30 |         features = None
 31 |         features = torch.stack(encoded_layers[-4:], 1).mean(1)
 32 |         # features have shape [len(phrase), seq_len, lang_dim]
 33 |         features = features / self.num_layers
 34 |         hidden = ((features * mask[:, :max_len].unsqueeze(-1).float()).sum(1) / mask.sum(-1).unsqueeze(-1).float())
 35 |         embedded  = features[:, :, :]
 36 |         ret = {
 37 |             'hidden':hidden,
 38 |             'embedded': embedded,
 39 |             'masks': mask
 40 |         }
 41 |         return ret
 42 | 
 43 | 
 44 | 
 45 | class RNNEncoder(nn.Module):
 46 |     def __init__(self, cfg_db, cfg_sys=None):
 47 |         super(RNNEncoder, self).__init__()
 48 |         self.cfg_db = cfg_db
 49 | 
 50 |         self.variable_length      = cfg_db['variable_length']
 51 |         self.word_embedding_size  = cfg_db['word_embedding_size']
 52 |         self.word_vec_size        = cfg_db['word_vec_size']
 53 |         self.hidden_size          = cfg_db['hidden_size']
 54 |         self.bidirectional        = cfg_db['bidirectional']
 55 |         self.input_dropout_p      = cfg_db['input_dropout_p']
 56 |         self.dropout_p            = cfg_db['dropout_p']
 57 |         self.n_layers             = cfg_db['n_layers']
 58 |         self.rnn_type             = 'lstm' # by default LSTM
 59 |         self.corpus_path          = cfg_db['corpus_path']
 60 |         self.vocab_size = len(torch.load(self.corpus_path, map_location = "cpu"))
 61 |         
 62 |         # encoder language
 63 |         self.embedding      = nn.Embedding(self.vocab_size, self.word_embedding_size)
 64 |         self.input_dropout  = nn.Dropout(self.input_dropout_p)
 65 |         self.mlp            = nn.Sequential(nn.Linear(self.word_embedding_size, self.word_vec_size), nn.ReLU())
 66 |         self.rnn            = getattr(nn, self.rnn_type.upper())(self.word_vec_size, 
 67 |                                                             self.hidden_size, 
 68 |                                                             self.n_layers,
 69 |                                                             batch_first=True,
 70 |                                                             bidirectional=self.bidirectional,
 71 |                                                             dropout = self.dropout_p)
 72 |         self.num_dirs = 2 if self.bidirectional else 1
 73 | 
 74 |     def forward(self, input, mask=None):
 75 |         word_id = input
 76 |         max_len = (word_id!=0).sum(1).max().item()
 77 |         word_id = word_id[:, :max_len] # mask zero
 78 |         # embedding
 79 |         output, hidden, embedded, final_output = self.RNNEncode(word_id)
 80 |         return {
 81 |             'hidden': hidden,
 82 |             'output': output,
 83 |             'embedded': embedded,
 84 |             'final_output': final_output,
 85 |         }
 86 | 
 87 |     def RNNEncode(self, input_labels):
 88 |         """
 89 |         Inputs:
 90 |         - input_labels: Variable long (batch, seq_len)
 91 |         Outputs:
 92 |         - output  : Variable float (batch, max_len, hidden_size * num_dirs)
 93 |         - hidden  : Variable float (batch, num_layers * num_dirs * hidden_size)
 94 |         - embedded: Variable float (batch, max_len, word_vec_size)
 95 |         """
 96 |         device = input_labels.device
 97 |         if self.variable_length:
 98 |             input_lengths_list, sorted_lengths_list, sort_idxs, recover_idxs = self.sort_inputs(input_labels)
 99 |             input_labels = input_labels[sort_idxs]
100 |         
101 |         embedded = self.embedding(input_labels) #(n, seq_len, word_embedding_size)
102 |         embedded = self.input_dropout(embedded) #(n, seq_len, word_embedding_size)
103 |         embedded = self.mlp(embedded)           #(n, seq_len, word_vec_size)
104 | 
105 |         if self.variable_length:
106 |             embedded = nn.utils.rnn.pack_padded_sequence(embedded, \
107 |                                                         sorted_lengths_list,\
108 |                                                          batch_first=True)
109 |         # forward rnn
110 |         self.rnn.flatten_parameters()
111 |         output, hidden = self.rnn(embedded)
112 |         
113 |         # recover
114 |         if self.variable_length:
115 |             # recover embedded
116 |             embedded, _ = nn.utils.rnn.pad_packed_sequence(embedded, batch_first=True)  # (batch, max_len, word_vec_size)
117 |             embedded = embedded[recover_idxs]
118 | 
119 |             # recover output
120 |             output, _ = nn.utils.rnn.pad_packed_sequence(output, batch_first=True)      # (batch, max_len, hidden_size * num_dir)
121 |             output = output[recover_idxs]
122 | 
123 |             # recover hidden
124 |             if self.rnn_type == 'lstm':
125 |                 hidden = hidden[0]                      # hidden state
126 |             hidden = hidden[:, recover_idxs, :]         # (num_layers * num_dirs, batch, hidden_size)
127 |             hidden = hidden.transpose(0,1).contiguous() # (batch, num_layers * num_dirs, hidden_size)
128 |             hidden = hidden.view(hidden.size(0), -1)    # (batch, num_layers * num_dirs * hidden_size)
129 |         
130 |         # finnal output
131 |         finnal_output = []
132 |         for ii in range(output.shape[0]):
133 |             finnal_output.append(output[ii, int(input_lengths_list[ii]-1), :])
134 |         finnal_output = torch.stack(finnal_output, dim=0)   # (batch, number_dirs * hidden_size)
135 | 
136 |         return output, hidden, embedded, finnal_output
137 | 
138 |     def sort_inputs(self, input_labels):                                                # sort input labels by descending
139 |         device = input_labels.device
140 |         input_lengths = (input_labels!=0).sum(1)
141 |         input_lengths_list = input_lengths.data.cpu().numpy().tolist()
142 |         sorted_input_lengths_list = np.sort(input_lengths_list)[::-1].tolist()          # list of sorted input_lengths
143 |         sort_idxs = np.argsort(input_lengths_list)[::-1].tolist()
144 |         s2r = {s:r for r, s in enumerate(sort_idxs)}
145 |         recover_idxs = [s2r[s] for s in range(len(input_lengths_list))]
146 |         assert max(input_lengths_list) == input_labels.size(1)
147 |         # move to long tensor
148 |         sort_idxs = input_labels.data.new(sort_idxs).long().to(device)             # Variable long
149 |         recover_idxs = input_labels.data.new(recover_idxs).long().to(device)       # Variable long
150 |         return input_lengths_list, sorted_input_lengths_list, sort_idxs, recover_idxs
151 | 


--------------------------------------------------------------------------------
/core/models/lang_encoder/__init__.py:
--------------------------------------------------------------------------------
1 | from .RNNencoder import BertEncoder, RNNEncoder


--------------------------------------------------------------------------------
/core/models/net/__init__.py:
--------------------------------------------------------------------------------
1 | from .lbylnet import LBYLNet
2 | from .baseline import Baseline


--------------------------------------------------------------------------------
/core/models/net/baseline.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | import pdb
  6 | import re
  7 | from ...paths import get_file_path
  8 | from ..context.module import convolution, residual
  9 | from ..lang_encoder.RNNencoder import BertEncoder, RNNEncoder
 10 | from ..utils.losses import Loss
 11 | from .darknet import Darknet
 12 | 
 13 | 
 14 | class Baseline(nn.Module):
 15 |     def __init__(self, cfg_sys, cfg_db):
 16 |         super(Baseline, self).__init__()
 17 |         self.init_configs(cfg_sys, cfg_db)
 18 |         # visu_encoder
 19 |         weight_path = get_file_path("..", "ext", "yolov3.weights")
 20 |         darknet_cfg_path = get_file_path("..", "ext", "yolov3.cfg")
 21 |         self.visu_encoder = Darknet(config_path=darknet_cfg_path)
 22 |         self.visu_encoder.load_weights(weight_path)
 23 |         # language encoder
 24 |         if self.lstm:
 25 |             self.lang_encoder = RNNEncoder(self.cfg_db)
 26 |         else:
 27 |             self.lang_encoder = BertEncoder(self.cfg_db)
 28 | 
 29 |         # fusion
 30 |         self.mapping_visu = nn.ModuleList([self._make_conv(dim, self.joint_embedding_size, 1) \
 31 |                                           for dim in [1024, 512, 256]])
 32 |         self.mapping_lang = self._make_mlp(self.lang_dim, self.joint_embedding_size, self.joint_embedding_dropout)
 33 |         self.norms = nn.ModuleList([nn.InstanceNorm2d(self.joint_inp_dim) for _ in [1024, 512, 256]])
 34 |         self.joint_fusion = nn.ModuleList([self._make_conv(self.joint_inp_dim, self.joint_out_dim, 1) \
 35 |                                           for _ in [1024, 512, 256]])
 36 |         # localization
 37 |         self.out_funcs = nn.ModuleList([self._make_pred(self.joint_out_dim, 3*5) 
 38 |                                   for _ in range(3)])
 39 |         self.loss = Loss(off_weight=5., anchors=self.anchors, input_size=self.input_size[0], alpha=cfg_sys.alpha)
 40 | 
 41 |     def forward(self, images, phrases, masks=None, test=False):
 42 |         device = images.device
 43 |         batch_size = images.shape[0]
 44 |         visu_feats = self.visu_encoder(images) # fpn head
 45 |         coord_feats = [self._make_coord(batch_size, x.shape[2], x.shape[3]) for x in visu_feats]
 46 |         lang_feat = self.lang_encoder(phrases, mask=phrases.gt(0))
 47 |         lang_feat = lang_feat['hidden']
 48 |         lang_feat  = self.mapping_lang(lang_feat)
 49 | 
 50 |         # concat conv
 51 |         visu_feat = []
 52 |         for ii, feat in enumerate(visu_feats):
 53 |             coord_feat = coord_feats[ii].to(device)
 54 |             lang_feat = self._normalize(lang_feat)
 55 |             lang_feat = lang_feat.view(lang_feat.shape[0], lang_feat.shape[1], 1, 1) # tile to match feature map
 56 |             feat = self.norms[ii](feat)
 57 |             feat = torch.cat([self.mapping_visu[ii](feat), lang_feat.repeat(1, 1, feat.shape[2], feat.shape[3]),
 58 |                     coord_feat], dim=1)
 59 |             visu_feat.append(feat)
 60 | 
 61 |         # joint fusion
 62 |         joint_feats = [fusion(feat) for feat, fusion in zip(visu_feat, self.joint_fusion)]
 63 |         # make prediction
 64 |         outs = [func(feat) for func, feat in zip(self.out_funcs, joint_feats)]
 65 |         if not test:
 66 |             return outs
 67 |         else:
 68 |             return self.loss(outs, targets=None)
 69 | 
 70 | 
 71 |     def init_configs(self, cfg_sys, cfg_db):
 72 |         self.cfg_sys                = cfg_sys
 73 |         self.cfg_db                 = cfg_db
 74 |         self.lstm                   = cfg_sys.lstm
 75 |         # loading parameter
 76 |         self.anchors                 = self.cfg_db["anchors"]
 77 |         self.joint_embedding_size    = self.cfg_db['joint_embedding_size']
 78 |         self.joint_embedding_dropout = self.cfg_db['joint_embedding_dropout']
 79 |         self.joint_mlp_layers        = self.cfg_db['joint_mlp_layers']
 80 |         self.n_layers                = self.cfg_db['n_layers']
 81 |         self.output_sizes            = self.cfg_db['output_sizes']
 82 |         self.hidden_size             = self.cfg_db['hidden_size']
 83 |         self.input_size              = self.cfg_db['input_size']
 84 |         self.num_dirs                = 2 if self.cfg_db['bidirectional'] else 1
 85 |         self.lang_dim                = self.hidden_size * self.num_dirs
 86 |         self.coord_dim               = 8
 87 |         self.joint_inp_dim           = self.coord_dim + self.joint_embedding_size * 2 # concat
 88 |         self.joint_out_dim           = self.cfg_db['joint_out_dim']
 89 |         self.pooldim                 = self.cfg_sys.pooldim
 90 |         if not self.lstm and self.cfg_db['rnn_type']== 'bert-base-uncased':
 91 |             self.lang_dim = 768
 92 |         else:
 93 |             self.lang_dim = 1024
 94 | 
 95 | 
 96 |     def _make_pred(self, input_dim, output_dim):
 97 |         pred = nn.Sequential(
 98 |             convolution(3, input_dim, input_dim, with_bn=False),
 99 |             nn.Conv2d(input_dim, output_dim, (1, 1))
100 |         )
101 |         if self.cfg_sys.balance_init:
102 |             nn.init.normal_(pred[0].conv.weight, mean=0, std=0.01)
103 |             nn.init.constant_(pred[0].conv.bias, 0.0)
104 |             nn.init.constant_(pred[1].weight, 0.0)
105 |             pi = 0.001
106 |             nn.init.constant_(pred[1].bias, -np.log((1-pi)/pi))
107 |         return pred
108 | 
109 |     def _make_mlp(self, input_dim, output_dim, drop):
110 |         return nn.Sequential(nn.Linear(input_dim, output_dim), 
111 |                 nn.BatchNorm1d(output_dim), 
112 |                 nn.ReLU(inplace=True), 
113 |                 nn.Dropout(drop),
114 |                 nn.Linear(output_dim, output_dim),
115 |                 nn.BatchNorm1d(output_dim),
116 |                 nn.ReLU(inplace=True))
117 | 
118 |     def _make_conv(self, input_dim, output_dim, k, stride=1):
119 |         pad = (k - 1) // 2
120 |         return nn.Sequential(
121 |             nn.Conv2d(input_dim, output_dim, (k, k), padding=(pad, pad), stride=(stride, stride)),
122 |             nn.BatchNorm2d(output_dim),
123 |             nn.ReLU(inplace=True)
124 |         )
125 | 
126 |     def _make_coord(self, batch, height, width):
127 |         xv, yv = torch.meshgrid([torch.arange(0,height), torch.arange(0,width)])
128 |         xv_min = (xv.float()*2 - width)/width
129 |         yv_min = (yv.float()*2 - height)/height
130 |         xv_max = ((xv+1).float()*2 - width)/width
131 |         yv_max = ((yv+1).float()*2 - height)/height
132 |         xv_ctr = (xv_min+xv_max)/2
133 |         yv_ctr = (yv_min+yv_max)/2
134 |         hmap = torch.ones(height, width)*(1./height)
135 |         wmap = torch.ones(height, width)*(1./width)
136 |         coord = torch.autograd.Variable(torch.cat([xv_min.unsqueeze(0), yv_min.unsqueeze(0),\
137 |             xv_max.unsqueeze(0), yv_max.unsqueeze(0),\
138 |             xv_ctr.unsqueeze(0), yv_ctr.unsqueeze(0),\
139 |             hmap.unsqueeze(0), wmap.unsqueeze(0)], dim=0))
140 |         coord = coord.unsqueeze(0).repeat(batch,1,1,1)
141 |         return coord
142 | 
143 |     def _normalize(self, feat, p=2, dim=1):
144 |         return F.normalize(feat, p, dim)


--------------------------------------------------------------------------------
/core/models/net/lbylnet.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | import pdb
  6 | import re
  7 | from ...paths import get_file_path
  8 | # from ..context.module import convolution, residual
  9 | from ..lang_encoder.RNNencoder import BertEncoder, RNNEncoder
 10 | from ..utils.losses import Loss
 11 | from .darknet import Darknet
 12 | from ..context.module import (convolution, residual, LandmarkP1,LandmarkP2, LandmarkP2x, LandmarkP4, LandmarkP4x, LandmarkP8, Nonlocal, Dilated)
 13 | 
 14 | context = {
 15 |     'LandmarkP8': LandmarkP8,
 16 |     'LandmarkP4': LandmarkP4,
 17 |     'LandmarkP4x':LandmarkP4x,
 18 |     'LandmarkP2': LandmarkP2,
 19 |     'LandmarkP2x':LandmarkP2x,
 20 |     'Nonlocal': Nonlocal,
 21 |     'Dilated' : Dilated,
 22 | }
 23 | 
 24 | class LBYLNet(nn.Module):
 25 |     def __init__(self, cfg_sys, cfg_db):
 26 |         super(LBYLNet, self).__init__()
 27 |         self.init_configs(cfg_sys, cfg_db)
 28 | 
 29 |         # visu_encoder
 30 |         weight_path = get_file_path("..", "ext", "yolov3.weights")
 31 |         darknet_cfg_path = get_file_path("..", "ext", "yolov3.cfg")
 32 |         self.visu_encoder = Darknet(config_path=darknet_cfg_path)
 33 |         self.visu_encoder.load_weights(weight_path)
 34 | 
 35 |         # lang_encoder
 36 |         if self.lstm:
 37 |             self.lang_encoder = RNNEncoder(self.cfg_db)
 38 |         else:
 39 |             self.lang_encoder = BertEncoder(self.cfg_db, cfg_sys=cfg_sys)
 40 |         
 41 |         # fusion module
 42 |         self.mapping_visu = nn.ModuleList([self._make_conv(dim, self.joint_embedding_size, 1) \
 43 |                                           for dim in [1024, 512, 256]])
 44 |         self.norms = nn.ModuleList([nn.InstanceNorm2d(self.joint_inp_dim) for _ in [1024, 512, 256]])
 45 |         self.mapping_lang = self._make_mlp(self.lang_dim, self.joint_embedding_size, self.joint_embedding_dropout)
 46 |         self.gamma = nn.ModuleList(nn.Linear(self.joint_embedding_size, self.joint_inp_dim) for _ in [1024, 512, 256])
 47 |         self.beta = nn.ModuleList(nn.Linear(self.joint_embedding_size, self.joint_inp_dim) for _ in [1024, 512, 256])
 48 | 
 49 |         self.joint_fusion = nn.ModuleList([self._make_conv(self.joint_inp_dim, self.joint_out_dim, 1) \
 50 |                                           for _ in [1024, 512, 256]])
 51 |         
 52 |         # landmark feature convolution module
 53 |         self.maxpool = nn.MaxPool2d(kernel_size=2, stride=2)
 54 |         self.upsample = nn.Upsample(scale_factor=2)
 55 |         self.refine = self._make_conv(self.joint_out_dim, self.joint_out_dim, 1)
 56 | 
 57 |         self.context_block = None
 58 |         if self.cfg_sys.context:
 59 |             self.context_block = context.get(cfg_sys.context)(self.joint_out_dim, mapdim=self.map_dim)
 60 |             # print("using context module {}".format(self.context_block))
 61 | 
 62 |         self.out_funcs = nn.ModuleList([self._make_pred(self.joint_out_dim, 3*5) 
 63 |                                   for _ in range(3)])
 64 |         # localization module
 65 |         self.loss = Loss(off_weight=5., anchors=self.anchors, input_size=self.input_size[0])
 66 | 
 67 | 
 68 |     def forward(self, images, phrases, masks=None, test=False):
 69 |         # visual module
 70 |         device = images.device
 71 |         batch_size = images.shape[0]
 72 |         # visual language encoders 
 73 |         visu_feats = self.visu_encoder(images) # fpn P3->P5
 74 |         coord_feats = [self._make_coord(batch_size, x.shape[2], x.shape[3]) for x in visu_feats]
 75 |         lang_feat = self.lang_encoder(phrases, mask=phrases.gt(0))
 76 |         # only use a global representation of language
 77 |         # you can also use more complex modeling using word-level representations
 78 |         # to improve performance, motivated by lots of previous works.
 79 |         # Usage: lang_feat = lang_feat['words'] shape [seq_len, dim]
 80 |         lang_feat = lang_feat['hidden']    
 81 |         lang_feat  = self.mapping_lang(lang_feat)
 82 | 
 83 |         # use FILM to enhance visual-language fusion
 84 |         gamma = [F.tanh(gamma(lang_feat)) for gamma in self.gamma]
 85 |         beta = [F.tanh(beta(lang_feat)) for beta in self.beta]
 86 |         visu_feat = []
 87 |         for ii, feat in enumerate(visu_feats):
 88 |             coord_feat = coord_feats[ii].to(device)
 89 |             feat = torch.cat([self.mapping_visu[ii](feat), coord_feat], dim=1)
 90 |             feat = self.norms[ii](feat)
 91 |             b = beta[ii].view(batch_size, -1, 1, 1).expand_as(feat)
 92 |             g = gamma[ii].view(batch_size, -1, 1, 1).expand_as(feat)
 93 |             feat = F.relu(g * feat + b)
 94 |             visu_feat.append(feat)
 95 | 
 96 |         # prior to modeling context, we tackle the scale problem for modeling context by simply sum them.
 97 |         joint_feats = [fusion(feat) for feat, fusion in zip(visu_feat, self.joint_fusion)]
 98 |         lower_feat = self.upsample(joint_feats[0])
 99 |         higher_feat = self.maxpool(joint_feats[-1])
100 |         inter_feat = self.refine((joint_feats[1] + lower_feat + higher_feat) / 3.)
101 |         
102 |         # any context-aware module you want.
103 |         if self.context_block:
104 |             inter_feat = self.context_block(inter_feat)
105 | 
106 |         # distribute back to FPN accounting for scale problem for detection.
107 |         joint_feats[0] = joint_feats[0] + self.maxpool(inter_feat)
108 |         joint_feats[1] = joint_feats[1] + inter_feat
109 |         joint_feats[2] = joint_feats[2] + self.upsample(inter_feat)
110 | 
111 |         # make predictions.
112 |         outs = [func(feat) for func, feat in zip(self.out_funcs, joint_feats)]
113 |         if not test:
114 |             return outs
115 |         else:
116 |             return self.loss(outs, targets=None)
117 | 
118 |     def init_configs(self, cfg_sys, cfg_db):
119 |         self.cfg_sys                = cfg_sys
120 |         self.cfg_db                 = cfg_db
121 |         self.lstm                   = cfg_sys.lstm
122 |         # loading parameter
123 |         self.lang                    = self.cfg_sys.lang_encoder
124 |         self.anchors                 = self.cfg_db["anchors"]
125 |         self.joint_embedding_size    = self.cfg_db['joint_embedding_size']
126 |         self.joint_embedding_dropout = self.cfg_db['joint_embedding_dropout']
127 |         self.joint_mlp_layers        = self.cfg_db['joint_mlp_layers']
128 |         self.n_layers                = self.cfg_db['n_layers']
129 |         self.output_sizes            = self.cfg_db['output_sizes']
130 |         self.hidden_size             = self.cfg_db['hidden_size']
131 |         self.input_size              = self.cfg_db['input_size']
132 |         self.num_dirs                = 2 if self.cfg_db['bidirectional'] else 1
133 |         self.lang_dim                = self.hidden_size * self.num_dirs
134 |         self.coord_dim               = 8
135 |         self.joint_inp_dim           = self.coord_dim + self.joint_embedding_size
136 |         self.joint_out_dim           = self.cfg_db['joint_out_dim']
137 |         self.map_dim                 = self.cfg_sys.ctx_dim
138 |         if not self.lstm and self.lang == 'bert-base-uncased':
139 |             self.lang_dim = 768
140 |         else:
141 |             self.lang_dim = 1024
142 | 
143 | 
144 |     def _make_pred(self, input_dim, output_dim):
145 |         pred = nn.Sequential(
146 |             convolution(3, input_dim, input_dim, with_bn=False),
147 |             nn.Conv2d(input_dim, output_dim, (1, 1))
148 |         )
149 |         return pred
150 | 
151 |     def _make_mlp(self, input_dim, output_dim, drop):
152 |         return nn.Sequential(nn.Linear(input_dim, output_dim), 
153 |                 nn.BatchNorm1d(output_dim), 
154 |                 nn.ReLU(inplace=True), 
155 |                 nn.Dropout(drop),
156 |                 nn.Linear(output_dim, output_dim),
157 |                 nn.BatchNorm1d(output_dim),
158 |                 nn.ReLU(inplace=True))
159 | 
160 |     def _make_conv(self, input_dim, output_dim, k, stride=1):
161 |         pad = (k - 1) // 2
162 |         return nn.Sequential(
163 |             nn.Conv2d(input_dim, output_dim, (k, k), padding=(pad, pad), stride=(stride, stride)),
164 |             nn.BatchNorm2d(output_dim),
165 |             nn.ReLU(inplace=True)
166 |         )
167 | 
168 |     def _make_coord(self, batch, height, width):
169 |         xv, yv = torch.meshgrid([torch.arange(0,height), torch.arange(0,width)])
170 |         xv_min = (xv.float()*2 - width)/width
171 |         yv_min = (yv.float()*2 - height)/height
172 |         xv_max = ((xv+1).float()*2 - width)/width
173 |         yv_max = ((yv+1).float()*2 - height)/height
174 |         xv_ctr = (xv_min+xv_max)/2
175 |         yv_ctr = (yv_min+yv_max)/2
176 |         hmap = torch.ones(height, width)*(1./height)
177 |         wmap = torch.ones(height, width)*(1./width)
178 |         coord = torch.autograd.Variable(torch.cat([xv_min.unsqueeze(0), yv_min.unsqueeze(0),\
179 |             xv_max.unsqueeze(0), yv_max.unsqueeze(0),\
180 |             xv_ctr.unsqueeze(0), yv_ctr.unsqueeze(0),\
181 |             hmap.unsqueeze(0), wmap.unsqueeze(0)], dim=0))
182 |         coord = coord.unsqueeze(0).repeat(batch,1,1,1)
183 |         return coord
184 | 
185 |     def _normalize(self, feat, p=2, dim=1):
186 |         return F.normalize(feat, p, dim)


--------------------------------------------------------------------------------
/core/models/net/yolov3.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | #batch=1
  4 | #subdivisions=1
  5 | # Training
  6 | batch=16
  7 | subdivisions=1
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 | 
583 | [yoloconvolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 | 
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 | 
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 | 
606 | 
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 | 
617 | 
618 | [route]
619 | layers = -4
620 | 
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 | 
629 | [upsample]
630 | stride=2
631 | 
632 | [route]
633 | layers = -1, 61
634 | 
635 | 
636 | 
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 | 
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 | 
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 | 
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 | 
669 | [yoloconvolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 | 
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 | 
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 | 
692 | 
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 | 
703 | 
704 | 
705 | [route]
706 | layers = -4
707 | 
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 | 
716 | [upsample]
717 | stride=2
718 | 
719 | [route]
720 | layers = -1, 36
721 | 
722 | 
723 | 
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 | 
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 | 
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 | 
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 | 
756 | [yoloconvolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 | 
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 | 
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 | 
779 | 
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 | 


--------------------------------------------------------------------------------
/core/models/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .word_utils import Corpus
2 | Corpus


--------------------------------------------------------------------------------
/core/models/utils/losses.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn as nn
  3 | import torch.nn.functional as F
  4 | import numpy as np
  5 | import pdb
  6 | 
  7 | def _sigmoid(x):
  8 |     return torch.clamp(x.sigmoid_(), min=1e-4, max=1-1e-4)
  9 | 
 10 | def bbox_wh_iou(wh1, wh2):
 11 |     wh2 = wh2.t()
 12 |     w1, h1 = wh1[0], wh1[1]
 13 |     w2, h2 = wh2[0], wh2[1]
 14 |     inter_area = torch.min(w1, w2) * torch.min(h1, h2)
 15 |     union_area = (w1 * h1 + 1e-16) + w2 * h2 - inter_area
 16 |     return inter_area / union_area
 17 | 
 18 | def bbox_iou(box1, box2, x1y1x2y2=True):
 19 |     """
 20 |     Returns the IoU of two bounding boxes
 21 |     """
 22 |     if not x1y1x2y2:
 23 |         # Transform from center and width to exact coordinates
 24 |         b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
 25 |         b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
 26 |         b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
 27 |         b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
 28 |     else:
 29 |         # Get the coordinates of bounding boxes
 30 |         b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
 31 |         b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]
 32 | 
 33 |     # get the corrdinates of the intersection rectangle
 34 |     inter_rect_x1 = torch.max(b1_x1, b2_x1)
 35 |     inter_rect_y1 = torch.max(b1_y1, b2_y1)
 36 |     inter_rect_x2 = torch.min(b1_x2, b2_x2)
 37 |     inter_rect_y2 = torch.min(b1_y2, b2_y2)
 38 |     # Intersection area
 39 |     inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * torch.clamp(
 40 |         inter_rect_y2 - inter_rect_y1 + 1, min=0
 41 |     )
 42 |     # Union Area
 43 |     b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
 44 |     b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
 45 | 
 46 |     iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)
 47 |     return iou
 48 | 
 49 | def _build_target(pred_bboxes, targets, anchors):
 50 |     # pred_bboxes (batch_size, num_bboxes, 5) 
 51 |     # targets     (batch_size, targets, anchors)
 52 |     device = pred_bboxes.device
 53 |     anchors = anchors.to(device)
 54 |     # anchor = torch.from_numpy(anchors).float().to(device)
 55 |     nB = pred_bboxes.size(0)
 56 |     nA = pred_bboxes.size(1)
 57 |     nG = pred_bboxes.size(2) # number of grid
 58 |     
 59 |     # iou_scores = torch.FloatTensor(nB, nA, nG, nG).fill_(0).to(device)
 60 |     tx = torch.FloatTensor(nB, nA, nG, nG).fill_(0).to(device)
 61 |     ty = torch.FloatTensor(nB, nA, nG, nG).fill_(0).to(device)
 62 |     tw = torch.FloatTensor(nB, nA, nG, nG).fill_(0).to(device)
 63 |     th = torch.FloatTensor(nB, nA, nG, nG).fill_(0).to(device)
 64 |     obj_mask = torch.ByteTensor(nB, nA, nG, nG).fill_(0).to(device)
 65 | 
 66 |     targets = targets * nG
 67 |     gxy = (targets[:, :2] + targets[:, 2:]).float() / 2
 68 |     gwh = (targets[:, 2:] - targets[:, :2]).float()
 69 | 
 70 |     # find max overlap in original scales
 71 |     # calculate iou with bbox prior in the original scales
 72 |     ious = torch.stack([bbox_wh_iou(anchors[i], gwh) for i in range(len(anchors))])
 73 |     best_ious, best_n = ious.max(0)
 74 |     gx, gy = gxy.t()
 75 |     gw, gh = gwh.t()
 76 |     gi, gj = gxy.long().t()
 77 | 
 78 |     tx[range(0, nB), best_n, gj, gi] = gx - gx.floor()
 79 |     ty[range(0, nB), best_n, gj, gi] = gy - gy.floor()
 80 |     # Width and height
 81 |     tw[range(0, nB), best_n, gj, gi] = torch.log(gw / anchors[best_n][:, 0] + 1e-16)
 82 |     th[range(0, nB), best_n, gj, gi] = torch.log(gh / anchors[best_n][:, 1] + 1e-16)
 83 |     obj_mask[range(0, nB), best_n, gj, gi] = 1
 84 | 
 85 | 
 86 |     return gi, gj, gw, gh, best_ious, best_n, tx, ty, tw, th, obj_mask# iou_scores[b, best_n, gj, gi] = bbox_iou(pred_boxes[b, best_n, gj, gi], target_boxes, x1y1x2y2=False)
 87 |     
 88 | class Loss(nn.Module):
 89 |     def __init__(self, off_weight, anchors, input_size):
 90 |         super(Loss, self).__init__()
 91 |         self.img_dim = input_size
 92 |         self.off_weight = off_weight
 93 |         self.anchors = np.array([list(anchor) for anchor in anchors])
 94 |         self.anchors = self.anchors.reshape(3, 3, 2)
 95 |         self.anchors = [self.anchors[i] for i in range(0, 3)]
 96 |         self.num_anchors = len(anchors)
 97 |         self.mse_loss = nn.MSELoss(reduction='none')
 98 |         self.ce_loss  = nn.CrossEntropyLoss(reduction='none')
 99 |         self.grid_size = 0
100 | 
101 |     def reshape(self, x):
102 |         batch_size, _, h, w = x.shape
103 |         return (x.view(batch_size, 3, 5, h, w) \
104 |             .permute(0, 1, 3, 4, 2)       \
105 |             .contiguous()                 \
106 |             .view(batch_size, 3, h, w, 5) \
107 |             .view(batch_size, 3*h*w,   5)
108 |             )
109 | 
110 |     def loss(self, outs, targets):
111 |         xs, best_ious, txs, tys, tws, ths, obj_masks = [],[],[],[],[],[],[]
112 |         
113 |         for i, (x, a) in enumerate(zip(outs, self.anchors)):
114 |             _, _, _, _, best_iou, best_n, tx, ty, tw, th, obj_mask \
115 |                 = self.target_generator(x, a, targets)
116 |             batch_size, _, h, w = x.shape
117 |             x = self.reshape(x)
118 |             xs.append(x)
119 |             best_ious.append(best_iou)
120 |             txs.append(tx.view(batch_size, -1))
121 |             tys.append(ty.view(batch_size, -1))
122 |             tws.append(tw.view(batch_size, -1))
123 |             ths.append(th.view(batch_size, -1))
124 |             obj_masks.append(obj_mask.view(batch_size, -1))
125 | 
126 |         index = torch.stack(best_ious, dim=1).max(dim=1)[1]
127 |         for batch_id, mask_id in enumerate(index):
128 |             for i in range(len(obj_masks)):
129 |                 if i == mask_id:
130 |                     continue
131 |                 obj_masks[i][batch_id, ...] = 0
132 | 
133 |         obj_masks = torch.cat(obj_masks, dim=1)
134 |         if hasattr(obj_mask, 'bool'):
135 |             obj_masks = obj_masks.bool()
136 | 
137 |         assert(obj_masks.sum().item() == len(index))
138 |         txs = torch.cat(txs, dim=1)
139 |         tys = torch.cat(tys, dim=1)
140 |         tws = torch.cat(tws, dim=1)
141 |         ths = torch.cat(ths, dim=1)
142 |         xs  = torch.cat(xs,  dim=1)
143 |         
144 |         loss_x = self.mse_loss(_sigmoid(xs[..., 0][obj_masks]), txs[obj_masks])
145 |         loss_y = self.mse_loss(_sigmoid(xs[..., 1][obj_masks]), tys[obj_masks])
146 |         loss_w = self.mse_loss(xs[..., 2][obj_masks], tws[obj_masks])
147 |         loss_h = self.mse_loss(xs[..., 3][obj_masks], ths[obj_masks])
148 |         off_loss = (loss_x + loss_y + loss_w + loss_h).mean() 
149 | 
150 |         loss_conf = self.ce_loss(xs[..., 4], obj_masks.max(1)[1]).mean()
151 |         loss = off_loss * self.off_weight + loss_conf
152 |         return loss.unsqueeze(0), loss_conf.unsqueeze(0), self.off_weight * off_loss.unsqueeze(0) 
153 | 
154 |     def forward(self, outs, targets):
155 |         # map to (0, 1)
156 |         if targets is None:
157 |             return [self.target_generator(x, a) for x, a in zip(outs, self.anchors)]
158 | 
159 |         targets = targets.float() / self.img_dim
160 |         return self.loss(outs, targets)
161 | 
162 |     def compute_grid_offsets(self, grid_size, anchors, cuda=True):
163 |         self.grid_size = grid_size
164 |         g = self.grid_size
165 |         self.stride = self.img_dim / self.grid_size
166 |         # Calculate offsets for each grid
167 |         self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).to(self.device).float()
168 |         self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).to(self.device).float()
169 |         self.scaled_anchors = torch.FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in anchors]).to(self.device)
170 |         self.anchor_w = self.scaled_anchors[:, 0:1].view((1, len(anchors), 1, 1))
171 |         self.anchor_h = self.scaled_anchors[:, 1:2].view((1, len(anchors), 1, 1))
172 | 
173 | 
174 |     def target_generator(self, x, layer_anchors, targets=None):
175 |         # x [b, 3, 5, h, w]
176 |         self.device = x.device
177 |         # targets = targets.to(self.device)
178 |         batch_size, _, h, w = x.shape
179 |         grid_size = h
180 |         prediction = (
181 |             x.view(batch_size, 3, 5, h, w)
182 |             .permute(0, 1, 3, 4, 2)
183 |             .contiguous()
184 |         ).float()
185 | 
186 |         x = torch.sigmoid(prediction[..., 0]) # center x
187 |         y = torch.sigmoid(prediction[..., 1]) # center y
188 |         w = prediction[..., 2]  # Width
189 |         h = prediction[..., 3]  # Height
190 |         pred_conf = prediction[..., 4]
191 | 
192 |         # If grid size does not match current we compute new offsets
193 |         if grid_size != self.grid_size:
194 |             self.compute_grid_offsets(grid_size, layer_anchors, cuda=x.is_cuda)
195 | 
196 |         # Add offset and scale with anchors
197 |         pred_boxes = torch.FloatTensor(prediction[..., :4].shape).to(self.device)
198 |         pred_boxes[..., 0] = x.data + self.grid_x
199 |         pred_boxes[..., 1] = y.data + self.grid_y
200 |         pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w
201 |         pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h
202 | 
203 |         output = torch.cat(
204 |             (
205 |                 pred_boxes.view(batch_size, -1, 4) * self.stride,
206 |                 pred_conf.view(batch_size, -1, 1)
207 |             ),
208 |             -1,
209 |         )
210 |         if targets is None:
211 |             return output
212 | 
213 |         gi, gj, gw, gh, best_ious, best_n, tx, ty, tw, th, object_mask = _build_target(pred_boxes, targets, self.scaled_anchors)
214 |         
215 |         return gi, gj, gw, gh, best_ious, best_n, tx, ty, tw, th, object_mask
216 | 


--------------------------------------------------------------------------------
/core/models/utils/word_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Language-related data loading helper functions and class wrappers.
  5 | """
  6 | 
  7 | import re
  8 | import torch
  9 | import codecs
 10 | 
 11 | UNK_TOKEN = '<unk>'
 12 | PAD_TOKEN = '<pad>'
 13 | END_TOKEN = '<eos>'
 14 | SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
 15 | 
 16 | 
 17 | class Dictionary(object):
 18 |     def __init__(self):
 19 |         self.word2idx = {}
 20 |         self.idx2word = []
 21 | 
 22 |     def add_word(self, word):
 23 |         if word not in self.word2idx:
 24 |             self.idx2word.append(word)
 25 |             self.word2idx[word] = len(self.idx2word) - 1
 26 |         return self.word2idx[word]
 27 | 
 28 |     def __len__(self):
 29 |         return len(self.idx2word)
 30 | 
 31 |     def __getitem__(self, a):
 32 |         if isinstance(a, int):
 33 |             return self.idx2word[a]
 34 |         elif isinstance(a, list):
 35 |             return [self.idx2word[x] for x in a]
 36 |         elif isinstance(a, str):
 37 |             return self.word2idx[a]
 38 |         else:
 39 |             raise TypeError("Query word/index argument must be int or str")
 40 | 
 41 |     def __contains__(self, word):
 42 |         return word in self.word2idx
 43 | 
 44 | 
 45 | class Corpus(object):
 46 |     def __init__(self):
 47 |         self.dictionary = Dictionary()
 48 | 
 49 |     def set_max_len(self, value):
 50 |         self.max_len = value
 51 | 
 52 |     def load_file(self, filename):
 53 |         with codecs.open(filename, 'r', 'utf-8') as f:
 54 |             for line in f:
 55 |                 line = line.strip()
 56 |                 self.add_to_corpus(line)
 57 |         self.dictionary.add_word(UNK_TOKEN)
 58 |         self.dictionary.add_word(PAD_TOKEN)
 59 | 
 60 |     def add_to_corpus(self, line):
 61 |         """Tokenizes a text line."""
 62 |         # Add words to the dictionary
 63 |         words = line.split()
 64 |         # tokens = len(words)
 65 |         for word in words:
 66 |             word = word.lower()
 67 |             self.dictionary.add_word(word)
 68 | 
 69 |     def tokenize(self, line, max_len=20):
 70 |         # Tokenize line contents
 71 |         words = SENTENCE_SPLIT_REGEX.split(line.strip())
 72 |         # words = [w.lower() for w in words if len(w) > 0]
 73 |         words = [w.lower() for w in words if (len(w) > 0 and w!=' ')]   ## do not include space as a token
 74 | 
 75 |         if words[-1] == '.':
 76 |             words = words[:-1]
 77 | 
 78 |         if max_len > 0:
 79 |             if len(words) > max_len:
 80 |                 words = words[:max_len]
 81 |             elif len(words) < max_len:
 82 |                 # words = [PAD_TOKEN] * (max_len - len(words)) + words
 83 |                 words = words + [END_TOKEN] + [PAD_TOKEN] * (max_len - len(words) - 1)
 84 | 
 85 |         tokens = len(words) ## for end token
 86 |         ids = torch.LongTensor(tokens)
 87 |         token = 0
 88 |         for word in words:
 89 |             if word not in self.dictionary:
 90 |                 word = UNK_TOKEN
 91 |             # print(word, type(word), word.encode('ascii','ignore').decode('ascii'), type(word.encode('ascii','ignore').decode('ascii')))
 92 |             if type(word)!=type('a'):
 93 |                 print(word, type(word), word.encode('ascii','ignore').decode('ascii'), type(word.encode('ascii','ignore').decode('ascii')))
 94 |                 word = word.encode('ascii','ignore').decode('ascii')
 95 |             ids[token] = self.dictionary[word]
 96 |             token += 1
 97 |         # ids[token] = self.dictionary[END_TOKEN]
 98 |         return ids
 99 | 
100 |     def __len__(self):
101 |         return len(self.dictionary)
102 | 


--------------------------------------------------------------------------------
/core/nnet/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svip-lab/LBYLNet/47a0e83b6db60d7efe1f74acdf4cb210ffd9554d/core/nnet/__init__.py


--------------------------------------------------------------------------------
/core/nnet/nnet_factory.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import pickle
  4 | import importlib
  5 | import torch.nn as nn
  6 | import pdb
  7 | import logging
  8 | from ..utils.data_parallel import DataParallel
  9 | torch.manual_seed(317)
 10 | 
 11 | class Network(nn.Module):
 12 |     def __init__(self, model, loss):
 13 |         super(Network, self).__init__()
 14 | 
 15 |         self.model = model
 16 |         self.loss  = loss
 17 | 
 18 |     def forward(self, images, phrases, labels, **kwargs):
 19 |         preds = self.model(images, phrases, **kwargs)
 20 |         loss, focal_loss, off_loss = self.loss(preds, labels, **kwargs)
 21 |         return loss, focal_loss, off_loss
 22 | 
 23 | # for model backward compatibility
 24 | # previously model was wrapped by DataParallel module
 25 | class DummyModule(nn.Module):
 26 |     def __init__(self, model):
 27 |         super(DummyModule, self).__init__()
 28 |         self.module = model
 29 | 
 30 |     def forward(self, *xs, **kwargs):
 31 |         return self.module(*xs, **kwargs)
 32 | 
 33 | class NetworkFactory(object):
 34 |     def __init__(self, system_config, model, distributed=False, gpu=None):
 35 |         super(NetworkFactory, self).__init__()
 36 | 
 37 |         self.system_config = system_config
 38 | 
 39 |         self.gpu     = gpu
 40 |         self.model   = DummyModule(model)
 41 |         self.loss    = model.loss
 42 |         self.network = Network(self.model, self.loss)
 43 | 
 44 |         if distributed:
 45 |             from apex.parallel import DistributedDataParallel, convert_syncbn_model
 46 |             torch.cuda.set_device(gpu)
 47 |             print("gpu {} build distributed Dataparallel".format(gpu)) 
 48 |             self.network = self.network.cuda(gpu)
 49 |             self.network = convert_syncbn_model(self.network)
 50 |             self.network = DistributedDataParallel(self.network)
 51 |             print("gpu {} build distributed model finished".format(gpu)) 
 52 |         else:
 53 |             # self.network = DataParallel(self.network, chunk_sizes=system_config.chunk_sizes)
 54 |             self.network = nn.DataParallel(self.network).cuda(gpu)
 55 |         total_params = 0
 56 |         for params in self.model.parameters():
 57 |             num_params = 1
 58 |             for x in params.size():
 59 |                 num_params *= x
 60 |             total_params += num_params
 61 |         
 62 |         self.logger = logging.getLogger()
 63 |         self.logger.info("ground model total parameters: {:.2f} M".format(total_params / 1000000))
 64 | 
 65 | 
 66 |         if system_config.opt_algo == "adam":
 67 |             self.optimizer = torch.optim.Adam(
 68 |                 filter(lambda p: p.requires_grad, self.model.parameters()),
 69 |                 lr= system_config.lr
 70 |             )
 71 |         elif system_config.opt_algo == "sgd":
 72 |             self.optimizer = torch.optim.SGD(
 73 |                 filter(lambda p: p.requires_grad, self.model.parameters()),
 74 |                 lr=system_config.lr, 
 75 |                 momentum=0.9, weight_decay=0.001
 76 |             )
 77 |         elif system_config.opt_algo == 'rmsprop':
 78 |             if system_config.debug:
 79 |                 pdb.set_trace()
 80 | 
 81 |             self.optimizer = torch.optim.RMSprop(
 82 |                 filter(lambda p: p.requires_grad, self.model.parameters()),
 83 |                 lr=system_config.lr, 
 84 |                 weight_decay=0.0005
 85 |             )
 86 |             # visu_params = self.model.module.visu_encoder.parameters()
 87 |             # visu_params = [param for param in self.model.module.visu_encoder.parameters()]
 88 |             # rest_params = [param for param in self.model.module.parameters() if param not in visu_params]
 89 |             # visu_params = list(visu_params)
 90 |             # self.optimizer = torch.optim.RMSprop(
 91 |                 # [{'params':rest_params},
 92 |                 # {'params':visu_params, 'lr':system_config.lr / 10.}],
 93 |                 # filter(lambda p: p.requires_grad, self.model.parameters()), 
 94 |                 # lr = system_config.lr,
 95 |                 # weight_decay =0.0005
 96 |             # )
 97 |         else:
 98 |             raise ValueError("unknown optimizer")
 99 | 
100 |     def cuda(self):
101 |         self.model.cuda()
102 | 
103 |     def train_mode(self):
104 |         self.network.train()
105 | 
106 |     def eval_mode(self):
107 |         self.network.eval()
108 | 
109 |     def _t_cuda(self, xs):
110 |         if type(xs) is list:
111 |             return [x.cuda(self.gpu, non_blocking=True) for x in xs]
112 |         return xs.cuda(self.gpu, non_blocking=True)
113 | 
114 |     def train(self, images, phrases, labels, **kwargs):
115 |         images = images.contiguous()
116 |         images = self._t_cuda(images)
117 |         phrases = self._t_cuda(phrases)
118 |         if isinstance(labels, list):
119 |             labels = [self._t_cuda(label) for label in labels]
120 |         else:
121 |             labels = self._t_cuda(labels)
122 |         self.optimizer.zero_grad()
123 |         loss, focal_loss, off_loss = self.network(images, phrases, labels)
124 |         loss = loss.mean()
125 |         focal_loss = focal_loss.mean()
126 |         off_loss = off_loss.mean()
127 |         loss.backward()
128 |         self.optimizer.step()
129 | 
130 |         return loss, focal_loss, off_loss
131 | 
132 |     def validate(self, images, phrases, labels, **kwargs):
133 |         with torch.no_grad():
134 |             images = images.contiguous()
135 |             images = self._t_cuda(images)
136 |             phrases = self._t_cuda(phrases)
137 |             # labels = [self._t_cuda(label) for label in labels]
138 |             if isinstance(labels, list):
139 |                 labels = [self._t_cuda(label) for label in labels]
140 |             else:
141 |                 labels = self._t_cuda(labels)
142 | 
143 |             loss, focal_loss, off_loss = self.network(images, phrases, labels)
144 |             loss = loss.mean()
145 |             focal_loss = focal_loss.mean()
146 |             off_loss = off_loss.mean()
147 |             return loss, focal_loss, off_loss
148 | 
149 |     def test(self, images, phrases, **kwargs):
150 |         with torch.no_grad():
151 |             images = images.contiguous()
152 |             images = self._t_cuda(images)
153 |             phrases = self._t_cuda(phrases)
154 |             return self.model(images, phrases, test=True)
155 |             
156 | 
157 |     def set_lr(self, lr):
158 |         self.logger.info("setting learning rate to: {}".format(lr))
159 |         for param_group in self.optimizer.param_groups:
160 |             param_group["lr"] = lr
161 |             param_group['initial_lr'] = lr
162 | 
163 |     def get_lr(self):
164 |         # for param_group in self.optimizer.param_groups:
165 |         return self.optimizer.param_groups[0]["lr"]
166 |         # return lr
167 | 
168 |     def load_pretrained_params(self, pretrained_model):
169 |         self.logger.info("loading from {}".format(pretrained_model))
170 |         with open(pretrained_model, "rb") as f:
171 |             params = torch.load(f, map_location="cpu")
172 |             self.model.load_state_dict(params)
173 | 
174 |     def load_params(self, epoch):
175 |         cache_file = self.system_config.snapshot_file.format(epoch)
176 |         self.logger.info("loading model from {}".format(cache_file))
177 |         with open(cache_file, "rb") as f:
178 |             check_pt = torch.load(f, map_location="cpu")
179 |             if type(check_pt) == dict:
180 |                 params = check_pt['params']
181 |             else:
182 |                 params = check_pt
183 |             # params = torch.load(f, map_location="cpu")
184 |             self.model.load_state_dict(params)
185 | 
186 |     def save_params(self, epoch):
187 |         cache_file = self.system_config.snapshot_file.format(epoch)
188 |         self.logger.info("saving model to {}".format(epoch))
189 |         with open(cache_file, "wb") as f:
190 |             params = self.model.state_dict()
191 |             check_pt = {
192 |                 'params': params,
193 |                 'optimizer': self.optimizer.state_dict()
194 |             }
195 |             torch.save(check_pt, f)
196 | 
197 |     def lr_poly(self, base_lr, iter, max_iter, power=0.9):
198 |         return base_lr * ((1 - float(iter) / max_iter) ** (power))
199 | 
200 |     def lr_step(self, lr, decay_rate):
201 |         return lr / decay_rate


--------------------------------------------------------------------------------
/core/optimizer/lr_scheduler.py:
--------------------------------------------------------------------------------
  1 | from torch.optim.lr_scheduler import MultiStepLR, StepLR, CosineAnnealingLR
  2 | from torch.optim.lr_scheduler import _LRScheduler
  3 | from torch.optim.lr_scheduler import ReduceLROnPlateau
  4 | import pdb
  5 | 
  6 | def make_scheduler(optimizer, system_config, last_epoch=-1, logger=None):
  7 |     warm_up = system_config._configs['warm_up']
  8 |     max_epoch =  system_config._configs['nb_epoch'] 
  9 |     warm_up_epoch = system_config._configs["warm_up_epoch"] if warm_up else 0
 10 | 
 11 |     if system_config._configs["lr_scheduler"] == "step_lr":
 12 |         lr_scheduler = StepLR(optimizer, 
 13 |                             step_size=system_config._configs["stepsize"],
 14 |                             gamma=system_config._configs["gamma"], 
 15 |                             last_epoch=last_epoch)
 16 |     elif system_config._configs["lr_scheduler"] == "multi_step_lr":
 17 |         lr_scheduler = MultiStepLR(optimizer, 
 18 |                                    milestones= system_config._configs["milestones"],
 19 |                                    gamma= system_config._configs["gamma"], 
 20 |                                    last_epoch=last_epoch)
 21 |     elif system_config._configs["lr_scheduler"] == "poly_lr":
 22 |         lr_scheduler = PolyScheduler(optimizer,
 23 |                                      max_epoch= system_config._configs["nb_epoch"],
 24 |                                      power=0.9, 
 25 |                                      last_epoch=last_epoch)
 26 |     elif system_config._configs["lr_scheduler"] == "cosin_lr":
 27 |         max_epoch = max_epoch - warm_up_epoch
 28 |         lr_scheduler = CosineAnnealingLR(optimizer, T_max=max_epoch)
 29 | 
 30 |     if system_config._configs['warm_up']:
 31 |         warm_up_lr = system_config._configs["warm_up_from_lr"]
 32 |         target_lr  = system_config._configs["learning_rate"]
 33 |         warm_up_epoch = system_config._configs["warm_up_epoch"]
 34 |         multiplier = target_lr / warm_up_lr
 35 |         warm_up_scheduler =  GradualWarmupScheduler(optimizer, 
 36 |                                                     multiplier,
 37 |                                                     warm_up_epoch, 
 38 |                                                     after_scheduler=lr_scheduler, 
 39 |                                                     last_epoch=last_epoch)
 40 |         lr_scheduler = warm_up_scheduler
 41 |         info = 'Warm up learning from {:.9f} to {:.9f} in the first {} epoches'\
 42 |                                             .format(warm_up_lr,
 43 |                                             target_lr, 
 44 |                                             warm_up_epoch)
 45 |         print(info)
 46 |         info = "after that {} are applied.".format(system_config._configs["lr_scheduler"])
 47 |         print(info)
 48 |         
 49 |     return lr_scheduler
 50 | 
 51 | 
 52 | class PolyScheduler(_LRScheduler):
 53 |     def __init__(self, optimizer, max_epoch, power, last_epoch=-1):
 54 |         self.max_epoch = max_epoch
 55 |         self.power = power
 56 |         super(PolyScheduler, self).__init__(optimizer, last_epoch)
 57 |     def get_lr(self):
 58 |         if self.last_epoch > self.max_epoch:
 59 |             return [group['lr'] for group in self.optimizer.param_groups]
 60 |         else:
 61 |             return [base_lr*(1-self.last_epoch/self.max_epoch)**self.power for base_lr in self.base_lrs]
 62 | 
 63 | 
 64 | class GradualWarmupScheduler(_LRScheduler):
 65 |     """ Gradually warm-up(increasing) learning rate in optimizer.
 66 |     Proposed in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour'.
 67 |     Args:
 68 |         optimizer (Optimizer): Wrapped optimizer.
 69 |         multiplier: target learning rate = base lr * multiplier if multiplier > 1.0. if multiplier = 1.0, lr starts from 0 and ends up with the base_lr.
 70 |         total_epoch: target learning rate is reached at total_epoch, gradually
 71 |         after_scheduler: after target_epoch, use this scheduler(eg. ReduceLROnPlateau)
 72 |     """
 73 | 
 74 |     def __init__(self, optimizer, multiplier, total_epoch, after_scheduler=None, last_epoch=-1):
 75 |         self.multiplier = multiplier
 76 |         if self.multiplier < 1.:
 77 |             raise ValueError('multiplier should be greater thant or equal to 1.')
 78 |         self.total_epoch = total_epoch
 79 |         self.after_scheduler = after_scheduler
 80 |         self.finished = False
 81 |         super(GradualWarmupScheduler, self).__init__(optimizer, last_epoch=last_epoch)
 82 | 
 83 |     def get_lr(self):
 84 |         if self.last_epoch > self.total_epoch:
 85 |             if self.after_scheduler:
 86 |                 if not self.finished:
 87 |                     self.after_scheduler.base_lrs = [base_lr * self.multiplier for base_lr in self.base_lrs]
 88 |                     self.finished = True
 89 |                 return self.after_scheduler.get_lr()
 90 |             return [base_lr * self.multiplier for base_lr in self.base_lrs]
 91 | 
 92 |         if self.multiplier == 1.0:
 93 |             return [base_lr * (float(self.last_epoch) / self.total_epoch) for base_lr in self.base_lrs]
 94 |         else:
 95 |             return [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
 96 | 
 97 |     def step_ReduceLROnPlateau(self, metrics, epoch=None):
 98 |         if epoch is None:
 99 |             epoch = self.last_epoch + 1
100 |         self.last_epoch = epoch if epoch != 0 else 1  # ReduceLROnPlateau is called at the end of epoch, whereas others are called at beginning
101 |         if self.last_epoch <= self.total_epoch:
102 |             warmup_lr = [base_lr * ((self.multiplier - 1.) * self.last_epoch / self.total_epoch + 1.) for base_lr in self.base_lrs]
103 |             for param_group, lr in zip(self.optimizer.param_groups, warmup_lr):
104 |                 param_group['lr'] = lr
105 |         else:
106 |             if epoch is None:
107 |                 self.after_scheduler.step(metrics, None)
108 |             else:
109 |                 self.after_scheduler.step(metrics, epoch - self.total_epoch)
110 | 
111 |     def step(self, epoch=None, metrics=None):
112 |         if type(self.after_scheduler) != ReduceLROnPlateau:
113 |             if self.finished and self.after_scheduler:
114 |                 if epoch is None:
115 |                     self.after_scheduler.step(None)
116 |                 else:
117 |                     self.after_scheduler.step(epoch - self.total_epoch)
118 |             else:
119 |                 return super(GradualWarmupScheduler, self).step(epoch)
120 |         else:
121 |             self.step_ReduceLROnPlateau(metrics, epoch)


--------------------------------------------------------------------------------
/core/paths.py:
--------------------------------------------------------------------------------
1 | import pkg_resources
2 | 
3 | _package_name = __name__
4 | 
5 | def get_file_path(*paths):
6 |     path = "/".join(paths)
7 |     return pkg_resources.resource_filename(_package_name, path)
8 | 


--------------------------------------------------------------------------------
/core/sampler/collate_fn.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import numpy as np
 3 | import pdb
 4 | 
 5 | def collate_fn(batch_data):
 6 |     images, phrases, bboxes = tuple(zip(*tuple(batch_data)))
 7 |     images = np.stack(images)
 8 |     images = torch.from_numpy(images)
 9 |     phrases = torch.stack(phrases)
10 |     bboxes = np.stack(bboxes)
11 |     bboxes = torch.from_numpy(bboxes)
12 |         
13 |     return {
14 |         "images": images,
15 |         "phrases": phrases,
16 |         "labels": bboxes
17 |     }
18 | 
19 | def collate_fn_bert(batch_data):
20 |     images, bboxes, inputs, masks = tuple(zip(*tuple(batch_data)))
21 |     images = np.stack(images)
22 |     images = torch.from_numpy(images)
23 |     bboxes = np.stack(bboxes)
24 |     bboxes = torch.from_numpy(bboxes)
25 |     inputs = torch.tensor(np.stack(inputs), dtype=torch.long)
26 |     masks =  torch.tensor(np.stack(masks), dtype=torch.long)
27 |     return {
28 |         'images': images,
29 |         'phrases': inputs,
30 |         'labels': bboxes,
31 |         # 'masks': masks
32 |     }


--------------------------------------------------------------------------------
/core/sampler/sampler.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import cv2
  3 | import random
  4 | import math
  5 | import numpy as np
  6 | import torch
  7 | import pdb
  8 | # import logging
  9 | 
 10 | from torch.utils.data import Dataset
 11 | 
 12 | from .utils import (normalize_, color_jittering_, \
 13 |                 lighting_, random_flip_, random_affine_, clip_bbox_, \
 14 |                 show_example, random_crop_, resize_image_, valid_affine)
 15 | 
 16 | from .utils import convert_examples_to_features, read_examples
 17 | from pytorch_pretrained_bert.tokenization import BertTokenizer
 18 | 
 19 | class Referring(Dataset):
 20 |     def __init__(self, db, system_configs, data_aug=True, debug=False, shuffle=False, test=False):
 21 |         super(Referring, self).__init__()
 22 |         self.test = test
 23 |         self._db = db
 24 |         self._sys_config = system_configs
 25 |         self.lstm = system_configs.lstm
 26 |         self.data_rng = system_configs.data_rng
 27 |         self.data_aug = data_aug
 28 |         self.debug = debug
 29 |         self.input_size    = self._db.configs["input_size"]
 30 |         self.output_size   = self._db.configs["output_sizes"]
 31 |         # self.rand_scales   = self._db.configs["rand_scales"]
 32 |         self.rand_color    = self._db.configs["random_color"]
 33 |         self.random_flip   = self._db.configs["random_flip"]
 34 |         self.random_aff    = self._db.configs["random_affine"]
 35 |         self.lighting      = self._db.configs["random_lighting"]
 36 |         self.query_len     = self._db.configs["max_query_len"]
 37 |         self.corpus        = self._db.corpus
 38 |         self.tokenizer     = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
 39 |         if shuffle:
 40 |             self._db.shuffle_inds()
 41 |         
 42 |     def __len__(self):
 43 |         return len(self._db.db_inds)
 44 | 
 45 |     def _tokenize_phrase(self, phrase):
 46 |         return self.corpus.tokenize(phrase, self.query_len)
 47 | 
 48 |     def __getitem__(self, k_ind):
 49 |         db_ind = self._db.db_inds[k_ind]
 50 |         while True:
 51 |             # reading images
 52 |             image_path = self._db.image_path(db_ind)
 53 |             image = cv2.imread(image_path)
 54 |                 # image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 55 |             # else:
 56 |             if not image.shape[-1] > 1:
 57 |                 image = np.stack([image] * 3) # duplicate channel if gray image
 58 |             
 59 |             # original scale
 60 |             original_shape = image.shape
 61 |             # reading bbox annnotation
 62 |             bbox = self._db.annotation_box(db_ind)
 63 |             # reading phrase
 64 |             phrase = self._db.phrase(db_ind)
 65 |             phrase = phrase.lower()
 66 | 
 67 |             if self.data_aug:
 68 |                 if self.random_flip and random.random() > 0.5:
 69 |                     image, phrase, bbox = random_flip_(image, phrase, bbox.copy()) # should ensure bbox read-only
 70 |                     
 71 |                 # resize images
 72 |                 image, bbox = resize_image_(image, bbox.copy(), self.input_size)
 73 |                 if self.random_aff:
 74 |                     aff_image, aff_bbox = random_affine_(image, bbox.copy())
 75 |                     if valid_affine(aff_bbox, aff_image.shape[:2]):
 76 |                         # only keep valid_affine
 77 |                         image = aff_image
 78 |                         bbox = aff_bbox
 79 | 
 80 |                 if self.debug and k_ind % 5000 == 0:
 81 |                     show_example(image, bbox, phrase, name="input_sample{}".format(k_ind))
 82 | 
 83 |                 image = image.astype(np.float32) / 255.
 84 |                 if self.rand_color:
 85 |                     color_jittering_(self.data_rng, image)
 86 |                     if self.lighting:
 87 |                         lighting_(self.data_rng, image, 0.1, self._db.eig_val, self._db.eig_vec)
 88 |                 normalize_(image, self._db.mean, self._db.std)
 89 |             else:   ## should be inference, or specified training
 90 |                 image, bbox = resize_image_(image, bbox.copy(), self.input_size, \
 91 |                     padding_color=tuple((self._db.mean * 255).tolist()))    
 92 |                 image = image.astype(np.float32) / 255.
 93 |                 normalize_(image, self._db.mean, self._db.std)
 94 |             
 95 |             bbox = clip_bbox_(bbox.copy(), image.shape[0:2])
 96 | 
 97 |             if not ((bbox[2] - bbox[0] > 0) and (bbox[3] - bbox[1] > 0)):
 98 |                 # show_example(image, bbox.copy(), phrase, name="failure_case_{}".format(k_ind))
 99 |                 # if failure, choose next image
100 |                 db_ind = random.choice(self._db.db_inds)
101 |                 continue
102 |             
103 |             image = image.transpose((2, 0, 1))
104 |             if not self.lstm: # for BERT 
105 |                 examples = read_examples(phrase, db_ind)
106 |                 features = convert_examples_to_features(examples=examples, \
107 |                     seq_length=self.query_len, tokenizer=self.tokenizer)
108 |                 word_id = features[0].input_ids
109 |                 word_mask = features[0].input_mask
110 |                 if self.test:
111 |                     word_id = torch.tensor(word_id, dtype=torch.long)
112 |                     return image, word_id, original_shape
113 |                 else:
114 |                     return image, bbox, word_id, word_mask
115 |             else: # for lstm
116 |                 phrase = self._tokenize_phrase(phrase)
117 |                 if self.test:
118 |                     return image, phrase, original_shape
119 |                 else:
120 |                     return image, phrase, bbox
121 | 


--------------------------------------------------------------------------------
/core/test/__init__.py:
--------------------------------------------------------------------------------
1 | from .test import *
2 | # def test_func(sys_config, db, nnet, result_dir, debug=False):
3 |     # return globals()[sys_config.sampling_function](db, nnet, result_dir, debug=debug)
4 | 


--------------------------------------------------------------------------------
/core/test/test.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import cv2
  3 | import json
  4 | import numpy as np
  5 | import torch
  6 | 
  7 | from tqdm import tqdm
  8 | 
  9 | from ..utils import Timer
 10 | from ..vis_utils import draw_bboxes
 11 | # from ..sample.utils import crop_image
 12 | from ..sampler.sampler import Referring
 13 | import pdb
 14 | 
 15 | 
 16 | def _decode_anchorbased(out):
 17 |     # feature
 18 |     outs = torch.cat(out, dim=1)
 19 |     confidence, index = outs[..., 4].max(1)
 20 |     x = outs[range(outs.shape[0]), index, 0]
 21 |     y = outs[range(outs.shape[0]), index, 1]
 22 |     w = outs[range(outs.shape[0]), index, 2]
 23 |     h = outs[range(outs.shape[0]), index, 3]
 24 |     bboxes = torch.stack((x-w/2, y-h/2, x+w/2, y+h/2), dim=1).cpu().data.numpy()
 25 |     bboxes = [bbox for bbox in bboxes]
 26 |     return bboxes
 27 | 
 28 | 
 29 | def _topk(scores, k=1):
 30 |     batch, _, height, width = scores.size()
 31 |     topk_scroes, topk_inds = torch.topk(scores.view(batch, -1), k)
 32 |     topk_ys = (topk_inds / width).int().float()
 33 |     topk_xs = (topk_inds % width).int().float()
 34 |     return topk_ys, topk_xs
 35 | 
 36 | def _decode(out):
 37 |     tl_heats, br_heats, tl_regrs, br_regrs = out
 38 |     batch, _, height, width = tl_heats.shape
 39 |     tl_ys, tl_xs = _topk(tl_heats, k=1)
 40 |     br_ys, br_xs = _topk(br_heats, k=1)
 41 |     bboxes = []
 42 |     for ind in range(batch):
 43 |         # TODO only support k = 1 here
 44 |         tly = tl_ys[ind][0]
 45 |         tlx = tl_xs[ind][0]
 46 |         bry = br_ys[ind][0]
 47 |         brx = br_xs[ind][0]
 48 |         tl_off_x, tl_off_y = tl_regrs[ind, :, tly.to(torch.int), tlx.to(torch.int)]
 49 |         br_off_x, br_off_y = br_regrs[ind, :, bry.to(torch.int), brx.to(torch.int)]
 50 |         bbox = tlx + tl_off_x, tly + tl_off_y, brx + br_off_x, bry + br_off_y
 51 |         bbox = [x.cpu().data.item() for x in bbox]
 52 |         bbox = np.array(bbox)
 53 |         bboxes.append(bbox)
 54 | 
 55 |     return bboxes
 56 | 
 57 | def _bbox_iou(bbox1, bbox2):
 58 |     iou = np.zeros((4, ))
 59 |     iou[:2] = np.where(bbox1[:2]>bbox2[:2],bbox1[:2], bbox2[:2]) # element wise max
 60 |     iou[2:] = np.where(bbox1[2:]<bbox2[2:],bbox1[2:], bbox2[2:]) # element wise min
 61 |     w, h = np.clip(iou[2:]-iou[:2], 0, iou[2:]-iou[:2])
 62 |     iou_area = w * h
 63 |     w, h = bbox1[2:] - bbox1[:2]
 64 |     bbox1_area = w * h
 65 |     w, h = bbox2[2:] - bbox2[:2]
 66 |     bbox2_area = w * h
 67 |     return iou_area / (bbox1_area + bbox2_area - iou_area + 1e-16)
 68 | 
 69 | 
 70 | def _visualize(input, pred_bbox=None, gt_bbox=None, phrase=None, font_size=0.5, save_path=None, draw_phrase=True, color=None):
 71 |     image = input.copy()
 72 |     h, w = image.shape[:2]
 73 |     gtbox_size = cv2.getTextSize("ground truth", cv2.FONT_HERSHEY_SIMPLEX, font_size, 2)[0]
 74 |     prbox_size = cv2.getTextSize("prediction", cv2.FONT_HERSHEY_SIMPLEX, font_size, 2)[0]
 75 |     expr_size  = cv2.getTextSize(phrase, cv2.FONT_HERSHEY_SIMPLEX, font_size, 2)[0]
 76 |     if image.dtype == np.float32:
 77 |         image = (image * 255).astype(np.uint8)
 78 |     
 79 |     def _draw_title(bbox, text, color):
 80 |         text_size = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, font_size, 2)[0]
 81 |         if bbox[1] - text_size[1] - 2 < 0:
 82 |             cv2.rectangle(image, 
 83 |                           (bbox[0],bbox[1] + 2),
 84 |                           (bbox[0]+text_size[0], bbox[1]+text_size[1]),
 85 |                           color, -1)
 86 |             cv2.putText(image, text,
 87 |                         (bbox[0], bbox[1]+2+text_size[1]),
 88 |                         cv2.FONT_HERSHEY_SIMPLEX, font_size, (0, 0, 0), thickness=1)
 89 |         else:
 90 |             cv2.rectangle(image, 
 91 |                           (bbox[0], bbox[1]-text_size[1]-2),
 92 |                           (bbox[0]+text_size[0], bbox[1]-2),
 93 |                           color, -1)
 94 |             cv2.putText(image, text,
 95 |                         (bbox[0], bbox[1]-2),
 96 |                         cv2.FONT_HERSHEY_SIMPLEX, font_size, (0, 0, 0), thickness=1)
 97 | 
 98 |     pred_color = (255.0, 0.0, 0.0)
 99 |     gt_color   = (0.0, 255.0, 255.0)
100 |     if color:
101 |         pred_color = color
102 |         gt_color = color
103 |     # _draw_title(pred_bbox, "prediction", pred_color)
104 |     # _draw_title(gt_bbox, "ground truth", gt_color)
105 |     if gt_bbox is not None:
106 |         gt_bbox   = gt_bbox.copy().astype(int)
107 |         cv2.rectangle(image, 
108 |                  (gt_bbox[0], gt_bbox[1]),
109 |                  (gt_bbox[2], gt_bbox[3]),
110 |                  gt_color, 10)
111 | 
112 |     if pred_bbox is not None:
113 |         pred_bbox = pred_bbox.copy().astype(int)
114 |         cv2.rectangle(image, 
115 |                  (pred_bbox[0], pred_bbox[1]),
116 |                  (pred_bbox[2], pred_bbox[3]),
117 |                  pred_color, 10)
118 | 
119 | 
120 |     bottom = expr_size[1] * 2
121 |     oh, ow = h + expr_size[1] // 2 + 6, w // 2 - expr_size[0] // 2
122 |     ow = 0 if ow < 0 else ow
123 |     if draw_phrase and phrase:
124 |         image = cv2.copyMakeBorder(image, \
125 |                                0, bottom, \
126 |                                0, 0, \
127 |                                 cv2.BORDER_CONSTANT, value=(255, 255, 255))  # padded square
128 |         cv2.putText(image, phrase, (ow, oh), \
129 |             cv2.FONT_HERSHEY_SIMPLEX, font_size, (0, 0, 0), thickness=1
130 |         )
131 |     if save_path:
132 |         cv2.imwrite(save_path, image)
133 |     return image


--------------------------------------------------------------------------------
/core/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | # -----------------------------------------------------------------------------
 3 | # Copyright (c) Edgar Andrés Margffoy-Tuay, Emilio Botero and Juan Camilo Pérez
 4 | #
 5 | # Licensed under the terms of the MIT License
 6 | # (see LICENSE for details)
 7 | # -----------------------------------------------------------------------------
 8 | 
 9 | """Misc data and other helping utillites."""
10 | from .tqdm import stdout_to_tqdm
11 | from .timer import Timer
12 | from .meter import AverageMeter
13 | from .misc import make_anchors, get_root_logger
14 | 


--------------------------------------------------------------------------------
/core/utils/data_parallel.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | from torch.nn.modules import Module
  3 | from torch.nn.parallel.scatter_gather import gather
  4 | from torch.nn.parallel.replicate import replicate
  5 | from torch.nn.parallel.parallel_apply import parallel_apply
  6 | 
  7 | from .scatter_gather import scatter_kwargs
  8 | 
  9 | class DataParallel(Module):
 10 |     r"""Implements data parallelism at the module level.
 11 | 
 12 |     This container parallelizes the application of the given module by
 13 |     splitting the input across the specified devices by chunking in the batch
 14 |     dimension. In the forward pass, the module is replicated on each device,
 15 |     and each replica handles a portion of the input. During the backwards
 16 |     pass, gradients from each replica are summed into the original module.
 17 | 
 18 |     The batch size should be larger than the number of GPUs used. It should
 19 |     also be an integer multiple of the number of GPUs so that each chunk is the
 20 |     same size (so that each GPU processes the same number of samples).
 21 | 
 22 |     See also: :ref:`cuda-nn-dataparallel-instead`
 23 | 
 24 |     Arbitrary positional and keyword inputs are allowed to be passed into
 25 |     DataParallel EXCEPT Tensors. All variables will be scattered on dim
 26 |     specified (default 0). Primitive types will be broadcasted, but all
 27 |     other types will be a shallow copy and can be corrupted if written to in
 28 |     the model's forward pass.
 29 | 
 30 |     Args:
 31 |         module: module to be parallelized
 32 |         device_ids: CUDA devices (default: all devices)
 33 |         output_device: device location of output (default: device_ids[0])
 34 | 
 35 |     Example::
 36 | 
 37 |         >>> net = torch.nn.DataParallel(model, device_ids=[0, 1, 2])
 38 |         >>> output = net(input_var)
 39 |     """
 40 | 
 41 |     # TODO: update notes/cuda.rst when this class handles 8+ GPUs well
 42 | 
 43 |     def __init__(self, module, device_ids=None, output_device=None, dim=0, chunk_sizes=None):
 44 |         super(DataParallel, self).__init__()
 45 | 
 46 |         if not torch.cuda.is_available():
 47 |             self.module = module
 48 |             self.device_ids = []
 49 |             return
 50 | 
 51 |         if device_ids is None:
 52 |             device_ids = list(range(torch.cuda.device_count()))
 53 |         if output_device is None:
 54 |             output_device = device_ids[0]
 55 |         self.dim = dim
 56 |         self.module = module
 57 |         self.device_ids = device_ids
 58 |         self.chunk_sizes = chunk_sizes
 59 |         self.output_device = output_device
 60 |         if len(self.device_ids) == 1:
 61 |             self.module.cuda(device_ids[0])
 62 | 
 63 |     def forward(self, *inputs, **kwargs):
 64 |         if not self.device_ids:
 65 |             return self.module(*inputs, **kwargs)
 66 |         inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids, self.chunk_sizes)
 67 |         if len(self.device_ids) == 1:
 68 |             return self.module(*inputs[0], **kwargs[0])
 69 |         replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
 70 |         outputs = self.parallel_apply(replicas, inputs, kwargs)
 71 |         return self.gather(outputs, self.output_device)
 72 | 
 73 |     def replicate(self, module, device_ids):
 74 |         return replicate(module, device_ids)
 75 | 
 76 |     def scatter(self, inputs, kwargs, device_ids, chunk_sizes):
 77 |         return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim, chunk_sizes=self.chunk_sizes)
 78 | 
 79 |     def parallel_apply(self, replicas, inputs, kwargs):
 80 |         return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
 81 | 
 82 |     def gather(self, outputs, output_device):
 83 |         return gather(outputs, output_device, dim=self.dim)
 84 | 
 85 | 
 86 | def data_parallel(module, inputs, device_ids=None, output_device=None, dim=0, module_kwargs=None):
 87 |     r"""Evaluates module(input) in parallel across the GPUs given in device_ids.
 88 | 
 89 |     This is the functional version of the DataParallel module.
 90 | 
 91 |     Args:
 92 |         module: the module to evaluate in parallel
 93 |         inputs: inputs to the module
 94 |         device_ids: GPU ids on which to replicate module
 95 |         output_device: GPU location of the output  Use -1 to indicate the CPU.
 96 |             (default: device_ids[0])
 97 |     Returns:
 98 |         a Variable containing the result of module(input) located on
 99 |         output_device
100 |     """
101 |     if not isinstance(inputs, tuple):
102 |         inputs = (inputs,)
103 | 
104 |     if device_ids is None:
105 |         device_ids = list(range(torch.cuda.device_count()))
106 | 
107 |     if output_device is None:
108 |         output_device = device_ids[0]
109 | 
110 |     inputs, module_kwargs = scatter_kwargs(inputs, module_kwargs, device_ids, dim)
111 |     if len(device_ids) == 1:
112 |         return module(*inputs[0], **module_kwargs[0])
113 |     used_device_ids = device_ids[:len(inputs)]
114 |     replicas = replicate(module, used_device_ids)
115 |     outputs = parallel_apply(replicas, inputs, module_kwargs, used_device_ids)
116 |     return gather(outputs, output_device, dim)
117 | 


--------------------------------------------------------------------------------
/core/utils/meter.py:
--------------------------------------------------------------------------------
 1 | class AverageMeter(object):
 2 |     """Computes and stores the average and current value"""
 3 |     def __init__(self):
 4 |         self.reset()
 5 | 
 6 |     def reset(self):
 7 |         self.val = 0
 8 |         self.avg = 0
 9 |         self.sum = 0
10 |         self.count = 0
11 | 
12 |     def update(self, val, n=1):
13 |         self.val = val
14 |         self.sum += val * n
15 |         self.count += n
16 |         self.avg = self.sum / self.count


--------------------------------------------------------------------------------
/core/utils/misc.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | def get_root_logger(rank, filename=None, log_level=logging.INFO):
 3 |     logger = logging.getLogger()
 4 |     if not logger.hasHandlers():
 5 |         logging.basicConfig(
 6 |             filename=filename if rank == 0 else None,
 7 |             format='%(asctime)s - %(levelname)s - %(message)s',
 8 |             level=log_level)
 9 |     if rank != 0:
10 |         logger.setLevel('ERROR')
11 |     return logger
12 | 
13 | def make_anchors(dataset, input_size=416):
14 |     if dataset=='refeit':
15 |         anchors = '30,36,  78,46,  48,86,  149,79,  82,148,  331,93,  156,207,  381,163,  329,285'
16 |     elif dataset=='flickr':
17 |         anchors = '29,26,  55,58,  137,71,  82,121,  124,205,  204,132,  209,263,  369,169,  352,294'
18 |     else:
19 |         anchors = '10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326'
20 |     anchors = [float(x) for x in anchors.split(',')]
21 |     anchors = [x * input_size / 416 for x in anchors]
22 |     anchors_full = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)][::-1]
23 |     return anchors_full
24 | 


--------------------------------------------------------------------------------
/core/utils/scatter_gather.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from torch.autograd import Variable
 3 | from torch.nn.parallel._functions import Scatter, Gather
 4 | 
 5 | 
 6 | def scatter(inputs, target_gpus, dim=0, chunk_sizes=None):
 7 |     r"""
 8 |     Slices variables into approximately equal chunks and
 9 |     distributes them across given GPUs. Duplicates
10 |     references to objects that are not variables. Does not
11 |     support Tensors.
12 |     """
13 |     def scatter_map(obj):
14 |         if isinstance(obj, Variable):
15 |             return Scatter.apply(target_gpus, chunk_sizes, dim, obj)
16 |         assert not torch.is_tensor(obj), "Tensors not supported in scatter."
17 |         if isinstance(obj, tuple):
18 |             return list(zip(*map(scatter_map, obj)))
19 |         if isinstance(obj, list):
20 |             return list(map(list, zip(*map(scatter_map, obj))))
21 |         if isinstance(obj, dict):
22 |             return list(map(type(obj), zip(*map(scatter_map, obj.items()))))
23 |         return [obj for targets in target_gpus]
24 | 
25 |     return scatter_map(inputs)
26 | 
27 | 
28 | def scatter_kwargs(inputs, kwargs, target_gpus, dim=0, chunk_sizes=None):
29 |     r"""Scatter with support for kwargs dictionary"""
30 |     inputs = scatter(inputs, target_gpus, dim, chunk_sizes) if inputs else []
31 |     kwargs = scatter(kwargs, target_gpus, dim, chunk_sizes) if kwargs else []
32 |     if len(inputs) < len(kwargs):
33 |         inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
34 |     elif len(kwargs) < len(inputs):
35 |         kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
36 |     inputs = tuple(inputs)
37 |     kwargs = tuple(kwargs)
38 |     return inputs, kwargs
39 | 


--------------------------------------------------------------------------------
/core/utils/timer.py:
--------------------------------------------------------------------------------
 1 | import time
 2 | 
 3 | class Timer(object):
 4 |     """A simple timer."""
 5 |     def __init__(self):
 6 |         self.total_time = 0.
 7 |         self.calls = 0
 8 |         self.start_time = 0.
 9 |         self.diff = 0.
10 |         self.average_time = 0.
11 | 
12 |     def tic(self):
13 |         # using time.time instead of time.clock because time time.clock
14 |         # does not normalize for multithreading
15 |         self.start_time = time.time()
16 | 
17 |     def toc(self, average=True):
18 |         self.diff = time.time() - self.start_time
19 |         self.total_time += self.diff
20 |         self.calls += 1
21 |         self.average_time = self.total_time / self.calls
22 |         if average:
23 |             return self.average_time
24 |         else:
25 |             return self.diff
26 | 


--------------------------------------------------------------------------------
/core/utils/tqdm.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import numpy as np
 3 | import contextlib
 4 | 
 5 | from tqdm import tqdm
 6 | 
 7 | class TqdmFile(object):
 8 |     dummy_file = None
 9 |     def __init__(self, dummy_file):
10 |         self.dummy_file = dummy_file
11 | 
12 |     def write(self, x):
13 |         if len(x.rstrip()) > 0:
14 |             tqdm.write(x, file=self.dummy_file)
15 | 
16 | @contextlib.contextmanager
17 | def stdout_to_tqdm():
18 |     save_stdout = sys.stdout
19 |     try:
20 |         sys.stdout = TqdmFile(sys.stdout)
21 |         yield save_stdout
22 |     except Exception as exc:
23 |         raise exc
24 |     finally:
25 |         sys.stdout = save_stdout
26 | 


--------------------------------------------------------------------------------
/core/vis_utils.py:
--------------------------------------------------------------------------------
 1 | import cv2
 2 | import numpy as np
 3 | 
 4 | def draw_bboxes(image, bboxes, font_size=0.5, thresh=0.5, colors=None):
 5 |     """Draws bounding boxes on an image.
 6 | 
 7 |     Args:
 8 |         image: An image in OpenCV format
 9 |         bboxes: A dictionary representing bounding boxes of different object
10 |             categories, where the keys are the names of the categories and the
11 |             values are the bounding boxes. The bounding boxes of category should be
12 |             stored in a 2D NumPy array, where each row is a bounding box (x1, y1,
13 |             x2, y2, score).
14 |         font_size: (Optional) Font size of the category names.
15 |         thresh: (Optional) Only bounding boxes with scores above the threshold
16 |             will be drawn.
17 |         colors: (Optional) Color of bounding boxes for each category. If it is
18 |             not provided, this function will use random color for each category.
19 | 
20 |     Returns:
21 |         An image with bounding boxes.
22 |     """
23 | 
24 |     image = image.copy()
25 |     for cat_name in bboxes:
26 |         keep_inds = bboxes[cat_name][:, -1] > thresh
27 |         cat_size  = cv2.getTextSize(cat_name, cv2.FONT_HERSHEY_SIMPLEX, font_size, 2)[0]
28 | 
29 |         if colors is None:
30 |             color = np.random.random((3, )) * 0.6 + 0.4
31 |             color = (color * 255).astype(np.int32).tolist()
32 |         else:
33 |             color = colors[cat_name]
34 | 
35 |         for bbox in bboxes[cat_name][keep_inds]:
36 |             bbox = bbox[0:4].astype(np.int32)
37 |             cv2.rectangle(image,
38 |                 (bbox[0], bbox[1]),
39 |                 (bbox[2], bbox[3]),
40 |                 color, 2
41 |             )
42 |             
43 |             if bbox[1] - cat_size[1] - 2 < 0:
44 |                 cv2.rectangle(image,
45 |                     (bbox[0], bbox[1] + 2),
46 |                     (bbox[0] + cat_size[0], bbox[1] + cat_size[1] + 2),
47 |                     color, -1
48 |                 )
49 |                 cv2.putText(image, cat_name,
50 |                     (bbox[0], bbox[1] + cat_size[1] + 2),
51 |                     cv2.FONT_HERSHEY_SIMPLEX, font_size, (0, 0, 0), thickness=1
52 |                 )
53 |             else:
54 |                 cv2.rectangle(image,
55 |                     (bbox[0], bbox[1] - cat_size[1] - 2),
56 |                     (bbox[0] + cat_size[0], bbox[1] - 2),
57 |                     color, -1
58 |                 )
59 |                 cv2.putText(image, cat_name,
60 |                     (bbox[0], bbox[1] - 2),
61 |                     cv2.FONT_HERSHEY_SIMPLEX, font_size, (0, 0, 0), thickness=1
62 |                 )
63 |     return image
64 | 


--------------------------------------------------------------------------------
/data/refer/download_data.sh:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | version="0.1"
  4 | 
  5 | # This is an optional arguments-only example of Argbash potential
  6 | #
  7 | # ARG_OPTIONAL_SINGLE([path],[p],[path onto which files are to be downloaded],[data])
  8 | # ARG_VERSION([echo test v$version])
  9 | # ARG_HELP([The general script's help msg])
 10 | # ARGBASH_GO()
 11 | # needed because of Argbash --> m4_ignore([
 12 | ### START OF CODE GENERATED BY Argbash v2.5.0 one line above ###
 13 | # Argbash is a bash code generator used to get arguments parsing right.
 14 | # Argbash is FREE SOFTWARE, see https://argbash.io for more info
 15 | # Generated online by https://argbash.io/generate
 16 | 
 17 | die()
 18 | {
 19 |     local _ret=$2
 20 |     test -n "$_ret" || _ret=1
 21 |     test "$_PRINT_HELP" = yes && print_help >&2
 22 |     echo "$1" >&2
 23 |     exit ${_ret}
 24 | }
 25 | 
 26 | begins_with_short_option()
 27 | {
 28 |     local first_option all_short_options
 29 |     all_short_options='pvh'
 30 |     first_option="${1:0:1}"
 31 |     test "$all_short_options" = "${all_short_options/$first_option/}" && return 1 || return 0
 32 | }
 33 | 
 34 | 
 35 | 
 36 | # THE DEFAULTS INITIALIZATION - OPTIONALS
 37 | _arg_path="referit_data"
 38 | 
 39 | print_help ()
 40 | {
 41 |     printf "%s\n" "download ReferIt data script"
 42 |     printf 'Usage: %s [-p|--path <arg>] [-v|--version] [-h|--help]\n' "$0"
 43 |     printf "\t%s\n" "-p,--path: path onto which files are to be downloaded (default: '"referit_data"')"
 44 |     printf "\t%s\n" "-v,--version: Prints version"
 45 |     printf "\t%s\n" "-h,--help: Prints help"
 46 | }
 47 | 
 48 | parse_commandline ()
 49 | {
 50 |     while test $# -gt 0
 51 |     do
 52 |         _key="$1"
 53 |         case "$_key" in
 54 |             -p|--path)
 55 |                 test $# -lt 2 && die "Missing value for the optional argument '$_key'." 1
 56 |                 _arg_path="$2"
 57 |                 shift
 58 |                 ;;
 59 |             --path=*)
 60 |                 _arg_path="${_key##--path=}"
 61 |                 ;;
 62 |             -p*)
 63 |                 _arg_path="${_key##-p}"
 64 |                 ;;
 65 |             -v|--version)
 66 |                 echo test v$version
 67 |                 exit 0
 68 |                 ;;
 69 |             -v*)
 70 |                 echo test v$version
 71 |                 exit 0
 72 |                 ;;
 73 |             -h|--help)
 74 |                 print_help
 75 |                 exit 0
 76 |                 ;;
 77 |             -h*)
 78 |                 print_help
 79 |                 exit 0
 80 |                 ;;
 81 |             *)
 82 |                 _PRINT_HELP=yes die "FATAL ERROR: Got an unexpected argument '$1'" 1
 83 |                 ;;
 84 |         esac
 85 |         shift
 86 |     done
 87 | }
 88 | 
 89 | parse_commandline "$@"
 90 | 
 91 | # OTHER STUFF GENERATED BY Argbash
 92 | 
 93 | ### END OF CODE GENERATED BY Argbash (sortof) ### ])
 94 | # [ <-- needed because of Argbash
 95 | 
 96 | 
 97 | echo "Save data to: $_arg_path"
 98 | 
 99 | 
100 | REFERIT_SPLITS_URL="https://s3-sa-east-1.amazonaws.com/query-objseg/referit_splits.tar.bz2"
101 | REFERIT_DATA_URL="http://www.eecs.berkeley.edu/~ronghang/projects/cvpr16_text_obj_retrieval/referitdata.tar.gz"
102 | COCO_DATA_URL="http://images.cocodataset.org/zips/train2014.zip"
103 | 
104 | REFCOCO_URL="http://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco.zip"
105 | REFCOCO_PLUS_URL="http://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco+.zip"
106 | REFCOCOG_URL="http://bvisionweb1.cs.unc.edu/licheng/referit/data/refcocog.zip"
107 | 
108 | REFERIT_FILE=${REFERIT_DATA_URL#*cvpr16_text_obj_retrieval/}
109 | SPLIT_FILE=${REFERIT_SPLITS_URL#*query-objseg/}
110 | COCO_FILE=${COCO_DATA_URL#*zips/}
111 | 
112 | 
113 | if [ ! -d $_arg_path ]; then
114 |     mkdir $_arg_path
115 |     cd $_arg_path
116 | 
117 |     mkdir referit
118 |     cd referit
119 | 
120 |     printf "Downloading ReferIt dataset (This may take a while...)"
121 |     aria2c -x 8 $REFERIT_DATA_URL
122 | 
123 | 
124 |     printf "Uncompressing data..."
125 |     tar -xzvf $REFERIT_FILE
126 |     rm $REFERIT_FILE
127 | 
128 |     mkdir splits
129 |     cd splits
130 | 
131 |     printf "Downloading ReferIt Splits..."
132 |     aria2c -x 8 $REFERIT_SPLITS_URL
133 | 
134 |     tar -xjvf $SPLIT_FILE
135 |     rm $SPLIT_FILE
136 | 
137 |     cd ../..
138 | 
139 |     mkdir -p other/images/mscoco/images
140 |     cd other/images/mscoco/images
141 | 
142 |     printf "Downloading MS COCO 2014 train images (This may take a while...)"
143 |     aria2c -x 8 $COCO_DATA_URL
144 | 
145 |     unzip $COCO_FILE
146 |     rm $COCO_FILE
147 | 
148 |     cd ../../..
149 |     printf "Downloading refcoco, refcocog and refcoco+ splits..."
150 |     aria2c -x 8 $REFCOCO_URL
151 |     aria2c -x 8 $REFCOCO_PLUS_URL
152 |     aria2c -x 8 $REFCOCOG_URL
153 | 
154 |     unzip "*.zip"
155 |     rm *.zip
156 | fi


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | import cv2
 3 | import torch
 4 | from core.test.test import _visualize
 5 | from core.groundors import Net 
 6 | # pick one model
 7 | cfg_file = "lbyl_bert_unc+_batch64"
 8 | detector = Net(cfg_file, iter=100)
 9 | # inference
10 | image = cv2.imread('imgs/demo.jpeg')
11 | phrase = 'the green gaint'
12 | bbox = detector(image, phrase)
13 | _visualize(image, pred_bbox=bbox, phrase=phrase, save_path='imgs/demo_out.jpg', color=(1, 174, 245), draw_phrase=True)


--------------------------------------------------------------------------------
/evaluate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os
  3 | import os.path as osp
  4 | import json
  5 | import torch
  6 | import pprint
  7 | import argparse
  8 | import importlib
  9 | import cv2
 10 | import time
 11 | from tqdm import tqdm
 12 | import numpy as np
 13 | import pdb
 14 | from core.dbs import datasets
 15 | from core.test.test import _decode, _bbox_iou, _visualize, _decode_anchorbased
 16 | from core.sampler.sampler import Referring
 17 | from core.sampler.collate_fn import collate_fn, collate_fn_bert
 18 | from core.sampler.utils import letterbox
 19 | from core.config import SystemConfig
 20 | from core.nnet.nnet_factory import NetworkFactory
 21 | from core.utils import make_anchors
 22 | from core.paths import get_file_path
 23 | torch.backends.cudnn.benchmark = False
 24 | import matplotlib.pyplot as plt
 25 | from matplotlib import cm
 26 | from core.models.net.lbylnet import LBYLNet 
 27 | 
 28 | def parse_args():
 29 |     parser = argparse.ArgumentParser(description="Evaluation Script")
 30 |     parser.add_argument("cfg_file", help="config file", type=str)
 31 |     parser.add_argument("--testiter", dest="testiter",
 32 |                         help="test at iteration i",
 33 |                         default=None, type=int)
 34 |     parser.add_argument("--split", dest="split",
 35 |                         help="which split to use",
 36 |                         default="validation", type=str)
 37 |     parser.add_argument("--suffix", dest="suffix", default=None, type=str)
 38 |     parser.add_argument("--debug", action="store_true")
 39 |     parser.add_argument("--batch_size", default=64, type=int)
 40 |     parser.add_argument("--visulize", action="store_true")
 41 |     args = parser.parse_args()
 42 |     return args
 43 | 
 44 | def make_dirs(directories):
 45 |     for directory in directories:
 46 |         if not osp.exists(directory):
 47 |             os.makedirs(directory)
 48 | 
 49 | def test(db, system_config, model, args):
 50 |     split    = args.split
 51 |     testiter = args.testiter
 52 |     # debug    = args.debug
 53 |     suffix   = args.suffix
 54 |     bs       = args.batch_size
 55 |     result_dir = system_config.result_dir
 56 |     test_iter = system_config.nb_epoch if testiter is None else testiter
 57 |     result_dir = osp.join(result_dir, str(testiter), split)
 58 |     
 59 |     system_config.lr = system_config.learning_rate
 60 | 
 61 |     if suffix is not None:
 62 |         result_dir = osp.join(result_dir, suffix)
 63 | 
 64 |     make_dirs([result_dir])
 65 |     print("loading parameters at epoch: {}".format(test_iter))
 66 |     print("building neural network...")
 67 |     nnet = NetworkFactory(system_config, model)
 68 |     print("loading parameters...")
 69 |     nnet.load_params(test_iter)
 70 | 
 71 |     dataset = Referring(db, system_config, data_aug=False, \
 72 |         debug=False, shuffle=False, test=True)
 73 | 
 74 |     nnet.cuda()
 75 |     nnet.eval_mode()
 76 |     input_size = db.configs['input_size']
 77 |     output_size = db.configs['output_sizes']
 78 |     result = {}
 79 |     ap_metric = {}
 80 |     total_time = 0.0
 81 |     for i in range(9):
 82 |         ap_metric[0.5+0.05*i] = 0.0
 83 | 
 84 |     batch_size = bs
 85 |     for k_ind in tqdm(range(0, len(db.db_inds), batch_size)):
 86 |         
 87 |         
 88 |         end = time.time()
 89 |         if k_ind + batch_size > len(db.db_inds)-1:
 90 |             end_batch = len(db.db_inds) - 1
 91 |         else:
 92 |             end_batch = k_ind + batch_size
 93 | 
 94 |         db_inds = db.db_inds[k_ind:end_batch]
 95 |         image_paths = [db.image_path(ind) for ind in db_inds]
 96 |         expressions = [db.phrase(ind) for ind in db_inds]
 97 |         gt_bboxes   = [db.annotation_box(ind) for ind in db_inds]
 98 |         batch       = [dataset[ind] for ind in db_inds]
 99 |         images, phrases, original_shapes = tuple(zip(*tuple(batch)))
100 |         images = np.stack(images)
101 |         images = torch.from_numpy(images)
102 |         phrases = torch.stack(phrases)
103 | 
104 |         out = nnet.test(images, phrases)
105 |         bboxes = _decode_anchorbased(out)
106 |         for ind, bbox in enumerate(bboxes):
107 |             # revert to original scales
108 |             height, width = original_shapes[ind][0:2]
109 |             reshape_ratio = min(input_size[0] / height, \
110 |                                 input_size[1] / width)
111 |             resize_shape = round(height * reshape_ratio), round(width * reshape_ratio)
112 |             dh = (input_size[0] - resize_shape[0]) / 2 # height padding
113 |             dw = (input_size[1] - resize_shape[1]) / 2 # width padding
114 | 
115 |             bbox[0:4:2] = (bbox[0:4:2] - dw) / reshape_ratio
116 |             bbox[1:4:2] = (bbox[1:4:2] - dh) / reshape_ratio
117 |             bbox[0:4:2] = np.clip(bbox[0:4:2], 0, width-1)
118 |             bbox[1:4:2] = np.clip(bbox[1:4:2], 0, height-1)
119 |             iou = _bbox_iou(bbox, gt_bboxes[ind])
120 | 
121 |             for i in range(0, 9):
122 |                 if iou > 0.5+0.05*i:
123 |                     ap_metric[0.5+0.05*i] += 1.0
124 | 
125 |             value = {
126 |                  'iou': iou, 
127 |                  'image_name': osp.basename(image_paths[ind]),
128 |                  'pred_bbox': bbox.tolist(),
129 |                  'gt_bbox': gt_bboxes[ind].tolist(),
130 |             }
131 |             result[k_ind+ind] = value
132 |             if (k_ind + ind) % 1000 == 0:
133 |                 visu_dir = osp.join(result_dir, 'visulize')
134 |                 if not osp.exists(visu_dir):
135 |                     os.mkdir(visu_dir)
136 | 
137 |                 image = cv2.imread(image_paths[ind])
138 |                 _visualize(image, bbox, gt_bboxes[ind], expressions[ind], \
139 |                           save_path=osp.join(visu_dir, "{}-{}".format(k_ind+ind, osp.basename(image_paths[ind]))))
140 | 
141 | 
142 |     for i in range(0, 9):
143 |         key = 0.5+0.05*i
144 |         ap_metric[key] /= len(db.db_inds)
145 |         print("Average Precision  (AP) @[ IoU={:.2f}\t| area=\tall | = {:.4f}".format(key, ap_metric[key]))
146 |     with open(osp.join(result_dir, "metrics.json"), "w") as f:
147 |         json.dump(ap_metric, f)
148 |     with open(osp.join(result_dir, "results.json"), "w") as f:
149 |         json.dump(result, f)
150 | 
151 |     print("{:.2f} s / image ".format(total_time / len(db.db_inds)))
152 |     print("evaluate finish...")
153 | 
154 | 
155 | def main(args):
156 |     if args.suffix is None:
157 |         cfg_file = osp.join("./configs", args.cfg_file + ".json")
158 |     else:
159 |         cfg_file = osp.join("./configs", args.cfg_file + "-{}.json".format(args.suffix))
160 |     print("cfg_file: {}".format(cfg_file))
161 | 
162 |     with open(cfg_file, "r") as f:
163 |         config = json.load(f)
164 |             
165 |     config["system"]["snapshot_name"] = args.cfg_file
166 |     system_config = SystemConfig().update_config(config["system"])
167 |     anchors = make_anchors(system_config.dataset, 416)
168 |     config["db"]["anchors"] = anchors
169 |     config["db"]["corpus_path"] = get_file_path("..", "data", "refer", "data",  config["system"]["dataset"], "corpus.pth") 
170 |     
171 |     model = LBYLNet(system_config, config["db"])
172 |     train_split = system_config.train_split
173 |     val_split   = system_config.val_split
174 |     test_split  = system_config.test_split
175 | 
176 |     if args.split not in ["training", "validation", "testing"]:
177 |         split = args.split
178 |     else:
179 |         split = {
180 |             "training": train_split,
181 |             "validation": val_split,
182 |             "testing": test_split
183 |         }[args.split]
184 | 
185 |     print("loading all datasets...")
186 |     dataset = system_config.dataset
187 |     print("split: {}".format(split))
188 |     testing_db = datasets['refer'](config["db"], split=split, sys_config=system_config)
189 | 
190 |     print("system config...")
191 |     pprint.pprint(system_config.full)
192 | 
193 |     print("db config...")
194 |     pprint.pprint(testing_db.configs)
195 | 
196 |     test(testing_db, system_config, model, args)
197 | 
198 | if __name__ == "__main__":
199 |     args = parse_args()
200 |     main(args)
201 | 


--------------------------------------------------------------------------------
/ext/yolov3.cfg:
--------------------------------------------------------------------------------
  1 | [net]
  2 | # Testing
  3 | #batch=1
  4 | #subdivisions=1
  5 | # Training
  6 | batch=16
  7 | subdivisions=1
  8 | width=416
  9 | height=416
 10 | channels=3
 11 | momentum=0.9
 12 | decay=0.0005
 13 | angle=0
 14 | saturation = 1.5
 15 | exposure = 1.5
 16 | hue=.1
 17 | 
 18 | learning_rate=0.001
 19 | burn_in=1000
 20 | max_batches = 500200
 21 | policy=steps
 22 | steps=400000,450000
 23 | scales=.1,.1
 24 | 
 25 | [convolutional]
 26 | batch_normalize=1
 27 | filters=32
 28 | size=3
 29 | stride=1
 30 | pad=1
 31 | activation=leaky
 32 | 
 33 | # Downsample
 34 | 
 35 | [convolutional]
 36 | batch_normalize=1
 37 | filters=64
 38 | size=3
 39 | stride=2
 40 | pad=1
 41 | activation=leaky
 42 | 
 43 | [convolutional]
 44 | batch_normalize=1
 45 | filters=32
 46 | size=1
 47 | stride=1
 48 | pad=1
 49 | activation=leaky
 50 | 
 51 | [convolutional]
 52 | batch_normalize=1
 53 | filters=64
 54 | size=3
 55 | stride=1
 56 | pad=1
 57 | activation=leaky
 58 | 
 59 | [shortcut]
 60 | from=-3
 61 | activation=linear
 62 | 
 63 | # Downsample
 64 | 
 65 | [convolutional]
 66 | batch_normalize=1
 67 | filters=128
 68 | size=3
 69 | stride=2
 70 | pad=1
 71 | activation=leaky
 72 | 
 73 | [convolutional]
 74 | batch_normalize=1
 75 | filters=64
 76 | size=1
 77 | stride=1
 78 | pad=1
 79 | activation=leaky
 80 | 
 81 | [convolutional]
 82 | batch_normalize=1
 83 | filters=128
 84 | size=3
 85 | stride=1
 86 | pad=1
 87 | activation=leaky
 88 | 
 89 | [shortcut]
 90 | from=-3
 91 | activation=linear
 92 | 
 93 | [convolutional]
 94 | batch_normalize=1
 95 | filters=64
 96 | size=1
 97 | stride=1
 98 | pad=1
 99 | activation=leaky
100 | 
101 | [convolutional]
102 | batch_normalize=1
103 | filters=128
104 | size=3
105 | stride=1
106 | pad=1
107 | activation=leaky
108 | 
109 | [shortcut]
110 | from=-3
111 | activation=linear
112 | 
113 | # Downsample
114 | 
115 | [convolutional]
116 | batch_normalize=1
117 | filters=256
118 | size=3
119 | stride=2
120 | pad=1
121 | activation=leaky
122 | 
123 | [convolutional]
124 | batch_normalize=1
125 | filters=128
126 | size=1
127 | stride=1
128 | pad=1
129 | activation=leaky
130 | 
131 | [convolutional]
132 | batch_normalize=1
133 | filters=256
134 | size=3
135 | stride=1
136 | pad=1
137 | activation=leaky
138 | 
139 | [shortcut]
140 | from=-3
141 | activation=linear
142 | 
143 | [convolutional]
144 | batch_normalize=1
145 | filters=128
146 | size=1
147 | stride=1
148 | pad=1
149 | activation=leaky
150 | 
151 | [convolutional]
152 | batch_normalize=1
153 | filters=256
154 | size=3
155 | stride=1
156 | pad=1
157 | activation=leaky
158 | 
159 | [shortcut]
160 | from=-3
161 | activation=linear
162 | 
163 | [convolutional]
164 | batch_normalize=1
165 | filters=128
166 | size=1
167 | stride=1
168 | pad=1
169 | activation=leaky
170 | 
171 | [convolutional]
172 | batch_normalize=1
173 | filters=256
174 | size=3
175 | stride=1
176 | pad=1
177 | activation=leaky
178 | 
179 | [shortcut]
180 | from=-3
181 | activation=linear
182 | 
183 | [convolutional]
184 | batch_normalize=1
185 | filters=128
186 | size=1
187 | stride=1
188 | pad=1
189 | activation=leaky
190 | 
191 | [convolutional]
192 | batch_normalize=1
193 | filters=256
194 | size=3
195 | stride=1
196 | pad=1
197 | activation=leaky
198 | 
199 | [shortcut]
200 | from=-3
201 | activation=linear
202 | 
203 | 
204 | [convolutional]
205 | batch_normalize=1
206 | filters=128
207 | size=1
208 | stride=1
209 | pad=1
210 | activation=leaky
211 | 
212 | [convolutional]
213 | batch_normalize=1
214 | filters=256
215 | size=3
216 | stride=1
217 | pad=1
218 | activation=leaky
219 | 
220 | [shortcut]
221 | from=-3
222 | activation=linear
223 | 
224 | [convolutional]
225 | batch_normalize=1
226 | filters=128
227 | size=1
228 | stride=1
229 | pad=1
230 | activation=leaky
231 | 
232 | [convolutional]
233 | batch_normalize=1
234 | filters=256
235 | size=3
236 | stride=1
237 | pad=1
238 | activation=leaky
239 | 
240 | [shortcut]
241 | from=-3
242 | activation=linear
243 | 
244 | [convolutional]
245 | batch_normalize=1
246 | filters=128
247 | size=1
248 | stride=1
249 | pad=1
250 | activation=leaky
251 | 
252 | [convolutional]
253 | batch_normalize=1
254 | filters=256
255 | size=3
256 | stride=1
257 | pad=1
258 | activation=leaky
259 | 
260 | [shortcut]
261 | from=-3
262 | activation=linear
263 | 
264 | [convolutional]
265 | batch_normalize=1
266 | filters=128
267 | size=1
268 | stride=1
269 | pad=1
270 | activation=leaky
271 | 
272 | [convolutional]
273 | batch_normalize=1
274 | filters=256
275 | size=3
276 | stride=1
277 | pad=1
278 | activation=leaky
279 | 
280 | [shortcut]
281 | from=-3
282 | activation=linear
283 | 
284 | # Downsample
285 | 
286 | [convolutional]
287 | batch_normalize=1
288 | filters=512
289 | size=3
290 | stride=2
291 | pad=1
292 | activation=leaky
293 | 
294 | [convolutional]
295 | batch_normalize=1
296 | filters=256
297 | size=1
298 | stride=1
299 | pad=1
300 | activation=leaky
301 | 
302 | [convolutional]
303 | batch_normalize=1
304 | filters=512
305 | size=3
306 | stride=1
307 | pad=1
308 | activation=leaky
309 | 
310 | [shortcut]
311 | from=-3
312 | activation=linear
313 | 
314 | 
315 | [convolutional]
316 | batch_normalize=1
317 | filters=256
318 | size=1
319 | stride=1
320 | pad=1
321 | activation=leaky
322 | 
323 | [convolutional]
324 | batch_normalize=1
325 | filters=512
326 | size=3
327 | stride=1
328 | pad=1
329 | activation=leaky
330 | 
331 | [shortcut]
332 | from=-3
333 | activation=linear
334 | 
335 | 
336 | [convolutional]
337 | batch_normalize=1
338 | filters=256
339 | size=1
340 | stride=1
341 | pad=1
342 | activation=leaky
343 | 
344 | [convolutional]
345 | batch_normalize=1
346 | filters=512
347 | size=3
348 | stride=1
349 | pad=1
350 | activation=leaky
351 | 
352 | [shortcut]
353 | from=-3
354 | activation=linear
355 | 
356 | 
357 | [convolutional]
358 | batch_normalize=1
359 | filters=256
360 | size=1
361 | stride=1
362 | pad=1
363 | activation=leaky
364 | 
365 | [convolutional]
366 | batch_normalize=1
367 | filters=512
368 | size=3
369 | stride=1
370 | pad=1
371 | activation=leaky
372 | 
373 | [shortcut]
374 | from=-3
375 | activation=linear
376 | 
377 | [convolutional]
378 | batch_normalize=1
379 | filters=256
380 | size=1
381 | stride=1
382 | pad=1
383 | activation=leaky
384 | 
385 | [convolutional]
386 | batch_normalize=1
387 | filters=512
388 | size=3
389 | stride=1
390 | pad=1
391 | activation=leaky
392 | 
393 | [shortcut]
394 | from=-3
395 | activation=linear
396 | 
397 | 
398 | [convolutional]
399 | batch_normalize=1
400 | filters=256
401 | size=1
402 | stride=1
403 | pad=1
404 | activation=leaky
405 | 
406 | [convolutional]
407 | batch_normalize=1
408 | filters=512
409 | size=3
410 | stride=1
411 | pad=1
412 | activation=leaky
413 | 
414 | [shortcut]
415 | from=-3
416 | activation=linear
417 | 
418 | 
419 | [convolutional]
420 | batch_normalize=1
421 | filters=256
422 | size=1
423 | stride=1
424 | pad=1
425 | activation=leaky
426 | 
427 | [convolutional]
428 | batch_normalize=1
429 | filters=512
430 | size=3
431 | stride=1
432 | pad=1
433 | activation=leaky
434 | 
435 | [shortcut]
436 | from=-3
437 | activation=linear
438 | 
439 | [convolutional]
440 | batch_normalize=1
441 | filters=256
442 | size=1
443 | stride=1
444 | pad=1
445 | activation=leaky
446 | 
447 | [convolutional]
448 | batch_normalize=1
449 | filters=512
450 | size=3
451 | stride=1
452 | pad=1
453 | activation=leaky
454 | 
455 | [shortcut]
456 | from=-3
457 | activation=linear
458 | 
459 | # Downsample
460 | 
461 | [convolutional]
462 | batch_normalize=1
463 | filters=1024
464 | size=3
465 | stride=2
466 | pad=1
467 | activation=leaky
468 | 
469 | [convolutional]
470 | batch_normalize=1
471 | filters=512
472 | size=1
473 | stride=1
474 | pad=1
475 | activation=leaky
476 | 
477 | [convolutional]
478 | batch_normalize=1
479 | filters=1024
480 | size=3
481 | stride=1
482 | pad=1
483 | activation=leaky
484 | 
485 | [shortcut]
486 | from=-3
487 | activation=linear
488 | 
489 | [convolutional]
490 | batch_normalize=1
491 | filters=512
492 | size=1
493 | stride=1
494 | pad=1
495 | activation=leaky
496 | 
497 | [convolutional]
498 | batch_normalize=1
499 | filters=1024
500 | size=3
501 | stride=1
502 | pad=1
503 | activation=leaky
504 | 
505 | [shortcut]
506 | from=-3
507 | activation=linear
508 | 
509 | [convolutional]
510 | batch_normalize=1
511 | filters=512
512 | size=1
513 | stride=1
514 | pad=1
515 | activation=leaky
516 | 
517 | [convolutional]
518 | batch_normalize=1
519 | filters=1024
520 | size=3
521 | stride=1
522 | pad=1
523 | activation=leaky
524 | 
525 | [shortcut]
526 | from=-3
527 | activation=linear
528 | 
529 | [convolutional]
530 | batch_normalize=1
531 | filters=512
532 | size=1
533 | stride=1
534 | pad=1
535 | activation=leaky
536 | 
537 | [convolutional]
538 | batch_normalize=1
539 | filters=1024
540 | size=3
541 | stride=1
542 | pad=1
543 | activation=leaky
544 | 
545 | [shortcut]
546 | from=-3
547 | activation=linear
548 | 
549 | ######################
550 | 
551 | [convolutional]
552 | batch_normalize=1
553 | filters=512
554 | size=1
555 | stride=1
556 | pad=1
557 | activation=leaky
558 | 
559 | [convolutional]
560 | batch_normalize=1
561 | size=3
562 | stride=1
563 | pad=1
564 | filters=1024
565 | activation=leaky
566 | 
567 | [convolutional]
568 | batch_normalize=1
569 | filters=512
570 | size=1
571 | stride=1
572 | pad=1
573 | activation=leaky
574 | 
575 | [convolutional]
576 | batch_normalize=1
577 | size=3
578 | stride=1
579 | pad=1
580 | filters=1024
581 | activation=leaky
582 | 
583 | [yoloconvolutional]
584 | batch_normalize=1
585 | filters=512
586 | size=1
587 | stride=1
588 | pad=1
589 | activation=leaky
590 | 
591 | [convolutional]
592 | batch_normalize=1
593 | size=3
594 | stride=1
595 | pad=1
596 | filters=1024
597 | activation=leaky
598 | 
599 | [convolutional]
600 | size=1
601 | stride=1
602 | pad=1
603 | filters=255
604 | activation=linear
605 | 
606 | 
607 | [yolo]
608 | mask = 6,7,8
609 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
610 | classes=80
611 | num=9
612 | jitter=.3
613 | ignore_thresh = .7
614 | truth_thresh = 1
615 | random=1
616 | 
617 | 
618 | [route]
619 | layers = -4
620 | 
621 | [convolutional]
622 | batch_normalize=1
623 | filters=256
624 | size=1
625 | stride=1
626 | pad=1
627 | activation=leaky
628 | 
629 | [upsample]
630 | stride=2
631 | 
632 | [route]
633 | layers = -1, 61
634 | 
635 | 
636 | 
637 | [convolutional]
638 | batch_normalize=1
639 | filters=256
640 | size=1
641 | stride=1
642 | pad=1
643 | activation=leaky
644 | 
645 | [convolutional]
646 | batch_normalize=1
647 | size=3
648 | stride=1
649 | pad=1
650 | filters=512
651 | activation=leaky
652 | 
653 | [convolutional]
654 | batch_normalize=1
655 | filters=256
656 | size=1
657 | stride=1
658 | pad=1
659 | activation=leaky
660 | 
661 | [convolutional]
662 | batch_normalize=1
663 | size=3
664 | stride=1
665 | pad=1
666 | filters=512
667 | activation=leaky
668 | 
669 | [yoloconvolutional]
670 | batch_normalize=1
671 | filters=256
672 | size=1
673 | stride=1
674 | pad=1
675 | activation=leaky
676 | 
677 | [convolutional]
678 | batch_normalize=1
679 | size=3
680 | stride=1
681 | pad=1
682 | filters=512
683 | activation=leaky
684 | 
685 | [convolutional]
686 | size=1
687 | stride=1
688 | pad=1
689 | filters=255
690 | activation=linear
691 | 
692 | 
693 | [yolo]
694 | mask = 3,4,5
695 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
696 | classes=80
697 | num=9
698 | jitter=.3
699 | ignore_thresh = .7
700 | truth_thresh = 1
701 | random=1
702 | 
703 | 
704 | 
705 | [route]
706 | layers = -4
707 | 
708 | [convolutional]
709 | batch_normalize=1
710 | filters=128
711 | size=1
712 | stride=1
713 | pad=1
714 | activation=leaky
715 | 
716 | [upsample]
717 | stride=2
718 | 
719 | [route]
720 | layers = -1, 36
721 | 
722 | 
723 | 
724 | [convolutional]
725 | batch_normalize=1
726 | filters=128
727 | size=1
728 | stride=1
729 | pad=1
730 | activation=leaky
731 | 
732 | [convolutional]
733 | batch_normalize=1
734 | size=3
735 | stride=1
736 | pad=1
737 | filters=256
738 | activation=leaky
739 | 
740 | [convolutional]
741 | batch_normalize=1
742 | filters=128
743 | size=1
744 | stride=1
745 | pad=1
746 | activation=leaky
747 | 
748 | [convolutional]
749 | batch_normalize=1
750 | size=3
751 | stride=1
752 | pad=1
753 | filters=256
754 | activation=leaky
755 | 
756 | [yoloconvolutional]
757 | batch_normalize=1
758 | filters=128
759 | size=1
760 | stride=1
761 | pad=1
762 | activation=leaky
763 | 
764 | [convolutional]
765 | batch_normalize=1
766 | size=3
767 | stride=1
768 | pad=1
769 | filters=256
770 | activation=leaky
771 | 
772 | [convolutional]
773 | size=1
774 | stride=1
775 | pad=1
776 | filters=255
777 | activation=linear
778 | 
779 | 
780 | [yolo]
781 | mask = 0,1,2
782 | anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
783 | classes=80
784 | num=9
785 | jitter=.3
786 | ignore_thresh = .7
787 | truth_thresh = 1
788 | random=1
789 | 


--------------------------------------------------------------------------------
/imgs/demo.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svip-lab/LBYLNet/47a0e83b6db60d7efe1f74acdf4cb210ffd9554d/imgs/demo.jpeg


--------------------------------------------------------------------------------
/imgs/demo_out.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svip-lab/LBYLNet/47a0e83b6db60d7efe1f74acdf4cb210ffd9554d/imgs/demo_out.jpg


--------------------------------------------------------------------------------
/imgs/landmarks.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/svip-lab/LBYLNet/47a0e83b6db60d7efe1f74acdf4cb210ffd9554d/imgs/landmarks.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # This file may be used to create an environment using:
 2 | # $ conda create --name <env> --file <this file>
 3 | # platform: linux-64
 4 | cython=0.28.5=py37hf484d3e_0
 5 | opencv-python=4.2.0.34=pypi_0
 6 | packaging=20.3=pypi_0
 7 | pillow=5.2.0=py37heded4f4_0
 8 | pytorch-pretrained-bert=0.6.2=pypi_0
 9 | tqdm=4.45.0=pypi_0
10 | # python=3.7.1=h0371630_3
11 | # torch=1.6.0+cu92=pypi_0
12 | # torchvision=0.7.0+cu92=pypi_0
13 | 


--------------------------------------------------------------------------------
/results/lbyl_bert_gref_batch64/gref/30/val/metrics.json:
--------------------------------------------------------------------------------
1 | {"0.5": 0.6320260067114094, "0.55": 0.6162961409395973, "0.6": 0.5978397651006712, "0.65": 0.574244966442953, "0.7": 0.5450922818791947, "0.75": 0.49580536912751677, "0.8": 0.42554530201342283, "0.8500000000000001": 0.30379614093959734, "0.9": 0.1368498322147651}


--------------------------------------------------------------------------------
/results/lbyl_bert_referit_batch64/referit/100/test/metrics.json:
--------------------------------------------------------------------------------
1 | {"0.5": 0.684574813237276, "0.55": 0.6560737400795301, "0.6": 0.6213999301199608, "0.65": 0.580054240220954, "0.7": 0.529241468811873, "0.75": 0.4636041462156631, "0.8": 0.38087949020847545, "0.8500000000000001": 0.283330283014159, "0.9": 0.17929221503086368}


--------------------------------------------------------------------------------
/results/lbyl_bert_unc+_batch64/unc+/100/testA/metrics.json:
--------------------------------------------------------------------------------
1 | {"0.5": 0.737338456164862, "0.55": 0.7259867272092211, "0.6": 0.7155082081732449, "0.65": 0.6938526021655606, "0.7": 0.66765630457562, "0.75": 0.6281872162067761, "0.8": 0.5618232623122599, "0.8500000000000001": 0.4414949353824659, "0.9": 0.24869018512050298}


--------------------------------------------------------------------------------
/results/lbyl_bert_unc+_batch64/unc+/100/testB/metrics.json:
--------------------------------------------------------------------------------
1 | {"0.5": 0.5962364491716098, "0.55": 0.5808958887298016, "0.6": 0.5606463489466148, "0.65": 0.5397831867457558, "0.7": 0.5082839026385764, "0.75": 0.46573941501329513, "0.8": 0.39946819390468397, "0.8500000000000001": 0.298425035794641, "0.9": 0.1650644303538556}


--------------------------------------------------------------------------------
/results/lbyl_bert_unc+_batch64/unc+/100/val/metrics.json:
--------------------------------------------------------------------------------
1 | {"0.5": 0.6877672429819669, "0.55": 0.675404350250976, "0.6": 0.6600669269380925, "0.65": 0.640732478155791, "0.7": 0.6134969325153374, "0.75": 0.5737125859825246, "0.8": 0.5062279234058376, "0.8500000000000001": 0.3991448224577059, "0.9": 0.22308979364194087}


--------------------------------------------------------------------------------
/results/lbyl_bert_unc_batch64/unc/100/testA/metrics.json:
--------------------------------------------------------------------------------
1 | {"0.5": 0.8281774792292734, "0.55": 0.8106770372989217, "0.6": 0.7986565317305993, "0.65": 0.7836308997701962, "0.7": 0.7542867244122327, "0.75": 0.7088562842496022, "0.8": 0.6308997701962171, "0.8500000000000001": 0.49443167756761536, "0.9": 0.29503270284603145}


--------------------------------------------------------------------------------
/results/lbyl_bert_unc_batch64/unc/100/testB/metrics.json:
--------------------------------------------------------------------------------
1 | {"0.5": 0.7281648675171737, "0.55": 0.7099116781157998, "0.6": 0.6885181550539745, "0.65": 0.656918547595682, "0.7": 0.6239450441609421, "0.75": 0.57252208047105, "0.8": 0.4887144259077527, "0.8500000000000001": 0.3746810598626104, "0.9": 0.20372914622178606}


--------------------------------------------------------------------------------
/results/lbyl_bert_unc_batch64/unc/100/val/metrics.json:
--------------------------------------------------------------------------------
1 | {"0.5": 0.7972124792320473, "0.55": 0.781613439172974, "0.6": 0.7656451910651653, "0.65": 0.7409082517998893, "0.7": 0.7073103193649621, "0.75": 0.6617131253461326, "0.8": 0.5786413143806535, "0.8500000000000001": 0.45071072549381574, "0.9": 0.2515229832010338}


--------------------------------------------------------------------------------
/results/lbyl_lstm_gref_batch64/gref/30/val/metrics.json:
--------------------------------------------------------------------------------
1 | {"0.5": 0.600356543624161, "0.55": 0.5834731543624161, "0.6": 0.5631291946308725, "0.65": 0.5400587248322147, "0.7": 0.5073406040268457, "0.75": 0.4597315436241611, "0.8": 0.37531459731543626, "0.8500000000000001": 0.2595427852348993, "0.9": 0.11461828859060402}


--------------------------------------------------------------------------------
/results/lbyl_lstm_referit_batch64/referit/100/test/metrics.json:
--------------------------------------------------------------------------------
1 | {"0.5": 0.6598339517162205, "0.55": 0.6309502021529707, "0.6": 0.5985724506264246, "0.65": 0.557226760727418, "0.7": 0.5064805417366853, "0.75": 0.4431392775734988, "0.8": 0.3635592233332779, "0.8500000000000001": 0.2703026471224398, "0.9": 0.17093988652812672}


--------------------------------------------------------------------------------
/results/lbyl_lstm_referit_batch64/referit/100/validation/metrics.json:
--------------------------------------------------------------------------------
1 | {"0.5": 0.6699760356042451, "0.55": 0.6413899349537829, "0.6": 0.609038000684697, "0.65": 0.5703526189661074, "0.7": 0.5178021225607669, "0.75": 0.45583704210886683, "0.8": 0.37709688462855184, "0.8500000000000001": 0.2839780896953098, "0.9": 0.17305717220130093}


--------------------------------------------------------------------------------
/results/lbyl_lstm_unc+_batch64/unc+/100/testA/metrics.json:
--------------------------------------------------------------------------------
1 | {"0.5": 0.726336011177087, "0.55": 0.712190010478519, "0.6": 0.6969961578763535, "0.65": 0.6790080335312609, "0.7": 0.6496681802305274, "0.75": 0.6063569682151589, "0.8": 0.5415647921760391, "0.8500000000000001": 0.4268250087320992, "0.9": 0.24118057981138666}


--------------------------------------------------------------------------------
/results/lbyl_lstm_unc+_batch64/unc+/100/testB/metrics.json:
--------------------------------------------------------------------------------
1 | {"0.5": 0.5588054816935979, "0.55": 0.541419513192882, "0.6": 0.5244426263039477, "0.65": 0.5056248721619964, "0.7": 0.4755573736960524, "0.75": 0.4315811004295357, "0.8": 0.3706279402740847, "0.8500000000000001": 0.28083452648803436, "0.9": 0.1511556555532829}


--------------------------------------------------------------------------------
/results/lbyl_lstm_unc+_batch64/unc+/100/val/metrics.json:
--------------------------------------------------------------------------------
1 | {"0.5": 0.6671314370700874, "0.55": 0.6546755902584124, "0.6": 0.6409183863171594, "0.65": 0.6232571109871723, "0.7": 0.5980665551217699, "0.75": 0.5547499535229596, "0.8": 0.48680052054285183, "0.8500000000000001": 0.3747908533184607, "0.9": 0.20133853876185165}


--------------------------------------------------------------------------------
/results/lbyl_lstm_unc_batch64/unc/100/testA/metrics.json:
--------------------------------------------------------------------------------
1 | {"0.5": 0.8248188085557716, "0.55": 0.8135053915502917, "0.6": 0.8028990631076542, "0.65": 0.7815096340816687, "0.7": 0.7486300159094926, "0.75": 0.6998409050733604, "0.8": 0.6238288845677921, "0.8500000000000001": 0.49602262683401094, "0.9": 0.2923811207353721}


--------------------------------------------------------------------------------
/results/lbyl_lstm_unc_batch64/unc/100/testB/metrics.json:
--------------------------------------------------------------------------------
1 | {"0.5": 0.7177625122669283, "0.55": 0.7004906771344456, "0.6": 0.6824337585868498, "0.65": 0.658684985279686, "0.7": 0.6162904808635917, "0.75": 0.5631010794896958, "0.8": 0.47968596663395485, "0.8500000000000001": 0.37369970559371934, "0.9": 0.20255152109911678}


--------------------------------------------------------------------------------
/results/lbyl_lstm_unc_batch64/unc/100/val/metrics.json:
--------------------------------------------------------------------------------
1 | {"0.5": 0.785028613623777, "0.55": 0.7688757614916005, "0.6": 0.7529075133837918, "0.65": 0.7304781244231124, "0.7": 0.6947572457079564, "0.75": 0.6525752261399298, "0.8": 0.5717186634668636, "0.8500000000000001": 0.4464648329333579, "0.9": 0.2544766475909175}


--------------------------------------------------------------------------------
/train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | import os
  3 | import json
  4 | import torch
  5 | import numpy as np
  6 | import queue
  7 | import pprint
  8 | import random 
  9 | import argparse
 10 | import importlib
 11 | import threading
 12 | import traceback
 13 | import time
 14 | import logging
 15 | import torch.distributed as dist
 16 | from torch.utils.data.distributed import DistributedSampler
 17 | import torch.multiprocessing as mp
 18 | from torch.utils.data import DataLoader
 19 | 
 20 | from tqdm import tqdm
 21 | from torch.multiprocessing import Process, Queue, Pool
 22 | 
 23 | from core.dbs import datasets
 24 | from core.utils import stdout_to_tqdm, AverageMeter, make_anchors, get_root_logger
 25 | from core.config import SystemConfig
 26 | from core.nnet.nnet_factory import NetworkFactory
 27 | from core.sampler.sampler import Referring
 28 | from core.sampler.collate_fn import collate_fn, collate_fn_bert
 29 | from core.optimizer.lr_scheduler import make_scheduler
 30 | from core.models.net.lbylnet import LBYLNet
 31 | from core.models.net.baseline import Baseline
 32 | from core.paths import get_file_path
 33 | import pdb
 34 | seed = 413
 35 | random.seed(seed)
 36 | np.random.seed(seed+1)
 37 | torch.manual_seed(seed+2)
 38 | torch.cuda.manual_seed_all(seed+3)
 39 | torch.backends.cudnn.enabled   = True
 40 | torch.backends.cudnn.benchmark = True
 41 | 
 42 | def parse_args():
 43 |     parser = argparse.ArgumentParser(description="Training Script")
 44 |     parser.add_argument("cfg_file", help="config file", type=str)
 45 |     parser.add_argument("--start_epoch", dest="start_epoch",
 46 |                         help="train at iteration i",
 47 |                         default=0, type=int)
 48 |     parser.add_argument("--workers", default=4, type=int)
 49 |     parser.add_argument("--initialize", action="store_true")
 50 |     parser.add_argument("--lr_type", default='step', type=str)
 51 |     parser.add_argument("--distributed", action="store_true")
 52 |     parser.add_argument("--world_size", default=-1, type=int,
 53 |                         help="number of nodes of distributed training")
 54 |     parser.add_argument("--rank", default=0, type=int,
 55 |                         help="node rank for distributed training")
 56 |     parser.add_argument("--dist_url", default=None, type=str,
 57 |                         help="url used to set up distributed training")
 58 |     parser.add_argument("--dist_backend", default="nccl", type=str)
 59 |     parser.add_argument("--dataset", default=None, type=str)
 60 |     args = parser.parse_args()
 61 |     return args
 62 | 
 63 | def val_epoch(nnet, val_loader, rank, epoch, lr, print_freq, args):
 64 |     logger = args.logger
 65 | 
 66 |     def reduce_tensor(inp, average=False):
 67 |         """
 68 |         Reduce the loss from all the process so 
 69 |         that process with rank 0 has average result.
 70 |         """
 71 |         if args.world_size < 2:
 72 |             return inp
 73 |         with torch.no_grad():
 74 |             reduced_inp = inp
 75 |             dist.reduce(reduced_inp, dst=0)
 76 |             if average: 
 77 |                 reduced_inp = reduced_inp / args.world_size
 78 |         return reduced_inp
 79 |     
 80 |     batch_time = AverageMeter()
 81 |     data_time = AverageMeter()
 82 |     losses = AverageMeter()
 83 |     focal_losses = AverageMeter()
 84 |     off_losses = AverageMeter()
 85 |     end = time.time()
 86 |     nnet.eval_mode()
 87 |     for iter, batch in enumerate(val_loader):
 88 |         data_time.update(time.time()-end)
 89 |         loss, focal_loss, off_loss = nnet.validate(**batch)
 90 |         
 91 |         if args.distributed:
 92 |             loss = reduce_tensor(loss, average=True)
 93 |             focal_loss = reduce_tensor(focal_loss, average=True)
 94 |             off_loss = reduce_tensor(off_loss, average=True)
 95 | 
 96 |         losses.update(loss.item())
 97 |         focal_losses.update(focal_loss.item())
 98 |         off_losses.update(off_loss.item())
 99 |         batch_time.update(time.time() - end)
100 | 
101 |         
102 |         if rank==0 and print_freq and (iter+1) % print_freq == 0:
103 | 
104 |             message = ('Process {}\t'
105 |                 'epoch[{}][{}/{}]\t' \
106 |                 'time {time.val:.3f} ({time.avg:.3f})\t' \
107 |                 'data {data_time.val:.3f} ({data_time.avg:.3f})\t' \
108 |                 'loss {losses.val:.4f} ({losses.avg:.4f})\t'\
109 |                 'rank loss {focal_losses.val:.4f} ({focal_losses.avg:.4f})\t'\
110 |                 'offs loss {off_losses.val:.4f} ({off_losses.avg:4f})\t' \
111 |                 'lr {lr:.8f}'.format(
112 |                     rank, epoch, iter+1, len(val_loader),
113 |                     time=batch_time, data_time=data_time, losses=losses, 
114 |                     focal_losses =focal_losses, 
115 |                     off_losses=off_losses, lr=lr
116 |                 ))
117 |             logger.info(message)
118 |             print(message)
119 | 
120 |         end = time.time()
121 | 
122 |     return losses.avg, focal_losses.avg, off_losses.avg
123 | 
124 | def train_epoch(nnet, train_loader, rank, epoch, lr, print_freq, args):
125 |     logger = args.logger
126 | 
127 |     def reduce_tensor(inp, average=False):
128 |         """
129 |         Reduce the loss from all the process so 
130 |         that process with rank 0 has average result.
131 |         """
132 |         if args.world_size < 2:
133 |             return inp
134 |         with torch.no_grad():
135 |             reduced_inp = inp
136 |             dist.reduce(reduced_inp, dst=0)
137 |             if average: 
138 |                 reduced_inp = reduced_inp / args.world_size
139 |         return reduced_inp
140 |     
141 |     batch_time = AverageMeter()
142 |     data_time = AverageMeter()
143 |     losses = AverageMeter()
144 |     focal_losses = AverageMeter()
145 |     off_losses = AverageMeter()
146 |     end = time.time()
147 |     nnet.train_mode()
148 |     for iter, batch in enumerate(train_loader):
149 |         data_time.update(time.time()-end)
150 |         loss, focal_loss, off_loss = nnet.train(**batch)
151 |         
152 |         if args.distributed:
153 |             loss = reduce_tensor(loss, average=True)
154 |             focal_loss = reduce_tensor(focal_loss, average=True)
155 |             off_loss = reduce_tensor(off_loss, average=True)
156 | 
157 |         losses.update(loss.item())
158 |         focal_losses.update(focal_loss.item())
159 |         off_losses.update(off_loss.item())
160 |         batch_time.update(time.time() - end)
161 |         if rank==0 and print_freq and (iter+1) % print_freq == 0:
162 | 
163 |             message = ('Process {}\t'
164 |                 'epoch[{}][{}/{}]\t' \
165 |                 'time {time.val:.3f} ({time.avg:.3f})\t' \
166 |                 'data {data_time.val:.3f} ({data_time.avg:.3f})\t' \
167 |                 'loss {losses.val:.4f} ({losses.avg:.4f})\t'\
168 |                 'rank loss {focal_losses.val:.4f} ({focal_losses.avg:.4f})\t'\
169 |                 'offs loss {off_losses.val:.4f} ({off_losses.avg:4f})\t' \
170 |                 'lr {lr:.8f}'.format(
171 |                     rank, epoch, iter+1, len(train_loader),
172 |                     time=batch_time, data_time=data_time, losses=losses, 
173 |                     focal_losses =focal_losses, 
174 |                     off_losses=off_losses, lr=lr
175 |                 ))
176 | 
177 |             logger.info(message)
178 |             print(message)
179 | 
180 |         end = time.time()
181 | 
182 |     return losses.avg, focal_losses.avg, off_losses.avg
183 | 
184 | 
185 | def train(model, 
186 |         train_loader, 
187 |         val_loader, 
188 |         train_sampler, 
189 |         val_sampler,
190 |         system_config, 
191 |         args):
192 |     # reading arguments from command
193 |     start_epoch  = args.start_epoch
194 |     distributed  = args.distributed
195 |     world_size   = args.world_size
196 |     initialize   = args.initialize
197 |     rank         = args.rank
198 |     logger       = args.logger
199 |     # reading arguments from json file
200 |     args.batch_size  = system_config.batch_size
201 |     learning_rate    = system_config.learning_rate * world_size \
202 |                         if world_size > 0 else system_config.learning_rate
203 |     warm_up          = system_config.warm_up
204 |     warm_up_lr       = system_config.warm_up_lr
205 |     base_lr          = warm_up_lr if warm_up else learning_rate
206 |     pretrained_model = system_config.pretrain
207 |     snapshot         = system_config.snapshot
208 |     val_iter         = system_config.val_iter
209 |     nb_epoch         = system_config.nb_epoch
210 |     print_freq       = system_config.print_freq
211 |     # for automatic test after finishing training
212 |     args.test_split  = system_config.test_split
213 |     args.test_epoch  = system_config.nb_epoch
214 |     # system_config.learning_rate = base_lr
215 |     system_config.lr = base_lr
216 | 
217 |     print("Process {}: building model...".format(rank))
218 |     nnet = NetworkFactory(system_config, model, distributed=distributed, gpu=rank)
219 |     if initialize:
220 |         if rank == 0:
221 |             nnet.save_params(0)
222 |         exit(0)
223 | 
224 |     if pretrained_model is not None:
225 |         if not os.path.exists(pretrained_model):
226 |             raise ValueError("pretrained model does not exist")
227 |         logger.info("Process {}: loading from pretrained model".format(rank))
228 |         nnet.load_pretrained_params(pretrained_model)
229 | 
230 |     if start_epoch:
231 |         nnet.load_params(start_epoch)
232 |         logger.info("Process {}: training starts from iteration {} with learning_rate {}".format(rank, start_epoch + 1, base_lr))
233 | 
234 |     if rank == 0:
235 |         logger.info("training start...")
236 |     
237 |     nnet.cuda()
238 |     nnet.train_mode()
239 |     lr_scheduler = make_scheduler(nnet.optimizer, system_config, last_epoch=-1)
240 |     # dummpy loop for lr_scheduler
241 |     for epoch in range(start_epoch): #BUG HERE
242 |         lr_scheduler.step(epoch)
243 | 
244 | 
245 |     lr = nnet.get_lr()
246 | 
247 |     epoch_lr = []
248 |     for epoch in range(start_epoch, nb_epoch):
249 |         
250 |         if args.distributed:
251 |             train_sampler.set_epoch(epoch)
252 | 
253 |         train_epoch(nnet, train_loader, rank, epoch, lr, print_freq, args)
254 |         
255 |         epoch_lr.append(lr)
256 |         lr_scheduler.step(epoch)
257 |         lr = nnet.get_lr()
258 | 
259 |         if (epoch+1) % snapshot == 0 and rank == 0:
260 |             nnet.save_params(epoch+1)
261 |         
262 |         if (epoch+1) % val_iter == 0:
263 |             if rank == 0:
264 |                 logger.info('evaluating...')
265 |             val_epoch(nnet, val_loader, rank, epoch, lr, print_freq, args)
266 |             if rank ==0:
267 |                 logger.info('train...')
268 | 
269 | def main(gpu, ngpus_per_node, args):
270 |     args.gpu = gpu
271 |     if args.distributed:
272 |         args.rank = args.rank * ngpus_per_node + gpu
273 |         dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
274 |                                 world_size=args.world_size, rank=args.rank)
275 | 
276 |     rank = args.rank
277 |     curvers = os.path.join('./curves', args.cfg_file+'_curves_from_epoch_{}.pth'.format(args.start_epoch))
278 |     args.curves = curvers
279 |     logger = get_root_logger(rank, filename=os.path.join("./logs", args.cfg_file+".out"))
280 |     args.logger = logger
281 |     
282 |     logger.info("==================================================================")
283 |     logger.info("train start from here ... \n")
284 | 
285 |     cfg_file = os.path.join("./configs", args.cfg_file + ".json")
286 |     with open(cfg_file, "r") as f:
287 |         config = json.load(f)
288 | 
289 |     config["system"]["snapshot_name"] = args.cfg_file
290 |     if args.dataset is not None:
291 |         config["system"]["dataset"] = args.dataset 
292 |         
293 |     system_config = SystemConfig().update_config(config["system"])
294 |     anchors = make_anchors(system_config.dataset)
295 |     config["db"]["anchors"] = anchors
296 |     config["db"]["corpus_path"] = get_file_path("..", "data", "refer", "data",  config["system"]["dataset"], "corpus.pth") 
297 |     print(config["db"]["corpus_path"])
298 |     # if you want to access our baseline
299 |     # model = Baseline(system_config, config["db"])
300 |     model = LBYLNet(system_config, config["db"])
301 |     train_split = system_config.train_split
302 |     val_split   = system_config.val_split
303 | 
304 |     workers = args.workers
305 |     logger.info("Process {}: loading all datasets...".format(rank))
306 |     logger.info("Process {}: using {} workers".format(rank, workers))
307 | 
308 |     train_db = datasets['refer'](config["db"], split=train_split, sys_config=system_config)
309 |     valid_db = datasets['refer'](config["db"], split=val_split, sys_config=system_config)
310 | 
311 |     if rank == 0:
312 |         print("system config...")
313 |         pprint.pprint(system_config.full)
314 |         
315 |         logger.info("system config...")
316 |         logger.info(system_config.full)
317 | 
318 |         print("db config...")
319 |         pprint.pprint(train_db.configs)
320 | 
321 |         logger.info("db config...")
322 |         logger.info(train_db.configs)
323 | 
324 |         print("len of training db: {}".format(len(train_db.db_inds)))
325 |         print("len of validate db: {}".format(len(valid_db.db_inds)))
326 |         print("distributed: {}".format(args.distributed))
327 |         logger.info("len of training db: {}".format(len(train_db.db_inds)))
328 |         logger.info("len of validate db: {}".format(len(valid_db.db_inds)))
329 |         logger.info("distributed: {}".format(args.distributed))
330 | 
331 |     trainset= Referring(train_db, system_config, debug=False)
332 |     validset= Referring(valid_db, system_config, debug=False)
333 | 
334 |     train_sampler = None
335 |     val_sampler = None
336 |     if args.distributed:
337 |         train_sampler = DistributedSampler(trainset, num_replicas=args.world_size, rank=rank)
338 |         val_sampler = DistributedSampler(validset, num_replicas=args.world_size, rank=rank)
339 | 
340 |     collate_func = collate_fn_bert if not system_config.lstm else collate_fn
341 | 
342 |     batch_size = int(system_config.batch_size / args.world_size) \
343 |         if args.distributed else system_config.batch_size
344 |     train_loader = DataLoader(dataset=trainset, 
345 |                              batch_size=batch_size,
346 |                              shuffle=(train_sampler is None),
347 |                              num_workers=workers,
348 |                              collate_fn=collate_func,
349 |                              pin_memory=True,
350 |                              sampler=train_sampler)
351 |     val_loader = DataLoader(dataset=validset, 
352 |                              batch_size=batch_size, # validate require no grad.
353 |                              shuffle=(val_sampler is None),
354 |                              num_workers=workers,
355 |                              collate_fn=collate_func,
356 |                              pin_memory=True,
357 |                              sampler=val_sampler)
358 |         
359 |     train(model, train_loader, val_loader, train_sampler, val_sampler, system_config, args)
360 | 
361 | 
362 | if __name__ == "__main__":
363 |     args = parse_args()
364 | 
365 |     distributed = args.distributed
366 |     world_size  = args.world_size
367 | 
368 |     if distributed and world_size < 0:
369 |         raise ValueError("world size must be greater than 0 in distributed training")
370 | 
371 |     ngpus_per_node  = torch.cuda.device_count()
372 |     print("ngpus_per_node {}".format(ngpus_per_node))
373 |     if distributed:
374 |         args.world_size = ngpus_per_node * args.world_size
375 |         mp.spawn(main, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
376 |     else:
377 |         main(None, ngpus_per_node, args)
378 |     
379 |     # evaulate 
380 |     # print("evaluating...")
381 |     # os.system("python evaluate.py {} --split {}  --testiter {} --batch_size {} >> evalute.out".format(args.cfg_file, "test", 100, 64))


--------------------------------------------------------------------------------