├── .gitignore
├── LICENSE
├── README.md
├── configs
    ├── flickr30k
    │   ├── RefTR_flickr.sh
    │   ├── RefTR_flickr_roberta.sh
    │   ├── Ref_flickr_pt.sh
    │   └── Ref_flickr_pt_101.sh
    ├── refcoco+
    │   ├── RefTR_SEG_refcoco+.sh
    │   ├── RefTR_SEG_refcoco+_101.sh
    │   ├── RefTR_refcoco+.sh
    │   └── RefTR_refcoco+_101.sh
    ├── refcoco
    │   ├── RefTR_refcoco.sh
    │   └── RefTR_refcoco_101.sh
    ├── refcocog
    │   ├── RefTR_SEG_refcocog.sh
    │   ├── RefTR_SEG_refcocog_101.sh
    │   ├── RefTR_refcocog.sh
    │   └── RefTR_refcocog_101.sh
    └── referit
    │   ├── RefTR_referit.sh
    │   ├── RefTR_referit_101.sh
    │   ├── RefTR_referit_101_PT.sh
    │   └── RefTR_referit_PT.sh
├── datasets
    ├── __init__.py
    ├── data_prefetcher.py
    ├── grounding_datasets
    │   ├── __init__.py
    │   ├── refer_dataset.py
    │   └── resc_refer_dataset.py
    ├── lang_utils.py
    ├── refer_multiphrase.py
    ├── refer_resc.py
    ├── refer_segmentation.py
    ├── samplers.py
    └── transforms.py
├── engine_vg.py
├── main_vg.py
├── models
    ├── __init__.py
    ├── criterion.py
    ├── modeling
    │   ├── backbone.py
    │   ├── matcher.py
    │   ├── position_encoding.py
    │   ├── segmentation.py
    │   └── transformer.py
    ├── post_process.py
    ├── reftr.py
    ├── reftr_segmentation.py
    └── reftr_transformer.py
├── requirements.txt
├── tools
    ├── launch.py
    ├── run_dist_launch.sh
    ├── run_dist_slurm.sh
    └── vis_log.py
├── tox.ini
└── util
    ├── __init__.py
    ├── box_ops.py
    ├── collate_fn.py
    ├── lr_scheduler.py
    ├── misc.py
    ├── plot_utils.py
    ├── transforms.py
    └── word_utils.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.toptal.com/developers/gitignore/api/python,vscode,jupyternotebooks
  3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,vscode,jupyternotebooks
  4 | 
  5 | ###
  6 | exp_backup/*
  7 | exps/*
  8 | data
  9 | configs/VinVL_VQA_base/*.bin
 10 | SAVED_MODEL/*
 11 | 
 12 | ### JupyterNotebooks ###
 13 | # gitignore template for Jupyter Notebooks
 14 | # website: http://jupyter.org/
 15 | 
 16 | .ipynb_checkpoints
 17 | */.ipynb_checkpoints/*
 18 | 
 19 | # IPython
 20 | profile_default/
 21 | ipython_config.py
 22 | 
 23 | # Remove previous ipynb_checkpoints
 24 | #   git rm -r .ipynb_checkpoints/
 25 | 
 26 | ### Python ###
 27 | # Byte-compiled / optimized / DLL files
 28 | __pycache__/
 29 | *.py[cod]
 30 | *$py.class
 31 | 
 32 | # C extensions
 33 | *.so
 34 | 
 35 | # Distribution / packaging
 36 | .Python
 37 | build/
 38 | develop-eggs/
 39 | dist/
 40 | downloads/
 41 | eggs/
 42 | .eggs/
 43 | parts/
 44 | sdist/
 45 | var/
 46 | wheels/
 47 | pip-wheel-metadata/
 48 | share/python-wheels/
 49 | *.egg-info/
 50 | .installed.cfg
 51 | *.egg
 52 | MANIFEST
 53 | 
 54 | # PyInstaller
 55 | #  Usually these files are written by a python script from a template
 56 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 57 | *.manifest
 58 | *.spec
 59 | 
 60 | # Installer logs
 61 | pip-log.txt
 62 | pip-delete-this-directory.txt
 63 | 
 64 | # Unit test / coverage reports
 65 | htmlcov/
 66 | .tox/
 67 | .nox/
 68 | .coverage
 69 | .coverage.*
 70 | .cache
 71 | nosetests.xml
 72 | coverage.xml
 73 | *.cover
 74 | *.py,cover
 75 | .hypothesis/
 76 | .pytest_cache/
 77 | pytestdebug.log
 78 | 
 79 | # Translations
 80 | *.mo
 81 | *.pot
 82 | 
 83 | # Django stuff:
 84 | *.log
 85 | local_settings.py
 86 | db.sqlite3
 87 | db.sqlite3-journal
 88 | 
 89 | # Flask stuff:
 90 | instance/
 91 | .webassets-cache
 92 | 
 93 | # Scrapy stuff:
 94 | .scrapy
 95 | 
 96 | # Sphinx documentation
 97 | docs/_build/
 98 | doc/_build/
 99 | 
100 | # PyBuilder
101 | target/
102 | 
103 | # Jupyter Notebook
104 | 
105 | # IPython
106 | 
107 | # pyenv
108 | .python-version
109 | 
110 | # pipenv
111 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
112 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
113 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
114 | #   install all needed dependencies.
115 | #Pipfile.lock
116 | 
117 | # poetry
118 | #poetry.lock
119 | 
120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
121 | __pypackages__/
122 | 
123 | # Celery stuff
124 | celerybeat-schedule
125 | celerybeat.pid
126 | 
127 | # SageMath parsed files
128 | *.sage.py
129 | 
130 | # Environments
131 | # .env
132 | .env/
133 | .venv/
134 | env/
135 | venv/
136 | ENV/
137 | env.bak/
138 | venv.bak/
139 | pythonenv*
140 | 
141 | # Spyder project settings
142 | .spyderproject
143 | .spyproject
144 | 
145 | # Rope project settings
146 | .ropeproject
147 | 
148 | # mkdocs documentation
149 | /site
150 | 
151 | # mypy
152 | .mypy_cache/
153 | .dmypy.json
154 | dmypy.json
155 | 
156 | # Pyre type checker
157 | .pyre/
158 | 
159 | # pytype static type analyzer
160 | .pytype/
161 | 
162 | # operating system-related files
163 | *.DS_Store #file properties cache/storage on macOS
164 | Thumbs.db #thumbnail cache on Windows
165 | 
166 | # profiling data
167 | .prof
168 | 
169 | 
170 | ### vscode ###
171 | .vscode/*
172 | # !.vscode/settings.json
173 | !.vscode/tasks.json
174 | !.vscode/launch.json
175 | !.vscode/extensions.json
176 | *.code-workspace
177 | 
178 | # End of https://www.toptal.com/developers/gitignore/api/python,vscode,jupyternotebooks


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 UBC Computer Vision Group
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # RefTR
  2 | 
  3 | Code for paper "Referring Transformer: A One-step Approach to Multi-task Visual Grounding"
  4 | 
  5 | ## Requirements
  6 | 
  7 | To install requirements:
  8 | 
  9 | ```setup
 10 | pip install -r requirements.txt
 11 | ```
 12 | 
 13 | ```
 14 | chmod +x tools/run_dist_slurm.sh 
 15 | ```
 16 | 
 17 | ## Setting up dataset
 18 | Resc annotations preparation: https://drive.google.com/file/d/1fVwdDvXNbH8uuq_pHD_o5HI7yqeuz0yS/view?usp=sharing
 19 | 
 20 | Flicker30k Entities: http://bryanplummer.com/Flickr30kEntities/
 21 | 
 22 | MSCOCO: http://mscoco.org/dataset/#overview
 23 | 
 24 | Visual Genome Images: https://visualgenome.org/api/v0/api_home.html
 25 | 
 26 | data/annotations: https://drive.google.com/file/d/19qJ8b5sxijKmtN0XG9leWbt2sPkIVqlc/view?usp=sharing
 27 | 
 28 | refcoco/masks: https://drive.google.com/file/d/1oGUewiDtxjouT8Qp4dRzrPfGkc0LZaIT/view?usp=sharing
 29 | 
 30 | refcoco/anns: https://drive.google.com/file/d/1Prhrgm3t2JeY68Ni_1Ig_a4dfZvGC9vZ/view?usp=sharing
 31 | 
 32 | annotations_resc/vg/vg_all.pth: https://drive.google.com/file/d/1_GbWl0sSB1y26fFM9W7DDkXLRR8Ld3IH/view?usp=sharing
 33 | 
 34 | Extract dataset in the /data folder.(Tips: you can use softlinks to avoid putting data and code in the same directory.)
 35 | The data/ folder should look like this:
 36 | ```
 37 | data
 38 | ├── annotations
 39 | ├── annotations_resc
 40 | │   ├── flickr
 41 | │   ├── gref
 42 | │   ├── gref_umd
 43 | │   ├── referit
 44 | │   ├── unc
 45 | │   ├── unc+
 46 | │   └── vg
 47 | ├── flickr30k
 48 | │   └── f30k_images
 49 | ├── refcoco
 50 | |   ├── anns
 51 | │   ├── images
 52 | |   │   ├──train2014  # images from train 2014
 53 | │   ├── masks
 54 | ├── referit
 55 | │   ├── images
 56 | ├── visualgenome
 57 | └───└──  VG_100K
 58 | 
 59 | ```
 60 | 
 61 | ## Training
 62 | 
 63 | To train the model, run:
 64 | ```train
 65 | # using slurm system
 66 | MASTER_PORT=${Master Port} GPUS_PER_NODE={GPU per node} ./tools/run_dist_slurm.sh RefTR ${Number Of GPU} ${config file name}
 67 | ```
 68 | 
 69 | Example:
 70 | ```python
 71 | MASTER_PORT=29501 GPUS_PER_NODE=4  ./tools/run_dist_slurm.sh  RefTR 4 configs/flickr30k/RefTR_flickr.sh 
 72 | ```
 73 | 
 74 | ## Evaluation
 75 | 
 76 | To evaluate the model, run:
 77 | ```eval
 78 | MASTER_PORT=${Master Port} GPUS_PER_NODE={GPU per node} ./tools/run_dist_slurm.sh RefTR ${Number Of GPU} ${config file name} --eval --resume=${path to checkpoint}
 79 | ```
 80 | 
 81 | Example:
 82 | ```python
 83 | MASTER_PORT=29501 GPUS_PER_NODE=4  ./tools/run_dist_slurm.sh  RefTR 4 configs/flickr30k/RefTR_flickr.sh --eval --resume=./exps/flickr30k/checkpoint.pth
 84 | ```
 85 | 
 86 | ## Pretrained checkpoint for refcoco res/rec
 87 | | Checkpoint Name      | Dataset/Link | Description|
 88 | | ----------- | ----------- | --- |
 89 | | refcoco_SEG_PT_res50_6_epochs.pth  | [refcoco](https://drive.google.com/file/d/151XGTlGTbwGyQ6HMEn2sTEwEeFY9Csjx/view?usp=sharing) | Pretrained 6 epochs on VG |
 90 | | refcoco+_SEG_PT_res50_6_epochs.pth | [refcoco+](https://drive.google.com/file/d/1KKd80NReZJ500G6pnY1iRXoWqhJRDn5T/view?usp=sharing) | Pretrained 6 epochs on VG |
 91 | | refcocog_SEG_PT_res50_6_epochs.pth | [refcocog](https://drive.google.com/file/d/1oStrCvyJ2KyumXciMg6n8CdvefS9Qjsi/view?usp=sharing) | Pretrained 6 epochs on VG |
 92 | 
 93 | ## Bibtext
 94 | 
 95 | If you find this code is useful for your research, please cite our paper
 96 | 
 97 | ```
 98 | @inproceedings{muchen2021referring,
 99 |   title={Referring Transformer: A One-step Approach to Multi-task Visual Grounding},
100 |   author={Muchen, Li and Leonid, Sigal},
101 |   booktitle={Thirty-Fifth Conference on Neural Information Processing Systems},
102 |   year={2021}
103 | }
104 | 


--------------------------------------------------------------------------------
/configs/flickr30k/RefTR_flickr.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/RefTR_flickr
 6 | PY_ARGS=${@:1}
 7 | 
 8 | conda activate pytorch
 9 | which python
10 | 
11 | python3.8 -u main_vg.py \
12 |     --pretrained_model "./data/MODEL_ZOO/detr-r50-e632da11.pth"\
13 |     --num_feature_levels 1\
14 |     --dataset flickr30k\
15 |     --dec_layers 6\
16 |     --img_size 640\
17 |     --max_img_size 640\
18 |     --batch_size 16\
19 |     --epochs 60\
20 |     --warm_up_epoch 5\
21 |     --lr_schedule CosineWarmupLR\
22 |     --aux_loss\
23 |     --output_dir ${EXP_DIR} \
24 |     ${PY_ARGS}
25 |     
26 |     # --num_queries_per_phrase 1\
27 |     # --resume 


--------------------------------------------------------------------------------
/configs/flickr30k/RefTR_flickr_roberta.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/flickr/RefTR_flickr_roberta
 6 | PY_ARGS=${@:1}
 7 | 
 8 | conda activate pytorch
 9 | which python
10 | 
11 | python3.8 -u main_vg.py \
12 |     --pretrained_model "./data/MODEL_ZOO/detr-r50-e632da11.pth"\
13 |     --num_feature_levels 1\
14 |     --num_queries_per_phrase 1\
15 |     --dataset flickr30k\
16 |     --dec_layers 6\
17 |     --bert_model roberta-base\
18 |     --img_size 640\
19 |     --max_img_size 640\
20 |     --batch_size 16\
21 |     --epochs 60\
22 |     --lr_drop 40\
23 |     --aux_loss\
24 |     --output_dir ${EXP_DIR} \
25 |     ${PY_ARGS}
26 |     
27 |     # --resume 


--------------------------------------------------------------------------------
/configs/flickr30k/Ref_flickr_pt.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/flickr/RefTR_pt
 6 | PY_ARGS=${@:1}
 7 | 
 8 | conda activate pytorch
 9 | which python
10 | 
11 | python3.8 -u main_vg.py \
12 |     --resume "./SAVED_MODEL/PT/RefTR_VG_PT_08.pth"\
13 |     --resume_model_only\
14 |     --num_feature_levels 1\
15 |     --num_queries_per_phrase 1\
16 |     --dataset flickr30k\
17 |     --dec_layers 6\
18 |     --img_size 640\
19 |     --max_img_size 640\
20 |     --epochs 40\
21 |     --lr_drop 30\
22 |     --aux_loss\
23 |     --output_dir ${EXP_DIR} \
24 |     ${PY_ARGS}
25 |     
26 |     # --resume 


--------------------------------------------------------------------------------
/configs/flickr30k/Ref_flickr_pt_101.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/flickr/RefTR_pt_101
 6 | PY_ARGS=${@:1}
 7 | 
 8 | conda activate pytorch
 9 | which python
10 | 
11 | python3.8 -u main_vg.py \
12 |     --resume "./SAVED_MODEL/PT/RefTR_VG_101_PT_08.pth"\
13 |     --resume_model_only\
14 |     --num_feature_levels 1\
15 |     --num_queries_per_phrase 1\
16 |     --dataset flickr30k\
17 |     --dec_layers 6\
18 |     --img_size 640\
19 |     --max_img_size 640\
20 |     --epochs 40\
21 |     --lr_drop 30\
22 |     --aux_loss\
23 |     --backbone resnet101\
24 |     --output_dir ${EXP_DIR} \
25 |     ${PY_ARGS}
26 |     
27 |     # --resume 


--------------------------------------------------------------------------------
/configs/refcoco+/RefTR_SEG_refcoco+.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting
 3 | set -x
 4 | 
 5 | 
 6 | ###############################################################################################
 7 | # EXP_DIR=exps/refcoco+/RefTR_SEG
 8 | # PY_ARGS=${@:1}
 9 | 
10 | # conda activate pytorch
11 | # which python
12 | 
13 | # python3.8 -u main_vg.py \
14 | #     --pretrained_model "./SAVED_MODEL/refcoco_50_det/RefTR_refcoco+_l6/checkpoint_best.pth"\
15 | #     --num_feature_levels 1\
16 | #     --num_queries_per_phrase 1\
17 | #     --masks\
18 | #     --lr 1e-5\
19 | #     --lr_mask_branch_proj 10\
20 | #     --dataset refcoco+_unc\
21 | #     --train_split train\
22 | #     --test_split val testA testB\
23 | #     --dec_layers 6\
24 | #     --aux_loss \
25 | #     --img_size 640\
26 | #     --max_img_size 640\
27 | #     --epochs 40\
28 | #     --lr_drop 30\
29 | #     --output_dir ${EXP_DIR} \
30 | #     ${PY_ARGS}
31 |     
32 |     # --resume 
33 | 
34 | 
35 | ###############################################################################################
36 | EXP_DIR=exps/refcoco+/RefTR_SEG_PT
37 | PY_ARGS=${@:1}
38 | 
39 | conda activate pytorch
40 | which python
41 | 
42 | python3.8 -u main_vg.py \
43 |     --pretrained_model "./SAVED_MODEL/refcoco_50_det_pretrained/RefTR_refcoco+_pt/checkpoint_best.pth"\
44 |     --num_feature_levels 1\
45 |     --num_queries_per_phrase 1\
46 |     --masks\
47 |     --lr 1e-5\
48 |     --lr_mask_branch_proj 10\
49 |     --dataset refcoco+_unc\
50 |     --train_split train\
51 |     --test_split testA testB\
52 |     --dec_layers 6\
53 |     --aux_loss \
54 |     --img_size 640\
55 |     --max_img_size 640\
56 |     --epochs 40\
57 |     --lr_drop 30\
58 |     --output_dir ${EXP_DIR} \
59 |     ${PY_ARGS}


--------------------------------------------------------------------------------
/configs/refcoco+/RefTR_SEG_refcoco+_101.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting
 3 | set -x
 4 | 
 5 | # EXP_DIR=exps/refcoco+/RefTR_SEG_101
 6 | # PY_ARGS=${@:1}
 7 | 
 8 | # conda activate pytorch
 9 | # which python
10 | 
11 | # python3.8 -u main_vg.py \
12 | #     --pretrained_model "./SAVED_MODEL/refcoco_101_det/RefTR_refcoco+_101/checkpoint_best.pth"\
13 | #     --num_feature_levels 1\
14 | #     --num_queries_per_phrase 1\
15 | #     --masks\
16 | #     --lr 1e-5\
17 | #     --lr_mask_branch_proj 10\
18 | #     --dataset refcoco+_unc\
19 | #     --train_split train\
20 | #     --test_split val testA testB\
21 | #     --dec_layers 6\
22 | #     --backbone resnet101\
23 | #     --aux_loss \
24 | #     --img_size 640\
25 | #     --max_img_size 640\
26 | #     --epochs 40\
27 | #     --lr_drop 30\
28 | #     --output_dir ${EXP_DIR} \
29 | #     ${PY_ARGS}
30 |     
31 |     # --resume 
32 | 
33 | EXP_DIR=exps/refcoco+/RefTR_SEG_101_PT
34 | PY_ARGS=${@:1}
35 | 
36 | conda activate pytorch
37 | which python
38 | 
39 | python3.8 -u main_vg.py \
40 |     --pretrained_model "./SAVED_MODEL/refcoco_101_det_pretrained/RefTR_refcoco+_pt/checkpoint_best.pth"\
41 |     --num_feature_levels 1\
42 |     --num_queries_per_phrase 1\
43 |     --masks\
44 |     --lr 1e-5\
45 |     --lr_mask_branch_proj 10\
46 |     --dataset refcoco+_unc\
47 |     --train_split train\
48 |     --test_split val testA testB\
49 |     --dec_layers 6\
50 |     --backbone resnet101\
51 |     --aux_loss \
52 |     --img_size 640\
53 |     --max_img_size 640\
54 |     --epochs 40\
55 |     --lr_drop 30\
56 |     --ckpt_cycle 60\
57 |     --output_dir ${EXP_DIR} \
58 |     ${PY_ARGS}
59 |     


--------------------------------------------------------------------------------
/configs/refcoco+/RefTR_refcoco+.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/RefTR_refcoco+_unc
 6 | PY_ARGS=${@:1}
 7 | 
 8 | conda activate pytorch
 9 | which python
10 | 
11 | python3.8 -u main_vg.py \
12 |     --pretrained_model "./data/MODEL_ZOO/detr-r50-e632da11.pth"\
13 |     --num_feature_levels 1\
14 |     --num_queries_per_phrase 1\
15 |     --dataset refcoco+_unc\
16 |     --train_split train\
17 |     --test_split val testA testB\
18 |     --dec_layers 6\
19 |     --aux_loss \
20 |     --img_size 640\
21 |     --max_img_size 640\
22 |     --epochs 90\
23 |     --lr_drop 60\
24 |     --output_dir ${EXP_DIR} \
25 |     ${PY_ARGS}
26 |     
27 |     # --resume 


--------------------------------------------------------------------------------
/configs/refcoco+/RefTR_refcoco+_101.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/RefTR_refcoco+_unc_101
 6 | PY_ARGS=${@:1}
 7 | 
 8 | conda activate pytorch
 9 | which python
10 | 
11 | python3.8 -u main_vg.py \
12 |     --pretrained_model "./data/MODEL_ZOO/detr-r101-2c7b67e5.pth"\
13 |     --num_feature_levels 1\
14 |     --num_queries_per_phrase 1\
15 |     --dataset refcoco+_unc\
16 |     --train_split train\
17 |     --test_split val testA testB\
18 |     --dec_layers 6\
19 |     --backbone resnet101\
20 |     --aux_loss \
21 |     --img_size 640\
22 |     --max_img_size 640\
23 |     --epochs 90\
24 |     --lr_drop 60\
25 |     --output_dir ${EXP_DIR} \
26 |     ${PY_ARGS}
27 |     
28 |     # --resume 


--------------------------------------------------------------------------------
/configs/refcoco/RefTR_refcoco.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | set -x
 3 | 
 4 | PY_ARGS=${@:1}
 5 | 
 6 | conda activate pytorch
 7 | which python
 8 | 
 9 | EXP_DIR=exps/refcoco/r50_det
10 | python3.8 -u main_vg.py \
11 |     --pretrained_model "./data/MODEL_ZOO/detr-r50-e632da11.pth"\
12 |     --num_feature_levels 1\
13 |     --dataset refcoco_unc\
14 |     --train_split train\
15 |     --test_split val testA testB\
16 |     --dec_layers 6\
17 |     --aux_loss \
18 |     --img_size 640\
19 |     --max_img_size 640\
20 |     --epochs 90\
21 |     --lr_drop 60\
22 |     --output_dir ${EXP_DIR} \
23 |     ${PY_ARGS}
24 | 
25 | EXP_DIR=exps/refcoco/r50
26 | python3.8 -u main_vg.py \
27 |     --pretrained_model "./exps/refcoco/refTR_det/checkpoint_best.pth"\
28 |     --num_feature_levels 1\
29 |     --masks\
30 |     --lr 1e-5\
31 |     --lr_mask_branch_proj 10\
32 |     --dataset refcoco_unc\
33 |     --train_split train\
34 |     --test_split val testA testB\
35 |     --dec_layers 6\
36 |     --aux_loss \
37 |     --img_size 640\
38 |     --max_img_size 640\
39 |     --epochs 40\
40 |     --lr_drop 30\
41 |     --output_dir ${EXP_DIR} \
42 |     ${PY_ARGS}


--------------------------------------------------------------------------------
/configs/refcoco/RefTR_refcoco_101.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting
 3 | set -x
 4 | 
 5 | PY_ARGS=${@:1}
 6 | 
 7 | conda activate pytorch
 8 | which python
 9 | 
10 | EXP_DIR=exps/refcoco/r101_det
11 | python3.8 -u main_vg.py \
12 |     --pretrained_model "./data/MODEL_ZOO/detr-r101-2c7b67e5.pth"\
13 |     --num_feature_levels 1\
14 |     --dataset refcoco_unc\
15 |     --train_split train\
16 |     --test_split val testA testB\
17 |     --dec_layers 6\
18 |     --backbone resnet101\
19 |     --aux_loss \
20 |     --img_size 640\
21 |     --max_img_size 640\
22 |     --epochs 90\
23 |     --lr_drop 60\
24 |     --output_dir ${EXP_DIR} \
25 |     ${PY_ARGS}
26 | 
27 | 
28 | EXP_DIR=exps/refcoco/r101
29 | python3.8 -u main_vg.py \
30 |     --pretrained_model "./SAVED_MODEL/refcoco_101_det/RefTR_refcoco_101/checkpoint_best.pth"\
31 |     --num_feature_levels 1\
32 |     --masks\
33 |     --lr 1e-5\
34 |     --lr_mask_branch_proj 10\
35 |     --dataset refcoco_unc\
36 |     --train_split train\
37 |     --test_split val testA testB\
38 |     --dec_layers 6\
39 |     --backbone resnet101\
40 |     --aux_loss \
41 |     --img_size 640\
42 |     --max_img_size 640\
43 |     --epochs 40\
44 |     --lr_drop 30\
45 |     --output_dir ${EXP_DIR} \


--------------------------------------------------------------------------------
/configs/refcocog/RefTR_SEG_refcocog.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting
 3 | set -x
 4 | 
 5 | ###############################################################################################
 6 | # EXP_DIR=exps/refcocog/RefTR_SEG
 7 | # PY_ARGS=${@:1}
 8 | 
 9 | # conda activate pytorch
10 | # which python
11 | 
12 | # python3.8 -u main_vg.py \
13 | #     --pretrained_model "./SAVED_MODEL/refcoco_50_det/RefTR_refcocog_l6/checkpoint_best.pth"\
14 | #     --num_feature_levels 1\
15 | #     --num_queries_per_phrase 1\
16 | #     --masks\
17 | #     --lr 1e-5\
18 | #     --lr_mask_branch_proj 10\
19 | #     --dataset refcocog_umd\
20 | #     --train_split train\
21 | #     --test_split val test\
22 | #     --dec_layers 6\
23 | #     --aux_loss \
24 | #     --img_size 640\
25 | #     --max_img_size 640\
26 | #     --epochs 40\
27 | #     --lr_drop 30\
28 | #     --output_dir ${EXP_DIR} \
29 | #     ${PY_ARGS}
30 |     
31 | #     # --resume 
32 | 
33 | ###############################################################################################
34 | EXP_DIR=exps/refcocog/RefTR_SEG_PT
35 | PY_ARGS=${@:1}
36 | 
37 | conda activate pytorch
38 | which python
39 | 
40 | python3.8 -u main_vg.py \
41 |     --pretrained_model "./SAVED_MODEL/refcoco_50_det_pretrained/RefTR_refcocog_pt/checkpoint_best.pth"\
42 |     --num_feature_levels 1\
43 |     --num_queries_per_phrase 1\
44 |     --masks\
45 |     --lr 1e-5\
46 |     --lr_mask_branch_proj 10\
47 |     --dataset refcocog_umd\
48 |     --train_split train\
49 |     --test_split test\
50 |     --dec_layers 6\
51 |     --aux_loss \
52 |     --img_size 640\
53 |     --max_img_size 640\
54 |     --epochs 40\
55 |     --lr_drop 30\
56 |     --output_dir ${EXP_DIR} \
57 |     ${PY_ARGS}
58 |     
59 |     # --resume 


--------------------------------------------------------------------------------
/configs/refcocog/RefTR_SEG_refcocog_101.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting
 3 | set -x
 4 | 
 5 | # EXP_DIR=exps/refcocog/RefTR_SEG_101
 6 | # PY_ARGS=${@:1}
 7 | 
 8 | # conda activate pytorch
 9 | # which python
10 | 
11 | # python3.8 -u main_vg.py \
12 | #     --pretrained_model "./SAVED_MODEL/refcoco_101_det/RefTR_refcocog_101/checkpoint_best.pth"\
13 | #     --num_feature_levels 1\
14 | #     --num_queries_per_phrase 1\
15 | #     --masks\
16 | #     --lr 1e-5\
17 | #     --lr_mask_branch_proj 10\
18 | #     --dataset refcocog_umd\
19 | #     --train_split train\
20 | #     --test_split val test\
21 | #     --dec_layers 6\
22 | #     --backbone resnet101\
23 | #     --aux_loss \
24 | #     --img_size 640\
25 | #     --max_img_size 640\
26 | #     --epochs 40\
27 | #     --lr_drop 30\
28 | #     --output_dir ${EXP_DIR} \
29 | #     ${PY_ARGS}
30 |     
31 |     # --resume 
32 | 
33 | EXP_DIR=exps/refcocog/RefTR_SEG_101_PT
34 | PY_ARGS=${@:1}
35 | 
36 | conda activate pytorch
37 | which python
38 | 
39 | python3.8 -u main_vg.py \
40 |     --pretrained_model "./SAVED_MODEL/refcoco_101_det_pretrained/RefTR_refcocog_pt/checkpoint_best.pth"\
41 |     --num_feature_levels 1\
42 |     --num_queries_per_phrase 1\
43 |     --masks\
44 |     --lr 1e-5\
45 |     --lr_mask_branch_proj 10\
46 |     --dataset refcocog_umd\
47 |     --train_split train\
48 |     --test_split val test\
49 |     --dec_layers 6\
50 |     --backbone resnet101\
51 |     --aux_loss \
52 |     --img_size 640\
53 |     --max_img_size 640\
54 |     --epochs 40\
55 |     --lr_drop 30\
56 |     --ckpt_cycle 60\
57 |     --output_dir ${EXP_DIR} \
58 |     ${PY_ARGS}


--------------------------------------------------------------------------------
/configs/refcocog/RefTR_refcocog.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/RefTR_refcocog_unc
 6 | PY_ARGS=${@:1}
 7 | 
 8 | conda activate pytorch
 9 | which python
10 | 
11 | python3.8 -u main_vg.py \
12 |     --pretrained_model "./data/MODEL_ZOO/detr-r50-e632da11.pth"\
13 |     --num_feature_levels 1\
14 |     --num_queries_per_phrase 1\
15 |     --dataset refcocog_umd\
16 |     --train_split train\
17 |     --test_split val test\
18 |     --dec_layers 6\
19 |     --aux_loss \
20 |     --img_size 640\
21 |     --max_img_size 640\
22 |     --epochs 90\
23 |     --lr_drop 60\
24 |     --output_dir ${EXP_DIR} \
25 |     ${PY_ARGS}
26 |     
27 |     # --resume 


--------------------------------------------------------------------------------
/configs/refcocog/RefTR_refcocog_101.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/RefTR_refcocog_unc_101
 6 | PY_ARGS=${@:1}
 7 | 
 8 | conda activate pytorch
 9 | which python
10 | 
11 | python3.8 -u main_vg.py \
12 |     --pretrained_model "./data/MODEL_ZOO/detr-r101-2c7b67e5.pth"\
13 |     --num_feature_levels 1\
14 |     --num_queries_per_phrase 1\
15 |     --dataset refcocog_umd\
16 |     --train_split train\
17 |     --test_split val test\
18 |     --dec_layers 6\
19 |     --backbone resnet101\
20 |     --aux_loss \
21 |     --img_size 640\
22 |     --max_img_size 640\
23 |     --epochs 90\
24 |     --lr_drop 60\
25 |     --output_dir ${EXP_DIR} \
26 |     ${PY_ARGS}
27 |     
28 |     # --resume 


--------------------------------------------------------------------------------
/configs/referit/RefTR_referit.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/referit/RefTR
 6 | PY_ARGS=${@:1}
 7 | 
 8 | conda activate pytorch
 9 | which python
10 | 
11 | python3.8 -u main_vg.py \
12 |     --pretrained_model "./data/MODEL_ZOO/detr-r50-e632da11.pth"\
13 |     --num_feature_levels 1\
14 |     --num_queries_per_phrase 1\
15 |     --dec_layers 3\
16 |     --aux_loss \
17 |     --dataset referit\
18 |     --img_size 640\
19 |     --max_img_size 640\
20 |     --epochs 90\
21 |     --lr_drop 60\
22 |     --ckpt_cycle 45\
23 |     --output_dir ${EXP_DIR} \
24 |     ${PY_ARGS}
25 |     
26 | #     # --resume 
27 | # python3.8 -u main_vg.py \
28 | #     --resume ${EXP_DIR}/checkpoint0069.pth\
29 | #     --num_feature_levels 1\
30 | #     --num_queries_per_phrase 1\
31 | #     --dec_layers 3\
32 | #     --dataset referit\
33 | #     --img_size 640\
34 | #     --max_img_size 640\
35 | #     --epochs 90\
36 | #     --lr_drop 60\
37 | #     --output_dir ${EXP_DIR} \
38 | #     ${PY_ARGS}


--------------------------------------------------------------------------------
/configs/referit/RefTR_referit_101.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/referit/RefTR_101
 6 | PY_ARGS=${@:1}
 7 | 
 8 | conda activate pytorch
 9 | which python
10 | 
11 | python3.8 -u main_vg.py \
12 |     --pretrained_model "./data/MODEL_ZOO/detr-r101-2c7b67e5.pth"\
13 |     --num_feature_levels 1\
14 |     --num_queries_per_phrase 1\
15 |     --dec_layers 3\
16 |     --backbone resnet101\
17 |     --aux_loss \
18 |     --dataset referit\
19 |     --img_size 640\
20 |     --max_img_size 640\
21 |     --epochs 90\
22 |     --lr_drop 60\
23 |     --ckpt_cycle 45\
24 |     --output_dir ${EXP_DIR} \
25 |     ${PY_ARGS}
26 |     
27 | #     # --resume 
28 | # python3.8 -u main_vg.py \
29 | #     --resume ${EXP_DIR}/checkpoint0069.pth\
30 | #     --num_feature_levels 1\
31 | #     --num_queries_per_phrase 1\
32 | #     --dec_layers 3\
33 | #     --dataset referit\
34 | #     --img_size 640\
35 | #     --max_img_size 640\
36 | #     --epochs 90\
37 | #     --lr_drop 60\
38 | #     --output_dir ${EXP_DIR} \
39 | #     ${PY_ARGS}


--------------------------------------------------------------------------------
/configs/referit/RefTR_referit_101_PT.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/referit/RefTR_101_PT
 6 | PY_ARGS=${@:1}
 7 | 
 8 | conda activate pytorch
 9 | which python
10 | 
11 | python3.8 -u main_vg.py \
12 |     --resume './SAVED_MODEL/PT/RefTR_VG_101_PT_08.pth'\
13 |     --resume_model_only\
14 |     --num_feature_levels 1\
15 |     --num_queries_per_phrase 1\
16 |     --dec_layers 6\
17 |     --backbone resnet101\
18 |     --aux_loss \
19 |     --dataset referit\
20 |     --img_size 640\
21 |     --max_img_size 640\
22 |     --ckpt_cycle 90\
23 |     --epochs 90\
24 |     --lr_drop 60\
25 |     --output_dir ${EXP_DIR} \
26 |     ${PY_ARGS}
27 |     
28 | #     # --resume 
29 | # python3.8 -u main_vg.py \
30 | #     --resume ${EXP_DIR}/checkpoint0069.pth\
31 | #     --num_feature_levels 1\
32 | #     --num_queries_per_phrase 1\
33 | #     --dec_layers 3\
34 | #     --dataset referit\
35 | #     --img_size 640\
36 | #     --max_img_size 640\
37 | #     --epochs 90\
38 | #     --lr_drop 60\
39 | #     --output_dir ${EXP_DIR} \
40 | #     ${PY_ARGS}


--------------------------------------------------------------------------------
/configs/referit/RefTR_referit_PT.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting
 3 | set -x
 4 | 
 5 | EXP_DIR=exps/referit/RefTR_PT
 6 | PY_ARGS=${@:1}
 7 | 
 8 | conda activate pytorch
 9 | which python
10 | 
11 | python3.8 -u main_vg.py \
12 |     --resume './SAVED_MODEL/PT/RefTR_VG_PT_08.pth'\
13 |     --resume_model_only\
14 |     --num_feature_levels 1\
15 |     --num_queries_per_phrase 1\
16 |     --dec_layers 6\
17 |     --aux_loss \
18 |     --dataset referit\
19 |     --img_size 640\
20 |     --max_img_size 640\
21 |     --ckpt_cycle 90\
22 |     --epochs 90\
23 |     --lr_drop 60\
24 |     --output_dir ${EXP_DIR} \
25 |     ${PY_ARGS}
26 |     
27 | #     # --resume 
28 | # python3.8 -u main_vg.py \
29 | #     --resume ${EXP_DIR}/checkpoint0069.pth\
30 | #     --num_feature_levels 1\
31 | #     --num_queries_per_phrase 1\
32 | #     --dec_layers 3\
33 | #     --dataset referit\
34 | #     --img_size 640\
35 | #     --max_img_size 640\
36 | #     --epochs 90\
37 | #     --lr_drop 60\
38 | #     --output_dir ${EXP_DIR} \
39 | #     ${PY_ARGS}


--------------------------------------------------------------------------------
/datasets/__init__.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Modified from DETR (https://github.com/facebookresearch/detr)
  7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  8 | # ------------------------------------------------------------------------
  9 | 
 10 | import torch.utils.data
 11 | 
 12 | from .refer_multiphrase import build_flickr30k
 13 | from .refer_segmentation import build_refcoco_segmentation
 14 | from .refer_resc import build_flickr30k_resc, build_refcoco_resc, build_referit_resc, build_visualgenome, GeneralReferDataset
 15 | 
 16 | 
 17 | def build_refer_dataset(image_set, args):
 18 |     if args.masks:
 19 |         return build_refcoco_segmentation(
 20 |             split=image_set,
 21 |             version=args.dataset,
 22 |             img_size=args.img_size,
 23 |             max_img_size=args.max_img_size,
 24 |             data_root="./data/refcoco/anns",
 25 |             im_dir="./data/refcoco/images/train2014",
 26 |             seg_dir="./data/refcoco/masks",
 27 |             bert_model=args.bert_model
 28 |         )
 29 | 
 30 |     if args.dataset == 'flickr30k':
 31 |         # if args.reftr_type == 'transformer_single_phrase':
 32 |         #     print("Using One stage grounding's flickr30k")
 33 |         #     return build_flickr30k_resc(
 34 |         #             split=image_set,
 35 |         #             img_size=args.img_size,
 36 |         #             max_img_size=args.max_img_size,
 37 |         #             data_root="./data/annotations_resc",
 38 |         #             im_dir="./data/flickr30k/f30k_images"
 39 |         #         )
 40 |         # else:
 41 |         return build_flickr30k(
 42 |             split=image_set,
 43 |             img_size=args.img_size,
 44 |             max_img_size=args.max_img_size,
 45 |             data_root="./data/annotations",
 46 |             im_dir="./data/flickr30k/f30k_images",
 47 |             bert_model=args.bert_model
 48 |         )
 49 |         # print("Flicker Dataset size:", len(dataset_train))
 50 |     elif args.dataset == 'referit':
 51 |         return build_referit_resc(
 52 |             split=image_set,
 53 |             data_root="./data/annotations_resc",
 54 |             max_query_len=40,
 55 |             img_size=args.img_size,
 56 |             max_img_size=args.max_img_size,
 57 |             bert_model=args.bert_model
 58 |         )
 59 |     elif args.dataset.startswith('refcoco'):
 60 |         if args.dataset == 'refcoco_unc':
 61 |             version = 'unc'
 62 |         elif args.dataset == 'refcoco+_unc':
 63 |             version = 'unc+'
 64 |         elif args.dataset == 'refcocog_google':
 65 |             version = 'gref'
 66 |         elif args.dataset == 'refcocog_umd':
 67 |             version = 'gref_umd'
 68 |         return build_refcoco_resc(
 69 |             split=image_set,
 70 |             version=version,
 71 |             data_root="./data/annotations_resc",
 72 |             im_dir="./data/refcoco/images/train2014",
 73 |             max_query_len=40,
 74 |             img_size=args.img_size,
 75 |             max_img_size=args.max_img_size,
 76 |             bert_model=args.bert_model
 77 |         )
 78 |     elif args.dataset == 'vg':
 79 |         if image_set != 'all':
 80 |             return build_referit_resc(
 81 |                 split=image_set,
 82 |                 data_root="./data/annotations_resc",
 83 |                 max_query_len=40,
 84 |                 img_size=args.img_size,
 85 |                 max_img_size=args.max_img_size,
 86 |                 bert_model=args.bert_model
 87 |             )
 88 |         return build_visualgenome(
 89 |             split='all',
 90 |             data_root="./data/annotations_resc",
 91 |             im_dir="./data/visualgenome/VG_100K",
 92 |             max_query_len=40,
 93 |             img_size=args.img_size,
 94 |             max_img_size=args.max_img_size,
 95 |             bert_model=args.bert_model
 96 |         )
 97 |     elif args.dataset == 'flickr30k_resc':
 98 |         return build_flickr30k_resc(
 99 |             split=image_set,
100 |             img_size=args.img_size,
101 |             max_img_size=args.max_img_size,
102 |             max_query_len=40,
103 |             data_root="./data/annotations_resc",
104 |             im_dir="./data/flickr30k/f30k_images",
105 |             bert_model=args.bert_model
106 |         )
107 |     elif args.dataset == 'flickr30k_refcoco':
108 |         f30k = build_flickr30k_resc(
109 |             split=image_set,
110 |             img_size=args.img_size,
111 |             max_img_size=args.max_img_size,
112 |             max_query_len=40,
113 |             data_root="./data/annotations_resc",
114 |             im_dir="./data/flickr30k/f30k_images",
115 |             bert_model=args.bert_model
116 |         )
117 |         refcoco = build_refcoco_resc(
118 |             split='trainval',
119 |             version='unc',
120 |             max_query_len=40,
121 |             img_size=args.img_size,
122 |             max_img_size=args.max_img_size,
123 |             data_root="./data/annotations_resc",
124 |             im_dir="./data/refcoco/images/train2014",
125 |             bert_model=args.bert_model
126 |         )
127 |         if image_set.startswith('train'):
128 |             return GeneralReferDataset(datasets=[f30k, refcoco])
129 |         else:
130 |             return f30k
131 |     else:
132 |         raise NotImplementedError
133 | 
134 | def build_refer_segmentaion_dataset(image_set, args):
135 |     return build_refcoco_segmentation(
136 |         split=image_set, version=args.dataset
137 |     )
138 | 


--------------------------------------------------------------------------------
/datasets/data_prefetcher.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | def to_cuda(samples, targets, device):
 4 |     samples = samples.to(device, non_blocking=True)
 5 |     targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets]
 6 |     return samples, targets
 7 | 
 8 | class data_prefetcher():
 9 |     def __init__(self, loader, device, prefetch=True):
10 |         self.loader = iter(loader)
11 |         self.prefetch = prefetch
12 |         self.device = device
13 |         if prefetch:
14 |             self.stream = torch.cuda.Stream()
15 |             self.preload()
16 | 
17 |     def preload(self):
18 |         try:
19 |             self.next_samples, self.next_targets = next(self.loader)
20 |         except StopIteration:
21 |             self.next_samples = None
22 |             self.next_targets = None
23 |             return
24 |         # if record_stream() doesn't work, another option is to make sure device inputs are created
25 |         # on the main stream.
26 |         # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda')
27 |         # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda')
28 |         # Need to make sure the memory allocated for next_* is not still in use by the main stream
29 |         # at the time we start copying to next_*:
30 |         # self.stream.wait_stream(torch.cuda.current_stream())
31 |         with torch.cuda.stream(self.stream):
32 |             self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device)
33 |             # more code for the alternative if record_stream() doesn't work:
34 |             # copy_ will record the use of the pinned source tensor in this side stream.
35 |             # self.next_input_gpu.copy_(self.next_input, non_blocking=True)
36 |             # self.next_target_gpu.copy_(self.next_target, non_blocking=True)
37 |             # self.next_input = self.next_input_gpu
38 |             # self.next_target = self.next_target_gpu
39 | 
40 |             # With Amp, it isn't necessary to manually convert data to half.
41 |             # if args.fp16:
42 |             #     self.next_input = self.next_input.half()
43 |             # else:
44 | 
45 |     def next(self):
46 |         if self.prefetch:
47 |             torch.cuda.current_stream().wait_stream(self.stream)
48 |             samples = self.next_samples
49 |             targets = self.next_targets
50 |             if samples is not None:
51 |                 samples.record_stream(torch.cuda.current_stream())
52 |             if targets is not None:
53 |                 for t in targets:
54 |                     for k, v in t.items():
55 |                         v.record_stream(torch.cuda.current_stream())
56 |             self.preload()
57 |         else:
58 |             try:
59 |                 samples, targets = next(self.loader)
60 |                 samples, targets = to_cuda(samples, targets, self.device)
61 |             except StopIteration:
62 |                 samples = None
63 |                 targets = None
64 |         return samples, targets
65 | 


--------------------------------------------------------------------------------
/datasets/grounding_datasets/__init__.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | from .refer_dataset import FlickrMultiPhraseDataset, ReferSegDataset
  3 | 
  4 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
  5 |   """Truncates a sequence pair in place to the maximum length."""
  6 |   while True:
  7 |     total_length = len(tokens_a) + len(tokens_b)
  8 |     if total_length <= max_length:
  9 |       break
 10 |     if len(tokens_a) > len(tokens_b):
 11 |       tokens_a.pop() 
 12 |     else:
 13 |       tokens_b.pop() 
 14 | 
 15 | def read_examples(input_line, unique_id):
 16 |     """
 17 |     Note from JOJO: this should be copied from bert source code
 18 |     refer to:
 19 |     https://daiwk.github.io/posts/nlp-bert-code-annotated-application.html#inputexample
 20 |     for understanding
 21 |     Read a list of `InputExample`s from an input file."""
 22 |     examples = []
 23 |     # unique_id = 0
 24 |     line = input_line #reader.readline()
 25 |     # if not line:
 26 |     #     break
 27 |     line = line.strip()
 28 |     text_a = None
 29 |     text_b = None
 30 |     m = re.match(r"^(.*) \|\|\| (.*)$", line)
 31 |     if m is None:
 32 |         text_a = line
 33 |     else:
 34 |         text_a = m.group(1)
 35 |         text_b = m.group(2)
 36 |     examples.append(
 37 |         InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
 38 |     # unique_id += 1
 39 |     return examples
 40 | 
 41 | ## Bert text encoding
 42 | class InputExample(object):
 43 |     def __init__(self, unique_id, text_a, text_b):
 44 |         self.unique_id = unique_id
 45 |         self.text_a = text_a
 46 |         self.text_b = text_b
 47 | 
 48 | class InputFeatures(object):
 49 |     """A single set of features of data."""
 50 |     def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
 51 |         self.unique_id = unique_id
 52 |         self.tokens = tokens
 53 |         self.input_ids = input_ids
 54 |         self.input_mask = input_mask
 55 |         self.input_type_ids = input_type_ids
 56 | 
 57 | def convert_examples_to_features(examples, seq_length, tokenizer):
 58 |     """Loads a data file into a list of `InputBatch`s."""
 59 |     features = []
 60 |     for (ex_index, example) in enumerate(examples):
 61 |         tokens_a = tokenizer.tokenize(example.text_a)
 62 | 
 63 |         tokens_b = None
 64 |         if example.text_b:
 65 |             tokens_b = tokenizer.tokenize(example.text_b)
 66 | 
 67 |         if tokens_b:
 68 |             # Modifies `tokens_a` and `tokens_b` in place so that the total
 69 |             # length is less than the specified length.
 70 |             # Account for [CLS], [SEP], [SEP] with "- 3"
 71 |             _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
 72 |         else:
 73 |             # Account for [CLS] and [SEP] with "- 2"
 74 |             if len(tokens_a) > seq_length - 2:
 75 |                 tokens_a = tokens_a[0:(seq_length - 2)]
 76 |         tokens = []
 77 |         input_type_ids = []
 78 |         tokens.append("[CLS]")
 79 |         input_type_ids.append(0)
 80 |         for token in tokens_a:
 81 |             tokens.append(token)
 82 |             input_type_ids.append(0)
 83 |         tokens.append("[SEP]")
 84 |         input_type_ids.append(0)
 85 | 
 86 |         if tokens_b:
 87 |             for token in tokens_b:
 88 |                 tokens.append(token)
 89 |                 input_type_ids.append(1)
 90 |             tokens.append("[SEP]")
 91 |             input_type_ids.append(1)
 92 | 
 93 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
 94 | 
 95 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
 96 |         # tokens are attended to.
 97 |         input_mask = [1] * len(input_ids)
 98 | 
 99 |         # Zero-pad up to the sequence length.
100 |         while len(input_ids) < seq_length:
101 |             input_ids.append(0)
102 |             input_mask.append(0)
103 |             input_type_ids.append(0)
104 | 
105 |         assert len(input_ids) == seq_length
106 |         assert len(input_mask) == seq_length
107 |         assert len(input_type_ids) == seq_length
108 |         features.append(
109 |             InputFeatures(
110 |                 unique_id=example.unique_id,
111 |                 tokens=tokens,
112 |                 input_ids=input_ids,
113 |                 input_mask=input_mask,
114 |                 input_type_ids=input_type_ids))
115 |     return features


--------------------------------------------------------------------------------
/datasets/grounding_datasets/refer_dataset.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Copied from https://github.com/zyang-ur/ReSC/blob/e4022f87bfd11200b67c4509bb9746640834ceae/utils/transforms.py
  5 | 
  6 | ReferIt, UNC, UNC+ and GRef referring image segmentation PyTorch dataset.
  7 | Define and group batches of images, segmentations and queries.
  8 | Based on:
  9 | https://github.com/chenxi116/TF-phrasecut-public/blob/master/build_batches.py
 10 | """
 11 | 
 12 | import os
 13 | import sys
 14 | import cv2
 15 | import json
 16 | import uuid
 17 | import tqdm
 18 | import math
 19 | import torch
 20 | import random
 21 | import argparse
 22 | import collections
 23 | import logging
 24 | import re
 25 | import operator
 26 | # import h5py
 27 | import numpy as np
 28 | import os.path as osp
 29 | import scipy.io as sio
 30 | import torch.utils.data as data
 31 | from collections import OrderedDict
 32 | sys.path.append('.')
 33 | # import util
 34 | from util.word_utils import Corpus
 35 | 
 36 | from transformers import BertTokenizerFast, RobertaTokenizerFast
 37 | from util.transforms import letterbox, random_affine
 38 | from datasets.lang_utils import convert_examples_to_features, read_examples
 39 | # sys.modules['utils'] = utils
 40 | 
 41 | cv2.setNumThreads(0)
 42 | 
 43 | def build_bert_tokenizer(bert_model):
 44 |     if bert_model.split('-')[0] == 'roberta':
 45 |         lang_backbone = RobertaTokenizerFast.from_pretrained(bert_model, do_lower_case=True, do_basic_tokenize=False)
 46 |     else:
 47 |         lang_backbone = BertTokenizerFast.from_pretrained(bert_model, do_lower_case=True, do_basic_tokenize=False)
 48 |     return lang_backbone
 49 | 
 50 | class DatasetNotFoundError(Exception):
 51 |     pass
 52 | 
 53 | class FlickrMultiPhraseDataset(data.Dataset):
 54 |     SUPPORTED_DATASETS = {
 55 |         'flickr': {'splits': ('train', 'val', 'test', 'trainval')}
 56 |     }
 57 | 
 58 |     def __init__(
 59 |             self, data_root, im_dir, dataset='referit', split='train', max_seq_len=88,
 60 |             max_num_phrases=16, max_phrase_len=22, bert_model='bert-base-uncased', lstm=False):
 61 |         self.images = []
 62 |         self.data_root = data_root
 63 |         self.im_dir = im_dir
 64 |         self.dataset = dataset
 65 |         self.seq_len = max_seq_len
 66 |         self.phrase_seq_len = max_phrase_len
 67 |         self.num_phrases = max_num_phrases
 68 |         self.split = split
 69 | 
 70 |         print("Using tokenizer from:", bert_model)
 71 |         self.tokenizer = build_bert_tokenizer(bert_model)
 72 |         # self.tokenizer.add_special_tokens({'cls_phrase': '[CLS_P]', 'sperator_phrase': '[SEP_P]'})
 73 | 
 74 |         annotation_path = osp.join(data_root, self.dataset)
 75 |         valid_splits = self.SUPPORTED_DATASETS[self.dataset]['splits']
 76 | 
 77 |         if split not in valid_splits:
 78 |             raise ValueError(
 79 |                 'Dataset {0} does not have split {1}'.format(
 80 |                     self.dataset, split))
 81 | 
 82 |         splits = ['train', 'val'] if split == 'trainval' else [split]
 83 |         for split in splits:
 84 |             imgset_file = '{0}_{1}.pth'.format(self.dataset, split)
 85 |             imgset_path = osp.join(annotation_path, imgset_file)
 86 |             self.images += torch.load(imgset_path)
 87 | 
 88 |     def exists_dataset(self):
 89 |         return osp.exists(osp.join(self.data_root, self.dataset))
 90 | 
 91 |     def pull_item(self, idx):
 92 |         if self.dataset == 'flickr':
 93 |             img_file, phrase_pos, bbox, phrases, _, sentence = self.images[idx]
 94 |         else:
 95 |             img_file, _, bbox, phrase, sentence = self.images[idx]
 96 |             phrases = [sentence]
 97 |             phrase_pos = [0]
 98 |         ## box format: to x1y1x2y2
 99 |         bbox = np.array(bbox, dtype=int)
100 | 
101 |         img_path = osp.join(self.im_dir, img_file)
102 |         img = cv2.imread(img_path)
103 |         ## duplicate channel if gray image
104 |         if img.shape[-1] > 1:
105 |             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
106 |         else:
107 |             img = np.stack([img] * 3)
108 |         return img, phrases, phrase_pos, sentence, bbox, img_file
109 | 
110 |     def __len__(self):
111 |         return len(self.images)
112 | 
113 |     def __getitem__(self, idx):
114 |         def phrase_pos_to_mask(pos_start, sentence_token, phrase_token, seq_len):
115 |             phrase_len = len(phrase_token) - 2
116 |             pos_start = pos_start + 1
117 |             assert phrase_len >= 0
118 |             assert sentence_token[pos_start:phrase_len+pos_start] == phrase_token[1:-1]
119 | 
120 |             mask = np.zeros(seq_len, dtype=np.bool)
121 |             if phrase_len == 0:
122 |                 mask[0] = True
123 |             else:
124 |                 mask[pos_start:phrase_len+pos_start] = True
125 |             return mask
126 | 
127 |         img, phrases, phrase_char_pos_l, sentence, bbox, img_file = self.pull_item(idx)
128 |         # phrase = phrase.decode("utf-8").encode().lower()
129 | 
130 |         # encode phrase to bert input
131 |         tokenized_sentence = self.tokenizer(
132 |             sentence,
133 |             padding='max_length',
134 |             max_length=self.seq_len,
135 |             return_tensors='pt',
136 |         )
137 |         word_id = tokenized_sentence['input_ids'][0]
138 |         word_mask = tokenized_sentence['attention_mask'][0]
139 | 
140 |         # examples = read_examples(sentence, idx)
141 |         # sentence_features = convert_examples_to_features(
142 |         #     examples=examples, seq_length=self.seq_len, tokenizer=self.tokenizer)
143 |         # word_id = sentence_features[0].input_ids
144 |         # word_mask = sentence_features[0].input_mask
145 | 
146 |         phrase_masks = []
147 |         phrase_context_masks = []
148 |         tokenized_phrases = []
149 |         phrase_pos_l = []
150 |         phrase_pos_r = []
151 |         for p, char_pos_l in zip(phrases, phrase_char_pos_l):
152 |             tokenized_phrase = self.tokenizer(
153 |                 p,
154 |                 padding='max_length',
155 |                 max_length=self.phrase_seq_len,
156 |                 return_tensors='np',
157 |             )
158 |             tokenized_phrases.append(tokenized_phrase['input_ids'][0])
159 |             phrase_masks.append(tokenized_phrase['attention_mask'][0])
160 | 
161 |             # set up phrase_pos
162 |             phrase_char_len = p.__len__()
163 |             pos_l = tokenized_sentence.char_to_token(char_pos_l)
164 |             pos_r = tokenized_sentence.char_to_token(char_pos_l + phrase_char_len - 1) 
165 |             assert pos_l is not None and pos_r is not None
166 |             # Tips for roberta: Ġ means the end of a new token 
167 |             # So assert from the second character
168 |             # assert tokenized_sentence.tokens()[pos_l+1:pos_r] == tokenized_phrase.tokens()[2:1+pos_r-pos_l],\
169 |             #     (tokenized_sentence.tokens()[pos_l:pos_r], tokenized_phrase.tokens(), pos_l, pos_r)
170 |             phrase_pos_l.append(pos_l)
171 |             phrase_pos_r.append(pos_r+1)
172 | 
173 |         for i in range(len(phrases), self.num_phrases):
174 |             tokenized_phrase = self.tokenizer(
175 |                 "",
176 |                 padding='max_length',
177 |                 max_length=self.phrase_seq_len,
178 |                 return_tensors='np',
179 |             )
180 |             tokenized_phrases.append(tokenized_phrase['input_ids'][0])
181 |             phrase_masks.append(tokenized_phrase['attention_mask'][0])
182 |             phrase_pos_l.append(0)
183 |             phrase_pos_r.append(1)
184 | 
185 |         h, w, c = img.shape
186 |         samples = {
187 |             "img": img,
188 |             "sentence": np.array(word_id, dtype=int),
189 |             "sentence_mask": np.array(word_mask, dtype=bool),
190 |             "phrase": np.array(tokenized_phrases, dtype=int),
191 |             "phrase_mask": np.array(phrase_masks, dtype=bool),
192 |             "phrase_pos_l": np.array(phrase_pos_l, dtype=int),
193 |             "phrase_pos_r": np.array(phrase_pos_r, dtype=int)
194 |         }
195 | 
196 |         image_id = int(img_file.split('.')[0].split('_')[-1])
197 |         target = {
198 |             "image_id": image_id,
199 |             "boxes": np.array(bbox, dtype=np.float32),
200 |             "labels": [0],
201 |             'dataset_id': idx,
202 |             "orig_size": np.array([h, w], dtype=np.int)
203 |         }
204 |         return samples, target
205 |         # if self.testmode:
206 |         #     return img, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
207 |         #         np.array(bbox, dtype=np.float32), np.array(ratio, dtype=np.float32), \
208 |         #         np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0]
209 |         # else:
210 |         #     return img, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \
211 |         #     np.array(bbox, dtype=np.float32)
212 | 
213 | class ReferSegDataset(data.Dataset):
214 |     SUPPORTED_DATASETS = {
215 |         'refcoco_unc': {
216 |             'splits': ('train', 'val', 'trainval', 'testA', 'testB'),
217 |             'params': {'dataset': 'refcoco', 'split_by': 'unc'}
218 |         },
219 |         'refcoco+_unc': {
220 |             'splits': ('train', 'val', 'trainval', 'testA', 'testB'),
221 |             'params': {'dataset': 'refcoco+', 'split_by': 'unc'}
222 |         },
223 |         'refcocog_google': {
224 |             'splits': ('train', 'val'),
225 |             'params': {'dataset': 'refcocog', 'split_by': 'google'}
226 |         },
227 |         'refcocog_umd': {
228 |             'splits': ('train', 'val', 'test'),
229 |             'params': {'dataset': 'refcocog', 'split_by': 'umd'}
230 |         }
231 |     }
232 | 
233 |     def __init__(self, data_root, im_dir, seg_dir, dataset='refcoco_unc', 
234 |                  split='train', max_query_len=40, bert_model='bert-base-uncased'):
235 |         self.images = []
236 |         self.data_root = data_root
237 |         self.im_dir = im_dir
238 |         self.dataset = dataset
239 |         self.query_len = max_query_len
240 |         self.split = split
241 |         self.tokenizer = build_bert_tokenizer(bert_model)
242 | 
243 |         dataset_dir = self.dataset.split('_')[0]
244 |         annotation_path = osp.join(data_root, dataset_dir)
245 |         self.seg_dir = osp.join(seg_dir, dataset_dir)
246 | 
247 |         valid_splits = self.SUPPORTED_DATASETS[self.dataset]['splits']
248 |         if split not in valid_splits:
249 |             raise ValueError(
250 |                 'Dataset {0} does not have split {1}'.format(
251 |                     self.dataset, split))
252 | 
253 |         splits = [split]
254 |         splits = ['train', 'val'] if split == 'trainval' else [split]
255 |         for split in splits:
256 |             imgset_file = '{0}_{1}.pth'.format(self.dataset, split)
257 |             imgset_path = osp.join(annotation_path, imgset_file)
258 |             self.images += torch.load(imgset_path)
259 | 
260 |     def pull_item(self, idx):
261 |         img_file, seg_file, bbox, phrase = self.images[idx]
262 |         ## box format: x1y1x2y2
263 |         bbox = np.array(bbox, dtype=int)
264 |         img = cv2.imread(osp.join(self.im_dir, img_file))
265 |         mask = np.load(osp.join(self.seg_dir, seg_file))
266 |         assert img.shape[:2] == mask.shape[:2]
267 |         ## duplicate channel if gray image
268 |         if img.shape[-1] > 1:
269 |             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
270 |         else:
271 |             img = np.stack([img] * 3)
272 |         return img, mask, phrase, bbox, img_file
273 | 
274 |     def tokenize_phrase(self, phrase):
275 |         return self.corpus.tokenize(phrase, self.query_len)
276 | 
277 |     def untokenize_word_vector(self, words):
278 |         return self.corpus.dictionary[words]
279 | 
280 |     def __len__(self):
281 |         return len(self.images)
282 | 
283 |     def __getitem__(self, idx):
284 |         img, mask, phrase, bbox, img_file = self.pull_item(idx)
285 |         # phrase = phrase.decode("utf-8").encode().lower()
286 |         phrase = phrase.lower()
287 | 
288 |         # encode phrase to bert input
289 |         tokenized_sentence = self.tokenizer(
290 |             phrase,
291 |             padding='max_length',
292 |             max_length=self.query_len,
293 |             truncation=True,
294 |             return_tensors='pt',
295 |         )
296 |         word_id = tokenized_sentence['input_ids'][0]
297 |         word_mask = tokenized_sentence['attention_mask'][0]
298 | 
299 |         h, w, c = img.shape
300 | 
301 |         samples = {
302 |             "img": img,
303 |             "sentence": np.array(word_id, dtype=int),
304 |             "sentence_mask": np.array(word_mask, dtype=int)
305 |         }
306 | 
307 |         mask = mask[None, :, :]
308 |         image_id = int(img_file.split('.')[0].split('_')[-1])
309 |         target = {
310 |             "image_id": image_id,
311 |             'dataset_id': idx,
312 |             "boxes": np.array([bbox], dtype=np.float32),
313 |             "labels": [0],
314 |             "masks": mask,
315 |             "orig_size": np.array([h, w], dtype=np.int)
316 |         }
317 |         return samples, target
318 | 
319 | 


--------------------------------------------------------------------------------
/datasets/grounding_datasets/resc_refer_dataset.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Copied from https://github.com/zyang-ur/ReSC/blob/e4022f87bfd11200b67c4509bb9746640834ceae/utils/transforms.py
  5 | 
  6 | ReferIt, UNC, UNC+ and GRef referring image segmentation PyTorch dataset.
  7 | Define and group batches of images, segmentations and queries.
  8 | Based on:
  9 | https://github.com/chenxi116/TF-phrasecut-public/blob/master/build_batches.py
 10 | """
 11 | 
 12 | import os
 13 | import sys
 14 | import cv2
 15 | import json
 16 | import uuid
 17 | import tqdm
 18 | import math
 19 | import torch
 20 | import random
 21 | # import h5py
 22 | import numpy as np
 23 | import os.path as osp
 24 | import scipy.io as sio
 25 | import torch.utils.data as data
 26 | from collections import OrderedDict
 27 | sys.path.append('.')
 28 | import operator
 29 | # import util
 30 | from util.word_utils import Corpus
 31 | 
 32 | import argparse
 33 | import logging
 34 | import re
 35 | 
 36 | 
 37 | from transformers import BertTokenizerFast, RobertaTokenizerFast
 38 | # from transformers import BertTokenizer,BertModel
 39 | from util.transforms import letterbox, random_affine
 40 | from datasets.lang_utils import convert_examples_to_features, read_examples
 41 | # sys.modules['utils'] = utils
 42 | 
 43 | def build_bert_tokenizer(bert_model):
 44 |     if bert_model.split('-')[0] == 'roberta':
 45 |         lang_backbone = RobertaTokenizerFast.from_pretrained(bert_model, do_lower_case=True, do_basic_tokenize=False)
 46 |     else:
 47 |         lang_backbone = BertTokenizerFast.from_pretrained(bert_model, do_lower_case=True, do_basic_tokenize=False)
 48 |     return lang_backbone
 49 | 
 50 | cv2.setNumThreads(0)
 51 | 
 52 | 
 53 | class DatasetNotFoundError(Exception):
 54 |     pass
 55 | 
 56 | 
 57 | class ReferDataset(data.Dataset):
 58 |     SUPPORTED_DATASETS = {
 59 |         'referit': {'splits': ('train', 'val', 'trainval', 'test')},
 60 |         'unc': {
 61 |             'splits': ('train', 'val', 'trainval', 'testA', 'testB'),
 62 |             'params': {'dataset': 'refcoco', 'split_by': 'unc'}
 63 |         },
 64 |         'unc+': {
 65 |             'splits': ('train', 'val', 'trainval', 'testA', 'testB'),
 66 |             'params': {'dataset': 'refcoco+', 'split_by': 'unc'}
 67 |         },
 68 |         'gref': {
 69 |             'splits': ('train', 'val'),
 70 |             'params': {'dataset': 'refcocog', 'split_by': 'google'}
 71 |         },
 72 |         'gref_umd': {
 73 |             'splits': ('train', 'val', 'test'),
 74 |             'params': {'dataset': 'refcocog', 'split_by': 'umd'}
 75 |         },
 76 |         'flickr': {'splits': ('train', 'val', 'trainval', 'test')},
 77 |         'vg': {'splits': ('all')}
 78 |     }
 79 | 
 80 |     def __init__(self, data_root, im_dir, dataset='referit', 
 81 |                  split='train', max_query_len=128, lstm=False, bert_model='bert-base-uncased'):
 82 |         self.images = []
 83 |         self.data_root = data_root
 84 |         self.im_dir = im_dir
 85 |         self.dataset = dataset
 86 |         self.query_len = max_query_len
 87 |         self.lstm = lstm
 88 |         self.split = split
 89 |         self.tokenizer = build_bert_tokenizer(bert_model)
 90 | 
 91 |         if not self.exists_dataset():
 92 |             # self.process_dataset()
 93 |             print('Please download index cache to data folder: \n \
 94 |                 https://drive.google.com/open?id=1cZI562MABLtAzM6YU4WmKPFFguuVr0lZ')
 95 |             exit(0)
 96 | 
 97 |         annotation_path = osp.join(data_root, self.dataset)
 98 |         valid_splits = self.SUPPORTED_DATASETS[self.dataset]['splits']
 99 | 
100 |         if self.lstm:
101 |             self.corpus = Corpus()
102 |             corpus_path = osp.join(annotation_path, 'corpus.pth')
103 |             self.corpus = torch.load(corpus_path)
104 | 
105 |         if split not in valid_splits:
106 |             raise ValueError(
107 |                 'Dataset {0} does not have split {1}'.format(
108 |                     self.dataset, split))
109 | 
110 |         splits = [split]
111 |         if self.dataset != 'referit':
112 |             splits = ['train', 'val'] if split == 'trainval' else [split]
113 |         for split in splits:
114 |             imgset_file = '{0}_{1}.pth'.format(self.dataset, split)
115 |             imgset_path = osp.join(annotation_path, imgset_file)
116 |             self.images += torch.load(imgset_path)
117 | 
118 |     def exists_dataset(self):
119 |         return osp.exists(osp.join(self.data_root, self.dataset))
120 | 
121 |     def pull_item(self, idx):
122 |         if self.dataset in ['flickr', 'vg']:
123 |             img_file, bbox, phrase = self.images[idx]
124 |         else:
125 |             img_file, _, bbox, phrase, attri = self.images[idx]
126 |         ## box format: to x1y1x2y2
127 |         if not (self.dataset == 'referit' or self.dataset == 'flickr'):
128 |             bbox = np.array(bbox, dtype=int)
129 |             bbox[2], bbox[3] = bbox[0]+bbox[2], bbox[1]+bbox[3]
130 |         else:
131 |             bbox = np.array(bbox, dtype=int)
132 | 
133 |         img_path = osp.join(self.im_dir, img_file)
134 |         img = cv2.imread(img_path)
135 |         ## duplicate channel if gray image
136 |         if img.shape[-1] > 1:
137 |             img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
138 |         else:
139 |             img = np.stack([img] * 3)
140 |         return img, phrase, bbox, img_file
141 | 
142 |     def tokenize_phrase(self, phrase):
143 |         return self.corpus.tokenize(phrase, self.query_len)
144 | 
145 |     def untokenize_word_vector(self, words):
146 |         return self.corpus.dictionary[words]
147 | 
148 |     def __len__(self):
149 |         return len(self.images)
150 | 
151 |     def __getitem__(self, idx):
152 |         img, phrase, bbox, img_file = self.pull_item(idx)
153 |         # phrase = phrase.decode("utf-8").encode().lower()
154 |         phrase = phrase.lower()
155 | 
156 |         # encode phrase to bert input
157 |         # Enable truncation in this case
158 |         tokenized_sentence = self.tokenizer(
159 |             phrase,
160 |             padding='max_length',
161 |             max_length=self.query_len,
162 |             truncation=True,
163 |             return_tensors='pt',
164 |         )
165 |         word_id = tokenized_sentence['input_ids'][0]
166 |         word_mask = tokenized_sentence['attention_mask'][0]
167 | 
168 |         h, w, c = img.shape
169 | 
170 |         samples = {
171 |             "img": img,
172 |             "sentence": np.array(word_id, dtype=int),
173 |             "sentence_mask": np.array(word_mask, dtype=int)
174 |         }
175 | 
176 |         image_id = int(img_file.split('.')[0].split('_')[-1])
177 |         target = {
178 |             "image_id": image_id,
179 |             "boxes": np.array([bbox], dtype=np.float32),
180 |             "labels": [0],
181 |             'dataset_id': idx,
182 |             "orig_size": np.array([h, w], dtype=np.int)
183 |         }
184 |         return samples, target


--------------------------------------------------------------------------------
/datasets/lang_utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | 
  3 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
  4 |   """Truncates a sequence pair in place to the maximum length."""
  5 |   while True:
  6 |     total_length = len(tokens_a) + len(tokens_b)
  7 |     if total_length <= max_length:
  8 |       break
  9 |     if len(tokens_a) > len(tokens_b):
 10 |       tokens_a.pop() 
 11 |     else:
 12 |       tokens_b.pop() 
 13 | 
 14 | def read_examples(input_line, unique_id):
 15 |     """
 16 |     Note from JOJO: this is copied from bert source code
 17 |     refer to:
 18 |     https://daiwk.github.io/posts/nlp-bert-code-annotated-application.html#inputexample
 19 |     for understanding
 20 |     Read a list of `InputExample`s from an input file."""
 21 |     examples = []
 22 |     # unique_id = 0
 23 |     line = input_line #reader.readline()
 24 |     # if not line:
 25 |     #     break
 26 |     line = line.strip()
 27 |     text_a = None
 28 |     text_b = None
 29 |     m = re.match(r"^(.*) \|\|\| (.*)$", line)
 30 |     if m is None:
 31 |         text_a = line
 32 |     else:
 33 |         text_a = m.group(1)
 34 |         text_b = m.group(2)
 35 |     examples.append(
 36 |         InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
 37 |     # unique_id += 1
 38 |     return examples
 39 | 
 40 | ## Bert text encoding
 41 | class InputExample(object):
 42 |     def __init__(self, unique_id, text_a, text_b):
 43 |         self.unique_id = unique_id
 44 |         self.text_a = text_a
 45 |         self.text_b = text_b
 46 | 
 47 | class InputFeatures(object):
 48 |     """A single set of features of data."""
 49 |     def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
 50 |         self.unique_id = unique_id
 51 |         self.tokens = tokens
 52 |         self.input_ids = input_ids
 53 |         self.input_mask = input_mask
 54 |         self.input_type_ids = input_type_ids
 55 | 
 56 | def convert_examples_to_features(examples, seq_length, tokenizer):
 57 |     """Loads a data file into a list of `InputBatch`s."""
 58 |     features = []
 59 |     for (ex_index, example) in enumerate(examples):
 60 |         tokens_a = tokenizer.tokenize(example.text_a)
 61 | 
 62 |         tokens_b = None
 63 |         if example.text_b:
 64 |             tokens_b = tokenizer.tokenize(example.text_b)
 65 | 
 66 |         if tokens_b:
 67 |             # Modifies `tokens_a` and `tokens_b` in place so that the total
 68 |             # length is less than the specified length.
 69 |             # Account for [CLS], [SEP], [SEP] with "- 3"
 70 |             _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
 71 |         else:
 72 |             # Account for [CLS] and [SEP] with "- 2"
 73 |             if len(tokens_a) > seq_length - 2:
 74 |                 tokens_a = tokens_a[0:(seq_length - 2)]
 75 |         tokens = []
 76 |         input_type_ids = []
 77 |         tokens.append("[CLS]")
 78 |         input_type_ids.append(0)
 79 |         for token in tokens_a:
 80 |             tokens.append(token)
 81 |             input_type_ids.append(0)
 82 |         tokens.append("[SEP]")
 83 |         input_type_ids.append(0)
 84 | 
 85 |         if tokens_b:
 86 |             for token in tokens_b:
 87 |                 tokens.append(token)
 88 |                 input_type_ids.append(1)
 89 |             tokens.append("[SEP]")
 90 |             input_type_ids.append(1)
 91 | 
 92 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
 93 | 
 94 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
 95 |         # tokens are attended to.
 96 |         input_mask = [1] * len(input_ids)
 97 | 
 98 |         # Zero-pad up to the sequence length.
 99 |         while len(input_ids) < seq_length:
100 |             input_ids.append(0)
101 |             input_mask.append(0)
102 |             input_type_ids.append(0)
103 | 
104 |         assert len(input_ids) == seq_length
105 |         assert len(input_mask) == seq_length
106 |         assert len(input_type_ids) == seq_length
107 |         features.append(
108 |             InputFeatures(
109 |                 unique_id=example.unique_id,
110 |                 tokens=tokens,
111 |                 input_ids=input_ids,
112 |                 input_mask=input_mask,
113 |                 input_type_ids=input_type_ids))
114 |     return features


--------------------------------------------------------------------------------
/datasets/refer_multiphrase.py:
--------------------------------------------------------------------------------
  1 | # Builder for visual grouding datasets
  2 | from .grounding_datasets import FlickrMultiPhraseDataset
  3 | from PIL import Image
  4 | import datasets.transforms as T
  5 | import torch
  6 | 
  7 | class flickr30k(FlickrMultiPhraseDataset):
  8 |     def __init__(self, data_root, im_dir, split, transforms,
  9 |                  max_seq_len=90, bert_model='bert-base-uncased', lstm=False):
 10 |         super(flickr30k, self).__init__(
 11 |             data_root=data_root,
 12 |             im_dir=im_dir,
 13 |             dataset='flickr',
 14 |             split=split,
 15 |             max_seq_len=max_seq_len,
 16 |             lstm=lstm,
 17 |             bert_model=bert_model
 18 |         )
 19 |         self._transforms = transforms
 20 |     
 21 |     def __getitem__(self, idx):
 22 |         input_sample, target = super(flickr30k, self).__getitem__(idx)
 23 |         target = {k: torch.as_tensor(v) for k, v in target.items()}
 24 |         # target['boxes'] = torch.as_tensor(target['boxes'])
 25 |         img = Image.fromarray(input_sample["img"])
 26 |         if self._transforms is not None:
 27 |             img, target = self._transforms(img, target)
 28 |         input_sample["img"] = img
 29 |         return input_sample, target
 30 | 
 31 | 
 32 | def make_refer_transforms(img_size=224 ,max_img_size=1333 ,test=False):
 33 |     normalize = T.Compose([
 34 |         T.ToTensor(),
 35 |         T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
 36 |     ])
 37 | 
 38 |     if not test:
 39 |         return T.Compose([
 40 |             # T.RandomHorizontalFlip(),
 41 |             T.RandomIntensitySaturation(),
 42 |             T.RandomResize([img_size], max_size=max_img_size),
 43 |             # T.RandomAffineTransform(degrees=(-5,5), translate=(0.1, 0.1),
 44 |             #                         scale=(0.9, 1.1)),
 45 |             normalize
 46 |         ])
 47 |     else:
 48 |         return T.Compose([
 49 |             T.RandomResize([img_size], max_size=max_img_size),
 50 |             normalize
 51 |         ])
 52 | 
 53 | 
 54 | def build_flickr30k(split='train', 
 55 |                     data_root="./data/annotations",
 56 |                     im_dir="./data/flickr30k/f30k_images",
 57 |                     bert_model='bert-base-uncased',
 58 |                     img_size=224, 
 59 |                     max_img_size=1333):
 60 |     istest = split != 'train'
 61 |     return flickr30k(
 62 |         data_root=data_root,
 63 |         im_dir=im_dir,
 64 |         transforms=make_refer_transforms(img_size, max_img_size, test=istest),
 65 |         split=split,
 66 |         bert_model=bert_model
 67 |     )
 68 | 
 69 | 
 70 | if __name__ == "__main__":
 71 |     # comment out normalize in make_refer_transforms when testing
 72 |     from PIL import Image, ImageDraw
 73 |     # flickr
 74 |     d_train = build_flickr30k(split='train', bert_model="./configs/VinVL_VQA_base")
 75 |     d_val = build_flickr30k(split='val', bert_model="./configs/VinVL_VQA_base")
 76 |     d_test = build_flickr30k(split='test', bert_model="./configs/VinVL_VQA_base")
 77 |     print(f"flickr30k datasets have : {len(d_train)} Training samples")
 78 |     print(f"flickr30k datasets have : {len(d_test)} Testing samples")
 79 |     for i in range(0, 200, 50):
 80 |         samples, target = d_train[i]
 81 |         img = samples['img']
 82 |         img1 = ImageDraw.Draw(img)
 83 |         print(img)
 84 |         print(target['boxes'])
 85 |         img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red')
 86 |         # if target['boxes'].shape[0] > 1:
 87 |         img1.rectangle(target['boxes'][1].numpy().tolist(), outline='blue')
 88 |         img.save(f"./exps/flickr_train_sample{i}.jpg")
 89 |     
 90 |     # # refcoco
 91 |     # d_train = build_refcoco(split='trainval', version='refcoco')
 92 |     # d_testA = build_refcoco(split='testA', version='refcoco')
 93 |     # d_testB = build_refcoco(split='testB', version='refcoco')
 94 |     # print(f"Refcoco datasets have : {len(d_train)} Training samples")
 95 |     # print(f"Refcoco datasets have : {len(d_testA)}/{len(d_testB)} Testing samples")
 96 |     # for i in range(0, 200, 50):
 97 |     #     samples, target = d_train[i]
 98 |     #     img = samples['img']
 99 |     #     img1 = ImageDraw.Draw(img)
100 |     #     img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red')
101 |     #     img.save(f"./exps/refcoco_train_sample{i}.jpg")
102 | 
103 |     # # refcoco
104 |     # d_train = build_refcoco(split='trainval', version='refcoco+')
105 |     # d_testA = build_refcoco(split='testA', version='refcoco+')
106 |     # d_testB = build_refcoco(split='testB', version='refcoco+')
107 |     # print(f"Refcoco+ datasets have : {len(d_train)} Training samples")
108 |     # print(f"Refcoco+ datasets have : {len(d_testA)}/{len(d_testB)} Testing samples")
109 |     # for i in range(0, 200, 50):
110 |     #     samples, target = d_train[i]
111 |     #     img = samples['img']
112 |     #     img1 = ImageDraw.Draw(img)
113 |     #     img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red')
114 |     #     img.save(f"./exps/refcoco+_train_sample{i}.jpg")
115 | 
116 |     # # referit
117 |     # d_train = build_referit(split='trainval')
118 |     # d_test = build_referit(split='test')
119 |     # print(f"ReferIt datasets have : {len(d_train)} Training samples")
120 |     # print(f"ReferIt datasets have : {len(d_test)} Testing samples")
121 |     # for i in range(0, 200, 50):
122 |     #     samples, target = d_train[i]
123 |     #     img = samples['img']
124 |     #     img1 = ImageDraw.Draw(img)
125 |     #     img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red')
126 |     #     img.save(f"./exps/referit_train_sample{i}.jpg")
127 | 
128 | # class GeneralReferDataset(torch.utils.data.Dataset):
129 | #     """
130 | #         A collection of datasets.
131 | #     """
132 | #     def __init__(self, datasets):
133 | #         super(GeneralReferDataset, self).__init__()
134 | #         self.datasets = datasets
135 | #         self.dataset_len = [len(x) for x in datasets]
136 |     
137 | #     def __getitem__(self, idx):
138 | #         for i, dataset in enumerate(self.datasets):
139 | #             if idx >= self.dataset_len[i]:
140 | #                 idx = idx - self.dataset_len[i]
141 | #             else:
142 | #                 return dataset.__getitem__(idx)
143 | 
144 | #     def __len__(self):
145 | #         return sum(self.dataset_len)
146 | 
147 | # class RefCOCO(ReferDataset):
148 | #     def __init__(self, data_root, im_dir, split, transforms, version="unc",
149 | #                  max_query_len=128, lstm=False, bert_model='bert-base-uncased'):
150 | #         super(RefCOCO, self).__init__(
151 | #             data_root=data_root,
152 | #             im_dir=im_dir,
153 | #             dataset=version,
154 | #             split=split,
155 | #             max_query_len=max_query_len,
156 | #             lstm=lstm,
157 | #             bert_model=bert_model
158 | #         )
159 | #         self._transforms = transforms
160 |     
161 | #     def __getitem__(self, idx):
162 | #         input_sample, target = super(RefCOCO, self).__getitem__(idx)
163 | #         target = {k: torch.as_tensor(v) for k, v in target.items()}
164 | #         # target['boxes'] = torch.as_tensor(target['boxes'])
165 | #         img = Image.fromarray(input_sample["img"])
166 | #         if self._transforms is not None:
167 | #             img, target = self._transforms(img, target)
168 | #         input_sample["img"] = img
169 | #         return input_sample, target
170 | 
171 | 
172 | # class ReferIt(ReferDataset):
173 | #     def __init__(self, data_root, im_dir, split, transforms, 
174 | #                  max_query_len=128, lstm=False, bert_model='bert-base-uncased'):
175 | #         super(ReferIt, self).__init__(
176 | #             data_root=data_root,
177 | #             im_dir=im_dir,
178 | #             dataset='referit',
179 | #             split=split,
180 | #             max_query_len=max_query_len,
181 | #             lstm=lstm,
182 | #             bert_model=bert_model
183 | #         )
184 | #         self._transforms = transforms
185 |     
186 | #     def __getitem__(self, idx):
187 | #         input_sample, target = super(ReferIt, self).__getitem__(idx)
188 | #         target = {k: torch.as_tensor(v) for k, v in target.items()}
189 | #         # target['boxes'] = torch.as_tensor(target['boxes'])
190 | #         img = Image.fromarray(input_sample["img"])
191 | #         if self._transforms is not None:
192 | #             img, target = self._transforms(img, target)
193 | #         input_sample["img"] = img
194 | #         return input_sample, target
195 | # def build_referit(split='train',
196 | #                   data_root="./data/annotations",
197 | #                   im_dir="./data/referit/images"):
198 | #     istest = split != 'train'
199 | #     return ReferIt(
200 | #         data_root=data_root,
201 | #         im_dir=im_dir,
202 | #         transforms=make_refer_transforms(test=istest),
203 | #         split=split,
204 | #         lstm=False
205 | #     )
206 | 
207 | 
208 | # def build_refcoco(split='train', 
209 | #                   version='refcoco',
210 | #                   data_root="./data/annotations",
211 | #                   im_dir="./data/refcoco/train2014"):
212 | #     istest = split != 'train'
213 | #     if version == 'refcoco':
214 | #         version = 'unc'
215 | #     elif version == 'refcoco+':
216 | #         version = 'unc+'
217 | #     elif version == 'refcocog':
218 | #         version = 'gref'
219 | #     else:
220 | #         raise NotImplementedError
221 | 
222 | #     return RefCOCO(
223 | #         data_root=data_root,
224 | #         im_dir=im_dir,
225 | #         version=version,
226 | #         transforms=make_refer_transforms(test=istest),
227 | #         split=split,
228 | #         lstm=False
229 | #     )
230 | 
231 | 
232 | # def build_refer_collections():
233 | #     flickr30k_d = build_flickr30k(split='train')
234 | #     refcoco_d = build_refcoco(split='trainval', version='refcoco')
235 | #     refcocop_d = build_refcoco(split='trainval', version='refcoco+')
236 | #     referit = build_referit(split='trainval')
237 | #     return GeneralReferDataset(datasets=[flickr30k_d, refcoco_d, refcocop_d, referit])


--------------------------------------------------------------------------------
/datasets/refer_resc.py:
--------------------------------------------------------------------------------
  1 | # Builder for visual grouding datasets
  2 | from datasets.grounding_datasets.resc_refer_dataset import ReferDataset
  3 | from PIL import Image
  4 | import datasets.transforms as T
  5 | import torch
  6 | 
  7 | class GeneralReferDataset(torch.utils.data.Dataset):
  8 |     """
  9 |         A collection of datasets.
 10 |     """
 11 |     def __init__(self, datasets):
 12 |         super(GeneralReferDataset, self).__init__()
 13 |         self.datasets = datasets
 14 |         self.dataset_len = [len(x) for x in datasets]
 15 |     
 16 |     def __getitem__(self, idx):
 17 |         for i, dataset in enumerate(self.datasets):
 18 |             if idx >= self.dataset_len[i]:
 19 |                 idx = idx - self.dataset_len[i]
 20 |             else:
 21 |                 return dataset.__getitem__(idx)
 22 | 
 23 |     def __len__(self):
 24 |         return sum(self.dataset_len)
 25 | 
 26 | class flickr30k(ReferDataset):
 27 |     def __init__(self, data_root, im_dir, split, transforms,
 28 |                  max_query_len=40, lstm=False, bert_model='bert-base-uncased'):
 29 |         super(flickr30k, self).__init__(
 30 |             data_root=data_root,
 31 |             im_dir=im_dir,
 32 |             dataset='flickr',
 33 |             split=split,
 34 |             max_query_len=max_query_len,
 35 |             lstm=lstm,
 36 |             bert_model=bert_model
 37 |         )
 38 |         self._transforms = transforms
 39 | 
 40 |     def __getitem__(self, idx):
 41 |         input_sample, target = super(flickr30k, self).__getitem__(idx)
 42 |         target = {k: torch.as_tensor(v) for k, v in target.items()}
 43 |         # target['boxes'] = torch.as_tensor(target['boxes'])
 44 |         img = Image.fromarray(input_sample["img"])
 45 |         if self._transforms is not None:
 46 |             img, target = self._transforms(img, target)
 47 |         input_sample["img"] = img
 48 |         return input_sample, target
 49 | 
 50 | class RefCOCO(ReferDataset):
 51 |     def __init__(self, data_root, im_dir, split, transforms, version="unc",
 52 |                  max_query_len=128, lstm=False, bert_model='bert-base-uncased'):
 53 |         super(RefCOCO, self).__init__(
 54 |             data_root=data_root,
 55 |             im_dir=im_dir,
 56 |             dataset=version,
 57 |             split=split,
 58 |             max_query_len=max_query_len,
 59 |             lstm=lstm,
 60 |             bert_model=bert_model
 61 |         )
 62 |         self._transforms = transforms
 63 |     
 64 |     def __getitem__(self, idx):
 65 |         input_sample, target = super(RefCOCO, self).__getitem__(idx)
 66 |         target = {k: torch.as_tensor(v) for k, v in target.items()}
 67 |         # target['boxes'] = torch.as_tensor(target['boxes'])
 68 |         img = Image.fromarray(input_sample["img"])
 69 |         if self._transforms is not None:
 70 |             img, target = self._transforms(img, target)
 71 |         input_sample["img"] = img
 72 |         return input_sample, target
 73 | 
 74 | 
 75 | class ReferIt(ReferDataset):
 76 |     def __init__(self, data_root, im_dir, split, transforms, 
 77 |                  max_query_len=128, lstm=False, bert_model='bert-base-uncased'):
 78 |         super(ReferIt, self).__init__(
 79 |             data_root=data_root,
 80 |             im_dir=im_dir,
 81 |             dataset='referit',
 82 |             split=split,
 83 |             max_query_len=max_query_len,
 84 |             lstm=lstm,
 85 |             bert_model=bert_model
 86 |         )
 87 |         self._transforms = transforms
 88 |     
 89 |     def __getitem__(self, idx):
 90 |         input_sample, target = super(ReferIt, self).__getitem__(idx)
 91 |         target = {k: torch.as_tensor(v) for k, v in target.items()}
 92 |         # target['boxes'] = torch.as_tensor(target['boxes'])
 93 |         img = Image.fromarray(input_sample["img"])
 94 |         if self._transforms is not None:
 95 |             img, target = self._transforms(img, target)
 96 |         input_sample["img"] = img
 97 |         return input_sample, target
 98 | 
 99 | 
100 | def make_refer_transforms(img_size=224 ,max_img_size=1333 ,test=False):
101 |     normalize = T.Compose([
102 |         T.ToTensor(),
103 |         T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
104 |     ])
105 | 
106 |     if not test:
107 |         return T.Compose([
108 |             # T.RandomHorizontalFlip(),
109 |             T.RandomIntensitySaturation(),
110 |             T.RandomResize([img_size], max_size=max_img_size),
111 |             # T.RandomAffineTransform(degrees=(-5,5), translate=(0.1, 0.1),
112 |             #                         scale=(0.9, 1.1)),
113 |             normalize
114 |         ])
115 |     else:
116 |         return T.Compose([
117 |             T.RandomResize([img_size], max_size=max_img_size),
118 |             normalize
119 |         ])
120 | 
121 | 
122 | def build_flickr30k_resc(
123 |         split='train',
124 |         data_root="./data/annotations_resc",
125 |         im_dir="./data/flickr30k/f30k_images",
126 |         img_size=224,
127 |         max_img_size=1333,
128 |         max_query_len=40,
129 |         bert_model='bert-base-uncased'):
130 |     istest = not split in ['train', 'trainval'] 
131 |     return flickr30k(
132 |         data_root=data_root,
133 |         im_dir=im_dir,
134 |         transforms=make_refer_transforms(img_size, max_img_size, test=istest),
135 |         split=split,
136 |         max_query_len=max_query_len,
137 |         bert_model=bert_model
138 |     )
139 | 
140 | def build_referit_resc(
141 |         split='train',
142 |         data_root="./data/annotations_resc",
143 |         im_dir="./data/referit/images",
144 |         max_query_len=40,
145 |         img_size=224, 
146 |         max_img_size=1333,
147 |         bert_model='bert-base-uncased'):
148 |     istest = not split in ['train', 'trainval'] 
149 |     return ReferIt(
150 |         data_root=data_root,
151 |         im_dir=im_dir,
152 |         transforms=make_refer_transforms(img_size, max_img_size, test=istest),
153 |         split=split,
154 |         max_query_len=max_query_len,
155 |         lstm=False,
156 |         bert_model=bert_model
157 |     )
158 | 
159 | 
160 | def build_refcoco_resc(
161 |         split='train',
162 |         version='unc',
163 |         data_root="./data/annotations_resc",
164 |         im_dir="./data/refcoco/train2014",
165 |         max_query_len=40,
166 |         img_size=224,
167 |         max_img_size=1333,
168 |         bert_model='bert-base-uncased'):
169 |     istest = not split in ['train', 'trainval'] 
170 |     return RefCOCO(
171 |         data_root=data_root,
172 |         im_dir=im_dir,
173 |         version=version,
174 |         transforms=make_refer_transforms(img_size, max_img_size, test=istest),
175 |         split=split,
176 |         max_query_len=max_query_len,
177 |         lstm=False,
178 |         bert_model=bert_model
179 |     )
180 | 
181 | 
182 | def build_visualgenome(
183 |         split='all',
184 |         data_root="./data/annotations_resc",
185 |         im_dir="./data/visualgenome/VG_100K",
186 |         max_query_len=40,
187 |         img_size=224,
188 |         max_img_size=1333,
189 |         bert_model='bert-base-uncased'):
190 |     istest = False
191 |     return RefCOCO(
192 |         data_root=data_root,
193 |         im_dir=im_dir,
194 |         version='vg',
195 |         transforms=make_refer_transforms(img_size, max_img_size, test=istest),
196 |         split=split,
197 |         max_query_len=max_query_len,
198 |         lstm=False,
199 |         bert_model=bert_model
200 |     )
201 | 
202 | # def build_refer_collections():
203 | #     flickr30k_d = build_flickr30k(split='train')
204 | #     refcoco_d = build_refcoco(split='trainval', version='refcoco')
205 | #     refcocop_d = build_refcoco(split='trainval', version='refcoco+')
206 | #     referit = build_referit(split='trainval')
207 | #     return GeneralReferDataset(datasets=[flickr30k_d, refcoco_d, refcocop_d, referit])
208 | 
209 | if __name__ == "__main__":
210 |     # comment out normalize in make_refer_transforms when testing
211 |     from PIL import Image, ImageDraw
212 |     # flickr
213 |     d_train = build_flickr30k(split='train')
214 |     d_val = build_flickr30k(split='val')
215 |     d_test = build_flickr30k(split='test')
216 |     print(f"flickr30k datasets have : {len(d_train)} Training samples")
217 |     print(f"flickr30k datasets have : {len(d_test)} Testing samples")
218 |     for i in range(0, 200, 50):
219 |         samples, target = d_train[i]
220 |         img = samples['img']
221 |         img1 = ImageDraw.Draw(img)
222 |         img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red')
223 |         img.save(f"./exps/flickr_train_sample{i}.jpg")
224 |     
225 |     # refcoco
226 |     d_train = build_refcoco(split='trainval', version='refcoco')
227 |     d_testA = build_refcoco(split='testA', version='refcoco')
228 |     d_testB = build_refcoco(split='testB', version='refcoco')
229 |     print(f"Refcoco datasets have : {len(d_train)} Training samples")
230 |     print(f"Refcoco datasets have : {len(d_testA)}/{len(d_testB)} Testing samples")
231 |     for i in range(0, 200, 50):
232 |         samples, target = d_train[i]
233 |         img = samples['img']
234 |         img1 = ImageDraw.Draw(img)
235 |         img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red')
236 |         img.save(f"./exps/refcoco_train_sample{i}.jpg")
237 | 
238 |     # refcoco
239 |     d_train = build_refcoco(split='trainval', version='refcoco+')
240 |     d_testA = build_refcoco(split='testA', version='refcoco+')
241 |     d_testB = build_refcoco(split='testB', version='refcoco+')
242 |     print(f"Refcoco+ datasets have : {len(d_train)} Training samples")
243 |     print(f"Refcoco+ datasets have : {len(d_testA)}/{len(d_testB)} Testing samples")
244 |     for i in range(0, 200, 50):
245 |         samples, target = d_train[i]
246 |         img = samples['img']
247 |         img1 = ImageDraw.Draw(img)
248 |         img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red')
249 |         img.save(f"./exps/refcoco+_train_sample{i}.jpg")
250 | 
251 |     # referit
252 |     d_train = build_referit(split='trainval')
253 |     d_test = build_referit(split='test')
254 |     print(f"ReferIt datasets have : {len(d_train)} Training samples")
255 |     print(f"ReferIt datasets have : {len(d_test)} Testing samples")
256 |     for i in range(0, 200, 50):
257 |         samples, target = d_train[i]
258 |         img = samples['img']
259 |         img1 = ImageDraw.Draw(img)
260 |         img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red')
261 |         img.save(f"./exps/referit_train_sample{i}.jpg")


--------------------------------------------------------------------------------
/datasets/refer_segmentation.py:
--------------------------------------------------------------------------------
  1 | # Builder for visual grouding datasets
  2 | from .grounding_datasets import ReferSegDataset
  3 | from PIL import Image
  4 | import datasets.transforms as T
  5 | import torch
  6 | 
  7 | class RefCOCO(ReferSegDataset):
  8 |     def __init__(self, data_root, im_dir, seg_dir, split, transforms, version="refcoco_unc",
  9 |                  max_query_len=40, bert_model='bert-base-uncased'):
 10 |         super(RefCOCO, self).__init__(
 11 |             data_root=data_root,
 12 |             im_dir=im_dir,
 13 |             seg_dir=seg_dir,
 14 |             dataset=version,
 15 |             split=split,
 16 |             max_query_len=max_query_len,
 17 |             bert_model=bert_model
 18 |         )
 19 |         self._transforms = transforms
 20 |     
 21 |     def __getitem__(self, idx):
 22 |         input_sample, target = super(RefCOCO, self).__getitem__(idx)
 23 |         target = {k: torch.as_tensor(v) for k, v in target.items()}
 24 |         # target['boxes'] = torch.as_tensor(target['boxes'])
 25 |         img = Image.fromarray(input_sample["img"])
 26 |         if self._transforms is not None:
 27 |             img, target = self._transforms(img, target)
 28 |         input_sample["img"] = img
 29 |         return input_sample, target
 30 | 
 31 | 
 32 | def make_refer_seg_transforms(img_size=224 ,max_img_size=1333 ,test=False):
 33 |     normalize = T.Compose([
 34 |         T.ToTensor(),
 35 |         T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
 36 |     ])
 37 | 
 38 |     if not test:
 39 |         return T.Compose([
 40 |             # T.RandomHorizontalFlip(),
 41 |             T.RandomIntensitySaturation(),
 42 |             T.RandomResize([img_size], max_size=max_img_size),
 43 |             # T.RandomAffineTransform(degrees=(-5,5), translate=(0.1, 0.1),
 44 |             #                         scale=(0.9, 1.1)),
 45 |             normalize
 46 |         ])
 47 |     else:
 48 |         return T.Compose([
 49 |             T.RandomResize([img_size], max_size=max_img_size),
 50 |             normalize
 51 |         ])
 52 | 
 53 | 
 54 | def build_refcoco_segmentation(
 55 |         split='train', 
 56 |         version='refcoco_unc',
 57 |         data_root="./data/refcoco/anns",
 58 |         im_dir="./data/refcoco/images/train2014",
 59 |         seg_dir="./data/refcoco/masks",
 60 |         img_size=224, 
 61 |         max_img_size=1333,
 62 |         bert_model='bert-base-uncased'
 63 |     ):
 64 |     '''
 65 |         'refcoco_unc'
 66 |         'refcoco+_unc'
 67 |         'refcocog_google'
 68 |         'refcocog_umd'
 69 |     '''
 70 |     istest = split != 'train'
 71 | 
 72 |     return RefCOCO(
 73 |         data_root=data_root,
 74 |         im_dir=im_dir,
 75 |         seg_dir=seg_dir,
 76 |         version=version,
 77 |         transforms=make_refer_seg_transforms(img_size, max_img_size, test=istest),
 78 |         split=split,
 79 |         bert_model=bert_model
 80 |     )
 81 | 
 82 | if __name__ == "__main__":
 83 |     # comment out normalize in make_refer_transforms when testing
 84 |     from PIL import Image, ImageDraw
 85 |     import numpy as np
 86 |     # flickr
 87 |     d_train = build_refcoco_segmentation(split='train')
 88 |     d_val = build_refcoco_segmentation(split='val')
 89 |     d_test = build_refcoco_segmentation(split='testA')
 90 |     print(f"flickr30k datasets have : {len(d_train)} Training samples")
 91 |     print(f"flickr30k datasets have : {len(d_val)} Val samples")
 92 |     print(f"flickr30k datasets have : {len(d_test)} Testing samples")
 93 |     for i in range(0, 200, 50):
 94 |         samples, target = d_train[i]
 95 |         img = samples['img']
 96 |         mask = target['masks']
 97 |         img1 = ImageDraw.Draw(img)
 98 |         img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red')
 99 |         img.save(f"./exps/refcoco_train_sample{i}.jpg")
100 |         
101 |         print(mask.shape, mask.dtype)
102 |         mask = mask.numpy().astype(np.uint8)[0] * 255
103 |         print(mask)
104 |         mask = Image.fromarray(mask)
105 |         mask.save(f"./exps/refcoco_mask_sample{i}.jpg")


--------------------------------------------------------------------------------
/datasets/samplers.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import torch
  4 | import torch.distributed as dist
  5 | from torch.utils.data.sampler import Sampler
  6 | 
  7 | 
  8 | class DistributedSampler(Sampler):
  9 |     """Sampler that restricts data loading to a subset of the dataset.
 10 |     It is especially useful in conjunction with
 11 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
 12 |     process can pass a DistributedSampler instance as a DataLoader sampler,
 13 |     and load a subset of the original dataset that is exclusive to it.
 14 |     .. note::
 15 |         Dataset is assumed to be of constant size.
 16 |     Arguments:
 17 |         dataset: Dataset used for sampling.
 18 |         num_replicas (optional): Number of processes participating in
 19 |             distributed training.
 20 |         rank (optional): Rank of the current process within num_replicas.
 21 |     """
 22 | 
 23 |     def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True):
 24 |         if num_replicas is None:
 25 |             if not dist.is_available():
 26 |                 raise RuntimeError("Requires distributed package to be available")
 27 |             num_replicas = dist.get_world_size()
 28 |         if rank is None:
 29 |             if not dist.is_available():
 30 |                 raise RuntimeError("Requires distributed package to be available")
 31 |             rank = dist.get_rank()
 32 |         self.dataset = dataset
 33 |         self.num_replicas = num_replicas
 34 |         self.rank = rank
 35 |         self.epoch = 0
 36 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
 37 |         self.total_size = self.num_samples * self.num_replicas
 38 |         self.shuffle = shuffle
 39 | 
 40 |     def __iter__(self):
 41 |         if self.shuffle:
 42 |             # deterministically shuffle based on epoch
 43 |             g = torch.Generator()
 44 |             g.manual_seed(self.epoch)
 45 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
 46 |         else:
 47 |             indices = torch.arange(len(self.dataset)).tolist()
 48 | 
 49 |         # add extra samples to make it evenly divisible
 50 |         indices += indices[: (self.total_size - len(indices))]
 51 |         assert len(indices) == self.total_size
 52 | 
 53 |         # subsample
 54 |         offset = self.num_samples * self.rank
 55 |         indices = indices[offset : offset + self.num_samples]
 56 |         assert len(indices) == self.num_samples
 57 | 
 58 |         return iter(indices)
 59 | 
 60 |     def __len__(self):
 61 |         return self.num_samples
 62 | 
 63 |     def set_epoch(self, epoch):
 64 |         self.epoch = epoch
 65 | 
 66 | 
 67 | class NodeDistributedSampler(Sampler):
 68 |     """Sampler that restricts data loading to a subset of the dataset.
 69 |     It is especially useful in conjunction with
 70 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
 71 |     process can pass a DistributedSampler instance as a DataLoader sampler,
 72 |     and load a subset of the original dataset that is exclusive to it.
 73 |     .. note::
 74 |         Dataset is assumed to be of constant size.
 75 |     Arguments:
 76 |         dataset: Dataset used for sampling.
 77 |         num_replicas (optional): Number of processes participating in
 78 |             distributed training.
 79 |         rank (optional): Rank of the current process within num_replicas.
 80 |     """
 81 | 
 82 |     def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True):
 83 |         if num_replicas is None:
 84 |             if not dist.is_available():
 85 |                 raise RuntimeError("Requires distributed package to be available")
 86 |             num_replicas = dist.get_world_size()
 87 |         if rank is None:
 88 |             if not dist.is_available():
 89 |                 raise RuntimeError("Requires distributed package to be available")
 90 |             rank = dist.get_rank()
 91 |         if local_rank is None:
 92 |             local_rank = int(os.environ.get('LOCAL_RANK', 0))
 93 |         if local_size is None:
 94 |             local_size = int(os.environ.get('LOCAL_SIZE', 1))
 95 |         self.dataset = dataset
 96 |         self.shuffle = shuffle
 97 |         self.num_replicas = num_replicas
 98 |         self.num_parts = local_size
 99 |         self.rank = rank
100 |         self.local_rank = local_rank
101 |         self.epoch = 0
102 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
103 |         self.total_size = self.num_samples * self.num_replicas
104 | 
105 |         self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts
106 | 
107 |     def __iter__(self):
108 |         if self.shuffle:
109 |             # deterministically shuffle based on epoch
110 |             g = torch.Generator()
111 |             g.manual_seed(self.epoch)
112 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
113 |         else:
114 |             indices = torch.arange(len(self.dataset)).tolist()
115 |         indices = [i for i in indices if i % self.num_parts == self.local_rank]
116 | 
117 |         # add extra samples to make it evenly divisible
118 |         indices += indices[:(self.total_size_parts - len(indices))]
119 |         assert len(indices) == self.total_size_parts
120 | 
121 |         # subsample
122 |         indices = indices[self.rank // self.num_parts:self.total_size_parts:self.num_replicas // self.num_parts]
123 |         assert len(indices) == self.num_samples
124 | 
125 |         return iter(indices)
126 | 
127 |     def __len__(self):
128 |         return self.num_samples
129 | 
130 |     def set_epoch(self, epoch):
131 |         self.epoch = epoch
132 | 


--------------------------------------------------------------------------------
/datasets/transforms.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Modified from DETR (https://github.com/facebookresearch/detr)
  3 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  4 | # ------------------------------------------------------------------------
  5 | 
  6 | """
  7 | Transforms and data augmentation for both image + bbox.
  8 | """
  9 | import random
 10 | 
 11 | import PIL
 12 | import torch
 13 | import torchvision.transforms as T
 14 | import torchvision.transforms.functional as F
 15 | import numpy as np
 16 | 
 17 | from util.box_ops import box_xyxy_to_cxcywh
 18 | from util.misc import interpolate
 19 | 
 20 | 
 21 | def crop(image, target, region):
 22 |     cropped_image = F.crop(image, *region)
 23 | 
 24 |     target = target.copy()
 25 |     i, j, h, w = region
 26 | 
 27 |     # should we do something wrt the original size?
 28 |     target["size"] = torch.tensor([h, w])
 29 | 
 30 |     fields = ["labels", "area", "iscrowd"]
 31 | 
 32 |     if "boxes" in target:
 33 |         boxes = target["boxes"]
 34 |         max_size = torch.as_tensor([w, h], dtype=torch.float32)
 35 |         cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
 36 |         cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
 37 |         cropped_boxes = cropped_boxes.clamp(min=0)
 38 |         area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
 39 |         target["boxes"] = cropped_boxes.reshape(-1, 4)
 40 |         target["area"] = area
 41 |         fields.append("boxes")
 42 | 
 43 |     if "masks" in target:
 44 |         # FIXME should we update the area here if there are no boxes?
 45 |         target['masks'] = target['masks'][:, i:i + h, j:j + w]
 46 |         fields.append("masks")
 47 | 
 48 |     # remove elements for which the boxes or masks that have zero area
 49 |     if "boxes" in target or "masks" in target:
 50 |         # favor boxes selection when defining which elements to keep
 51 |         # this is compatible with previous implementation
 52 |         if "boxes" in target:
 53 |             cropped_boxes = target['boxes'].reshape(-1, 2, 2)
 54 |             keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
 55 |         else:
 56 |             keep = target['masks'].flatten(1).any(1)
 57 | 
 58 |         for field in fields:
 59 |             target[field] = target[field][keep]
 60 | 
 61 |     return cropped_image, target
 62 | 
 63 | 
 64 | def hflip(image, target):
 65 |     flipped_image = F.hflip(image)
 66 | 
 67 |     w, h = image.size
 68 | 
 69 |     target = target.copy()
 70 |     if "boxes" in target:
 71 |         boxes = target["boxes"]
 72 |         boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
 73 |         target["boxes"] = boxes
 74 | 
 75 |     if "masks" in target:
 76 |         target['masks'] = target['masks'].flip(-1)
 77 | 
 78 |     return flipped_image, target
 79 | 
 80 | 
 81 | def resize(image, target, size, max_size=None):
 82 |     # size can be min_size (scalar) or (w, h) tuple
 83 | 
 84 |     def get_size_with_aspect_ratio(image_size, size, max_size=None):
 85 |         w, h = image_size
 86 |         if max_size is not None:
 87 |             min_original_size = float(min((w, h)))
 88 |             max_original_size = float(max((w, h)))
 89 |             if max_original_size / min_original_size * size > max_size:
 90 |                 size = int(round(max_size * min_original_size / max_original_size))
 91 | 
 92 |         if (w <= h and w == size) or (h <= w and h == size):
 93 |             return (h, w)
 94 | 
 95 |         if w < h:
 96 |             ow = size
 97 |             oh = int(size * h / w)
 98 |         else:
 99 |             oh = size
100 |             ow = int(size * w / h)
101 | 
102 |         return (oh, ow)
103 | 
104 |     def get_size(image_size, size, max_size=None):
105 |         if isinstance(size, (list, tuple)):
106 |             return size[::-1]
107 |         else:
108 |             return get_size_with_aspect_ratio(image_size, size, max_size)
109 | 
110 |     size = get_size(image.size, size, max_size)
111 |     rescaled_image = F.resize(image, size)
112 | 
113 |     if target is None:
114 |         return rescaled_image, None
115 | 
116 |     ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size))
117 |     ratio_width, ratio_height = ratios
118 | 
119 |     target = target.copy()
120 |     if "boxes" in target:
121 |         boxes = target["boxes"]
122 |         scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height])
123 |         target["boxes"] = scaled_boxes
124 | 
125 |     if "area" in target:
126 |         area = target["area"]
127 |         scaled_area = area * (ratio_width * ratio_height)
128 |         target["area"] = scaled_area
129 | 
130 |     h, w = size
131 |     target["size"] = torch.tensor([h, w])
132 | 
133 |     if "masks" in target:
134 |         target['masks'] = interpolate(
135 |             target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5
136 | 
137 |     return rescaled_image, target
138 | 
139 | 
140 | def pad(image, target, padding):
141 |     # assumes that we only pad on the bottom right corners
142 |     padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
143 |     if target is None:
144 |         return padded_image, None
145 |     target = target.copy()
146 |     # should we do something wrt the original size?
147 |     target["size"] = torch.tensor(padded_image[::-1])
148 |     if "masks" in target:
149 |         target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1]))
150 |     return padded_image, target
151 | 
152 | 
153 | class RandomCrop(object):
154 |     def __init__(self, size):
155 |         self.size = size
156 | 
157 |     def __call__(self, img, target):
158 |         region = T.RandomCrop.get_params(img, self.size)
159 |         return crop(img, target, region)
160 | 
161 | 
162 | class RandomSizeCrop(object):
163 |     def __init__(self, min_size: int, max_size: int):
164 |         self.min_size = min_size
165 |         self.max_size = max_size
166 | 
167 |     def __call__(self, img: PIL.Image.Image, target: dict):
168 |         w = random.randint(self.min_size, min(img.width, self.max_size))
169 |         h = random.randint(self.min_size, min(img.height, self.max_size))
170 |         region = T.RandomCrop.get_params(img, [h, w])
171 |         return crop(img, target, region)
172 | 
173 | 
174 | class CenterCrop(object):
175 |     def __init__(self, size):
176 |         self.size = size
177 | 
178 |     def __call__(self, img, target):
179 |         image_width, image_height = img.size
180 |         crop_height, crop_width = self.size
181 |         crop_top = int(round((image_height - crop_height) / 2.))
182 |         crop_left = int(round((image_width - crop_width) / 2.))
183 |         return crop(img, target, (crop_top, crop_left, crop_height, crop_width))
184 | 
185 | 
186 | class RandomHorizontalFlip(object):
187 |     def __init__(self, p=0.5):
188 |         self.p = p
189 | 
190 |     def __call__(self, img, target):
191 |         if random.random() < self.p:
192 |             return hflip(img, target)
193 |         return img, target
194 | 
195 | 
196 | class RandomResize(object):
197 |     def __init__(self, sizes, max_size=None):
198 |         assert isinstance(sizes, (list, tuple))
199 |         self.sizes = sizes
200 |         self.max_size = max_size
201 | 
202 |     def __call__(self, img, target=None):
203 |         size = random.choice(self.sizes)
204 |         return resize(img, target, size, self.max_size)
205 | 
206 | 
207 | class RandomPad(object):
208 |     def __init__(self, max_pad):
209 |         self.max_pad = max_pad
210 | 
211 |     def __call__(self, img, target):
212 |         pad_x = random.randint(0, self.max_pad)
213 |         pad_y = random.randint(0, self.max_pad)
214 |         return pad(img, target, (pad_x, pad_y))
215 | 
216 | 
217 | class RandomSelect(object):
218 |     """
219 |     Randomly selects between transforms1 and transforms2,
220 |     with probability p for transforms1 and (1 - p) for transforms2
221 |     """
222 |     def __init__(self, transforms1, transforms2, p=0.5):
223 |         self.transforms1 = transforms1
224 |         self.transforms2 = transforms2
225 |         self.p = p
226 | 
227 |     def __call__(self, img, target):
228 |         if random.random() < self.p:
229 |             return self.transforms1(img, target)
230 |         return self.transforms2(img, target)
231 | 
232 | 
233 | class ToTensor(object):
234 |     def __call__(self, img, target):
235 |         return F.to_tensor(img), target
236 | 
237 | 
238 | class RandomErasing(object):
239 | 
240 |     def __init__(self, *args, **kwargs):
241 |         self.eraser = T.RandomErasing(*args, **kwargs)
242 | 
243 |     def __call__(self, img, target):
244 |         return self.eraser(img), target
245 | 
246 | 
247 | class Normalize(object):
248 |     def __init__(self, mean, std):
249 |         self.mean = mean
250 |         self.std = std
251 | 
252 |     def __call__(self, image, target=None):
253 |         image = F.normalize(image, mean=self.mean, std=self.std)
254 |         if target is None:
255 |             return image, None
256 |         target = target.copy()
257 |         h, w = image.shape[-2:]
258 |         if "boxes" in target:
259 |             boxes = target["boxes"]
260 |             boxes = box_xyxy_to_cxcywh(boxes)
261 |             boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
262 |             target["boxes"] = boxes
263 |         return image, target
264 | 
265 | import cv2
266 | class RandomIntensitySaturation(object):
267 |     def __call__(self, image, target=None):
268 |         fraction = 0.50
269 |         image = np.asarray(image)
270 |         img_hsv = cv2.cvtColor(cv2.cvtColor(image, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2HSV)
271 |         S = img_hsv[:, :, 1].astype(np.float32)
272 |         V = img_hsv[:, :, 2].astype(np.float32)
273 |         a = (random.random() * 2 - 1) * fraction + 1
274 |         if a > 1:
275 |             np.clip(S, a_min=0, a_max=255, out=S)
276 |         a = (random.random() * 2 - 1) * fraction + 1
277 |         V *= a
278 |         if a > 1:
279 |             np.clip(V, a_min=0, a_max=255, out=V)
280 | 
281 |         img_hsv[:, :, 1] = S.astype(np.uint8)
282 |         img_hsv[:, :, 2] = V.astype(np.uint8)
283 |         img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR), cv2.COLOR_BGR2RGB)
284 |         img = PIL.Image.fromarray(img)
285 |         return img, target
286 | 
287 | from util.transforms import random_affine
288 | class RandomAffineTransform(object):
289 |     def __init__(self, degrees=(-10, 10), translate=(.1, .1), scale=(0.90, 1.10)):
290 |         self.degrees = degrees
291 |         self.translate = translate
292 |         self.scale = scale
293 | 
294 |     def __call__(self, image, targets):
295 |         bboxes = targets["boxes"]
296 |         image = np.asarray(image)
297 |         image, _, bboxes = random_affine(image, mask=None, targets=bboxes.numpy(), 
298 |                         degrees=self.degrees, translate=self.translate,
299 |                         scale=self.scale)
300 |         # TODO: A hack here for later transform operations
301 |         img = PIL.Image.fromarray(image)
302 |         targets["boxes"] = torch.as_tensor(bboxes)
303 |         return img, targets
304 | 
305 | class Compose(object):
306 |     def __init__(self, transforms):
307 |         self.transforms = transforms
308 | 
309 |     def __call__(self, image, target):
310 |         for t in self.transforms:
311 |             image, target = t(image, target)
312 |         return image, target
313 | 
314 |     def __repr__(self):
315 |         format_string = self.__class__.__name__ + "("
316 |         for t in self.transforms:
317 |             format_string += "\n"
318 |             format_string += "    {0}".format(t)
319 |         format_string += "\n)"
320 |         return format_string
321 | 


--------------------------------------------------------------------------------
/engine_vg.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Train and eval functions used in main.py
  3 | """
  4 | import math
  5 | import os
  6 | import sys
  7 | import json
  8 | from typing import Iterable
  9 | 
 10 | import torch
 11 | import util.misc as utils
 12 | from PIL import Image, ImageDraw
 13 | import matplotlib.pyplot as plt
 14 | import numpy as np
 15 | import torch.nn.functional as F
 16 | 
 17 | # from datasets.data_prefetcher import data_prefetcher
 18 | 
 19 | # # Reuse Deformable DETR's train function
 20 | # from engine import train_one_epoch
 21 | 
 22 | def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module,
 23 |                     data_loader: Iterable, optimizer: torch.optim.Optimizer,
 24 |                     lr_scheduler: torch.optim.lr_scheduler._LRScheduler,
 25 |                     device: torch.device, epoch: int, max_norm: float = 0):
 26 |     model.train()
 27 |     criterion.train()
 28 |     metric_logger = utils.MetricLogger(delimiter="  ")
 29 |     metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
 30 |     # metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}'))
 31 |     metric_logger.add_meter('grad_norm', utils.SmoothedValue(window_size=1, fmt='{value:.2f}'))
 32 |     header = 'Epoch: [{}]'.format(epoch)
 33 |     print_freq = 50
 34 | 
 35 |     prefetcher = data_prefetcher(data_loader, device, prefetch=True)
 36 |     samples, targets = prefetcher.next()
 37 | 
 38 |     # for samples, targets in metric_logger.log_every(data_loader, print_freq, header):
 39 |     for _ in metric_logger.log_every(range(len(data_loader)), print_freq, header):
 40 |         outputs = model(samples)
 41 |         loss_dict = criterion(outputs, targets)
 42 |         weight_dict = criterion.weight_dict
 43 |         losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict)
 44 | 
 45 |         # reduce losses over all GPUs for logging purposes
 46 |         loss_dict_reduced = utils.reduce_dict(loss_dict)
 47 |         loss_dict_reduced_unscaled = {f'{k}_unscaled': v
 48 |                                       for k, v in loss_dict_reduced.items()}
 49 |         loss_dict_reduced_scaled = {k: v * weight_dict[k]
 50 |                                     for k, v in loss_dict_reduced.items() if k in weight_dict}
 51 |         losses_reduced_scaled = sum(loss_dict_reduced_scaled.values())
 52 | 
 53 |         loss_value = losses_reduced_scaled.item()
 54 | 
 55 |         if not math.isfinite(loss_value):
 56 |             print("Loss is {}, stopping training".format(loss_value))
 57 |             print(loss_dict_reduced)
 58 |             sys.exit(1)
 59 | 
 60 |         optimizer.zero_grad()
 61 |         losses.backward()
 62 |         if max_norm > 0:
 63 |             grad_total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
 64 |         else:
 65 |             grad_total_norm = utils.get_total_grad_norm(model.parameters(), max_norm)
 66 |         optimizer.step()
 67 |         lr_scheduler.step()
 68 | 
 69 |         metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled)
 70 |         # metric_logger.update(class_error=loss_dict_reduced['class_error'])
 71 |         metric_logger.update(lr=optimizer.param_groups[0]["lr"])
 72 |         metric_logger.update(grad_norm=grad_total_norm)
 73 | 
 74 |         samples, targets = prefetcher.next()
 75 |     # gather the stats from all processes
 76 |     metric_logger.synchronize_between_processes()
 77 |     print("Averaged stats:", metric_logger)
 78 |     return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
 79 | 
 80 | from util.box_ops import box_iou, box_cxcywh_to_xyxy, mask_iou
 81 | @torch.no_grad()
 82 | def evaluate(model, criterion, postprocessors, data_loader, device, output_dir, visualize=False):
 83 |     model.eval()
 84 |     criterion.eval()
 85 |     # visualize=False
 86 |     if visualize:
 87 |         split_name = data_loader.dataset.split
 88 |         output_dir = output_dir / 'vis' / split_name 
 89 |         output_dir.mkdir(parents=True, exist_ok=True)
 90 |         (output_dir / 'mask').mkdir(parents=True, exist_ok=True)
 91 |         (output_dir / 'bbox').mkdir(parents=True, exist_ok=True)
 92 |         (output_dir / 'att').mkdir(parents=True, exist_ok=True)
 93 |         (output_dir / 'gt').mkdir(parents=True, exist_ok=True)
 94 |         purple = np.array([[[128, 0, 128]]], dtype=np.uint8)
 95 |         yellow = np.array([[[255, 255, 0]]], dtype=np.uint8)
 96 |     metric_logger = utils.MetricLogger(delimiter="  ")
 97 |     # metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}'))
 98 |     header = data_loader.dataset.split + ':'
 99 | 
100 |     results_dict = {}
101 |     results_iou = {'det':{}, 'seg':{}}
102 |     sum_accu = 0.
103 |     sum_iou = 0.
104 |     cnt_test = 0.
105 |     seg_iou = 0.
106 |     cnt_seg = 0.
107 |     for samples, targets in metric_logger.log_every(data_loader, 50, header):
108 |         samples = {k: v.to(device, non_blocking=True) for k, v in samples.items()}
109 |         targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
110 | 
111 |         outputs = model(samples)
112 |         loss_dict = criterion(outputs, targets)
113 |         weight_dict = criterion.weight_dict
114 | 
115 |         # reduce losses over all GPUs for logging purposes
116 |         loss_dict_reduced = utils.reduce_dict(loss_dict)
117 |         loss_dict_reduced_scaled = {k: v * weight_dict[k]
118 |                                     for k, v in loss_dict_reduced.items() if k in weight_dict}
119 |         loss_dict_reduced_unscaled = {f'{k}_unscaled': v
120 |                                       for k, v in loss_dict_reduced.items()}
121 |         metric_logger.update(loss=sum(loss_dict_reduced_scaled.values()),
122 |                              **loss_dict_reduced_scaled,
123 |                              **loss_dict_reduced_unscaled)
124 |         # metric_logger.update(class_error=loss_dict_reduced['class_error'])
125 | 
126 |         # TODO： some issues with data loaders here
127 |         orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0)
128 |         results = postprocessors['bbox'](outputs, orig_target_sizes)
129 |         # res = {target['image_id'].item(): output for target, output in zip(targets, results)}
130 | 
131 |         for i, res in enumerate(results):
132 |             t = box_cxcywh_to_xyxy(targets[i]['boxes'])
133 |             assert t.size(0) == res['boxes'].size(0), (res, t)
134 |             iou, union = box_iou(t, res['boxes'])
135 |             iou = torch.diag(iou)
136 |             # print(t, res['boxes'], iou, union)
137 |             sum_accu = sum_accu + torch.sum((iou > 0.5).type(torch.float))#.item()
138 |             sum_iou = sum_iou + torch.sum(iou)#.item()
139 |             cnt_test = cnt_test + torch.tensor(len(targets[i]['boxes']), device=sum_iou.device)
140 |             results_iou['det'][targets[i]['dataset_id'].item()] = torch.sum(iou).item()
141 |         results_scaled = postprocessors['bbox'](outputs, orig_target_sizes, scale_to_original_shape=True)
142 |         
143 |         # TODO support multi-phrase in the future
144 |         if 'segm' in postprocessors.keys():
145 |             target_sizes = torch.stack([t["size"] for t in targets], dim=0)
146 |             results = postprocessors['segm'](results, outputs, orig_target_sizes, target_sizes)
147 |             for i, res in enumerate(results):
148 |                 t = targets[i]
149 |                 t_mask = t['masks']
150 |                 pred_mask = res['masks'][0]
151 |                 # print(pred_mask.shape, t_mask.shape)
152 |                 iou = mask_iou(pred_mask[0], t_mask)
153 |                 seg_iou = seg_iou + iou
154 |                 cnt_seg = cnt_seg + 1
155 |                 results_iou['seg'][targets[i]['dataset_id'].item()] = iou.item()
156 |                 if visualize:
157 |                     dataset_id = t['dataset_id'].item()
158 |                     pred_mask = res['masks_origin'][0, 0].cpu().unsqueeze(-1).numpy().astype(np.uint8)
159 |                     img, mask, phrase, tgt_box, img_file = data_loader.dataset.pull_item(dataset_id)
160 |                     assert pred_mask.shape[:2] == mask.shape, (pred_mask.shape, mask.shape)
161 |                     # print(pred_mask.shape, yellow.shape)
162 |                     img_name = img_file.split('/')[-1].split('.')[0]
163 |                     pred_mask = pred_mask * yellow + (1-pred_mask)*purple
164 |                     # print(pred_mask.shape, yellow.shape)
165 |                     pred_mask = Image.fromarray(pred_mask)
166 |                     pred_mask.save(output_dir / 'mask'/ f"{img_name}_{dataset_id:05d}.jpg")
167 | 
168 |                     
169 |                     mask = np.expand_dims(mask, -1)
170 |                     gt = mask * yellow + (1-mask)*purple
171 |                     # print(pred_mask.shape, yellow.shape)
172 |                     gt_mask = Image.fromarray(gt)
173 |                     gt_mask.save(output_dir / 'gt'/ f"{img_name}_{dataset_id:05d}.jpg")
174 | 
175 |                     pred_box = results_scaled[i]['boxes'][0].cpu().numpy().tolist()
176 |                     # print(pred_box, tgt_box)
177 |                     img_bbox = Image.fromarray(img)
178 |                     draw = ImageDraw.Draw(img_bbox)
179 |                     draw.rectangle(pred_box, outline='blue', width=5)
180 |                     draw.rectangle(tgt_box.tolist(), outline='red', width=5)
181 |                     img_bbox.save(output_dir / 'bbox'/ f"{img_name}_{dataset_id:05d}.jpg")
182 | 
183 |                     att_mask = outputs['mask_att'][i:i+1, :].cpu()
184 |                     h, w, _ = mask.shape
185 |                     att_mask = F.interpolate(att_mask, size=(320, 320), mode="bilinear").numpy()
186 |                     # print(att_mask.shape)
187 |                     plt.imsave(output_dir / 'att' /f"{img_name}_{dataset_id:05d}_0.jpg", att_mask[0, 0, :h//2, :w//2], cmap='viridis')
188 |                     plt.imsave(output_dir / 'att' /f"{img_name}_{dataset_id:05d}_1.jpg", att_mask[0, 1, :h//2, :w//2], cmap='viridis')
189 |                     plt.imsave(output_dir / 'att' /f"{img_name}_{dataset_id:05d}_2.jpg", att_mask[0, 2, :h//2, :w//2], cmap='viridis')
190 |                     plt.imsave(output_dir / 'att' /f"{img_name}_{dataset_id:05d}_7.jpg", att_mask[0, 7, :h//2, :w//2], cmap='viridis')
191 |                     # att_mask = att_mask[0, 0, :h, :w, None]
192 |                     # att_mask_rescaled = (att_mask - att_mask.min()) / (att_mask.max()-att_mask.min())
193 |                     # att_mask_rescaled = np.clip(1.5 * att_mask_rescaled - 0.5, 0., 1.0)
194 |                     # att_img = (img * att_mask_rescaled).astype(np.uint8)
195 |                     # att_img = Image.fromarray(att_img)
196 |                     # att_img.save(output_dir / 'att' / f"{img_name}_{dataset_id:05d}_0.jpg")
197 |                     # plt.imsave(output_dir / 'att' /f"0_{img_name}_{dataset_id:05d}.jpg", att_mask[:,:,0], cmap='viridis')
198 | 
199 | 
200 |         results_dict.update({target['image_id'].item(): output['boxes'].cpu().numpy().tolist() for target, output in zip(targets, results_scaled)})
201 |     
202 |     # gather the stats from all processes
203 |     metric_logger.synchronize_between_processes()
204 |     print("Averaged stats:", metric_logger)
205 | 
206 |     stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()}
207 |     if utils.is_dist_avail_and_initialized():
208 |         torch.distributed.all_reduce(sum_accu)
209 |         torch.distributed.all_reduce(cnt_test)
210 |         torch.distributed.all_reduce(sum_iou)
211 |     stats["accuracy_iou0.5"] = (sum_accu / cnt_test).cpu().item()
212 |     stats["miou"] = (sum_iou / cnt_test).cpu().item()
213 | 
214 |     if 'segm' in postprocessors.keys():
215 |         if utils.is_dist_avail_and_initialized():
216 |             torch.distributed.all_reduce(seg_iou)
217 |             cnt_seg = utils.get_world_size() * cnt_seg
218 |             print(cnt_seg)
219 |         stats["seg_miou"] = (seg_iou / cnt_seg).cpu().item()
220 |         
221 |     # do not print aux test loss
222 |     stats = {k:v for k,v in stats.items() if k.split('_')[-1] not in ['unscaled', '0', '1', '2']}
223 |     # with (output_dir / f"{data_loader.dataset.split}_iou.json").open("w") as f:
224 |     #     f.write(json.dumps(results_iou) + "\n")
225 |     return stats, results_dict
226 | 
227 | 
228 | def to_cuda(samples, targets, device):
229 |     # samples = samples.to(device, non_blocking=True)
230 |     samples = {k: v.to(device, non_blocking=True) for k, v in samples.items()}
231 |     targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets]
232 |     return samples, targets
233 | 
234 | class data_prefetcher():
235 |     def __init__(self, loader, device, prefetch=True):
236 |         self.loader = iter(loader)
237 |         self.prefetch = prefetch
238 |         self.device = device
239 |         if prefetch:
240 |             self.stream = torch.cuda.Stream()
241 |             self.preload()
242 | 
243 |     def preload(self):
244 |         try:
245 |             self.next_samples, self.next_targets = next(self.loader)
246 |         except StopIteration:
247 |             self.next_samples = None
248 |             self.next_targets = None
249 |             return
250 |         # if record_stream() doesn't work, another option is to make sure device inputs are created
251 |         # on the main stream.
252 |         # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda')
253 |         # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda')
254 |         # Need to make sure the memory allocated for next_* is not still in use by the main stream
255 |         # at the time we start copying to next_*:
256 |         # self.stream.wait_stream(torch.cuda.current_stream())
257 |         with torch.cuda.stream(self.stream):
258 |             self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device)
259 |             # more code for the alternative if record_stream() doesn't work:
260 |             # copy_ will record the use of the pinned source tensor in this side stream.
261 |             # self.next_input_gpu.copy_(self.next_input, non_blocking=True)
262 |             # self.next_target_gpu.copy_(self.next_target, non_blocking=True)
263 |             # self.next_input = self.next_input_gpu
264 |             # self.next_target = self.next_target_gpu
265 | 
266 |             # With Amp, it isn't necessary to manually convert data to half.
267 |             # if args.fp16:
268 |             #     self.next_input = self.next_input.half()
269 |             # else:
270 | 
271 |     def next(self):
272 |         if self.prefetch:
273 |             torch.cuda.current_stream().wait_stream(self.stream)
274 |             samples = self.next_samples
275 |             targets = self.next_targets
276 |             if samples is not None:
277 |                 for k, v in samples.items():
278 |                     v.record_stream(torch.cuda.current_stream())
279 |             if targets is not None:
280 |                 for t in targets:
281 |                     for k, v in t.items():
282 |                         v.record_stream(torch.cuda.current_stream())
283 |             self.preload()
284 |         else:
285 |             try:
286 |                 samples, targets = next(self.loader)
287 |                 samples, targets = to_cuda(samples, targets, self.device)
288 |             except StopIteration:
289 |                 samples = None
290 |                 targets = None
291 |         return samples, targets
292 | 


--------------------------------------------------------------------------------
/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .reftr_transformer import build_reftr as build_transformer_based_reftr
 2 | from .reftr_segmentation import build_reftr_seg
 3 | 
 4 | def build_reftr(args):
 5 |     if args.reftr_type.startswith('transformer'):
 6 |         if args.masks:
 7 |             return build_reftr_seg(args)
 8 |         else:
 9 |             return build_transformer_based_reftr(args)
10 |     else:
11 |         raise NotImplementedError


--------------------------------------------------------------------------------
/models/modeling/backbone.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Modified from DETR (https://github.com/facebookresearch/detr)
  7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  8 | # ------------------------------------------------------------------------
  9 | 
 10 | """
 11 | Backbone modules.
 12 | """
 13 | from collections import OrderedDict
 14 | 
 15 | import torch
 16 | import torch.nn.functional as F
 17 | import torchvision
 18 | from torch import nn
 19 | from torchvision.models._utils import IntermediateLayerGetter
 20 | from typing import Dict, List
 21 | 
 22 | from util.misc import NestedTensor, is_main_process
 23 | 
 24 | from .position_encoding import build_position_encoding
 25 | 
 26 | class MLP(nn.Module):
 27 |     """ Very simple multi-layer perceptron (also called FFN)"""
 28 | 
 29 |     def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
 30 |         super().__init__()
 31 |         self.num_layers = num_layers
 32 |         h = [hidden_dim] * (num_layers - 1)
 33 |         self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
 34 | 
 35 |     def forward(self, x):
 36 |         for i, layer in enumerate(self.layers):
 37 |             x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
 38 |         return x
 39 | 
 40 | def _get_clones(module, N):
 41 |     return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
 42 | 
 43 | class FrozenBatchNorm2d(torch.nn.Module):
 44 |     """
 45 |     BatchNorm2d where the batch statistics and the affine parameters are fixed.
 46 | 
 47 |     Copy-paste from torchvision.misc.ops with added eps before rqsrt,
 48 |     without which any other models than torchvision.models.resnet[18,34,50,101]
 49 |     produce nans.
 50 |     """
 51 | 
 52 |     def __init__(self, n, eps=1e-5):
 53 |         super(FrozenBatchNorm2d, self).__init__()
 54 |         self.register_buffer("weight", torch.ones(n))
 55 |         self.register_buffer("bias", torch.zeros(n))
 56 |         self.register_buffer("running_mean", torch.zeros(n))
 57 |         self.register_buffer("running_var", torch.ones(n))
 58 |         self.eps = eps
 59 | 
 60 |     def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
 61 |                               missing_keys, unexpected_keys, error_msgs):
 62 |         num_batches_tracked_key = prefix + 'num_batches_tracked'
 63 |         if num_batches_tracked_key in state_dict:
 64 |             del state_dict[num_batches_tracked_key]
 65 | 
 66 |         super(FrozenBatchNorm2d, self)._load_from_state_dict(
 67 |             state_dict, prefix, local_metadata, strict,
 68 |             missing_keys, unexpected_keys, error_msgs)
 69 | 
 70 |     def forward(self, x):
 71 |         # move reshapes to the beginning
 72 |         # to make it fuser-friendly
 73 |         w = self.weight.reshape(1, -1, 1, 1)
 74 |         b = self.bias.reshape(1, -1, 1, 1)
 75 |         rv = self.running_var.reshape(1, -1, 1, 1)
 76 |         rm = self.running_mean.reshape(1, -1, 1, 1)
 77 |         eps = self.eps
 78 |         scale = w * (rv + eps).rsqrt()
 79 |         bias = b - rm * scale
 80 |         return x * scale + bias
 81 | 
 82 | 
 83 | class BackboneBase(nn.Module):
 84 | 
 85 |     def __init__(self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool):
 86 |         super().__init__()
 87 |         for name, parameter in backbone.named_parameters():
 88 |             if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name:
 89 |                 parameter.requires_grad_(False)
 90 |         if return_interm_layers:
 91 |             return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"}
 92 |             # return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"}
 93 |             self.strides = [4, 8, 16, 32]
 94 |             self.num_channels = [256, 512, 1024, 2048]
 95 |         else:
 96 |             return_layers = {'layer4': "0"}
 97 |             self.strides = [32]
 98 |             self.num_channels = [2048]
 99 |         self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
100 | 
101 |     def forward(self, tensor_list: NestedTensor):
102 |         xs = self.body(tensor_list.tensors)
103 |         out: Dict[str, NestedTensor] = {}
104 |         for name, x in xs.items():
105 |             m = tensor_list.mask
106 |             assert m is not None
107 |             mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0]
108 |             out[name] = NestedTensor(x, mask)
109 |         return out
110 | 
111 | 
112 | class Backbone(BackboneBase):
113 |     """ResNet backbone with frozen BatchNorm."""
114 |     def __init__(self, name: str,
115 |                  train_backbone: bool,
116 |                  return_interm_layers: bool,
117 |                  dilation: bool):
118 |         norm_layer = FrozenBatchNorm2d
119 |         backbone = getattr(torchvision.models, name)(
120 |             replace_stride_with_dilation=[False, False, dilation],
121 |             pretrained=is_main_process(), norm_layer=norm_layer)
122 |         assert name not in ('resnet18', 'resnet34'), "number of channels are hard coded"
123 |         super().__init__(backbone, train_backbone, return_interm_layers)
124 |         if dilation:
125 |             self.strides[-1] = self.strides[-1] // 2
126 | 
127 | 
128 | class Joiner(nn.Sequential):
129 |     def __init__(self, backbone, position_embedding):
130 |         super().__init__(backbone, position_embedding)
131 |         self.strides = backbone.strides
132 |         self.num_channels = backbone.num_channels
133 | 
134 |     def forward(self, tensor_list: NestedTensor):
135 |         xs = self[0](tensor_list)
136 |         out: List[NestedTensor] = []
137 |         pos = []
138 |         for name, x in sorted(xs.items()):
139 |             out.append(x)
140 | 
141 |         # position encoding
142 |         for x in out:
143 |             pos.append(self[1](x).to(x.tensors.dtype))
144 | 
145 |         return out, pos
146 | 
147 | 
148 | def build_backbone(args):
149 |     position_embedding = build_position_encoding(args)
150 |     train_backbone = args.lr_backbone > 0
151 |     return_interm_layers = args.masks or (args.num_feature_levels > 1)
152 |     backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation)
153 |     model = Joiner(backbone, position_embedding)
154 |     return model
155 | 


--------------------------------------------------------------------------------
/models/modeling/matcher.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Modified from DETR (https://github.com/facebookresearch/detr)
  7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  8 | # ------------------------------------------------------------------------
  9 | 
 10 | """
 11 | Modules to compute the matching cost and solve the corresponding LSAP.
 12 | """
 13 | import torch
 14 | from scipy.optimize import linear_sum_assignment
 15 | from torch import nn
 16 | 
 17 | from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou
 18 | 
 19 | class OnetoAllMatcher(nn.Module):
 20 |     """
 21 |         Assume only one gt per match batch
 22 |     """
 23 |     def __init__(self,
 24 |                  cost_class: float = 1,
 25 |                  cost_bbox: float = 1,
 26 |                  cost_giou: float = 1):
 27 |         """Creates the matcher
 28 | 
 29 |         Params:
 30 |             cost_class: This is the relative weight of the classification error in the matching cost
 31 |             cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
 32 |             cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
 33 |         """
 34 |         super().__init__()
 35 |         self.cost_class = cost_class
 36 |         self.cost_bbox = cost_bbox
 37 |         self.cost_giou = cost_giou
 38 |         assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
 39 | 
 40 |     def forward(self, outputs, targets, topk=1, use_softmax_match=False):
 41 |         with torch.no_grad():
 42 |             bs, num_queries = outputs["pred_logits"].shape[:2]
 43 | 
 44 |             out_prob = outputs["pred_logits"]
 45 |             out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
 46 |             tgt_ids = torch.cat([v["labels"] for v in targets])
 47 |             tgt_bbox = torch.cat([v["boxes"] for v in targets])
 48 | 
 49 |             if use_softmax_match:
 50 |                 out_prob = nn.functional.softmax(out_prob.view(bs, num_queries), dim=-1)
 51 |                 out_prob = out_prob.view(bs * num_queries, -1)
 52 |                 cost_class = -(out_prob + 1e-8).log()
 53 |                 cost_class = cost_class[:, tgt_ids]
 54 |             else:
 55 |                 alpha = 0.25
 56 |                 gamma = 2.0
 57 |                 out_prob = out_prob.flatten(0, 1).sigmoid()
 58 |                 neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
 59 |                 pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
 60 |                 # Compute the costs.            
 61 |                 cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
 62 | 
 63 |             cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
 64 |             cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox),
 65 |                                              box_cxcywh_to_xyxy(tgt_bbox))
 66 | 
 67 |             # Final cost matrix
 68 |             C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
 69 |             C = C.view(bs, num_queries, -1).cpu()
 70 | 
 71 |             # Muchen: Here is what different from HungarianMacher
 72 |             sizes = [len(v["boxes"]) for v in targets]
 73 | 
 74 |             indices = []
 75 |             for batch_i, c in enumerate(C.split(sizes, -1)):
 76 |                 cost_matrix = c[batch_i]
 77 |                 q, tgt = cost_matrix.shape
 78 | 
 79 |                 assert tgt == 1, f"cost_matrix have a size of: {cost_matrix.shape}"  
 80 |                 # take the 
 81 |                 topv, topi = torch.topk(-1*cost_matrix.flatten(), topk)
 82 |                 indices.append((topi, [0]*topk))
 83 |             
 84 |             return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
 85 | 
 86 | 
 87 | class HungarianMatcher(nn.Module):
 88 |     """This class computes an assignment between the targets and the predictions of the network
 89 | 
 90 |     For efficiency reasons, the targets don't include the no_object. Because of this, in general,
 91 |     there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
 92 |     while the others are un-matched (and thus treated as non-objects).
 93 |     """
 94 | 
 95 |     def __init__(self,
 96 |                  cost_class: float = 1,
 97 |                  cost_bbox: float = 1,
 98 |                  cost_giou: float = 1):
 99 |         """Creates the matcher
100 | 
101 |         Params:
102 |             cost_class: This is the relative weight of the classification error in the matching cost
103 |             cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost
104 |             cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost
105 |         """
106 |         super().__init__()
107 |         self.cost_class = cost_class
108 |         self.cost_bbox = cost_bbox
109 |         self.cost_giou = cost_giou
110 |         assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0"
111 | 
112 |     # TODO: work around here for parameter passing
113 |     def forward(self, outputs, targets, topk=1, use_softmax_match=False):
114 |         """ Performs the matching
115 | 
116 |         Params:
117 |             outputs: This is a dict that contains at least these entries:
118 |                  "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
119 |                  "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates
120 | 
121 |             targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing:
122 |                  "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth
123 |                            objects in the target) containing the class labels
124 |                  "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates
125 | 
126 |         Returns:
127 |             A list of size batch_size, containing tuples of (index_i, index_j) where:
128 |                 - index_i is the indices of the selected predictions (in order)
129 |                 - index_j is the indices of the corresponding selected targets (in order)
130 |             For each batch element, it holds:
131 |                 len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
132 |         """
133 |         with torch.no_grad():
134 |             bs, num_queries = outputs["pred_logits"].shape[:2]
135 | 
136 |             # We flatten to compute the cost matrices in a batch
137 |             out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid()
138 |             out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
139 | 
140 |             # Also concat the target labels and boxes
141 |             tgt_ids = torch.cat([v["labels"] for v in targets])
142 |             tgt_bbox = torch.cat([v["boxes"] for v in targets])
143 | 
144 |             # Compute the classification cost.
145 |             alpha = 0.25
146 |             gamma = 2.0
147 |             neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log())
148 |             pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
149 |             cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids]
150 | 
151 |             # Compute the L1 cost between boxes
152 |             cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1)
153 | 
154 |             # Compute the giou cost betwen boxes
155 |             cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox),
156 |                                              box_cxcywh_to_xyxy(tgt_bbox))
157 | 
158 |             # Final cost matrix
159 |             C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou
160 |             C = C.view(bs, num_queries, -1).cpu()
161 | 
162 |             sizes = [len(v["boxes"]) for v in targets]
163 |             indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))]
164 |             return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
165 | 
166 | 
167 | def build_matcher(args, mode='hungarian'):
168 |     if mode == 'hungarian':
169 |         print("Building Hungarian Matcher")
170 |         return HungarianMatcher(cost_class=args.set_cost_class,
171 |                             cost_bbox=args.set_cost_bbox,
172 |                             cost_giou=args.set_cost_giou)
173 |     elif mode == "one_to_all":
174 |         print("Building One to all Matcher")
175 |         return OnetoAllMatcher(cost_class=args.set_cost_class,
176 |                             cost_bbox=args.set_cost_bbox,
177 |                             cost_giou=args.set_cost_giou)
178 |     else:
179 |         raise NotImplementedError
180 |         
181 | 


--------------------------------------------------------------------------------
/models/modeling/position_encoding.py:
--------------------------------------------------------------------------------
 1 | # ------------------------------------------------------------------------
 2 | # Deformable DETR
 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 5 | # ------------------------------------------------------------------------
 6 | # Modified from DETR (https://github.com/facebookresearch/detr)
 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
 8 | # ------------------------------------------------------------------------
 9 | 
10 | """
11 | Various positional encodings for the transformer.
12 | """
13 | import math
14 | import torch
15 | from torch import nn
16 | 
17 | from util.misc import NestedTensor
18 | 
19 | 
20 | class PositionEmbeddingSine(nn.Module):
21 |     """
22 |     This is a more standard version of the position embedding, very similar to the one
23 |     used by the Attention is all you need paper, generalized to work on images.
24 |     """
25 |     def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None):
26 |         super().__init__()
27 |         self.num_pos_feats = num_pos_feats
28 |         self.temperature = temperature
29 |         self.normalize = normalize
30 |         if scale is not None and normalize is False:
31 |             raise ValueError("normalize should be True if scale is passed")
32 |         if scale is None:
33 |             scale = 2 * math.pi
34 |         self.scale = scale
35 | 
36 |     def forward(self, tensor_list: NestedTensor):
37 |         x = tensor_list.tensors
38 |         mask = tensor_list.mask
39 |         assert mask is not None
40 |         not_mask = ~mask
41 |         y_embed = not_mask.cumsum(1, dtype=torch.float32)
42 |         x_embed = not_mask.cumsum(2, dtype=torch.float32)
43 |         if self.normalize:
44 |             eps = 1e-6
45 |             y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
46 |             x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
47 | 
48 |         dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device)
49 |         dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats)
50 | 
51 |         pos_x = x_embed[:, :, :, None] / dim_t
52 |         pos_y = y_embed[:, :, :, None] / dim_t
53 |         pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
54 |         pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
55 |         pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
56 |         return pos
57 | 
58 | 
59 | class PositionEmbeddingLearned(nn.Module):
60 |     """
61 |     Absolute pos embedding, learned.
62 |     """
63 |     def __init__(self, num_pos_feats=256):
64 |         super().__init__()
65 |         self.row_embed = nn.Embedding(50, num_pos_feats)
66 |         self.col_embed = nn.Embedding(50, num_pos_feats)
67 |         self.reset_parameters()
68 | 
69 |     def reset_parameters(self):
70 |         nn.init.uniform_(self.row_embed.weight)
71 |         nn.init.uniform_(self.col_embed.weight)
72 | 
73 |     def forward(self, tensor_list: NestedTensor):
74 |         x = tensor_list.tensors
75 |         h, w = x.shape[-2:]
76 |         i = torch.arange(w, device=x.device)
77 |         j = torch.arange(h, device=x.device)
78 |         x_emb = self.col_embed(i)
79 |         y_emb = self.row_embed(j)
80 |         pos = torch.cat([
81 |             x_emb.unsqueeze(0).repeat(h, 1, 1),
82 |             y_emb.unsqueeze(1).repeat(1, w, 1),
83 |         ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1)
84 |         return pos
85 | 
86 | 
87 | def build_position_encoding(args):
88 |     N_steps = args.hidden_dim // 2
89 |     if args.position_embedding in ('v2', 'sine'):
90 |         # TODO find a better way of exposing other arguments
91 |         position_embedding = PositionEmbeddingSine(N_steps, normalize=True)
92 |     elif args.position_embedding in ('v3', 'learned'):
93 |         position_embedding = PositionEmbeddingLearned(N_steps)
94 |     else:
95 |         raise ValueError(f"not supported {args.position_embedding}")
96 | 
97 |     return position_embedding
98 | 


--------------------------------------------------------------------------------
/models/modeling/transformer.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  2 | """
  3 | DETR Transformer class.
  4 | Copy-paste from torch.nn.Transformer with modifications:
  5 |     * positional encodings are passed in MHattention
  6 |     * extra LN at the end of encoder is removed
  7 |     * decoder returns a stack of activations from all decoding layers
  8 | """
  9 | import copy
 10 | from typing import Optional, List
 11 | 
 12 | import torch
 13 | import torch.nn.functional as F
 14 | from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
 15 | from torch import nn, Tensor
 16 | 
 17 | 
 18 | class Transformer(nn.Module):
 19 |     """
 20 |         Modified based on deformable transformer to enable multi-scale.
 21 |     """
 22 |     def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
 23 |                  num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
 24 |                  activation="relu", normalize_before=False, num_feature_levels=1,
 25 |                  return_intermediate_dec=False):
 26 |         super().__init__()
 27 | 
 28 |         encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
 29 |                                                 dropout, activation, normalize_before)
 30 |         encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
 31 |         self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
 32 | 
 33 |         decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
 34 |                                                 dropout, activation, normalize_before)
 35 |         decoder_norm = nn.LayerNorm(d_model)
 36 |         self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
 37 |                                           return_intermediate=return_intermediate_dec)
 38 | 
 39 |         self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
 40 |         self._reset_parameters()
 41 | 
 42 |         self.d_model = d_model
 43 |         self.nhead = nhead
 44 | 
 45 |     def _reset_parameters(self):
 46 |         for p in self.parameters():
 47 |             if p.dim() > 1:
 48 |                 nn.init.xavier_uniform_(p)
 49 |         normal_(self.level_embed)
 50 | 
 51 |     def forward(self, src, mask, pos_embed, query_embed=None, lang_feat=None):
 52 |         src_flatten = []
 53 |         mask_flatten = []
 54 |         lvl_pos_embed_flatten = []
 55 |         for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)):
 56 |             bs, c, h, w = src.shape
 57 |             src = src.flatten(2).transpose(1, 2)
 58 |             mask = mask.flatten(1)
 59 |             pos_embed = pos_embed.flatten(2).transpose(1, 2)
 60 |             lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
 61 |             lvl_pos_embed_flatten.append(lvl_pos_embed)
 62 |             src_flatten.append(src)
 63 |             mask_flatten.append(mask)
 64 |         src_flatten = torch.cat(src_flatten, 1).transpose(0, 1)
 65 |         mask_flatten = torch.cat(mask_flatten, 1).transpose(0, 1)
 66 |         lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1).transpose(0, 1)
 67 | 
 68 |         query_embed, tgt = torch.split(query_embed, c, dim=1)
 69 |         query_embed = query_embed.unsqueeze(1).expand(-1, bs, -1)
 70 |         tgt = tgt.unsqueeze(1).expand(-1, bs, -1)
 71 |         lang_feat = lang_feat.transpose(0, 1)
 72 | 
 73 |         query_embed = query_embed + lang_feat
 74 | 
 75 |         memory = self.encoder(src_flatten, src_key_padding_mask=mask_flatten, pos=lvl_pos_embed_flatten)
 76 |         hs = self.decoder(tgt, memory, memory_key_padding_mask=mask_flatten,
 77 |                           pos=lvl_pos_embed_flatten, query_pos=query_embed)
 78 |         return hs.transpose(1, 2), #memory.permute(1, 2, 0).view(bs, c, h, w)
 79 | 
 80 | 
 81 | class TransformerEncoder(nn.Module):
 82 | 
 83 |     def __init__(self, encoder_layer, num_layers, norm=None):
 84 |         super().__init__()
 85 |         self.layers = _get_clones(encoder_layer, num_layers)
 86 |         self.num_layers = num_layers
 87 |         self.norm = norm
 88 | 
 89 |     def forward(self, src,
 90 |                 mask: Optional[Tensor] = None,
 91 |                 src_key_padding_mask: Optional[Tensor] = None,
 92 |                 pos: Optional[Tensor] = None):
 93 |         output = src
 94 | 
 95 |         for layer in self.layers:
 96 |             output = layer(output, src_mask=mask,
 97 |                            src_key_padding_mask=src_key_padding_mask, pos=pos)
 98 | 
 99 |         if self.norm is not None:
100 |             output = self.norm(output)
101 | 
102 |         return output
103 | 
104 | 
105 | class TransformerDecoder(nn.Module):
106 | 
107 |     def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False):
108 |         super().__init__()
109 |         self.layers = _get_clones(decoder_layer, num_layers)
110 |         self.num_layers = num_layers
111 |         self.norm = norm
112 |         self.return_intermediate = return_intermediate
113 | 
114 |     def forward(self, tgt, memory,
115 |                 tgt_mask: Optional[Tensor] = None,
116 |                 memory_mask: Optional[Tensor] = None,
117 |                 tgt_key_padding_mask: Optional[Tensor] = None,
118 |                 memory_key_padding_mask: Optional[Tensor] = None,
119 |                 pos: Optional[Tensor] = None,
120 |                 query_pos: Optional[Tensor] = None):
121 |         output = tgt
122 | 
123 |         intermediate = []
124 | 
125 |         for layer in self.layers:
126 |             output = layer(output, memory, tgt_mask=tgt_mask,
127 |                            memory_mask=memory_mask,
128 |                            tgt_key_padding_mask=tgt_key_padding_mask,
129 |                            memory_key_padding_mask=memory_key_padding_mask,
130 |                            pos=pos, query_pos=query_pos)
131 |             if self.return_intermediate:
132 |                 intermediate.append(self.norm(output))
133 | 
134 |         if self.norm is not None:
135 |             output = self.norm(output)
136 |             if self.return_intermediate:
137 |                 intermediate.pop()
138 |                 intermediate.append(output)
139 | 
140 |         if self.return_intermediate:
141 |             return torch.stack(intermediate)
142 | 
143 |         return output.unsqueeze(0)
144 | 
145 | 
146 | class TransformerEncoderLayer(nn.Module):
147 | 
148 |     def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
149 |                  activation="relu", normalize_before=False):
150 |         super().__init__()
151 |         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
152 |         # Implementation of Feedforward model
153 |         self.linear1 = nn.Linear(d_model, dim_feedforward)
154 |         self.dropout = nn.Dropout(dropout)
155 |         self.linear2 = nn.Linear(dim_feedforward, d_model)
156 | 
157 |         self.norm1 = nn.LayerNorm(d_model)
158 |         self.norm2 = nn.LayerNorm(d_model)
159 |         self.dropout1 = nn.Dropout(dropout)
160 |         self.dropout2 = nn.Dropout(dropout)
161 | 
162 |         self.activation = _get_activation_fn(activation)
163 |         self.normalize_before = normalize_before
164 | 
165 |     def with_pos_embed(self, tensor, pos: Optional[Tensor]):
166 |         return tensor if pos is None else tensor + pos
167 | 
168 |     def forward_post(self,
169 |                      src,
170 |                      src_mask: Optional[Tensor] = None,
171 |                      src_key_padding_mask: Optional[Tensor] = None,
172 |                      pos: Optional[Tensor] = None):
173 |         q = k = self.with_pos_embed(src, pos)
174 |         src2 = self.self_attn(q, k, value=src, attn_mask=src_mask,
175 |                               key_padding_mask=src_key_padding_mask)[0]
176 |         src = src + self.dropout1(src2)
177 |         src = self.norm1(src)
178 |         src2 = self.linear2(self.dropout(self.activation(self.linear1(src))))
179 |         src = src + self.dropout2(src2)
180 |         src = self.norm2(src)
181 |         return src
182 | 
183 |     def forward_pre(self, src,
184 |                     src_mask: Optional[Tensor] = None,
185 |                     src_key_padding_mask: Optional[Tensor] = None,
186 |                     pos: Optional[Tensor] = None):
187 |         src2 = self.norm1(src)
188 |         q = k = self.with_pos_embed(src2, pos)
189 |         src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask,
190 |                               key_padding_mask=src_key_padding_mask)[0]
191 |         src = src + self.dropout1(src2)
192 |         src2 = self.norm2(src)
193 |         src2 = self.linear2(self.dropout(self.activation(self.linear1(src2))))
194 |         src = src + self.dropout2(src2)
195 |         return src
196 | 
197 |     def forward(self, src,
198 |                 src_mask: Optional[Tensor] = None,
199 |                 src_key_padding_mask: Optional[Tensor] = None,
200 |                 pos: Optional[Tensor] = None):
201 |         if self.normalize_before:
202 |             return self.forward_pre(src, src_mask, src_key_padding_mask, pos)
203 |         return self.forward_post(src, src_mask, src_key_padding_mask, pos)
204 | 
205 | 
206 | class TransformerDecoderLayer(nn.Module):
207 | 
208 |     def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1,
209 |                  activation="relu", normalize_before=False):
210 |         super().__init__()
211 |         self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
212 |         self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout)
213 |         # Implementation of Feedforward model
214 |         self.linear1 = nn.Linear(d_model, dim_feedforward)
215 |         self.dropout = nn.Dropout(dropout)
216 |         self.linear2 = nn.Linear(dim_feedforward, d_model)
217 | 
218 |         self.norm1 = nn.LayerNorm(d_model)
219 |         self.norm2 = nn.LayerNorm(d_model)
220 |         self.norm3 = nn.LayerNorm(d_model)
221 |         self.dropout1 = nn.Dropout(dropout)
222 |         self.dropout2 = nn.Dropout(dropout)
223 |         self.dropout3 = nn.Dropout(dropout)
224 | 
225 |         self.activation = _get_activation_fn(activation)
226 |         self.normalize_before = normalize_before
227 | 
228 |     def with_pos_embed(self, tensor, pos: Optional[Tensor]):
229 |         return tensor if pos is None else tensor + pos
230 | 
231 |     def forward_post(self, tgt, memory,
232 |                      tgt_mask: Optional[Tensor] = None,
233 |                      memory_mask: Optional[Tensor] = None,
234 |                      tgt_key_padding_mask: Optional[Tensor] = None,
235 |                      memory_key_padding_mask: Optional[Tensor] = None,
236 |                      pos: Optional[Tensor] = None,
237 |                      query_pos: Optional[Tensor] = None):
238 |         q = k = self.with_pos_embed(tgt, query_pos)
239 |         tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask,
240 |                               key_padding_mask=tgt_key_padding_mask)[0]
241 |         tgt = tgt + self.dropout1(tgt2)
242 |         tgt = self.norm1(tgt)
243 |         tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos),
244 |                                    key=self.with_pos_embed(memory, pos),
245 |                                    value=memory, attn_mask=memory_mask,
246 |                                    key_padding_mask=memory_key_padding_mask)[0]
247 |         tgt = tgt + self.dropout2(tgt2)
248 |         tgt = self.norm2(tgt)
249 |         tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt))))
250 |         tgt = tgt + self.dropout3(tgt2)
251 |         tgt = self.norm3(tgt)
252 |         return tgt
253 | 
254 |     def forward_pre(self, tgt, memory,
255 |                     tgt_mask: Optional[Tensor] = None,
256 |                     memory_mask: Optional[Tensor] = None,
257 |                     tgt_key_padding_mask: Optional[Tensor] = None,
258 |                     memory_key_padding_mask: Optional[Tensor] = None,
259 |                     pos: Optional[Tensor] = None,
260 |                     query_pos: Optional[Tensor] = None):
261 |         tgt2 = self.norm1(tgt)
262 |         q = k = self.with_pos_embed(tgt2, query_pos)
263 |         tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask,
264 |                               key_padding_mask=tgt_key_padding_mask)[0]
265 |         tgt = tgt + self.dropout1(tgt2)
266 |         tgt2 = self.norm2(tgt)
267 |         tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos),
268 |                                    key=self.with_pos_embed(memory, pos),
269 |                                    value=memory, attn_mask=memory_mask,
270 |                                    key_padding_mask=memory_key_padding_mask)[0]
271 |         tgt = tgt + self.dropout2(tgt2)
272 |         tgt2 = self.norm3(tgt)
273 |         tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2))))
274 |         tgt = tgt + self.dropout3(tgt2)
275 |         return tgt
276 | 
277 |     def forward(self, tgt, memory,
278 |                 tgt_mask: Optional[Tensor] = None,
279 |                 memory_mask: Optional[Tensor] = None,
280 |                 tgt_key_padding_mask: Optional[Tensor] = None,
281 |                 memory_key_padding_mask: Optional[Tensor] = None,
282 |                 pos: Optional[Tensor] = None,
283 |                 query_pos: Optional[Tensor] = None):
284 |         if self.normalize_before:
285 |             return self.forward_pre(tgt, memory, tgt_mask, memory_mask,
286 |                                     tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
287 |         return self.forward_post(tgt, memory, tgt_mask, memory_mask,
288 |                                  tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos)
289 | 
290 | 
291 | def _get_clones(module, N):
292 |     return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
293 | 
294 | def build_transformer(args):
295 |     return Transformer(
296 |         d_model=args.hidden_dim,
297 |         nhead=args.nheads,
298 |         num_encoder_layers=args.enc_layers,
299 |         num_decoder_layers=args.dec_layers,
300 |         dim_feedforward=args.dim_feedforward,
301 |         dropout=args.dropout,
302 |         activation="relu",
303 |         return_intermediate_dec=True,
304 |         num_feature_levels=args.num_feature_levels)
305 | 
306 | def _get_activation_fn(activation):
307 |     """Return an activation function given a string"""
308 |     if activation == "relu":
309 |         return F.relu
310 |     if activation == "gelu":
311 |         return F.gelu
312 |     if activation == "glu":
313 |         return F.glu
314 |     raise RuntimeError(F"activation should be relu/gelu, not {activation}.")


--------------------------------------------------------------------------------
/models/post_process.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn.functional as F
 3 | from torch import nn
 4 | import math
 5 | from util import box_ops
 6 | 
 7 | class PostProcessVGOnePhrase(nn.Module):
 8 |     """ This module converts the model's output into the format expected by the coco api"""
 9 | 
10 |     @torch.no_grad()
11 |     def forward(self, outputs, target_sizes, scale_to_original_shape=False):
12 |         """ Perform the computation
13 |         Parameters:
14 |             outputs: raw outputs of the model
15 |             target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
16 |                           For evaluation, this must be the original image size (before any data augmentation)
17 |                           For visualization, this should be the image size after data augment, but before padding
18 |         """
19 |         out_bbox = outputs['pred_boxes']
20 |         bs, k, _ = out_bbox.shape
21 | 
22 |         assert len(out_bbox) == len(target_sizes)
23 |         assert target_sizes.shape[1] == 2
24 | 
25 |         # TODO for multiple predictions
26 |         # print("out_bbox.shape:", out_bbox.shape)
27 |         out_bbox = out_bbox[:, 0, :]
28 |         boxes = box_ops.box_cxcywh_to_xyxy(out_bbox)
29 | 
30 |         # and from relative [0, 1] to absolute [0, height] coordinates
31 |         if scale_to_original_shape:
32 |             img_h, img_w = target_sizes.unbind(1)
33 |             scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
34 |             boxes = boxes * scale_fct
35 | 
36 |         # print("boxes.shape:", boxes.shape)
37 |         # return boxes
38 |         results = [{'boxes': boxes[i:i+1, :]} for i in range(bs)]
39 |         return results
40 | 
41 | class PostProcessVGMultiPhrase(nn.Module):
42 |     """ This module converts the model's output into the format expected by the coco api"""
43 | 
44 |     @torch.no_grad()
45 |     def forward(self, outputs, target_sizes, scale_to_original_shape=False):
46 |         """ Perform the computation
47 |         Parameters:
48 |             outputs: raw outputs of the model
49 |             target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch
50 |                           For evaluation, this must be the original image size (before any data augmentation)
51 |                           For visualization, this should be the image size after data augment, but before padding
52 |         """
53 |         out_bbox = outputs['pred_boxes']
54 |         bsz, num_phrase, k, _ = out_bbox.shape
55 |         mask = outputs['phrase_mask'].view(bsz, num_phrase, k, -1)
56 |         # print(out_bbox.shape, mask.shape)
57 | 
58 |         target_boxes = []
59 |         assert bsz == len(target_sizes)
60 |         for i in range(bsz):
61 |             mask_i = mask[i]
62 |             pred_i = torch.masked_select(out_bbox[i], mask_i).view(-1, k, 4)
63 | 
64 |             assert target_sizes.shape[1] == 2
65 | 
66 |             # TODO for multiple predictions
67 |             # print("out_bbox.shape:", out_bbox.shape)
68 |             out_bbox_i = pred_i[:, 0, :]
69 |             boxes = box_ops.box_cxcywh_to_xyxy(out_bbox_i)
70 | 
71 |             # and from relative [0, 1] to absolute [0, height] coordinates
72 |             if scale_to_original_shape:
73 |                 img_h, img_w = target_sizes[i:i+1].unbind(1)
74 |                 scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1)
75 |                 # print(boxes, scale_fct)
76 |                 boxes = boxes * scale_fct
77 | 
78 |             target_boxes.append(boxes)
79 | 
80 |         # print("boxes.shape:", boxes.shape)
81 |         # return boxes
82 |         results = [{'boxes': target_boxes[i]} for i in range(bsz)]
83 |         return results


--------------------------------------------------------------------------------
/models/reftr.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import torch
  3 | import torch.nn.functional as F
  4 | 
  5 | from typing import Optional, List
  6 | from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_
  7 | from torch import nn, Tensor
  8 | from models.modeling.transformer import TransformerDecoder, TransformerDecoderLayer ,TransformerEncoder, TransformerEncoderLayer
  9 | 
 10 | class VLTransformer(nn.Module):
 11 |     def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
 12 |                  num_decoder_layers=6, dim_feedforward=2048, dropout=0.1,
 13 |                  activation="relu", normalize_before=False, num_feature_levels=1,
 14 |                  num_queries=1 ,return_intermediate_dec=False, max_lang_seq=128):
 15 |         super().__init__()
 16 |         # Positional embedding and feat type embedding
 17 |         # token type embedding to indicate image feature vs language feature
 18 |         self.max_lang_seq = max_lang_seq
 19 |         self.num_queries = num_queries
 20 |         self.d_model = d_model
 21 |         self.nhead = nhead
 22 |         self.lang_pos_embeddings = nn.Embedding(max_lang_seq, d_model)
 23 |         self.token_type_embeddings = nn.Embedding(2, d_model)
 24 | 
 25 |         # Transformer Encoder as encoder
 26 |         self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model))
 27 |         encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
 28 |                                                 dropout, activation, normalize_before)
 29 |         encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
 30 |         self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm)
 31 | 
 32 |         # if num_decoder_layers < 0, no decoder is used
 33 |         self.use_decoder = num_decoder_layers > 0
 34 |         if self.use_decoder:
 35 |             decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward,
 36 |                                                     dropout, activation, normalize_before)
 37 |             decoder_norm = nn.LayerNorm(d_model)
 38 |             self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm,
 39 |                                             return_intermediate=return_intermediate_dec)
 40 |         else:
 41 |             print("No decoder is used!")
 42 | 
 43 |         self._reset_parameters()
 44 | 
 45 |     def _reset_parameters(self):
 46 |         for p in self.parameters():
 47 |             if p.dim() > 1:
 48 |                 nn.init.xavier_uniform_(p)
 49 |         normal_(self.level_embed)
 50 | 
 51 |     def process_img_feat(self, img_srcs, img_masks, img_pos_embeds):
 52 |         src_flatten = []
 53 |         mask_flatten = []
 54 |         lvl_pos_embed_flatten = []
 55 |         for lvl, (src, mask, pos_embed) in enumerate(zip(img_srcs, img_masks, img_pos_embeds)):
 56 |             bs, c, h, w = src.shape
 57 |             src = src.flatten(2).transpose(1, 2)
 58 |             mask = mask.flatten(1)
 59 |             pos_embed = pos_embed.flatten(2).transpose(1, 2)
 60 |             lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1)
 61 |             lvl_pos_embed_flatten.append(lvl_pos_embed)
 62 |             src_flatten.append(src)
 63 |             mask_flatten.append(mask)
 64 |         img_src_flatten = torch.cat(src_flatten, 1)#.transpose(0, 1)
 65 |         img_mask_flatten = torch.cat(mask_flatten, 1)#.transpose(0, 1)
 66 |         img_lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)#.transpose(0, 1)
 67 | 
 68 |         # Add token type embedding if available
 69 |         bsz, seq_length, dim = img_src_flatten.shape
 70 |         if self.token_type_embeddings is not None:
 71 |             token_type_ids = torch.ones((bsz, seq_length), dtype=torch.long, device=img_src_flatten.device)
 72 |             token_type_embeddings = self.token_type_embeddings(token_type_ids)
 73 |             img_lvl_pos_embed_flatten = img_lvl_pos_embed_flatten + token_type_embeddings
 74 |         
 75 |         return img_mask_flatten,\
 76 |                img_src_flatten.transpose(0, 1),\
 77 |                img_lvl_pos_embed_flatten.transpose(0, 1)
 78 | 
 79 |     def process_lang_feat(self, lang_srcs, lang_masks):
 80 |         bsz, seq_length, dim = lang_srcs.shape
 81 |         assert seq_length <= self.max_lang_seq
 82 |         position_ids = torch.arange(seq_length, dtype=torch.long, device=lang_srcs.device)
 83 |         position_ids = position_ids.unsqueeze(0).expand(bsz, -1)
 84 |         position_embeddings = self.lang_pos_embeddings(position_ids)
 85 | 
 86 |         if self.token_type_embeddings is not None:
 87 |             token_type_ids = torch.zeros((bsz, seq_length), dtype=torch.long, device=lang_srcs.device)
 88 |             token_type_embeddings = self.token_type_embeddings(token_type_ids)
 89 |             position_embeddings = position_embeddings + token_type_embeddings
 90 |         
 91 |         # Non-zero area is ignored 
 92 |         lang_masks = lang_masks.logical_not()
 93 |         assert (lang_masks[:, 0] == False).all()
 94 | 
 95 |         return lang_masks,\
 96 |                lang_srcs.transpose(0, 1),\
 97 |                position_embeddings.transpose(0, 1)
 98 |     
 99 |     def encode(self, img_srcs, img_masks, img_pos_embeds, 
100 |                 lang_srcs, lang_masks):
101 |         # create image feature and mask & pos info
102 |         
103 |         # print(f"img_srcs/img_masks/img_pos_embeds: {img_srcs.shape} {img_masks.shape} {img_pos_embeds.shape}")
104 |         img_masks, img_srcs, img_pos_embeds =\
105 |             self.process_img_feat(img_srcs, img_masks, img_pos_embeds)
106 |         # print(f"img_srcs/img_masks/img_pos_embeds: {img_srcs.shape} {img_masks.shape} {img_pos_embeds.shape}")
107 |         # print(img_masks)
108 | 
109 |         # print(f"lang_srcs/lang_masks: {lang_srcs.shape} {lang_masks.shape}")
110 |         lang_masks, lang_srcs, lang_pos_embeds =\
111 |             self.process_lang_feat(lang_srcs, lang_masks)
112 |         # print(f"lang_srcs/lang_masks/lang_pos_embeds: {lang_srcs.shape} {lang_masks.shape} {lang_pos_embeds.shape}")
113 |         # print(lang_masks)
114 | 
115 |         masks = torch.cat([lang_masks, img_masks], dim=1)
116 |         srcs = torch.cat([lang_srcs, img_srcs], dim=0)
117 |         pos_embeds = torch.cat([lang_pos_embeds, img_pos_embeds], dim=0)
118 | 
119 |         memory = self.encoder(srcs, src_key_padding_mask=masks, pos=pos_embeds)
120 |         return memory, masks, pos_embeds
121 | 
122 |     def forward(self, img_srcs, img_masks, img_pos_embeds, 
123 |                 lang_srcs, lang_masks, 
124 |                 query=None, query_mask=None, query_pos=None):
125 |         
126 |         memory, masks, pos_embeds =\
127 |             self.encode(img_srcs, img_masks, img_pos_embeds, lang_srcs, lang_masks)
128 | 
129 |         if self.use_decoder:
130 |             # TODO here
131 |             hs = self.decoder(query, memory, 
132 |                             memory_key_padding_mask=masks,
133 |                             tgt_key_padding_mask=query_mask,
134 |                             pos=pos_embeds, query_pos=query_pos)
135 |         else:
136 |             hs = memory.unsqueeze(0)
137 |         return hs.transpose(1, 2)
138 | 
139 | 
140 | def build_vl_transformer(args):
141 |     return VLTransformer(
142 |         d_model=args.hidden_dim,
143 |         nhead=args.nheads,
144 |         num_encoder_layers=args.enc_layers,
145 |         num_decoder_layers=args.dec_layers,
146 |         dim_feedforward=args.dim_feedforward,
147 |         dropout=args.dropout,
148 |         activation="relu",
149 |         num_feature_levels=args.num_feature_levels,
150 |         return_intermediate_dec=True,
151 |         max_lang_seq=args.max_lang_seq
152 |     )
153 | 


--------------------------------------------------------------------------------
/models/reftr_transformer.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torch.nn.functional as F
  3 | from torch import nn
  4 | 
  5 | from util.misc import (NestedTensor, nested_tensor_from_tensor_list)
  6 | from models.modeling.backbone import build_backbone, MLP
  7 | 
  8 | from transformers import RobertaModel, BertModel
  9 | from models.reftr import build_vl_transformer
 10 | from models.criterion import CriterionVGOnePhrase, CriterionVGMultiPhrase
 11 | from models.post_process import PostProcessVGOnePhrase, PostProcessVGMultiPhrase
 12 | 
 13 | 
 14 | def mlp_mapping(input_dim, output_dim):
 15 |     return torch.nn.Sequential(
 16 |         nn.Linear(input_dim, output_dim),
 17 |         nn.LayerNorm(output_dim),
 18 |         nn.ReLU(),
 19 |         nn.Dropout(0.1),
 20 |         nn.Linear(output_dim, output_dim),
 21 |         nn.LayerNorm(output_dim),
 22 |         nn.ReLU(),
 23 |     )
 24 | 
 25 | 
 26 | class QueryEncoder(nn.Module):
 27 |     def __init__(self, num_queries_per_phrase, hidden_dim, ablation):
 28 |         super(QueryEncoder, self).__init__()
 29 |         self.ablation = ablation
 30 |         self.hidden_dim = hidden_dim
 31 |         self.query_embed = nn.Embedding(num_queries_per_phrase, hidden_dim*2)
 32 |         self.linear1 = nn.Linear(hidden_dim, hidden_dim)
 33 |         self.linear2 = nn.Linear(hidden_dim, hidden_dim)
 34 |         self.linear3 = nn.Linear(hidden_dim, hidden_dim)
 35 |         self.fuse_encoder_query = mlp_mapping(hidden_dim*2, hidden_dim)
 36 |         self.context_out = nn.Sequential(
 37 |             nn.Linear(hidden_dim, hidden_dim),
 38 |             nn.LayerNorm(hidden_dim)
 39 |         )
 40 | 
 41 |     def forward(self, lang_context_feat, lang_query_feat, mask_query_context):
 42 |         learnable_querys = self.query_embed.weight
 43 |         bs, n_ph, _ = lang_query_feat.shape
 44 |         n_q = learnable_querys.size(0)
 45 |         # n_context = lang_context_feat.size(1)
 46 | 
 47 |         # attended reduce
 48 |         k = self.linear1(lang_context_feat[:, 0:1, :])
 49 |         q = self.linear2(lang_context_feat).transpose(1, 2)
 50 |         v = self.linear3(lang_context_feat).unsqueeze(1)                     # b, 1, n_context, -1
 51 |         att_weight = torch.bmm(k, q)
 52 |         att_weight = att_weight.expand(-1, n_ph, -1)
 53 |         att_weight = att_weight.masked_fill(mask_query_context, float('-inf'))
 54 |         att_weight_normalized = F.softmax(att_weight, dim=-1).unsqueeze(-1)  # b, n_ph, n_context, -1
 55 |         context_feats = self.context_out((v * att_weight_normalized).sum(dim=-2))              # b, n_ph, -1
 56 | 
 57 |         # residual connection
 58 |         context_feats = lang_context_feat[:, None, 0, :] + context_feats
 59 | 
 60 |         lang_query_feat = torch.cat([context_feats, lang_query_feat], dim=-1)
 61 |         lang_query_feat = self.fuse_encoder_query(lang_query_feat)
 62 |         phrase_queries = lang_query_feat.view(bs, n_ph, 1, -1).repeat(1, 1, 1, 2) +\
 63 |             learnable_querys.view(1, 1, n_q, -1)
 64 |         phrase_queries = phrase_queries.view(bs, n_ph*n_q, -1).transpose(0, 1)
 65 | 
 66 |         return torch.split(phrase_queries, self.hidden_dim, dim=-1)
 67 | 
 68 | 
 69 | class RefTR(nn.Module):
 70 |     def __init__(self, img_backbone, lang_backbone, vl_transformer,
 71 |                  num_feature_levels=1, num_queries_per_phrase=1,
 72 |                  freeze_lang_backbone=False, aux_loss=False, ablation='none'):
 73 |         super(RefTR, self).__init__()
 74 |         # print("ABLATION !!!", ablation)
 75 |         self.img_backbone = img_backbone
 76 |         self.lang_backbone = lang_backbone
 77 |         self.vl_transformer = vl_transformer
 78 |         self.num_feature_levels = num_feature_levels
 79 |         self.num_queries_per_phrase = num_queries_per_phrase
 80 |         self.hidden_dim = hidden_dim = vl_transformer.d_model
 81 |         print("Model dim:", hidden_dim)
 82 |         self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3)
 83 | 
 84 |         self.lang_hidden_dim = lang_backbone.config.hidden_size
 85 |         print("Language model dim:", self.lang_hidden_dim)
 86 |         self.map_sentence = mlp_mapping(self.lang_hidden_dim, hidden_dim)
 87 | 
 88 |         # TODO here
 89 |         self.use_decoder = self.vl_transformer.use_decoder
 90 |         if self.use_decoder:
 91 |             self.map_phrase = mlp_mapping(self.lang_hidden_dim, hidden_dim)
 92 |             self.query_encoder = QueryEncoder(
 93 |                 num_queries_per_phrase=num_queries_per_phrase,
 94 |                 hidden_dim=hidden_dim,
 95 |                 ablation='none'
 96 |             )
 97 | 
 98 |         # Set up for Feature Payramid
 99 |         if num_feature_levels > 1:
100 |             num_backbone_outs = len(self.img_backbone.strides)-1
101 |             input_proj_list = []
102 |             for l_ in range(num_backbone_outs):
103 |                 l_ = l_ + 1
104 |                 in_channels = self.img_backbone.num_channels[l_]
105 |                 print(f"layer {l_}: {self.img_backbone.num_channels[l_]}")
106 |                 input_proj_list.append(nn.Sequential(
107 |                     nn.Conv2d(in_channels, hidden_dim, kernel_size=1),
108 |                     nn.GroupNorm(32, hidden_dim),
109 |                 ))
110 |             for l_ in range(num_feature_levels - num_backbone_outs):
111 |                 print(f"layer {l_}: {in_channels}")
112 |                 input_proj_list.append(nn.Sequential(
113 |                     nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1),
114 |                     nn.GroupNorm(32, hidden_dim),
115 |                 ))
116 |                 in_channels = hidden_dim
117 |             self.input_proj = nn.ModuleList(input_proj_list)
118 |         else:
119 |             # TODO fix this for other network
120 |             assert self.img_backbone.num_channels[-1] == 2048
121 |             self.input_proj = nn.ModuleList([
122 |                 nn.Sequential(
123 |                     nn.Conv2d(self.img_backbone.num_channels[-1], hidden_dim, kernel_size=1),
124 |                     nn.GroupNorm(32, hidden_dim),
125 |                 )])
126 | 
127 |         self.aux_loss = aux_loss
128 |         self.freeze_lang_backbone = freeze_lang_backbone
129 | 
130 |         # initialization
131 |         nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
132 |         nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
133 |         for proj in self.input_proj:
134 |             nn.init.xavier_uniform_(proj[0].weight, gain=1)
135 |             nn.init.constant_(proj[0].bias, 0)
136 | 
137 |     def init_from_pretrained_detr(self, state_dict):
138 |         """
139 |             Initialize from pretrained DETR.
140 |         """
141 |         # print(state_dict.keys())
142 |         state_dict_backbone = {k.split('.', 1)[1]: v for k, v in state_dict.items() if k.split('.', 1)[0] == 'backbone'}
143 |         state_dict_transformer_encoder = {k.split('.', 2)[2]: v for k, v in state_dict.items() if 'transformer.encoder' in k}
144 |         self.img_backbone.load_state_dict(state_dict_backbone)
145 |         self.vl_transformer.encoder.load_state_dict(state_dict_transformer_encoder)
146 |         return
147 | 
148 |     def freeze_img_backbone(self):
149 |         for param in self.backbone.parameters():
150 |             param.requires_grad = False
151 | 
152 |     def freeze_bert(self):
153 |         """
154 |             Freeze for distributed training
155 |         """
156 |         for param in self.textmodel.parameters():
157 |             param.requires_grad = False
158 | 
159 |     def forward(self, samples):
160 |         # TODO
161 |         img = samples["img"]
162 | 
163 |         # Visual Module
164 |         srcs = []
165 |         masks = []
166 |         if not isinstance(img, NestedTensor):
167 |             img = nested_tensor_from_tensor_list(img)
168 |         img_features, pos = self.img_backbone(img)
169 | 
170 |         # FPN features & masks
171 |         pos = pos[-2:]
172 |         for l_, feat in enumerate(img_features[-2:]):
173 |             src, mask = feat.decompose()
174 |             srcs.append(self.input_proj[l_](src))
175 |             masks.append(mask)
176 |             # print(f"l: {l} src/mask/pos: {srcs[-1].shape} / {mask.shape} / {pos[l].shape}")
177 |             assert mask is not None
178 |         if self.num_feature_levels > len(srcs):
179 |             _len_srcs = len(srcs)
180 |             for l_ in range(_len_srcs, self.num_feature_levels):
181 |                 if l_ == _len_srcs:
182 |                     src = self.input_proj[l_](img_features[-1].tensors)
183 |                 else:
184 |                     src = self.input_proj[l_](srcs[-1])
185 |                 m = img.mask
186 |                 mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0]
187 |                 pos_l = self.img_backbone[1](NestedTensor(src, mask)).to(src.dtype)
188 |                 srcs.append(src)
189 |                 masks.append(mask)
190 |                 pos.append(pos_l)
191 |                 # print(f"l: {l} src/mask/pos: {src.shape} / {mask.shape} / {pos_l.shape}")
192 | 
193 |         # Language model
194 |         sentence = samples["sentence"]
195 |         sentence_mask = samples["sentence_mask"]
196 |         # ---------------------------------------------- #
197 |         # Ablation on context encoder
198 |         # sentence_feat = self.lang_backbone.embeddings(sentence)
199 |         # ---------------------------------------------- #
200 |         sentence_feat, sentence_feat_pooled = self.lang_backbone(sentence, token_type_ids=None, attention_mask=sentence_mask)[0:2]
201 |         sentence_feat = self.map_sentence(sentence_feat)
202 | 
203 |         # Process phrase queries
204 |         n_q = self.num_queries_per_phrase
205 |         bsz = sentence.size(0)
206 |         if 'phrase' in samples.keys():
207 |             phrases = samples["phrase"]
208 |             phrase_masks = samples["phrase_mask"]
209 |             p_pos_l = samples['phrase_pos_l']
210 |             p_pos_r = samples['phrase_pos_r']
211 |             n_ph = phrases.size(1)
212 |             assert n_ph == p_pos_l.size(1)
213 | 
214 |             # Get Phrase Representation
215 |             phrases = phrases.view(bsz * n_ph, -1)
216 |             phrase_masks = phrase_masks.view(bsz * n_ph, -1)
217 |             phrase_pooled_feat = self.lang_backbone(phrases, token_type_ids=None, attention_mask=phrase_masks)[1]
218 | 
219 |             # p_len = p_pos_r - p_pos_l
220 |             # TODO language len set to 90 in flickr Multiphrase setting
221 |             # assert 90 == n_context
222 | 
223 |             # Set up phrase-specific mask on context
224 |             mask_context = []
225 |             for i in range(bsz):
226 |                 for j in range(n_ph):
227 |                     mask = torch.ones_like(sentence_mask[0, :], device=sentence_mask.device)
228 |                     mask[p_pos_l[i, j]:p_pos_r[i, j]] = 0
229 |                     mask_context.append(mask)
230 |             mask_context = torch.stack(mask_context).view(bsz, n_ph, -1).to(torch.bool)
231 | 
232 |             # Mask for Query Decoder input
233 |             # TODO Hack here: Take the third mask of each phrase,
234 |             # if 0, the phrase only contains "[CLS] [SEP]", ignore
235 |             query_mask = phrase_masks.view(bsz, n_ph, -1)[:, :, 2:3]
236 |             query_mask = query_mask.logical_not()
237 |             query_mask = query_mask.expand(-1, -1, n_q)
238 |             query_mask = query_mask.view(bsz, n_ph*n_q)
239 |         else:
240 |             n_ph = 1
241 |             phrase_pooled_feat = sentence_feat_pooled
242 |             sentence_len = sentence_mask.to(torch.int32).sum(-1)
243 |             mask_context = sentence_mask.view(bsz, n_ph, -1).logical_not().to(torch.bool)
244 |             # Mask out [CLS] and [SEP]
245 |             mask_context[:, :, 0] = True
246 |             for i in range(bsz):
247 |                 mask_context[i, :, sentence_len[i]-1] = True
248 |             query_mask = torch.zeros((bsz, 1), device=sentence_mask.device).to(torch.bool)
249 | 
250 |         phrase_pooled_feat = self.map_phrase(phrase_pooled_feat).view(bsz, n_ph, -1)
251 | 
252 |         # print(f"phrase_queries {phrase_queries.shape} phrase_masks {phrase_masks.shape}")
253 |         memory, memory_mask, memory_pos =\
254 |             self.vl_transformer.encode(
255 |                 img_srcs=srcs,
256 |                 img_masks=masks,
257 |                 img_pos_embeds=pos,
258 |                 lang_srcs=sentence_feat,
259 |                 lang_masks=sentence_mask
260 |             )
261 |         memory_lang = memory[:sentence_feat.size(1)]
262 |         query, query_pos =\
263 |             self.query_encoder(
264 |                 lang_context_feat=memory_lang.transpose(0, 1),
265 |                 lang_query_feat=phrase_pooled_feat,
266 |                 mask_query_context=mask_context
267 |             )
268 | 
269 |         hs = self.vl_transformer.decoder(
270 |             tgt=query,
271 |             memory=memory,
272 |             tgt_key_padding_mask=query_mask,
273 |             memory_key_padding_mask=memory_mask,
274 |             query_pos=query_pos,
275 |             pos=memory_pos,
276 |         ).transpose(1, 2)
277 | 
278 |         # print(f"hs: {hs.shape}")
279 |         num_l = hs.size(0)
280 |         hs = hs.view(num_l, bsz, n_ph, n_q, -1)
281 |         # ----------------------------------------------#
282 |         # Ablation on no decoder
283 |         # hs = (query + query_pos).transpose(1, 2)
284 |         # hs = hs.reshape(1, bsz, n_ph, n_q, -1)
285 |         # ----------------------------------------------#
286 |         # TODO this
287 |         outputs_coord = self.bbox_embed(hs).sigmoid()
288 |         if torch.isnan(outputs_coord).any():
289 |             print(outputs_coord)
290 |             print(hs)
291 |             print(query)
292 | 
293 |         out = {'pred_boxes': outputs_coord[-1], 'phrase_mask': query_mask.logical_not()}
294 |         if self.aux_loss:
295 |             out['aux_outputs'] = self._set_aux_loss(outputs_coord, query_mask.logical_not())
296 | 
297 |         return out
298 | 
299 |     @torch.jit.unused
300 |     def _set_aux_loss(self, outputs_coord, phrase_mask):
301 |         # this is a workaround to make torchscript happy, as torchscript
302 |         # doesn't support dictionary with non-homogeneous values, such
303 |         # as a dict having both a Tensor and a list.
304 |         return [{'pred_boxes': b, 'phrase_mask': phrase_mask} for b in outputs_coord[:-1]]
305 | 
306 | 
307 | def build_reftr(args):
308 |     # num_classes = 1  # if args.dataset_file != 'coco' else 91
309 |     device = torch.device(args.device)
310 |     if args.no_decoder:
311 |         args.dec_layers = 0
312 | 
313 |     img_backbone = build_backbone(args)
314 |     vl_transformer = build_vl_transformer(args)
315 |     if args.bert_model.split('-')[0] == 'roberta':
316 |         lang_backbone = RobertaModel.from_pretrained(args.bert_model)
317 |     else:
318 |         lang_backbone = BertModel.from_pretrained(args.bert_model)
319 | 
320 |     weight_dict = {'loss_giou': args.giou_loss_coef, 'loss_bbox': args.bbox_loss_coef}
321 |     # weight_dict['loss_giou'] = args.giou_loss_coef
322 | 
323 |     # TODO this is a hack
324 |     if args.aux_loss:
325 |         aux_weight_dict = {}
326 |         for i in range(args.dec_layers - 1):
327 |             aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()})
328 |         aux_weight_dict.update({k + '_enc': v for k, v in weight_dict.items()})
329 |         weight_dict.update(aux_weight_dict)
330 | 
331 |     print("ABLATION !!!", args.ablation)
332 | 
333 |     model = RefTR(
334 |         img_backbone=img_backbone,
335 |         lang_backbone=lang_backbone,
336 |         vl_transformer=vl_transformer,
337 |         num_feature_levels=args.num_feature_levels,
338 |         num_queries_per_phrase=args.num_queries_per_phrase,
339 |         freeze_lang_backbone=args.freeze_bert,
340 |         aux_loss=args.aux_loss,
341 |         ablation=args.ablation
342 |     )
343 |     criterion = CriterionVGMultiPhrase(weight_dict, losses=['boxes'])
344 |     postprocessors = {'bbox': PostProcessVGMultiPhrase()}
345 | 
346 |     criterion.to(device)
347 |     return model, criterion, postprocessors
348 | 
349 | # if __name__ == "__main__":
350 | #     import sys, argparse
351 | #     sys.path.append(path_to_parent)
352 | #     from main_vg import get_args_parser
353 | #     parser = argparse.ArgumentParser('Deformable DETR training and evaluation script', parents=[get_args_parser()])
354 | #     args = parser.parse_args()
355 | #     model, ce, postprocessors = build_model(args)
356 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | pycocotools
2 | tqdm
3 | cython
4 | scipy
5 | 


--------------------------------------------------------------------------------
/tools/launch.py:
--------------------------------------------------------------------------------
  1 | # --------------------------------------------------------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # --------------------------------------------------------------------------------------------------------------------------
  6 | # Modified from https://github.com/pytorch/pytorch/blob/173f224570017b4b1a3a1a13d0bff280a54d9cd9/torch/distributed/launch.py
  7 | # --------------------------------------------------------------------------------------------------------------------------
  8 | 
  9 | r"""
 10 | `torch.distributed.launch` is a module that spawns up multiple distributed
 11 | training processes on each of the training nodes.
 12 | The utility can be used for single-node distributed training, in which one or
 13 | more processes per node will be spawned. The utility can be used for either
 14 | CPU training or GPU training. If the utility is used for GPU training,
 15 | each distributed process will be operating on a single GPU. This can achieve
 16 | well-improved single-node training performance. It can also be used in
 17 | multi-node distributed training, by spawning up multiple processes on each node
 18 | for well-improved multi-node distributed training performance as well.
 19 | This will especially be benefitial for systems with multiple Infiniband
 20 | interfaces that have direct-GPU support, since all of them can be utilized for
 21 | aggregated communication bandwidth.
 22 | In both cases of single-node distributed training or multi-node distributed
 23 | training, this utility will launch the given number of processes per node
 24 | (``--nproc_per_node``). If used for GPU training, this number needs to be less
 25 | or euqal to the number of GPUs on the current system (``nproc_per_node``),
 26 | and each process will be operating on a single GPU from *GPU 0 to
 27 | GPU (nproc_per_node - 1)*.
 28 | **How to use this module:**
 29 | 1. Single-Node multi-process distributed training
 30 | ::
 31 |     >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
 32 |                YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other
 33 |                arguments of your training script)
 34 | 2. Multi-Node multi-process distributed training: (e.g. two nodes)
 35 | Node 1: *(IP: 192.168.1.1, and has a free port: 1234)*
 36 | ::
 37 |     >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
 38 |                --nnodes=2 --node_rank=0 --master_addr="192.168.1.1"
 39 |                --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
 40 |                and all other arguments of your training script)
 41 | Node 2:
 42 | ::
 43 |     >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE
 44 |                --nnodes=2 --node_rank=1 --master_addr="192.168.1.1"
 45 |                --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3
 46 |                and all other arguments of your training script)
 47 | 3. To look up what optional arguments this module offers:
 48 | ::
 49 |     >>> python -m torch.distributed.launch --help
 50 | **Important Notices:**
 51 | 1. This utilty and multi-process distributed (single-node or
 52 | multi-node) GPU training currently only achieves the best performance using
 53 | the NCCL distributed backend. Thus NCCL backend is the recommended backend to
 54 | use for GPU training.
 55 | 2. In your training program, you must parse the command-line argument:
 56 | ``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by this module.
 57 | If your training program uses GPUs, you should ensure that your code only
 58 | runs on the GPU device of LOCAL_PROCESS_RANK. This can be done by:
 59 | Parsing the local_rank argument
 60 | ::
 61 |     >>> import argparse
 62 |     >>> parser = argparse.ArgumentParser()
 63 |     >>> parser.add_argument("--local_rank", type=int)
 64 |     >>> args = parser.parse_args()
 65 | Set your device to local rank using either
 66 | ::
 67 |     >>> torch.cuda.set_device(arg.local_rank)  # before your code runs
 68 | or
 69 | ::
 70 |     >>> with torch.cuda.device(arg.local_rank):
 71 |     >>>    # your code to run
 72 | 3. In your training program, you are supposed to call the following function
 73 | at the beginning to start the distributed backend. You need to make sure that
 74 | the init_method uses ``env://``, which is the only supported ``init_method``
 75 | by this module.
 76 | ::
 77 |     torch.distributed.init_process_group(backend='YOUR BACKEND',
 78 |                                          init_method='env://')
 79 | 4. In your training program, you can either use regular distributed functions
 80 | or use :func:`torch.nn.parallel.DistributedDataParallel` module. If your
 81 | training program uses GPUs for training and you would like to use
 82 | :func:`torch.nn.parallel.DistributedDataParallel` module,
 83 | here is how to configure it.
 84 | ::
 85 |     model = torch.nn.parallel.DistributedDataParallel(model,
 86 |                                                       device_ids=[arg.local_rank],
 87 |                                                       output_device=arg.local_rank)
 88 | Please ensure that ``device_ids`` argument is set to be the only GPU device id
 89 | that your code will be operating on. This is generally the local rank of the
 90 | process. In other words, the ``device_ids`` needs to be ``[args.local_rank]``,
 91 | and ``output_device`` needs to be ``args.local_rank`` in order to use this
 92 | utility
 93 | 5. Another way to pass ``local_rank`` to the subprocesses via environment variable
 94 | ``LOCAL_RANK``. This behavior is enabled when you launch the script with
 95 | ``--use_env=True``. You must adjust the subprocess example above to replace
 96 | ``args.local_rank`` with ``os.environ['LOCAL_RANK']``; the launcher
 97 | will not pass ``--local_rank`` when you specify this flag.
 98 | .. warning::
 99 |     ``local_rank`` is NOT globally unique: it is only unique per process
100 |     on a machine.  Thus, don't use it to decide if you should, e.g.,
101 |     write to a networked filesystem.  See
102 |     https://github.com/pytorch/pytorch/issues/12042 for an example of
103 |     how things can go wrong if you don't do this correctly.
104 | """
105 | 
106 | 
107 | import sys
108 | import subprocess
109 | import os
110 | import socket
111 | from argparse import ArgumentParser, REMAINDER
112 | 
113 | import torch
114 | 
115 | 
116 | def parse_args():
117 |     """
118 |     Helper function parsing the command line options
119 |     @retval ArgumentParser
120 |     """
121 |     parser = ArgumentParser(description="PyTorch distributed training launch "
122 |                                         "helper utilty that will spawn up "
123 |                                         "multiple distributed processes")
124 | 
125 |     # Optional arguments for the launch helper
126 |     parser.add_argument("--nnodes", type=int, default=1,
127 |                         help="The number of nodes to use for distributed "
128 |                              "training")
129 |     parser.add_argument("--node_rank", type=int, default=0,
130 |                         help="The rank of the node for multi-node distributed "
131 |                              "training")
132 |     parser.add_argument("--nproc_per_node", type=int, default=1,
133 |                         help="The number of processes to launch on each node, "
134 |                              "for GPU training, this is recommended to be set "
135 |                              "to the number of GPUs in your system so that "
136 |                              "each process can be bound to a single GPU.")
137 |     parser.add_argument("--master_addr", default="127.0.0.1", type=str,
138 |                         help="Master node (rank 0)'s address, should be either "
139 |                              "the IP address or the hostname of node 0, for "
140 |                              "single node multi-proc training, the "
141 |                              "--master_addr can simply be 127.0.0.1")
142 |     parser.add_argument("--master_port", default=29500, type=int,
143 |                         help="Master node (rank 0)'s free port that needs to "
144 |                              "be used for communciation during distributed "
145 |                              "training")
146 | 
147 |     # positional
148 |     parser.add_argument("training_script", type=str,
149 |                         help="The full path to the single GPU training "
150 |                              "program/script to be launched in parallel, "
151 |                              "followed by all the arguments for the "
152 |                              "training script")
153 | 
154 |     # rest from the training program
155 |     parser.add_argument('training_script_args', nargs=REMAINDER)
156 |     return parser.parse_args()
157 | 
158 | 
159 | def main():
160 |     args = parse_args()
161 | 
162 |     # world size in terms of number of processes
163 |     dist_world_size = args.nproc_per_node * args.nnodes
164 | 
165 |     # set PyTorch distributed related environmental variables
166 |     current_env = os.environ.copy()
167 |     current_env["MASTER_ADDR"] = args.master_addr
168 |     current_env["MASTER_PORT"] = str(args.master_port)
169 |     current_env["WORLD_SIZE"] = str(dist_world_size)
170 | 
171 |     processes = []
172 | 
173 |     for local_rank in range(0, args.nproc_per_node):
174 |         # each process's rank
175 |         dist_rank = args.nproc_per_node * args.node_rank + local_rank
176 |         current_env["RANK"] = str(dist_rank)
177 |         current_env["LOCAL_RANK"] = str(local_rank)
178 | 
179 |         cmd = [args.training_script] + args.training_script_args
180 | 
181 |         process = subprocess.Popen(cmd, env=current_env)
182 |         processes.append(process)
183 | 
184 |     for process in processes:
185 |         process.wait()
186 |         if process.returncode != 0:
187 |             raise subprocess.CalledProcessError(returncode=process.returncode,
188 |                                                 cmd=process.args)
189 | 
190 | 
191 | if __name__ == "__main__":
192 |     main()


--------------------------------------------------------------------------------
/tools/run_dist_launch.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # ------------------------------------------------------------------------
 3 | # Deformable DETR
 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
 6 | # ------------------------------------------------------------------------
 7 | 
 8 | set -x
 9 | 
10 | GPUS=$1
11 | RUN_COMMAND=${@:2}
12 | if [ $GPUS -lt 8 ]; then
13 |     GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS}
14 | else
15 |     GPUS_PER_NODE=${GPUS_PER_NODE:-8}
16 | fi
17 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
18 | MASTER_PORT=${MASTER_PORT:-"29500"}
19 | NODE_RANK=${NODE_RANK:-0}
20 | 
21 | let "NNODES=GPUS/GPUS_PER_NODE"
22 | 
23 | conda activate pytorch
24 | which python
25 | python ./tools/launch.py \
26 |     --nnodes ${NNODES} \
27 |     --node_rank ${NODE_RANK} \
28 |     --master_addr ${MASTER_ADDR} \
29 |     --master_port ${MASTER_PORT} \
30 |     --nproc_per_node ${GPUS_PER_NODE} \
31 |     ${RUN_COMMAND}


--------------------------------------------------------------------------------
/tools/run_dist_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | # --------------------------------------------------------------------------------------------------------------------------
 3 | # Modified from https://github.com/open-mmlab/mmdetection/blob/3b53fe15d87860c6941f3dda63c0f27422da6266/tools/slurm_train.sh
 4 | # --------------------------------------------------------------------------------------------------------------------------
 5 | 
 6 | set -x
 7 | PARTITION=edith
 8 | JOB_NAME=$1
 9 | GPUS=$2
10 | RUN_COMMAND=${@:3}
11 | RUN_TIME=${RUN_TIME:-"240:00:00"}
12 | dt=$(date '+%Y_%m%d_%H')
13 | if [ $GPUS -lt 8 ]; then
14 |     GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS}
15 | else
16 |     GPUS_PER_NODE=${GPUS_PER_NODE:-8}
17 | fi
18 | CPUS_PER_TASK=${CPUS_PER_TASK:-2}
19 | # CPUS_PER_TASK=2
20 | SRUN_ARGS=${SRUN_ARGS:-""}
21 | mkdir logs
22 | 
23 | srun -p ${PARTITION} \
24 |     --job-name=${JOB_NAME} \
25 |     --gres=gpu:${GPUS_PER_NODE} \
26 |     --ntasks=${GPUS} \
27 |     --ntasks-per-node=${GPUS_PER_NODE} \
28 |     --cpus-per-task=${CPUS_PER_TASK} \
29 |     --time=${RUN_TIME}\
30 |     --kill-on-bad-exit=1 \
31 |     ${SRUN_ARGS} \
32 |     ${RUN_COMMAND}\
33 | >  ./logs/${JOB_NAME}_${dt}.log 2>&1 &
34 | 
35 | # Removing this args because the
36 | # SRUN_ARGS="--nodelist=edith1" MASTER_PORT=29501 GPUS_PER_NODE=4  ./tools/run_dist_slurm.sh edith RefTR 4 configs/r50_deformable_vg_detr_single_scale_pretrained.sh


--------------------------------------------------------------------------------
/tools/vis_log.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import shutil
 4 | from tqdm import tqdm
 5 | from torch.utils.tensorboard import SummaryWriter
 6 | 
 7 | 
 8 | def convert_from_log(log_dir):
 9 |     if os.path.exists(f'{log_dir}/tb'):
10 |         shutil.rmtree(f'{log_dir}/tb')
11 |     tb_writter = SummaryWriter(log_dir=f'{log_dir}/tb')
12 |     with open(f"{log_dir}/log.txt", 'r') as f:
13 |         lines = f.readlines()
14 |         for epoch, line in tqdm(enumerate(lines)):
15 |             line = line.strip()
16 |             if line == '':
17 |                 break
18 |             info = json.loads(line)
19 | 
20 |             tb_writter.add_scalar('Loss/train', info['train_loss'], epoch)
21 |             tb_writter.add_scalar('Loss_bbox/train', info['train_loss_bbox_unscaled'], epoch)
22 |             tb_writter.add_scalar('Loss_ce/train', info['train_loss_ce_unscaled'], epoch)
23 |             
24 |             tb_writter.add_scalar('Loss/test', info['test_loss'], epoch)
25 |             tb_writter.add_scalar('Loss_bbox/test', info['test_loss_bbox_unscaled'], epoch)
26 |             tb_writter.add_scalar('Loss_ce/test', info['test_loss_ce_unscaled'], epoch)
27 |             tb_writter.add_scalar('Accuracy/test', info['test_accuracy_iou0.5'], epoch)
28 |             tb_writter.add_scalar('Miou/test', info['test_miou'], epoch)
29 |     tb_writter.close()
30 | 
31 | 
32 | if __name__ == '__main__':
33 |     exp_path = './exps'
34 |     for x in os.listdir(exp_path):
35 |         if os.path.isdir(f'{exp_path}/{x}') and os.path.exists(f'{exp_path}/{x}/log.txt'):
36 |             convert_from_log(f'{exp_path}/{x}')
37 | 


--------------------------------------------------------------------------------
/tox.ini:
--------------------------------------------------------------------------------
1 | [flake8]
2 | ignore = E226,E302,E41,F401
3 | max-line-length = 200
4 | exclude = tests/*
5 | max-complexity = 10
6 | 


--------------------------------------------------------------------------------
/util/__init__.py:
--------------------------------------------------------------------------------
1 | # ------------------------------------------------------------------------
2 | # Deformable DETR
3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
5 | # ------------------------------------------------------------------------
6 | # Modified from DETR (https://github.com/facebookresearch/detr)
7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
8 | # ------------------------------------------------------------------------
9 | 


--------------------------------------------------------------------------------
/util/box_ops.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Modified from DETR (https://github.com/facebookresearch/detr)
  7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  8 | # ------------------------------------------------------------------------
  9 | 
 10 | """
 11 | Utilities for bounding box manipulation and GIoU.
 12 | """
 13 | import torch
 14 | from torchvision.ops.boxes import box_area
 15 | 
 16 | 
 17 | def box_cxcywh_to_xyxy(x):
 18 |     x_c, y_c, w, h = x.unbind(-1)
 19 |     b = [(x_c - 0.5 * w), (y_c - 0.5 * h),
 20 |          (x_c + 0.5 * w), (y_c + 0.5 * h)]
 21 |     return torch.stack(b, dim=-1)
 22 | 
 23 | 
 24 | def box_xyxy_to_cxcywh(x):
 25 |     x0, y0, x1, y1 = x.unbind(-1)
 26 |     b = [(x0 + x1) / 2, (y0 + y1) / 2,
 27 |          (x1 - x0), (y1 - y0)]
 28 |     return torch.stack(b, dim=-1)
 29 | 
 30 | 
 31 | # modified from torchvision to also return the union
 32 | def box_iou(boxes1, boxes2):
 33 |     area1 = box_area(boxes1)
 34 |     area2 = box_area(boxes2)
 35 | 
 36 |     lt = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
 37 |     rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
 38 | 
 39 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 40 |     inter = wh[:, :, 0] * wh[:, :, 1]  # [N,M]
 41 | 
 42 |     union = area1[:, None] + area2 - inter
 43 | 
 44 |     iou = inter / union
 45 |     return iou, union
 46 | 
 47 | 
 48 | def generalized_box_iou(boxes1, boxes2):
 49 |     """
 50 |     Generalized IoU from https://giou.stanford.edu/
 51 | 
 52 |     The boxes should be in [x0, y0, x1, y1] format
 53 | 
 54 |     Returns a [N, M] pairwise matrix, where N = len(boxes1)
 55 |     and M = len(boxes2)
 56 |     """
 57 |     # degenerate boxes gives inf / nan results
 58 |     # so do an early check
 59 |     assert (boxes1[:, 2:] >= boxes1[:, :2]).all()
 60 |     assert (boxes2[:, 2:] >= boxes2[:, :2]).all()
 61 |     iou, union = box_iou(boxes1, boxes2)
 62 | 
 63 |     lt = torch.min(boxes1[:, None, :2], boxes2[:, :2])
 64 |     rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
 65 | 
 66 |     wh = (rb - lt).clamp(min=0)  # [N,M,2]
 67 |     area = wh[:, :, 0] * wh[:, :, 1]
 68 | 
 69 |     return iou - (area - union) / area
 70 | 
 71 | 
 72 | def masks_to_boxes(masks):
 73 |     """Compute the bounding boxes around the provided masks
 74 | 
 75 |     The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions.
 76 | 
 77 |     Returns a [N, 4] tensors, with the boxes in xyxy format
 78 |     """
 79 |     if masks.numel() == 0:
 80 |         return torch.zeros((0, 4), device=masks.device)
 81 | 
 82 |     h, w = masks.shape[-2:]
 83 | 
 84 |     y = torch.arange(0, h, dtype=torch.float)
 85 |     x = torch.arange(0, w, dtype=torch.float)
 86 |     y, x = torch.meshgrid(y, x)
 87 | 
 88 |     x_mask = (masks * x.unsqueeze(0))
 89 |     x_max = x_mask.flatten(1).max(-1)[0]
 90 |     x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
 91 | 
 92 |     y_mask = (masks * y.unsqueeze(0))
 93 |     y_max = y_mask.flatten(1).max(-1)[0]
 94 |     y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0]
 95 | 
 96 |     return torch.stack([x_min, y_min, x_max, y_max], 1)
 97 | 
 98 | 
 99 | def mask_iou(masks, target):
100 |     assert(target.shape[-2:] == masks.shape[-2:])
101 |     I = torch.sum(torch.logical_and(masks, target))
102 |     U = torch.sum(torch.logical_or(masks, target))
103 |     return I.float() / U.float()
104 | 


--------------------------------------------------------------------------------
/util/collate_fn.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import re
  3 | import collections
  4 | from .misc import NestedTensor
  5 | from typing import List
  6 | from torch._six import string_classes
  7 | 
  8 | def collate_fn_vg(batch):
  9 |     batch = list(zip(*batch))
 10 | 
 11 |     batch[0] = default_collate(batch[0])
 12 |     return tuple(batch)
 13 | 
 14 | 
 15 | def _max_by_axis(the_list):
 16 |     # type: (List[List[int]]) -> List[int]
 17 |     maxes = the_list[0]
 18 |     for sublist in the_list[1:]:
 19 |         for index, item in enumerate(sublist):
 20 |             maxes[index] = max(maxes[index], item)
 21 |     return maxes
 22 | 
 23 | 
 24 | def nested_tensor_from_tensor_list(tensor_list: List[torch.Tensor]):
 25 |     # TODO make this more general
 26 |     if tensor_list[0].ndim == 3:
 27 |         # TODO make it support different-sized images
 28 |         max_size = _max_by_axis([list(img.shape) for img in tensor_list])
 29 |         # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list]))
 30 |         batch_shape = [len(tensor_list)] + max_size
 31 |         b, c, h, w = batch_shape
 32 |         dtype = tensor_list[0].dtype
 33 |         device = tensor_list[0].device
 34 |         tensor = torch.zeros(batch_shape, dtype=dtype, device=device)
 35 |         mask = torch.ones((b, h, w), dtype=torch.bool, device=device)
 36 |         for img, pad_img, m in zip(tensor_list, tensor, mask):
 37 |             pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img)
 38 |             m[: img.shape[1], :img.shape[2]] = False
 39 |     else:
 40 |         raise ValueError('not supported')
 41 |     return NestedTensor(tensor, mask)
 42 | 
 43 | 
 44 | # Following is modified from Modified from 
 45 | # https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py
 46 | #############################################################################
 47 | default_collate_err_msg_format = (
 48 |     "default_collate: batch must contain tensors, numpy arrays, numbers, "
 49 |     "dicts or lists; found {}")
 50 | 
 51 | np_str_obj_array_pattern = re.compile(r'[SaUO]')
 52 | 
 53 | def default_collate(batch):
 54 |     r"""Puts each data field into a tensor with outer dimension batch size
 55 |     """
 56 |     elem = batch[0]
 57 |     elem_type = type(elem)
 58 |     if isinstance(elem, torch.Tensor):
 59 |         out = None
 60 |         if torch.utils.data.get_worker_info() is not None:
 61 |             # If we're in a background process, concatenate directly into a
 62 |             # shared memory tensor to avoid an extra copy
 63 |             numel = sum([x.numel() for x in batch])
 64 |             storage = elem.storage()._new_shared(numel)
 65 |             out = elem.new(storage)
 66 |         return torch.stack(batch, 0, out=out)
 67 |     elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
 68 |             and elem_type.__name__ != 'string_':
 69 |         if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap':
 70 |             # array of string classes and object
 71 |             if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
 72 |                 raise TypeError(default_collate_err_msg_format.format(elem.dtype))
 73 | 
 74 |             return default_collate([torch.as_tensor(b) for b in batch])
 75 |         elif elem.shape == ():  # scalars
 76 |             return torch.as_tensor(batch)
 77 |     elif isinstance(elem, float):
 78 |         return torch.tensor(batch, dtype=torch.float64)
 79 |     elif isinstance(elem, int):
 80 |         return torch.tensor(batch)
 81 |     elif isinstance(elem, string_classes):
 82 |         return batch
 83 |     elif isinstance(elem, collections.abc.Mapping):
 84 |         d = {}
 85 |         for key in elem:
 86 |             if key == 'img':
 87 |                 d['img'] = nested_tensor_from_tensor_list([d['img'] for d in batch])
 88 |             else:
 89 |                 d[key] = default_collate([d[key] for d in batch])
 90 |         return d
 91 |     elif isinstance(elem, tuple) and hasattr(elem, '_fields'):  # namedtuple
 92 |         return elem_type(*(default_collate(samples) for samples in zip(*batch)))
 93 |     elif isinstance(elem, collections.abc.Sequence):
 94 |         # check to make sure that the elements in batch have consistent size
 95 |         it = iter(batch)
 96 |         elem_size = len(next(it))
 97 |         if not all(len(elem) == elem_size for elem in it):
 98 |             raise RuntimeError('each element in list of batch should be of equal size')
 99 |         transposed = zip(*batch)
100 |         return [default_collate(samples) for samples in transposed]
101 | 
102 |     raise TypeError(default_collate_err_msg_format.format(elem_type))


--------------------------------------------------------------------------------
/util/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | import math
 2 | # config["max_num_epochs"] = 100
 3 | # warm_up_epochs = 5
 4 | # lr_milestones = [20,40]
 5 | # # MultiStepLR without warm up
 6 | # multistep_lr = lambda epoch: 0.1**len([m for m in lr_milestones if m <= epoch])
 7 | # # warm_up_with_multistep_lr
 8 | # warm_up_with_multistep_lr = lambda epoch: (epoch+1) / warm_up_epochs if epoch < warm_up_epochs else 0.1**len([m for m in lr_milestones if m <= epoch])
 9 | # # warm_up_with_step_lr
10 | # gamma = 0.9; stepsize = 1
11 | # warm_up_with_step_lr = lambda epoch: (epoch+1) / warm_up_epochs if epoch < warm_up_epochs \
12 | #     else gamma**( ((epoch - warm_up_epochs) /(config["max_num_epochs"] - warm_up_epochs))//stepsize*stepsize)
13 | # # warm_up_with_cosine_lr
14 | # warm_up_with_cosine_lr = lambda epoch: (epoch+1) / warm_up_epochs if epoch < warm_up_epochs \
15 | #     else 0.5 * ( math.cos((epoch - warm_up_epochs) /(config["max_num_epochs"] - warm_up_epochs) * math.pi) + 1)
16 | 
17 | # scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=warm_up_with_cosine_lr)
18 | 
19 | class MultiStepWarmupLR:
20 |     def __init__(self, decay_rate=0.1, lr_milestones=[20000, 40000], warm_up_steps=5000, min_decay_rate=0.01) -> None:
21 |         self.deacy_rate = decay_rate
22 |         self.lr_milestones = lr_milestones
23 |         self.warm_up_steps = warm_up_steps
24 |         self.min_decay_rate = min_decay_rate
25 | 
26 |     def __call__(self, steps):
27 |         if steps < self.warm_up_steps:
28 |             rate = (steps+1)/self.warm_up_steps
29 |         else:
30 |             rate = self.deacy_rate ** len([m for m in self.lr_milestones if m <= steps])
31 |         # make sure lr is not too small
32 |         if rate <= self.min_decay_rate:
33 |             return self.min_decay_rate
34 |         else:
35 |             return rate
36 | 
37 | class CosineWarmupLR:
38 |     def __init__(self, max_T=100, warm_up_steps=5, min_decay_rate=0.01) -> None:
39 |         self.max_T = max_T
40 |         self.warm_up_steps = warm_up_steps
41 |         self.min_decay_rate = min_decay_rate
42 | 
43 |     def __call__(self, steps):
44 |         if steps < self.warm_up_steps:
45 |             rate = (steps+1)/self.warm_up_steps
46 |         else:
47 |             rate = 0.5 * (math.cos((steps - self.warm_up_steps) / (self.max_T - self.warm_up_steps) * math.pi) + 1)
48 |         # make sure lr is not too small
49 |         if rate <= self.min_decay_rate:
50 |             return self.min_decay_rate
51 |         else:
52 |             return rate
53 | 


--------------------------------------------------------------------------------
/util/plot_utils.py:
--------------------------------------------------------------------------------
  1 | # ------------------------------------------------------------------------
  2 | # Deformable DETR
  3 | # Copyright (c) 2020 SenseTime. All Rights Reserved.
  4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details]
  5 | # ------------------------------------------------------------------------
  6 | # Modified from DETR (https://github.com/facebookresearch/detr)
  7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
  8 | # ------------------------------------------------------------------------
  9 | 
 10 | """
 11 | Plotting utilities to visualize training logs.
 12 | """
 13 | import torch
 14 | import pandas as pd
 15 | import seaborn as sns
 16 | import matplotlib.pyplot as plt
 17 | 
 18 | from pathlib import Path, PurePath
 19 | 
 20 | 
 21 | def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'):
 22 |     '''
 23 |     Function to plot specific fields from training log(s). Plots both training and test results.
 24 | 
 25 |     :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file
 26 |               - fields = which results to plot from each log file - plots both training and test for each field.
 27 |               - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots
 28 |               - log_name = optional, name of log file if different than default 'log.txt'.
 29 | 
 30 |     :: Outputs - matplotlib plots of results in fields, color coded for each log file.
 31 |                - solid lines are training results, dashed lines are test results.
 32 | 
 33 |     '''
 34 |     func_name = "plot_utils.py::plot_logs"
 35 | 
 36 |     # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path,
 37 |     # convert single Path to list to avoid 'not iterable' error
 38 | 
 39 |     if not isinstance(logs, list):
 40 |         if isinstance(logs, PurePath):
 41 |             logs = [logs]
 42 |             print(f"{func_name} info: logs param expects a list argument, converted to list[Path].")
 43 |         else:
 44 |             raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \
 45 |             Expect list[Path] or single Path obj, received {type(logs)}")
 46 | 
 47 |     # verify valid dir(s) and that every item in list is Path object
 48 |     for i, dir in enumerate(logs):
 49 |         if not isinstance(dir, PurePath):
 50 |             raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}")
 51 |         if dir.exists():
 52 |             continue
 53 |         raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}")
 54 | 
 55 |     # load log file(s) and plot
 56 |     dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs]
 57 | 
 58 |     fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5))
 59 | 
 60 |     for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))):
 61 |         for j, field in enumerate(fields):
 62 |             if field == 'mAP':
 63 |                 coco_eval = pd.DataFrame(pd.np.stack(df.test_coco_eval.dropna().values)[:, 1]).ewm(com=ewm_col).mean()
 64 |                 axs[j].plot(coco_eval, c=color)
 65 |             else:
 66 |                 df.interpolate().ewm(com=ewm_col).mean().plot(
 67 |                     y=[f'train_{field}', f'test_{field}'],
 68 |                     ax=axs[j],
 69 |                     color=[color] * 2,
 70 |                     style=['-', '--']
 71 |                 )
 72 |     for ax, field in zip(axs, fields):
 73 |         ax.legend([Path(p).name for p in logs])
 74 |         ax.set_title(field)
 75 | 
 76 | 
 77 | def plot_precision_recall(files, naming_scheme='iter'):
 78 |     if naming_scheme == 'exp_id':
 79 |         # name becomes exp_id
 80 |         names = [f.parts[-3] for f in files]
 81 |     elif naming_scheme == 'iter':
 82 |         names = [f.stem for f in files]
 83 |     else:
 84 |         raise ValueError(f'not supported {naming_scheme}')
 85 |     fig, axs = plt.subplots(ncols=2, figsize=(16, 5))
 86 |     for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names):
 87 |         data = torch.load(f)
 88 |         # precision is n_iou, n_points, n_cat, n_area, max_det
 89 |         precision = data['precision']
 90 |         recall = data['params'].recThrs
 91 |         scores = data['scores']
 92 |         # take precision for all classes, all areas and 100 detections
 93 |         precision = precision[0, :, :, 0, -1].mean(1)
 94 |         scores = scores[0, :, :, 0, -1].mean(1)
 95 |         prec = precision.mean()
 96 |         rec = data['recall'][0, :, 0, -1].mean()
 97 |         print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' +
 98 |               f'score={scores.mean():0.3f}, ' +
 99 |               f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}'
100 |               )
101 |         axs[0].plot(recall, precision, c=color)
102 |         axs[1].plot(recall, scores, c=color)
103 | 
104 |     axs[0].set_title('Precision / Recall')
105 |     axs[0].legend(names)
106 |     axs[1].set_title('Scores / Recall')
107 |     axs[1].legend(names)
108 |     return fig, axs
109 | 
110 | 
111 | 
112 | 


--------------------------------------------------------------------------------
/util/transforms.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Generic Image Transform utillities.
  5 | """
  6 | 
  7 | import cv2
  8 | import random, math
  9 | import numpy as np
 10 | from collections import Iterable
 11 | 
 12 | import torch.nn.functional as F
 13 | from torch.autograd import Variable
 14 | 
 15 | 
 16 | def letterbox(img, mask, height, color=(123.7, 116.3, 103.5)):  # resize a rectangular image to a padded square
 17 |     shape = img.shape[:2]  # shape = [height, width]
 18 |     ratio = float(height) / max(shape)  # ratio  = old / new
 19 |     new_shape = (round(shape[1] * ratio), round(shape[0] * ratio))
 20 |     dw = (height - new_shape[0]) / 2  # width padding
 21 |     dh = (height - new_shape[1]) / 2  # height padding
 22 |     top, bottom = round(dh - 0.1), round(dh + 0.1)
 23 |     left, right = round(dw - 0.1), round(dw + 0.1)
 24 |     img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA)  # resized, no border
 25 |     img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # padded square
 26 |     if mask is not None:
 27 |         mask = cv2.resize(mask, new_shape, interpolation=cv2.INTER_NEAREST)  # resized, no border
 28 |         mask = cv2.copyMakeBorder(mask, top, bottom, left, right, cv2.BORDER_CONSTANT, value=255)  # padded square
 29 |     return img, mask, ratio, dw, dh
 30 | 
 31 | def random_affine(img, mask, targets, degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-2, 2),
 32 |                   borderValue=(123.7, 116.3, 103.5), all_bbox=None):
 33 |     border = 0  # width of added border (optional)
 34 |     height = max(img.shape[0], img.shape[1]) + border * 2
 35 | 
 36 |     # Rotation and Scale
 37 |     R = np.eye(3)
 38 |     a = random.random() * (degrees[1] - degrees[0]) + degrees[0]
 39 |     # a += random.choice([-180, -90, 0, 90])  # 90deg rotations added to small rotations
 40 |     s = random.random() * (scale[1] - scale[0]) + scale[0]
 41 |     R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s)
 42 | 
 43 |     # Translation
 44 |     T = np.eye(3)
 45 |     T[0, 2] = (random.random() * 2 - 1) * translate[0] * img.shape[0] + border  # x translation (pixels)
 46 |     T[1, 2] = (random.random() * 2 - 1) * translate[1] * img.shape[1] + border  # y translation (pixels)
 47 | 
 48 |     # Shear
 49 |     S = np.eye(3)
 50 |     S[0, 1] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180)  # x shear (deg)
 51 |     S[1, 0] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180)  # y shear (deg)
 52 | 
 53 |     M = S @ T @ R  # Combined rotation matrix. ORDER IS IMPORTANT HERE!!
 54 |     imw = cv2.warpPerspective(img, M, dsize=(height, height), flags=cv2.INTER_LINEAR,
 55 |                               borderValue=borderValue)  # BGR order borderValue
 56 |     if mask is not None:
 57 |         maskw = cv2.warpPerspective(mask, M, dsize=(height, height), flags=cv2.INTER_NEAREST,
 58 |                                   borderValue=255)  # BGR order borderValue
 59 |     else:
 60 |         maskw = None
 61 | 
 62 |     # Return warped points also
 63 |     if isinstance(targets, list):
 64 |         targetlist=[]
 65 |         for bbox in targets:
 66 |             targetlist.append(wrap_points(bbox, M, height, a))
 67 |         return imw, maskw, targetlist
 68 |     elif targets.ndim == 1:   ## previous main
 69 |         targets = wrap_points(targets, M, height, a)
 70 |         return imw, maskw, targets
 71 |     elif targets.ndim == 2:
 72 |         for i in range(targets.shape[0]):
 73 |             targets[i] = wrap_points(targets[i], M, height, a)
 74 |         return imw, maskw, targets
 75 |     else:
 76 |         return imw
 77 | 
 78 | def wrap_points(targets, M, height, a):
 79 |     # n = targets.shape[0]
 80 |     # points = targets[:, 1:5].copy()
 81 |     points = targets.copy()
 82 |     # area0 = (points[:, 2] - points[:, 0]) * (points[:, 3] - points[:, 1])
 83 |     area0 = (points[2] - points[0]) * (points[3] - points[1])
 84 | 
 85 |     # warp points
 86 |     xy = np.ones((4, 3))
 87 |     xy[:, :2] = points[[0, 1, 2, 3, 0, 3, 2, 1]].reshape(4, 2)  # x1y1, x2y2, x1y2, x2y1
 88 |     xy = (xy @ M.T)[:, :2].reshape(1, 8)
 89 | 
 90 |     # create new boxes
 91 |     x = xy[:, [0, 2, 4, 6]]
 92 |     y = xy[:, [1, 3, 5, 7]]
 93 |     xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, 1).T
 94 | 
 95 |     # apply angle-based reduction
 96 |     radians = a * math.pi / 180
 97 |     reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5
 98 |     x = (xy[:, 2] + xy[:, 0]) / 2
 99 |     y = (xy[:, 3] + xy[:, 1]) / 2
100 |     w = (xy[:, 2] - xy[:, 0]) * reduction
101 |     h = (xy[:, 3] - xy[:, 1]) * reduction
102 |     xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, 1).T
103 | 
104 |     # reject warped points outside of image
105 |     np.clip(xy, 0, height, out=xy)
106 |     w = xy[:, 2] - xy[:, 0]
107 |     h = xy[:, 3] - xy[:, 1]
108 |     area = w * h
109 |     ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16))
110 |     i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10)
111 | 
112 |     ## print(targets, xy)
113 |     ## [ 56  36 108 210] [[ 47.80464857  15.6096533  106.30993434 196.71267693]]
114 |     # targets = targets[i]
115 |     # targets[:, 1:5] = xy[i]
116 |     targets = xy[0]
117 |     return targets   


--------------------------------------------------------------------------------
/util/word_utils.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | """
  4 | Language-related data loading helper functions and class wrappers.
  5 | Copied from https://github.com/zyang-ur/ReSC/blob/e4022f87bfd11200b67c4509bb9746640834ceae/utils/word_utils.py#L45
  6 | """
  7 | 
  8 | import re
  9 | import torch
 10 | import codecs
 11 | 
 12 | UNK_TOKEN = '<unk>'
 13 | PAD_TOKEN = '<pad>'
 14 | END_TOKEN = '<eos>'
 15 | SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)')
 16 | 
 17 | 
 18 | class Dictionary(object):
 19 |     def __init__(self):
 20 |         self.word2idx = {}
 21 |         self.idx2word = []
 22 | 
 23 |     def add_word(self, word):
 24 |         if word not in self.word2idx:
 25 |             self.idx2word.append(word)
 26 |             self.word2idx[word] = len(self.idx2word) - 1
 27 |         return self.word2idx[word]
 28 | 
 29 |     def __len__(self):
 30 |         return len(self.idx2word)
 31 | 
 32 |     def __getitem__(self, a):
 33 |         if isinstance(a, int):
 34 |             return self.idx2word[a]
 35 |         elif isinstance(a, list):
 36 |             return [self.idx2word[x] for x in a]
 37 |         elif isinstance(a, str):
 38 |             return self.word2idx[a]
 39 |         else:
 40 |             raise TypeError("Query word/index argument must be int or str")
 41 | 
 42 |     def __contains__(self, word):
 43 |         return word in self.word2idx
 44 | 
 45 | 
 46 | class Corpus(object):
 47 |     def __init__(self):
 48 |         self.dictionary = Dictionary()
 49 | 
 50 |     def set_max_len(self, value):
 51 |         self.max_len = value
 52 | 
 53 |     def load_file(self, filename):
 54 |         with codecs.open(filename, 'r', 'utf-8') as f:
 55 |             for line in f:
 56 |                 line = line.strip()
 57 |                 self.add_to_corpus(line)
 58 |         self.dictionary.add_word(UNK_TOKEN)
 59 |         self.dictionary.add_word(PAD_TOKEN)
 60 | 
 61 |     def add_to_corpus(self, line):
 62 |         """Tokenizes a text line."""
 63 |         # Add words to the dictionary
 64 |         words = line.split()
 65 |         # tokens = len(words)
 66 |         for word in words:
 67 |             word = word.lower()
 68 |             self.dictionary.add_word(word)
 69 | 
 70 |     def tokenize(self, line, max_len=20):
 71 |         # Tokenize line contents
 72 |         words = SENTENCE_SPLIT_REGEX.split(line.strip())
 73 |         # words = [w.lower() for w in words if len(w) > 0]
 74 |         words = [w.lower() for w in words if (len(w) > 0 and w!=' ')]   ## do not include space as a token
 75 | 
 76 |         if words[-1] == '.':
 77 |             words = words[:-1]
 78 | 
 79 |         if max_len > 0:
 80 |             if len(words) > max_len:
 81 |                 words = words[:max_len]
 82 |             elif len(words) < max_len:
 83 |                 # words = [PAD_TOKEN] * (max_len - len(words)) + words
 84 |                 words = words + [END_TOKEN] + [PAD_TOKEN] * (max_len - len(words) - 1)
 85 | 
 86 |         tokens = len(words) ## for end token
 87 |         ids = torch.LongTensor(tokens)
 88 |         token = 0
 89 |         for word in words:
 90 |             if word not in self.dictionary:
 91 |                 word = UNK_TOKEN
 92 |             # print(word, type(word), word.encode('ascii','ignore').decode('ascii'), type(word.encode('ascii','ignore').decode('ascii')))
 93 |             if type(word)!=type('a'):
 94 |                 print(word, type(word), word.encode('ascii','ignore').decode('ascii'), type(word.encode('ascii','ignore').decode('ascii')))
 95 |                 word = word.encode('ascii','ignore').decode('ascii')
 96 |             ids[token] = self.dictionary[word]
 97 |             token += 1
 98 |         # ids[token] = self.dictionary[END_TOKEN]
 99 |         return ids
100 | 
101 |     def __len__(self):
102 |         return len(self.dictionary)


--------------------------------------------------------------------------------