├── .gitignore ├── LICENSE ├── README.md ├── configs ├── flickr30k │ ├── RefTR_flickr.sh │ ├── RefTR_flickr_roberta.sh │ ├── Ref_flickr_pt.sh │ └── Ref_flickr_pt_101.sh ├── refcoco+ │ ├── RefTR_SEG_refcoco+.sh │ ├── RefTR_SEG_refcoco+_101.sh │ ├── RefTR_refcoco+.sh │ └── RefTR_refcoco+_101.sh ├── refcoco │ ├── RefTR_refcoco.sh │ └── RefTR_refcoco_101.sh ├── refcocog │ ├── RefTR_SEG_refcocog.sh │ ├── RefTR_SEG_refcocog_101.sh │ ├── RefTR_refcocog.sh │ └── RefTR_refcocog_101.sh └── referit │ ├── RefTR_referit.sh │ ├── RefTR_referit_101.sh │ ├── RefTR_referit_101_PT.sh │ └── RefTR_referit_PT.sh ├── datasets ├── __init__.py ├── data_prefetcher.py ├── grounding_datasets │ ├── __init__.py │ ├── refer_dataset.py │ └── resc_refer_dataset.py ├── lang_utils.py ├── refer_multiphrase.py ├── refer_resc.py ├── refer_segmentation.py ├── samplers.py └── transforms.py ├── engine_vg.py ├── main_vg.py ├── models ├── __init__.py ├── criterion.py ├── modeling │ ├── backbone.py │ ├── matcher.py │ ├── position_encoding.py │ ├── segmentation.py │ └── transformer.py ├── post_process.py ├── reftr.py ├── reftr_segmentation.py └── reftr_transformer.py ├── requirements.txt ├── tools ├── launch.py ├── run_dist_launch.sh ├── run_dist_slurm.sh └── vis_log.py ├── tox.ini └── util ├── __init__.py ├── box_ops.py ├── collate_fn.py ├── lr_scheduler.py ├── misc.py ├── plot_utils.py ├── transforms.py └── word_utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.toptal.com/developers/gitignore/api/python,vscode,jupyternotebooks 3 | # Edit at https://www.toptal.com/developers/gitignore?templates=python,vscode,jupyternotebooks 4 | 5 | ### 6 | exp_backup/* 7 | exps/* 8 | data 9 | configs/VinVL_VQA_base/*.bin 10 | SAVED_MODEL/* 11 | 12 | ### JupyterNotebooks ### 13 | # gitignore template for Jupyter Notebooks 14 | # website: http://jupyter.org/ 15 | 16 | .ipynb_checkpoints 17 | */.ipynb_checkpoints/* 18 | 19 | # IPython 20 | profile_default/ 21 | ipython_config.py 22 | 23 | # Remove previous ipynb_checkpoints 24 | # git rm -r .ipynb_checkpoints/ 25 | 26 | ### Python ### 27 | # Byte-compiled / optimized / DLL files 28 | __pycache__/ 29 | *.py[cod] 30 | *$py.class 31 | 32 | # C extensions 33 | *.so 34 | 35 | # Distribution / packaging 36 | .Python 37 | build/ 38 | develop-eggs/ 39 | dist/ 40 | downloads/ 41 | eggs/ 42 | .eggs/ 43 | parts/ 44 | sdist/ 45 | var/ 46 | wheels/ 47 | pip-wheel-metadata/ 48 | share/python-wheels/ 49 | *.egg-info/ 50 | .installed.cfg 51 | *.egg 52 | MANIFEST 53 | 54 | # PyInstaller 55 | # Usually these files are written by a python script from a template 56 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 57 | *.manifest 58 | *.spec 59 | 60 | # Installer logs 61 | pip-log.txt 62 | pip-delete-this-directory.txt 63 | 64 | # Unit test / coverage reports 65 | htmlcov/ 66 | .tox/ 67 | .nox/ 68 | .coverage 69 | .coverage.* 70 | .cache 71 | nosetests.xml 72 | coverage.xml 73 | *.cover 74 | *.py,cover 75 | .hypothesis/ 76 | .pytest_cache/ 77 | pytestdebug.log 78 | 79 | # Translations 80 | *.mo 81 | *.pot 82 | 83 | # Django stuff: 84 | *.log 85 | local_settings.py 86 | db.sqlite3 87 | db.sqlite3-journal 88 | 89 | # Flask stuff: 90 | instance/ 91 | .webassets-cache 92 | 93 | # Scrapy stuff: 94 | .scrapy 95 | 96 | # Sphinx documentation 97 | docs/_build/ 98 | doc/_build/ 99 | 100 | # PyBuilder 101 | target/ 102 | 103 | # Jupyter Notebook 104 | 105 | # IPython 106 | 107 | # pyenv 108 | .python-version 109 | 110 | # pipenv 111 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 112 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 113 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 114 | # install all needed dependencies. 115 | #Pipfile.lock 116 | 117 | # poetry 118 | #poetry.lock 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | # .env 132 | .env/ 133 | .venv/ 134 | env/ 135 | venv/ 136 | ENV/ 137 | env.bak/ 138 | venv.bak/ 139 | pythonenv* 140 | 141 | # Spyder project settings 142 | .spyderproject 143 | .spyproject 144 | 145 | # Rope project settings 146 | .ropeproject 147 | 148 | # mkdocs documentation 149 | /site 150 | 151 | # mypy 152 | .mypy_cache/ 153 | .dmypy.json 154 | dmypy.json 155 | 156 | # Pyre type checker 157 | .pyre/ 158 | 159 | # pytype static type analyzer 160 | .pytype/ 161 | 162 | # operating system-related files 163 | *.DS_Store #file properties cache/storage on macOS 164 | Thumbs.db #thumbnail cache on Windows 165 | 166 | # profiling data 167 | .prof 168 | 169 | 170 | ### vscode ### 171 | .vscode/* 172 | # !.vscode/settings.json 173 | !.vscode/tasks.json 174 | !.vscode/launch.json 175 | !.vscode/extensions.json 176 | *.code-workspace 177 | 178 | # End of https://www.toptal.com/developers/gitignore/api/python,vscode,jupyternotebooks -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 UBC Computer Vision Group 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # RefTR 2 | 3 | Code for paper "Referring Transformer: A One-step Approach to Multi-task Visual Grounding" 4 | 5 | ## Requirements 6 | 7 | To install requirements: 8 | 9 | ```setup 10 | pip install -r requirements.txt 11 | ``` 12 | 13 | ``` 14 | chmod +x tools/run_dist_slurm.sh 15 | ``` 16 | 17 | ## Setting up dataset 18 | Resc annotations preparation: https://drive.google.com/file/d/1fVwdDvXNbH8uuq_pHD_o5HI7yqeuz0yS/view?usp=sharing 19 | 20 | Flicker30k Entities: http://bryanplummer.com/Flickr30kEntities/ 21 | 22 | MSCOCO: http://mscoco.org/dataset/#overview 23 | 24 | Visual Genome Images: https://visualgenome.org/api/v0/api_home.html 25 | 26 | data/annotations: https://drive.google.com/file/d/19qJ8b5sxijKmtN0XG9leWbt2sPkIVqlc/view?usp=sharing 27 | 28 | refcoco/masks: https://drive.google.com/file/d/1oGUewiDtxjouT8Qp4dRzrPfGkc0LZaIT/view?usp=sharing 29 | 30 | refcoco/anns: https://drive.google.com/file/d/1Prhrgm3t2JeY68Ni_1Ig_a4dfZvGC9vZ/view?usp=sharing 31 | 32 | annotations_resc/vg/vg_all.pth: https://drive.google.com/file/d/1_GbWl0sSB1y26fFM9W7DDkXLRR8Ld3IH/view?usp=sharing 33 | 34 | Extract dataset in the /data folder.(Tips: you can use softlinks to avoid putting data and code in the same directory.) 35 | The data/ folder should look like this: 36 | ``` 37 | data 38 | ├── annotations 39 | ├── annotations_resc 40 | │ ├── flickr 41 | │ ├── gref 42 | │ ├── gref_umd 43 | │ ├── referit 44 | │ ├── unc 45 | │ ├── unc+ 46 | │ └── vg 47 | ├── flickr30k 48 | │ └── f30k_images 49 | ├── refcoco 50 | | ├── anns 51 | │ ├── images 52 | | │ ├──train2014 # images from train 2014 53 | │ ├── masks 54 | ├── referit 55 | │ ├── images 56 | ├── visualgenome 57 | └───└── VG_100K 58 | 59 | ``` 60 | 61 | ## Training 62 | 63 | To train the model, run: 64 | ```train 65 | # using slurm system 66 | MASTER_PORT=${Master Port} GPUS_PER_NODE={GPU per node} ./tools/run_dist_slurm.sh RefTR ${Number Of GPU} ${config file name} 67 | ``` 68 | 69 | Example: 70 | ```python 71 | MASTER_PORT=29501 GPUS_PER_NODE=4 ./tools/run_dist_slurm.sh RefTR 4 configs/flickr30k/RefTR_flickr.sh 72 | ``` 73 | 74 | ## Evaluation 75 | 76 | To evaluate the model, run: 77 | ```eval 78 | MASTER_PORT=${Master Port} GPUS_PER_NODE={GPU per node} ./tools/run_dist_slurm.sh RefTR ${Number Of GPU} ${config file name} --eval --resume=${path to checkpoint} 79 | ``` 80 | 81 | Example: 82 | ```python 83 | MASTER_PORT=29501 GPUS_PER_NODE=4 ./tools/run_dist_slurm.sh RefTR 4 configs/flickr30k/RefTR_flickr.sh --eval --resume=./exps/flickr30k/checkpoint.pth 84 | ``` 85 | 86 | ## Pretrained checkpoint for refcoco res/rec 87 | | Checkpoint Name | Dataset/Link | Description| 88 | | ----------- | ----------- | --- | 89 | | refcoco_SEG_PT_res50_6_epochs.pth | [refcoco](https://drive.google.com/file/d/151XGTlGTbwGyQ6HMEn2sTEwEeFY9Csjx/view?usp=sharing) | Pretrained 6 epochs on VG | 90 | | refcoco+_SEG_PT_res50_6_epochs.pth | [refcoco+](https://drive.google.com/file/d/1KKd80NReZJ500G6pnY1iRXoWqhJRDn5T/view?usp=sharing) | Pretrained 6 epochs on VG | 91 | | refcocog_SEG_PT_res50_6_epochs.pth | [refcocog](https://drive.google.com/file/d/1oStrCvyJ2KyumXciMg6n8CdvefS9Qjsi/view?usp=sharing) | Pretrained 6 epochs on VG | 92 | 93 | ## Bibtext 94 | 95 | If you find this code is useful for your research, please cite our paper 96 | 97 | ``` 98 | @inproceedings{muchen2021referring, 99 | title={Referring Transformer: A One-step Approach to Multi-task Visual Grounding}, 100 | author={Muchen, Li and Leonid, Sigal}, 101 | booktitle={Thirty-Fifth Conference on Neural Information Processing Systems}, 102 | year={2021} 103 | } 104 | -------------------------------------------------------------------------------- /configs/flickr30k/RefTR_flickr.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting 3 | set -x 4 | 5 | EXP_DIR=exps/RefTR_flickr 6 | PY_ARGS=${@:1} 7 | 8 | conda activate pytorch 9 | which python 10 | 11 | python3.8 -u main_vg.py \ 12 | --pretrained_model "./data/MODEL_ZOO/detr-r50-e632da11.pth"\ 13 | --num_feature_levels 1\ 14 | --dataset flickr30k\ 15 | --dec_layers 6\ 16 | --img_size 640\ 17 | --max_img_size 640\ 18 | --batch_size 16\ 19 | --epochs 60\ 20 | --warm_up_epoch 5\ 21 | --lr_schedule CosineWarmupLR\ 22 | --aux_loss\ 23 | --output_dir ${EXP_DIR} \ 24 | ${PY_ARGS} 25 | 26 | # --num_queries_per_phrase 1\ 27 | # --resume -------------------------------------------------------------------------------- /configs/flickr30k/RefTR_flickr_roberta.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting 3 | set -x 4 | 5 | EXP_DIR=exps/flickr/RefTR_flickr_roberta 6 | PY_ARGS=${@:1} 7 | 8 | conda activate pytorch 9 | which python 10 | 11 | python3.8 -u main_vg.py \ 12 | --pretrained_model "./data/MODEL_ZOO/detr-r50-e632da11.pth"\ 13 | --num_feature_levels 1\ 14 | --num_queries_per_phrase 1\ 15 | --dataset flickr30k\ 16 | --dec_layers 6\ 17 | --bert_model roberta-base\ 18 | --img_size 640\ 19 | --max_img_size 640\ 20 | --batch_size 16\ 21 | --epochs 60\ 22 | --lr_drop 40\ 23 | --aux_loss\ 24 | --output_dir ${EXP_DIR} \ 25 | ${PY_ARGS} 26 | 27 | # --resume -------------------------------------------------------------------------------- /configs/flickr30k/Ref_flickr_pt.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting 3 | set -x 4 | 5 | EXP_DIR=exps/flickr/RefTR_pt 6 | PY_ARGS=${@:1} 7 | 8 | conda activate pytorch 9 | which python 10 | 11 | python3.8 -u main_vg.py \ 12 | --resume "./SAVED_MODEL/PT/RefTR_VG_PT_08.pth"\ 13 | --resume_model_only\ 14 | --num_feature_levels 1\ 15 | --num_queries_per_phrase 1\ 16 | --dataset flickr30k\ 17 | --dec_layers 6\ 18 | --img_size 640\ 19 | --max_img_size 640\ 20 | --epochs 40\ 21 | --lr_drop 30\ 22 | --aux_loss\ 23 | --output_dir ${EXP_DIR} \ 24 | ${PY_ARGS} 25 | 26 | # --resume -------------------------------------------------------------------------------- /configs/flickr30k/Ref_flickr_pt_101.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting 3 | set -x 4 | 5 | EXP_DIR=exps/flickr/RefTR_pt_101 6 | PY_ARGS=${@:1} 7 | 8 | conda activate pytorch 9 | which python 10 | 11 | python3.8 -u main_vg.py \ 12 | --resume "./SAVED_MODEL/PT/RefTR_VG_101_PT_08.pth"\ 13 | --resume_model_only\ 14 | --num_feature_levels 1\ 15 | --num_queries_per_phrase 1\ 16 | --dataset flickr30k\ 17 | --dec_layers 6\ 18 | --img_size 640\ 19 | --max_img_size 640\ 20 | --epochs 40\ 21 | --lr_drop 30\ 22 | --aux_loss\ 23 | --backbone resnet101\ 24 | --output_dir ${EXP_DIR} \ 25 | ${PY_ARGS} 26 | 27 | # --resume -------------------------------------------------------------------------------- /configs/refcoco+/RefTR_SEG_refcoco+.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting 3 | set -x 4 | 5 | 6 | ############################################################################################### 7 | # EXP_DIR=exps/refcoco+/RefTR_SEG 8 | # PY_ARGS=${@:1} 9 | 10 | # conda activate pytorch 11 | # which python 12 | 13 | # python3.8 -u main_vg.py \ 14 | # --pretrained_model "./SAVED_MODEL/refcoco_50_det/RefTR_refcoco+_l6/checkpoint_best.pth"\ 15 | # --num_feature_levels 1\ 16 | # --num_queries_per_phrase 1\ 17 | # --masks\ 18 | # --lr 1e-5\ 19 | # --lr_mask_branch_proj 10\ 20 | # --dataset refcoco+_unc\ 21 | # --train_split train\ 22 | # --test_split val testA testB\ 23 | # --dec_layers 6\ 24 | # --aux_loss \ 25 | # --img_size 640\ 26 | # --max_img_size 640\ 27 | # --epochs 40\ 28 | # --lr_drop 30\ 29 | # --output_dir ${EXP_DIR} \ 30 | # ${PY_ARGS} 31 | 32 | # --resume 33 | 34 | 35 | ############################################################################################### 36 | EXP_DIR=exps/refcoco+/RefTR_SEG_PT 37 | PY_ARGS=${@:1} 38 | 39 | conda activate pytorch 40 | which python 41 | 42 | python3.8 -u main_vg.py \ 43 | --pretrained_model "./SAVED_MODEL/refcoco_50_det_pretrained/RefTR_refcoco+_pt/checkpoint_best.pth"\ 44 | --num_feature_levels 1\ 45 | --num_queries_per_phrase 1\ 46 | --masks\ 47 | --lr 1e-5\ 48 | --lr_mask_branch_proj 10\ 49 | --dataset refcoco+_unc\ 50 | --train_split train\ 51 | --test_split testA testB\ 52 | --dec_layers 6\ 53 | --aux_loss \ 54 | --img_size 640\ 55 | --max_img_size 640\ 56 | --epochs 40\ 57 | --lr_drop 30\ 58 | --output_dir ${EXP_DIR} \ 59 | ${PY_ARGS} -------------------------------------------------------------------------------- /configs/refcoco+/RefTR_SEG_refcoco+_101.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting 3 | set -x 4 | 5 | # EXP_DIR=exps/refcoco+/RefTR_SEG_101 6 | # PY_ARGS=${@:1} 7 | 8 | # conda activate pytorch 9 | # which python 10 | 11 | # python3.8 -u main_vg.py \ 12 | # --pretrained_model "./SAVED_MODEL/refcoco_101_det/RefTR_refcoco+_101/checkpoint_best.pth"\ 13 | # --num_feature_levels 1\ 14 | # --num_queries_per_phrase 1\ 15 | # --masks\ 16 | # --lr 1e-5\ 17 | # --lr_mask_branch_proj 10\ 18 | # --dataset refcoco+_unc\ 19 | # --train_split train\ 20 | # --test_split val testA testB\ 21 | # --dec_layers 6\ 22 | # --backbone resnet101\ 23 | # --aux_loss \ 24 | # --img_size 640\ 25 | # --max_img_size 640\ 26 | # --epochs 40\ 27 | # --lr_drop 30\ 28 | # --output_dir ${EXP_DIR} \ 29 | # ${PY_ARGS} 30 | 31 | # --resume 32 | 33 | EXP_DIR=exps/refcoco+/RefTR_SEG_101_PT 34 | PY_ARGS=${@:1} 35 | 36 | conda activate pytorch 37 | which python 38 | 39 | python3.8 -u main_vg.py \ 40 | --pretrained_model "./SAVED_MODEL/refcoco_101_det_pretrained/RefTR_refcoco+_pt/checkpoint_best.pth"\ 41 | --num_feature_levels 1\ 42 | --num_queries_per_phrase 1\ 43 | --masks\ 44 | --lr 1e-5\ 45 | --lr_mask_branch_proj 10\ 46 | --dataset refcoco+_unc\ 47 | --train_split train\ 48 | --test_split val testA testB\ 49 | --dec_layers 6\ 50 | --backbone resnet101\ 51 | --aux_loss \ 52 | --img_size 640\ 53 | --max_img_size 640\ 54 | --epochs 40\ 55 | --lr_drop 30\ 56 | --ckpt_cycle 60\ 57 | --output_dir ${EXP_DIR} \ 58 | ${PY_ARGS} 59 | -------------------------------------------------------------------------------- /configs/refcoco+/RefTR_refcoco+.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting 3 | set -x 4 | 5 | EXP_DIR=exps/RefTR_refcoco+_unc 6 | PY_ARGS=${@:1} 7 | 8 | conda activate pytorch 9 | which python 10 | 11 | python3.8 -u main_vg.py \ 12 | --pretrained_model "./data/MODEL_ZOO/detr-r50-e632da11.pth"\ 13 | --num_feature_levels 1\ 14 | --num_queries_per_phrase 1\ 15 | --dataset refcoco+_unc\ 16 | --train_split train\ 17 | --test_split val testA testB\ 18 | --dec_layers 6\ 19 | --aux_loss \ 20 | --img_size 640\ 21 | --max_img_size 640\ 22 | --epochs 90\ 23 | --lr_drop 60\ 24 | --output_dir ${EXP_DIR} \ 25 | ${PY_ARGS} 26 | 27 | # --resume -------------------------------------------------------------------------------- /configs/refcoco+/RefTR_refcoco+_101.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting 3 | set -x 4 | 5 | EXP_DIR=exps/RefTR_refcoco+_unc_101 6 | PY_ARGS=${@:1} 7 | 8 | conda activate pytorch 9 | which python 10 | 11 | python3.8 -u main_vg.py \ 12 | --pretrained_model "./data/MODEL_ZOO/detr-r101-2c7b67e5.pth"\ 13 | --num_feature_levels 1\ 14 | --num_queries_per_phrase 1\ 15 | --dataset refcoco+_unc\ 16 | --train_split train\ 17 | --test_split val testA testB\ 18 | --dec_layers 6\ 19 | --backbone resnet101\ 20 | --aux_loss \ 21 | --img_size 640\ 22 | --max_img_size 640\ 23 | --epochs 90\ 24 | --lr_drop 60\ 25 | --output_dir ${EXP_DIR} \ 26 | ${PY_ARGS} 27 | 28 | # --resume -------------------------------------------------------------------------------- /configs/refcoco/RefTR_refcoco.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | set -x 3 | 4 | PY_ARGS=${@:1} 5 | 6 | conda activate pytorch 7 | which python 8 | 9 | EXP_DIR=exps/refcoco/r50_det 10 | python3.8 -u main_vg.py \ 11 | --pretrained_model "./data/MODEL_ZOO/detr-r50-e632da11.pth"\ 12 | --num_feature_levels 1\ 13 | --dataset refcoco_unc\ 14 | --train_split train\ 15 | --test_split val testA testB\ 16 | --dec_layers 6\ 17 | --aux_loss \ 18 | --img_size 640\ 19 | --max_img_size 640\ 20 | --epochs 90\ 21 | --lr_drop 60\ 22 | --output_dir ${EXP_DIR} \ 23 | ${PY_ARGS} 24 | 25 | EXP_DIR=exps/refcoco/r50 26 | python3.8 -u main_vg.py \ 27 | --pretrained_model "./exps/refcoco/refTR_det/checkpoint_best.pth"\ 28 | --num_feature_levels 1\ 29 | --masks\ 30 | --lr 1e-5\ 31 | --lr_mask_branch_proj 10\ 32 | --dataset refcoco_unc\ 33 | --train_split train\ 34 | --test_split val testA testB\ 35 | --dec_layers 6\ 36 | --aux_loss \ 37 | --img_size 640\ 38 | --max_img_size 640\ 39 | --epochs 40\ 40 | --lr_drop 30\ 41 | --output_dir ${EXP_DIR} \ 42 | ${PY_ARGS} -------------------------------------------------------------------------------- /configs/refcoco/RefTR_refcoco_101.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting 3 | set -x 4 | 5 | PY_ARGS=${@:1} 6 | 7 | conda activate pytorch 8 | which python 9 | 10 | EXP_DIR=exps/refcoco/r101_det 11 | python3.8 -u main_vg.py \ 12 | --pretrained_model "./data/MODEL_ZOO/detr-r101-2c7b67e5.pth"\ 13 | --num_feature_levels 1\ 14 | --dataset refcoco_unc\ 15 | --train_split train\ 16 | --test_split val testA testB\ 17 | --dec_layers 6\ 18 | --backbone resnet101\ 19 | --aux_loss \ 20 | --img_size 640\ 21 | --max_img_size 640\ 22 | --epochs 90\ 23 | --lr_drop 60\ 24 | --output_dir ${EXP_DIR} \ 25 | ${PY_ARGS} 26 | 27 | 28 | EXP_DIR=exps/refcoco/r101 29 | python3.8 -u main_vg.py \ 30 | --pretrained_model "./SAVED_MODEL/refcoco_101_det/RefTR_refcoco_101/checkpoint_best.pth"\ 31 | --num_feature_levels 1\ 32 | --masks\ 33 | --lr 1e-5\ 34 | --lr_mask_branch_proj 10\ 35 | --dataset refcoco_unc\ 36 | --train_split train\ 37 | --test_split val testA testB\ 38 | --dec_layers 6\ 39 | --backbone resnet101\ 40 | --aux_loss \ 41 | --img_size 640\ 42 | --max_img_size 640\ 43 | --epochs 40\ 44 | --lr_drop 30\ 45 | --output_dir ${EXP_DIR} \ -------------------------------------------------------------------------------- /configs/refcocog/RefTR_SEG_refcocog.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting 3 | set -x 4 | 5 | ############################################################################################### 6 | # EXP_DIR=exps/refcocog/RefTR_SEG 7 | # PY_ARGS=${@:1} 8 | 9 | # conda activate pytorch 10 | # which python 11 | 12 | # python3.8 -u main_vg.py \ 13 | # --pretrained_model "./SAVED_MODEL/refcoco_50_det/RefTR_refcocog_l6/checkpoint_best.pth"\ 14 | # --num_feature_levels 1\ 15 | # --num_queries_per_phrase 1\ 16 | # --masks\ 17 | # --lr 1e-5\ 18 | # --lr_mask_branch_proj 10\ 19 | # --dataset refcocog_umd\ 20 | # --train_split train\ 21 | # --test_split val test\ 22 | # --dec_layers 6\ 23 | # --aux_loss \ 24 | # --img_size 640\ 25 | # --max_img_size 640\ 26 | # --epochs 40\ 27 | # --lr_drop 30\ 28 | # --output_dir ${EXP_DIR} \ 29 | # ${PY_ARGS} 30 | 31 | # # --resume 32 | 33 | ############################################################################################### 34 | EXP_DIR=exps/refcocog/RefTR_SEG_PT 35 | PY_ARGS=${@:1} 36 | 37 | conda activate pytorch 38 | which python 39 | 40 | python3.8 -u main_vg.py \ 41 | --pretrained_model "./SAVED_MODEL/refcoco_50_det_pretrained/RefTR_refcocog_pt/checkpoint_best.pth"\ 42 | --num_feature_levels 1\ 43 | --num_queries_per_phrase 1\ 44 | --masks\ 45 | --lr 1e-5\ 46 | --lr_mask_branch_proj 10\ 47 | --dataset refcocog_umd\ 48 | --train_split train\ 49 | --test_split test\ 50 | --dec_layers 6\ 51 | --aux_loss \ 52 | --img_size 640\ 53 | --max_img_size 640\ 54 | --epochs 40\ 55 | --lr_drop 30\ 56 | --output_dir ${EXP_DIR} \ 57 | ${PY_ARGS} 58 | 59 | # --resume -------------------------------------------------------------------------------- /configs/refcocog/RefTR_SEG_refcocog_101.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting 3 | set -x 4 | 5 | # EXP_DIR=exps/refcocog/RefTR_SEG_101 6 | # PY_ARGS=${@:1} 7 | 8 | # conda activate pytorch 9 | # which python 10 | 11 | # python3.8 -u main_vg.py \ 12 | # --pretrained_model "./SAVED_MODEL/refcoco_101_det/RefTR_refcocog_101/checkpoint_best.pth"\ 13 | # --num_feature_levels 1\ 14 | # --num_queries_per_phrase 1\ 15 | # --masks\ 16 | # --lr 1e-5\ 17 | # --lr_mask_branch_proj 10\ 18 | # --dataset refcocog_umd\ 19 | # --train_split train\ 20 | # --test_split val test\ 21 | # --dec_layers 6\ 22 | # --backbone resnet101\ 23 | # --aux_loss \ 24 | # --img_size 640\ 25 | # --max_img_size 640\ 26 | # --epochs 40\ 27 | # --lr_drop 30\ 28 | # --output_dir ${EXP_DIR} \ 29 | # ${PY_ARGS} 30 | 31 | # --resume 32 | 33 | EXP_DIR=exps/refcocog/RefTR_SEG_101_PT 34 | PY_ARGS=${@:1} 35 | 36 | conda activate pytorch 37 | which python 38 | 39 | python3.8 -u main_vg.py \ 40 | --pretrained_model "./SAVED_MODEL/refcoco_101_det_pretrained/RefTR_refcocog_pt/checkpoint_best.pth"\ 41 | --num_feature_levels 1\ 42 | --num_queries_per_phrase 1\ 43 | --masks\ 44 | --lr 1e-5\ 45 | --lr_mask_branch_proj 10\ 46 | --dataset refcocog_umd\ 47 | --train_split train\ 48 | --test_split val test\ 49 | --dec_layers 6\ 50 | --backbone resnet101\ 51 | --aux_loss \ 52 | --img_size 640\ 53 | --max_img_size 640\ 54 | --epochs 40\ 55 | --lr_drop 30\ 56 | --ckpt_cycle 60\ 57 | --output_dir ${EXP_DIR} \ 58 | ${PY_ARGS} -------------------------------------------------------------------------------- /configs/refcocog/RefTR_refcocog.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting 3 | set -x 4 | 5 | EXP_DIR=exps/RefTR_refcocog_unc 6 | PY_ARGS=${@:1} 7 | 8 | conda activate pytorch 9 | which python 10 | 11 | python3.8 -u main_vg.py \ 12 | --pretrained_model "./data/MODEL_ZOO/detr-r50-e632da11.pth"\ 13 | --num_feature_levels 1\ 14 | --num_queries_per_phrase 1\ 15 | --dataset refcocog_umd\ 16 | --train_split train\ 17 | --test_split val test\ 18 | --dec_layers 6\ 19 | --aux_loss \ 20 | --img_size 640\ 21 | --max_img_size 640\ 22 | --epochs 90\ 23 | --lr_drop 60\ 24 | --output_dir ${EXP_DIR} \ 25 | ${PY_ARGS} 26 | 27 | # --resume -------------------------------------------------------------------------------- /configs/refcocog/RefTR_refcocog_101.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting 3 | set -x 4 | 5 | EXP_DIR=exps/RefTR_refcocog_unc_101 6 | PY_ARGS=${@:1} 7 | 8 | conda activate pytorch 9 | which python 10 | 11 | python3.8 -u main_vg.py \ 12 | --pretrained_model "./data/MODEL_ZOO/detr-r101-2c7b67e5.pth"\ 13 | --num_feature_levels 1\ 14 | --num_queries_per_phrase 1\ 15 | --dataset refcocog_umd\ 16 | --train_split train\ 17 | --test_split val test\ 18 | --dec_layers 6\ 19 | --backbone resnet101\ 20 | --aux_loss \ 21 | --img_size 640\ 22 | --max_img_size 640\ 23 | --epochs 90\ 24 | --lr_drop 60\ 25 | --output_dir ${EXP_DIR} \ 26 | ${PY_ARGS} 27 | 28 | # --resume -------------------------------------------------------------------------------- /configs/referit/RefTR_referit.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting 3 | set -x 4 | 5 | EXP_DIR=exps/referit/RefTR 6 | PY_ARGS=${@:1} 7 | 8 | conda activate pytorch 9 | which python 10 | 11 | python3.8 -u main_vg.py \ 12 | --pretrained_model "./data/MODEL_ZOO/detr-r50-e632da11.pth"\ 13 | --num_feature_levels 1\ 14 | --num_queries_per_phrase 1\ 15 | --dec_layers 3\ 16 | --aux_loss \ 17 | --dataset referit\ 18 | --img_size 640\ 19 | --max_img_size 640\ 20 | --epochs 90\ 21 | --lr_drop 60\ 22 | --ckpt_cycle 45\ 23 | --output_dir ${EXP_DIR} \ 24 | ${PY_ARGS} 25 | 26 | # # --resume 27 | # python3.8 -u main_vg.py \ 28 | # --resume ${EXP_DIR}/checkpoint0069.pth\ 29 | # --num_feature_levels 1\ 30 | # --num_queries_per_phrase 1\ 31 | # --dec_layers 3\ 32 | # --dataset referit\ 33 | # --img_size 640\ 34 | # --max_img_size 640\ 35 | # --epochs 90\ 36 | # --lr_drop 60\ 37 | # --output_dir ${EXP_DIR} \ 38 | # ${PY_ARGS} -------------------------------------------------------------------------------- /configs/referit/RefTR_referit_101.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting 3 | set -x 4 | 5 | EXP_DIR=exps/referit/RefTR_101 6 | PY_ARGS=${@:1} 7 | 8 | conda activate pytorch 9 | which python 10 | 11 | python3.8 -u main_vg.py \ 12 | --pretrained_model "./data/MODEL_ZOO/detr-r101-2c7b67e5.pth"\ 13 | --num_feature_levels 1\ 14 | --num_queries_per_phrase 1\ 15 | --dec_layers 3\ 16 | --backbone resnet101\ 17 | --aux_loss \ 18 | --dataset referit\ 19 | --img_size 640\ 20 | --max_img_size 640\ 21 | --epochs 90\ 22 | --lr_drop 60\ 23 | --ckpt_cycle 45\ 24 | --output_dir ${EXP_DIR} \ 25 | ${PY_ARGS} 26 | 27 | # # --resume 28 | # python3.8 -u main_vg.py \ 29 | # --resume ${EXP_DIR}/checkpoint0069.pth\ 30 | # --num_feature_levels 1\ 31 | # --num_queries_per_phrase 1\ 32 | # --dec_layers 3\ 33 | # --dataset referit\ 34 | # --img_size 640\ 35 | # --max_img_size 640\ 36 | # --epochs 90\ 37 | # --lr_drop 60\ 38 | # --output_dir ${EXP_DIR} \ 39 | # ${PY_ARGS} -------------------------------------------------------------------------------- /configs/referit/RefTR_referit_101_PT.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting 3 | set -x 4 | 5 | EXP_DIR=exps/referit/RefTR_101_PT 6 | PY_ARGS=${@:1} 7 | 8 | conda activate pytorch 9 | which python 10 | 11 | python3.8 -u main_vg.py \ 12 | --resume './SAVED_MODEL/PT/RefTR_VG_101_PT_08.pth'\ 13 | --resume_model_only\ 14 | --num_feature_levels 1\ 15 | --num_queries_per_phrase 1\ 16 | --dec_layers 6\ 17 | --backbone resnet101\ 18 | --aux_loss \ 19 | --dataset referit\ 20 | --img_size 640\ 21 | --max_img_size 640\ 22 | --ckpt_cycle 90\ 23 | --epochs 90\ 24 | --lr_drop 60\ 25 | --output_dir ${EXP_DIR} \ 26 | ${PY_ARGS} 27 | 28 | # # --resume 29 | # python3.8 -u main_vg.py \ 30 | # --resume ${EXP_DIR}/checkpoint0069.pth\ 31 | # --num_feature_levels 1\ 32 | # --num_queries_per_phrase 1\ 33 | # --dec_layers 3\ 34 | # --dataset referit\ 35 | # --img_size 640\ 36 | # --max_img_size 640\ 37 | # --epochs 90\ 38 | # --lr_drop 60\ 39 | # --output_dir ${EXP_DIR} \ 40 | # ${PY_ARGS} -------------------------------------------------------------------------------- /configs/referit/RefTR_referit_PT.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # coco pretrained provided by r50_dconvDETR_C5_pretrained_coco_Q100 setting 3 | set -x 4 | 5 | EXP_DIR=exps/referit/RefTR_PT 6 | PY_ARGS=${@:1} 7 | 8 | conda activate pytorch 9 | which python 10 | 11 | python3.8 -u main_vg.py \ 12 | --resume './SAVED_MODEL/PT/RefTR_VG_PT_08.pth'\ 13 | --resume_model_only\ 14 | --num_feature_levels 1\ 15 | --num_queries_per_phrase 1\ 16 | --dec_layers 6\ 17 | --aux_loss \ 18 | --dataset referit\ 19 | --img_size 640\ 20 | --max_img_size 640\ 21 | --ckpt_cycle 90\ 22 | --epochs 90\ 23 | --lr_drop 60\ 24 | --output_dir ${EXP_DIR} \ 25 | ${PY_ARGS} 26 | 27 | # # --resume 28 | # python3.8 -u main_vg.py \ 29 | # --resume ${EXP_DIR}/checkpoint0069.pth\ 30 | # --num_feature_levels 1\ 31 | # --num_queries_per_phrase 1\ 32 | # --dec_layers 3\ 33 | # --dataset referit\ 34 | # --img_size 640\ 35 | # --max_img_size 640\ 36 | # --epochs 90\ 37 | # --lr_drop 60\ 38 | # --output_dir ${EXP_DIR} \ 39 | # ${PY_ARGS} -------------------------------------------------------------------------------- /datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | import torch.utils.data 11 | 12 | from .refer_multiphrase import build_flickr30k 13 | from .refer_segmentation import build_refcoco_segmentation 14 | from .refer_resc import build_flickr30k_resc, build_refcoco_resc, build_referit_resc, build_visualgenome, GeneralReferDataset 15 | 16 | 17 | def build_refer_dataset(image_set, args): 18 | if args.masks: 19 | return build_refcoco_segmentation( 20 | split=image_set, 21 | version=args.dataset, 22 | img_size=args.img_size, 23 | max_img_size=args.max_img_size, 24 | data_root="./data/refcoco/anns", 25 | im_dir="./data/refcoco/images/train2014", 26 | seg_dir="./data/refcoco/masks", 27 | bert_model=args.bert_model 28 | ) 29 | 30 | if args.dataset == 'flickr30k': 31 | # if args.reftr_type == 'transformer_single_phrase': 32 | # print("Using One stage grounding's flickr30k") 33 | # return build_flickr30k_resc( 34 | # split=image_set, 35 | # img_size=args.img_size, 36 | # max_img_size=args.max_img_size, 37 | # data_root="./data/annotations_resc", 38 | # im_dir="./data/flickr30k/f30k_images" 39 | # ) 40 | # else: 41 | return build_flickr30k( 42 | split=image_set, 43 | img_size=args.img_size, 44 | max_img_size=args.max_img_size, 45 | data_root="./data/annotations", 46 | im_dir="./data/flickr30k/f30k_images", 47 | bert_model=args.bert_model 48 | ) 49 | # print("Flicker Dataset size:", len(dataset_train)) 50 | elif args.dataset == 'referit': 51 | return build_referit_resc( 52 | split=image_set, 53 | data_root="./data/annotations_resc", 54 | max_query_len=40, 55 | img_size=args.img_size, 56 | max_img_size=args.max_img_size, 57 | bert_model=args.bert_model 58 | ) 59 | elif args.dataset.startswith('refcoco'): 60 | if args.dataset == 'refcoco_unc': 61 | version = 'unc' 62 | elif args.dataset == 'refcoco+_unc': 63 | version = 'unc+' 64 | elif args.dataset == 'refcocog_google': 65 | version = 'gref' 66 | elif args.dataset == 'refcocog_umd': 67 | version = 'gref_umd' 68 | return build_refcoco_resc( 69 | split=image_set, 70 | version=version, 71 | data_root="./data/annotations_resc", 72 | im_dir="./data/refcoco/images/train2014", 73 | max_query_len=40, 74 | img_size=args.img_size, 75 | max_img_size=args.max_img_size, 76 | bert_model=args.bert_model 77 | ) 78 | elif args.dataset == 'vg': 79 | if image_set != 'all': 80 | return build_referit_resc( 81 | split=image_set, 82 | data_root="./data/annotations_resc", 83 | max_query_len=40, 84 | img_size=args.img_size, 85 | max_img_size=args.max_img_size, 86 | bert_model=args.bert_model 87 | ) 88 | return build_visualgenome( 89 | split='all', 90 | data_root="./data/annotations_resc", 91 | im_dir="./data/visualgenome/VG_100K", 92 | max_query_len=40, 93 | img_size=args.img_size, 94 | max_img_size=args.max_img_size, 95 | bert_model=args.bert_model 96 | ) 97 | elif args.dataset == 'flickr30k_resc': 98 | return build_flickr30k_resc( 99 | split=image_set, 100 | img_size=args.img_size, 101 | max_img_size=args.max_img_size, 102 | max_query_len=40, 103 | data_root="./data/annotations_resc", 104 | im_dir="./data/flickr30k/f30k_images", 105 | bert_model=args.bert_model 106 | ) 107 | elif args.dataset == 'flickr30k_refcoco': 108 | f30k = build_flickr30k_resc( 109 | split=image_set, 110 | img_size=args.img_size, 111 | max_img_size=args.max_img_size, 112 | max_query_len=40, 113 | data_root="./data/annotations_resc", 114 | im_dir="./data/flickr30k/f30k_images", 115 | bert_model=args.bert_model 116 | ) 117 | refcoco = build_refcoco_resc( 118 | split='trainval', 119 | version='unc', 120 | max_query_len=40, 121 | img_size=args.img_size, 122 | max_img_size=args.max_img_size, 123 | data_root="./data/annotations_resc", 124 | im_dir="./data/refcoco/images/train2014", 125 | bert_model=args.bert_model 126 | ) 127 | if image_set.startswith('train'): 128 | return GeneralReferDataset(datasets=[f30k, refcoco]) 129 | else: 130 | return f30k 131 | else: 132 | raise NotImplementedError 133 | 134 | def build_refer_segmentaion_dataset(image_set, args): 135 | return build_refcoco_segmentation( 136 | split=image_set, version=args.dataset 137 | ) 138 | -------------------------------------------------------------------------------- /datasets/data_prefetcher.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def to_cuda(samples, targets, device): 4 | samples = samples.to(device, non_blocking=True) 5 | targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets] 6 | return samples, targets 7 | 8 | class data_prefetcher(): 9 | def __init__(self, loader, device, prefetch=True): 10 | self.loader = iter(loader) 11 | self.prefetch = prefetch 12 | self.device = device 13 | if prefetch: 14 | self.stream = torch.cuda.Stream() 15 | self.preload() 16 | 17 | def preload(self): 18 | try: 19 | self.next_samples, self.next_targets = next(self.loader) 20 | except StopIteration: 21 | self.next_samples = None 22 | self.next_targets = None 23 | return 24 | # if record_stream() doesn't work, another option is to make sure device inputs are created 25 | # on the main stream. 26 | # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda') 27 | # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda') 28 | # Need to make sure the memory allocated for next_* is not still in use by the main stream 29 | # at the time we start copying to next_*: 30 | # self.stream.wait_stream(torch.cuda.current_stream()) 31 | with torch.cuda.stream(self.stream): 32 | self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device) 33 | # more code for the alternative if record_stream() doesn't work: 34 | # copy_ will record the use of the pinned source tensor in this side stream. 35 | # self.next_input_gpu.copy_(self.next_input, non_blocking=True) 36 | # self.next_target_gpu.copy_(self.next_target, non_blocking=True) 37 | # self.next_input = self.next_input_gpu 38 | # self.next_target = self.next_target_gpu 39 | 40 | # With Amp, it isn't necessary to manually convert data to half. 41 | # if args.fp16: 42 | # self.next_input = self.next_input.half() 43 | # else: 44 | 45 | def next(self): 46 | if self.prefetch: 47 | torch.cuda.current_stream().wait_stream(self.stream) 48 | samples = self.next_samples 49 | targets = self.next_targets 50 | if samples is not None: 51 | samples.record_stream(torch.cuda.current_stream()) 52 | if targets is not None: 53 | for t in targets: 54 | for k, v in t.items(): 55 | v.record_stream(torch.cuda.current_stream()) 56 | self.preload() 57 | else: 58 | try: 59 | samples, targets = next(self.loader) 60 | samples, targets = to_cuda(samples, targets, self.device) 61 | except StopIteration: 62 | samples = None 63 | targets = None 64 | return samples, targets 65 | -------------------------------------------------------------------------------- /datasets/grounding_datasets/__init__.py: -------------------------------------------------------------------------------- 1 | import re 2 | from .refer_dataset import FlickrMultiPhraseDataset, ReferSegDataset 3 | 4 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 5 | """Truncates a sequence pair in place to the maximum length.""" 6 | while True: 7 | total_length = len(tokens_a) + len(tokens_b) 8 | if total_length <= max_length: 9 | break 10 | if len(tokens_a) > len(tokens_b): 11 | tokens_a.pop() 12 | else: 13 | tokens_b.pop() 14 | 15 | def read_examples(input_line, unique_id): 16 | """ 17 | Note from JOJO: this should be copied from bert source code 18 | refer to: 19 | https://daiwk.github.io/posts/nlp-bert-code-annotated-application.html#inputexample 20 | for understanding 21 | Read a list of `InputExample`s from an input file.""" 22 | examples = [] 23 | # unique_id = 0 24 | line = input_line #reader.readline() 25 | # if not line: 26 | # break 27 | line = line.strip() 28 | text_a = None 29 | text_b = None 30 | m = re.match(r"^(.*) \|\|\| (.*)$", line) 31 | if m is None: 32 | text_a = line 33 | else: 34 | text_a = m.group(1) 35 | text_b = m.group(2) 36 | examples.append( 37 | InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) 38 | # unique_id += 1 39 | return examples 40 | 41 | ## Bert text encoding 42 | class InputExample(object): 43 | def __init__(self, unique_id, text_a, text_b): 44 | self.unique_id = unique_id 45 | self.text_a = text_a 46 | self.text_b = text_b 47 | 48 | class InputFeatures(object): 49 | """A single set of features of data.""" 50 | def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): 51 | self.unique_id = unique_id 52 | self.tokens = tokens 53 | self.input_ids = input_ids 54 | self.input_mask = input_mask 55 | self.input_type_ids = input_type_ids 56 | 57 | def convert_examples_to_features(examples, seq_length, tokenizer): 58 | """Loads a data file into a list of `InputBatch`s.""" 59 | features = [] 60 | for (ex_index, example) in enumerate(examples): 61 | tokens_a = tokenizer.tokenize(example.text_a) 62 | 63 | tokens_b = None 64 | if example.text_b: 65 | tokens_b = tokenizer.tokenize(example.text_b) 66 | 67 | if tokens_b: 68 | # Modifies `tokens_a` and `tokens_b` in place so that the total 69 | # length is less than the specified length. 70 | # Account for [CLS], [SEP], [SEP] with "- 3" 71 | _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) 72 | else: 73 | # Account for [CLS] and [SEP] with "- 2" 74 | if len(tokens_a) > seq_length - 2: 75 | tokens_a = tokens_a[0:(seq_length - 2)] 76 | tokens = [] 77 | input_type_ids = [] 78 | tokens.append("[CLS]") 79 | input_type_ids.append(0) 80 | for token in tokens_a: 81 | tokens.append(token) 82 | input_type_ids.append(0) 83 | tokens.append("[SEP]") 84 | input_type_ids.append(0) 85 | 86 | if tokens_b: 87 | for token in tokens_b: 88 | tokens.append(token) 89 | input_type_ids.append(1) 90 | tokens.append("[SEP]") 91 | input_type_ids.append(1) 92 | 93 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 94 | 95 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 96 | # tokens are attended to. 97 | input_mask = [1] * len(input_ids) 98 | 99 | # Zero-pad up to the sequence length. 100 | while len(input_ids) < seq_length: 101 | input_ids.append(0) 102 | input_mask.append(0) 103 | input_type_ids.append(0) 104 | 105 | assert len(input_ids) == seq_length 106 | assert len(input_mask) == seq_length 107 | assert len(input_type_ids) == seq_length 108 | features.append( 109 | InputFeatures( 110 | unique_id=example.unique_id, 111 | tokens=tokens, 112 | input_ids=input_ids, 113 | input_mask=input_mask, 114 | input_type_ids=input_type_ids)) 115 | return features -------------------------------------------------------------------------------- /datasets/grounding_datasets/refer_dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Copied from https://github.com/zyang-ur/ReSC/blob/e4022f87bfd11200b67c4509bb9746640834ceae/utils/transforms.py 5 | 6 | ReferIt, UNC, UNC+ and GRef referring image segmentation PyTorch dataset. 7 | Define and group batches of images, segmentations and queries. 8 | Based on: 9 | https://github.com/chenxi116/TF-phrasecut-public/blob/master/build_batches.py 10 | """ 11 | 12 | import os 13 | import sys 14 | import cv2 15 | import json 16 | import uuid 17 | import tqdm 18 | import math 19 | import torch 20 | import random 21 | import argparse 22 | import collections 23 | import logging 24 | import re 25 | import operator 26 | # import h5py 27 | import numpy as np 28 | import os.path as osp 29 | import scipy.io as sio 30 | import torch.utils.data as data 31 | from collections import OrderedDict 32 | sys.path.append('.') 33 | # import util 34 | from util.word_utils import Corpus 35 | 36 | from transformers import BertTokenizerFast, RobertaTokenizerFast 37 | from util.transforms import letterbox, random_affine 38 | from datasets.lang_utils import convert_examples_to_features, read_examples 39 | # sys.modules['utils'] = utils 40 | 41 | cv2.setNumThreads(0) 42 | 43 | def build_bert_tokenizer(bert_model): 44 | if bert_model.split('-')[0] == 'roberta': 45 | lang_backbone = RobertaTokenizerFast.from_pretrained(bert_model, do_lower_case=True, do_basic_tokenize=False) 46 | else: 47 | lang_backbone = BertTokenizerFast.from_pretrained(bert_model, do_lower_case=True, do_basic_tokenize=False) 48 | return lang_backbone 49 | 50 | class DatasetNotFoundError(Exception): 51 | pass 52 | 53 | class FlickrMultiPhraseDataset(data.Dataset): 54 | SUPPORTED_DATASETS = { 55 | 'flickr': {'splits': ('train', 'val', 'test', 'trainval')} 56 | } 57 | 58 | def __init__( 59 | self, data_root, im_dir, dataset='referit', split='train', max_seq_len=88, 60 | max_num_phrases=16, max_phrase_len=22, bert_model='bert-base-uncased', lstm=False): 61 | self.images = [] 62 | self.data_root = data_root 63 | self.im_dir = im_dir 64 | self.dataset = dataset 65 | self.seq_len = max_seq_len 66 | self.phrase_seq_len = max_phrase_len 67 | self.num_phrases = max_num_phrases 68 | self.split = split 69 | 70 | print("Using tokenizer from:", bert_model) 71 | self.tokenizer = build_bert_tokenizer(bert_model) 72 | # self.tokenizer.add_special_tokens({'cls_phrase': '[CLS_P]', 'sperator_phrase': '[SEP_P]'}) 73 | 74 | annotation_path = osp.join(data_root, self.dataset) 75 | valid_splits = self.SUPPORTED_DATASETS[self.dataset]['splits'] 76 | 77 | if split not in valid_splits: 78 | raise ValueError( 79 | 'Dataset {0} does not have split {1}'.format( 80 | self.dataset, split)) 81 | 82 | splits = ['train', 'val'] if split == 'trainval' else [split] 83 | for split in splits: 84 | imgset_file = '{0}_{1}.pth'.format(self.dataset, split) 85 | imgset_path = osp.join(annotation_path, imgset_file) 86 | self.images += torch.load(imgset_path) 87 | 88 | def exists_dataset(self): 89 | return osp.exists(osp.join(self.data_root, self.dataset)) 90 | 91 | def pull_item(self, idx): 92 | if self.dataset == 'flickr': 93 | img_file, phrase_pos, bbox, phrases, _, sentence = self.images[idx] 94 | else: 95 | img_file, _, bbox, phrase, sentence = self.images[idx] 96 | phrases = [sentence] 97 | phrase_pos = [0] 98 | ## box format: to x1y1x2y2 99 | bbox = np.array(bbox, dtype=int) 100 | 101 | img_path = osp.join(self.im_dir, img_file) 102 | img = cv2.imread(img_path) 103 | ## duplicate channel if gray image 104 | if img.shape[-1] > 1: 105 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 106 | else: 107 | img = np.stack([img] * 3) 108 | return img, phrases, phrase_pos, sentence, bbox, img_file 109 | 110 | def __len__(self): 111 | return len(self.images) 112 | 113 | def __getitem__(self, idx): 114 | def phrase_pos_to_mask(pos_start, sentence_token, phrase_token, seq_len): 115 | phrase_len = len(phrase_token) - 2 116 | pos_start = pos_start + 1 117 | assert phrase_len >= 0 118 | assert sentence_token[pos_start:phrase_len+pos_start] == phrase_token[1:-1] 119 | 120 | mask = np.zeros(seq_len, dtype=np.bool) 121 | if phrase_len == 0: 122 | mask[0] = True 123 | else: 124 | mask[pos_start:phrase_len+pos_start] = True 125 | return mask 126 | 127 | img, phrases, phrase_char_pos_l, sentence, bbox, img_file = self.pull_item(idx) 128 | # phrase = phrase.decode("utf-8").encode().lower() 129 | 130 | # encode phrase to bert input 131 | tokenized_sentence = self.tokenizer( 132 | sentence, 133 | padding='max_length', 134 | max_length=self.seq_len, 135 | return_tensors='pt', 136 | ) 137 | word_id = tokenized_sentence['input_ids'][0] 138 | word_mask = tokenized_sentence['attention_mask'][0] 139 | 140 | # examples = read_examples(sentence, idx) 141 | # sentence_features = convert_examples_to_features( 142 | # examples=examples, seq_length=self.seq_len, tokenizer=self.tokenizer) 143 | # word_id = sentence_features[0].input_ids 144 | # word_mask = sentence_features[0].input_mask 145 | 146 | phrase_masks = [] 147 | phrase_context_masks = [] 148 | tokenized_phrases = [] 149 | phrase_pos_l = [] 150 | phrase_pos_r = [] 151 | for p, char_pos_l in zip(phrases, phrase_char_pos_l): 152 | tokenized_phrase = self.tokenizer( 153 | p, 154 | padding='max_length', 155 | max_length=self.phrase_seq_len, 156 | return_tensors='np', 157 | ) 158 | tokenized_phrases.append(tokenized_phrase['input_ids'][0]) 159 | phrase_masks.append(tokenized_phrase['attention_mask'][0]) 160 | 161 | # set up phrase_pos 162 | phrase_char_len = p.__len__() 163 | pos_l = tokenized_sentence.char_to_token(char_pos_l) 164 | pos_r = tokenized_sentence.char_to_token(char_pos_l + phrase_char_len - 1) 165 | assert pos_l is not None and pos_r is not None 166 | # Tips for roberta: Ġ means the end of a new token 167 | # So assert from the second character 168 | # assert tokenized_sentence.tokens()[pos_l+1:pos_r] == tokenized_phrase.tokens()[2:1+pos_r-pos_l],\ 169 | # (tokenized_sentence.tokens()[pos_l:pos_r], tokenized_phrase.tokens(), pos_l, pos_r) 170 | phrase_pos_l.append(pos_l) 171 | phrase_pos_r.append(pos_r+1) 172 | 173 | for i in range(len(phrases), self.num_phrases): 174 | tokenized_phrase = self.tokenizer( 175 | "", 176 | padding='max_length', 177 | max_length=self.phrase_seq_len, 178 | return_tensors='np', 179 | ) 180 | tokenized_phrases.append(tokenized_phrase['input_ids'][0]) 181 | phrase_masks.append(tokenized_phrase['attention_mask'][0]) 182 | phrase_pos_l.append(0) 183 | phrase_pos_r.append(1) 184 | 185 | h, w, c = img.shape 186 | samples = { 187 | "img": img, 188 | "sentence": np.array(word_id, dtype=int), 189 | "sentence_mask": np.array(word_mask, dtype=bool), 190 | "phrase": np.array(tokenized_phrases, dtype=int), 191 | "phrase_mask": np.array(phrase_masks, dtype=bool), 192 | "phrase_pos_l": np.array(phrase_pos_l, dtype=int), 193 | "phrase_pos_r": np.array(phrase_pos_r, dtype=int) 194 | } 195 | 196 | image_id = int(img_file.split('.')[0].split('_')[-1]) 197 | target = { 198 | "image_id": image_id, 199 | "boxes": np.array(bbox, dtype=np.float32), 200 | "labels": [0], 201 | 'dataset_id': idx, 202 | "orig_size": np.array([h, w], dtype=np.int) 203 | } 204 | return samples, target 205 | # if self.testmode: 206 | # return img, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \ 207 | # np.array(bbox, dtype=np.float32), np.array(ratio, dtype=np.float32), \ 208 | # np.array(dw, dtype=np.float32), np.array(dh, dtype=np.float32), self.images[idx][0] 209 | # else: 210 | # return img, np.array(word_id, dtype=int), np.array(word_mask, dtype=int), \ 211 | # np.array(bbox, dtype=np.float32) 212 | 213 | class ReferSegDataset(data.Dataset): 214 | SUPPORTED_DATASETS = { 215 | 'refcoco_unc': { 216 | 'splits': ('train', 'val', 'trainval', 'testA', 'testB'), 217 | 'params': {'dataset': 'refcoco', 'split_by': 'unc'} 218 | }, 219 | 'refcoco+_unc': { 220 | 'splits': ('train', 'val', 'trainval', 'testA', 'testB'), 221 | 'params': {'dataset': 'refcoco+', 'split_by': 'unc'} 222 | }, 223 | 'refcocog_google': { 224 | 'splits': ('train', 'val'), 225 | 'params': {'dataset': 'refcocog', 'split_by': 'google'} 226 | }, 227 | 'refcocog_umd': { 228 | 'splits': ('train', 'val', 'test'), 229 | 'params': {'dataset': 'refcocog', 'split_by': 'umd'} 230 | } 231 | } 232 | 233 | def __init__(self, data_root, im_dir, seg_dir, dataset='refcoco_unc', 234 | split='train', max_query_len=40, bert_model='bert-base-uncased'): 235 | self.images = [] 236 | self.data_root = data_root 237 | self.im_dir = im_dir 238 | self.dataset = dataset 239 | self.query_len = max_query_len 240 | self.split = split 241 | self.tokenizer = build_bert_tokenizer(bert_model) 242 | 243 | dataset_dir = self.dataset.split('_')[0] 244 | annotation_path = osp.join(data_root, dataset_dir) 245 | self.seg_dir = osp.join(seg_dir, dataset_dir) 246 | 247 | valid_splits = self.SUPPORTED_DATASETS[self.dataset]['splits'] 248 | if split not in valid_splits: 249 | raise ValueError( 250 | 'Dataset {0} does not have split {1}'.format( 251 | self.dataset, split)) 252 | 253 | splits = [split] 254 | splits = ['train', 'val'] if split == 'trainval' else [split] 255 | for split in splits: 256 | imgset_file = '{0}_{1}.pth'.format(self.dataset, split) 257 | imgset_path = osp.join(annotation_path, imgset_file) 258 | self.images += torch.load(imgset_path) 259 | 260 | def pull_item(self, idx): 261 | img_file, seg_file, bbox, phrase = self.images[idx] 262 | ## box format: x1y1x2y2 263 | bbox = np.array(bbox, dtype=int) 264 | img = cv2.imread(osp.join(self.im_dir, img_file)) 265 | mask = np.load(osp.join(self.seg_dir, seg_file)) 266 | assert img.shape[:2] == mask.shape[:2] 267 | ## duplicate channel if gray image 268 | if img.shape[-1] > 1: 269 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 270 | else: 271 | img = np.stack([img] * 3) 272 | return img, mask, phrase, bbox, img_file 273 | 274 | def tokenize_phrase(self, phrase): 275 | return self.corpus.tokenize(phrase, self.query_len) 276 | 277 | def untokenize_word_vector(self, words): 278 | return self.corpus.dictionary[words] 279 | 280 | def __len__(self): 281 | return len(self.images) 282 | 283 | def __getitem__(self, idx): 284 | img, mask, phrase, bbox, img_file = self.pull_item(idx) 285 | # phrase = phrase.decode("utf-8").encode().lower() 286 | phrase = phrase.lower() 287 | 288 | # encode phrase to bert input 289 | tokenized_sentence = self.tokenizer( 290 | phrase, 291 | padding='max_length', 292 | max_length=self.query_len, 293 | truncation=True, 294 | return_tensors='pt', 295 | ) 296 | word_id = tokenized_sentence['input_ids'][0] 297 | word_mask = tokenized_sentence['attention_mask'][0] 298 | 299 | h, w, c = img.shape 300 | 301 | samples = { 302 | "img": img, 303 | "sentence": np.array(word_id, dtype=int), 304 | "sentence_mask": np.array(word_mask, dtype=int) 305 | } 306 | 307 | mask = mask[None, :, :] 308 | image_id = int(img_file.split('.')[0].split('_')[-1]) 309 | target = { 310 | "image_id": image_id, 311 | 'dataset_id': idx, 312 | "boxes": np.array([bbox], dtype=np.float32), 313 | "labels": [0], 314 | "masks": mask, 315 | "orig_size": np.array([h, w], dtype=np.int) 316 | } 317 | return samples, target 318 | 319 | -------------------------------------------------------------------------------- /datasets/grounding_datasets/resc_refer_dataset.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Copied from https://github.com/zyang-ur/ReSC/blob/e4022f87bfd11200b67c4509bb9746640834ceae/utils/transforms.py 5 | 6 | ReferIt, UNC, UNC+ and GRef referring image segmentation PyTorch dataset. 7 | Define and group batches of images, segmentations and queries. 8 | Based on: 9 | https://github.com/chenxi116/TF-phrasecut-public/blob/master/build_batches.py 10 | """ 11 | 12 | import os 13 | import sys 14 | import cv2 15 | import json 16 | import uuid 17 | import tqdm 18 | import math 19 | import torch 20 | import random 21 | # import h5py 22 | import numpy as np 23 | import os.path as osp 24 | import scipy.io as sio 25 | import torch.utils.data as data 26 | from collections import OrderedDict 27 | sys.path.append('.') 28 | import operator 29 | # import util 30 | from util.word_utils import Corpus 31 | 32 | import argparse 33 | import logging 34 | import re 35 | 36 | 37 | from transformers import BertTokenizerFast, RobertaTokenizerFast 38 | # from transformers import BertTokenizer,BertModel 39 | from util.transforms import letterbox, random_affine 40 | from datasets.lang_utils import convert_examples_to_features, read_examples 41 | # sys.modules['utils'] = utils 42 | 43 | def build_bert_tokenizer(bert_model): 44 | if bert_model.split('-')[0] == 'roberta': 45 | lang_backbone = RobertaTokenizerFast.from_pretrained(bert_model, do_lower_case=True, do_basic_tokenize=False) 46 | else: 47 | lang_backbone = BertTokenizerFast.from_pretrained(bert_model, do_lower_case=True, do_basic_tokenize=False) 48 | return lang_backbone 49 | 50 | cv2.setNumThreads(0) 51 | 52 | 53 | class DatasetNotFoundError(Exception): 54 | pass 55 | 56 | 57 | class ReferDataset(data.Dataset): 58 | SUPPORTED_DATASETS = { 59 | 'referit': {'splits': ('train', 'val', 'trainval', 'test')}, 60 | 'unc': { 61 | 'splits': ('train', 'val', 'trainval', 'testA', 'testB'), 62 | 'params': {'dataset': 'refcoco', 'split_by': 'unc'} 63 | }, 64 | 'unc+': { 65 | 'splits': ('train', 'val', 'trainval', 'testA', 'testB'), 66 | 'params': {'dataset': 'refcoco+', 'split_by': 'unc'} 67 | }, 68 | 'gref': { 69 | 'splits': ('train', 'val'), 70 | 'params': {'dataset': 'refcocog', 'split_by': 'google'} 71 | }, 72 | 'gref_umd': { 73 | 'splits': ('train', 'val', 'test'), 74 | 'params': {'dataset': 'refcocog', 'split_by': 'umd'} 75 | }, 76 | 'flickr': {'splits': ('train', 'val', 'trainval', 'test')}, 77 | 'vg': {'splits': ('all')} 78 | } 79 | 80 | def __init__(self, data_root, im_dir, dataset='referit', 81 | split='train', max_query_len=128, lstm=False, bert_model='bert-base-uncased'): 82 | self.images = [] 83 | self.data_root = data_root 84 | self.im_dir = im_dir 85 | self.dataset = dataset 86 | self.query_len = max_query_len 87 | self.lstm = lstm 88 | self.split = split 89 | self.tokenizer = build_bert_tokenizer(bert_model) 90 | 91 | if not self.exists_dataset(): 92 | # self.process_dataset() 93 | print('Please download index cache to data folder: \n \ 94 | https://drive.google.com/open?id=1cZI562MABLtAzM6YU4WmKPFFguuVr0lZ') 95 | exit(0) 96 | 97 | annotation_path = osp.join(data_root, self.dataset) 98 | valid_splits = self.SUPPORTED_DATASETS[self.dataset]['splits'] 99 | 100 | if self.lstm: 101 | self.corpus = Corpus() 102 | corpus_path = osp.join(annotation_path, 'corpus.pth') 103 | self.corpus = torch.load(corpus_path) 104 | 105 | if split not in valid_splits: 106 | raise ValueError( 107 | 'Dataset {0} does not have split {1}'.format( 108 | self.dataset, split)) 109 | 110 | splits = [split] 111 | if self.dataset != 'referit': 112 | splits = ['train', 'val'] if split == 'trainval' else [split] 113 | for split in splits: 114 | imgset_file = '{0}_{1}.pth'.format(self.dataset, split) 115 | imgset_path = osp.join(annotation_path, imgset_file) 116 | self.images += torch.load(imgset_path) 117 | 118 | def exists_dataset(self): 119 | return osp.exists(osp.join(self.data_root, self.dataset)) 120 | 121 | def pull_item(self, idx): 122 | if self.dataset in ['flickr', 'vg']: 123 | img_file, bbox, phrase = self.images[idx] 124 | else: 125 | img_file, _, bbox, phrase, attri = self.images[idx] 126 | ## box format: to x1y1x2y2 127 | if not (self.dataset == 'referit' or self.dataset == 'flickr'): 128 | bbox = np.array(bbox, dtype=int) 129 | bbox[2], bbox[3] = bbox[0]+bbox[2], bbox[1]+bbox[3] 130 | else: 131 | bbox = np.array(bbox, dtype=int) 132 | 133 | img_path = osp.join(self.im_dir, img_file) 134 | img = cv2.imread(img_path) 135 | ## duplicate channel if gray image 136 | if img.shape[-1] > 1: 137 | img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) 138 | else: 139 | img = np.stack([img] * 3) 140 | return img, phrase, bbox, img_file 141 | 142 | def tokenize_phrase(self, phrase): 143 | return self.corpus.tokenize(phrase, self.query_len) 144 | 145 | def untokenize_word_vector(self, words): 146 | return self.corpus.dictionary[words] 147 | 148 | def __len__(self): 149 | return len(self.images) 150 | 151 | def __getitem__(self, idx): 152 | img, phrase, bbox, img_file = self.pull_item(idx) 153 | # phrase = phrase.decode("utf-8").encode().lower() 154 | phrase = phrase.lower() 155 | 156 | # encode phrase to bert input 157 | # Enable truncation in this case 158 | tokenized_sentence = self.tokenizer( 159 | phrase, 160 | padding='max_length', 161 | max_length=self.query_len, 162 | truncation=True, 163 | return_tensors='pt', 164 | ) 165 | word_id = tokenized_sentence['input_ids'][0] 166 | word_mask = tokenized_sentence['attention_mask'][0] 167 | 168 | h, w, c = img.shape 169 | 170 | samples = { 171 | "img": img, 172 | "sentence": np.array(word_id, dtype=int), 173 | "sentence_mask": np.array(word_mask, dtype=int) 174 | } 175 | 176 | image_id = int(img_file.split('.')[0].split('_')[-1]) 177 | target = { 178 | "image_id": image_id, 179 | "boxes": np.array([bbox], dtype=np.float32), 180 | "labels": [0], 181 | 'dataset_id': idx, 182 | "orig_size": np.array([h, w], dtype=np.int) 183 | } 184 | return samples, target -------------------------------------------------------------------------------- /datasets/lang_utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 4 | """Truncates a sequence pair in place to the maximum length.""" 5 | while True: 6 | total_length = len(tokens_a) + len(tokens_b) 7 | if total_length <= max_length: 8 | break 9 | if len(tokens_a) > len(tokens_b): 10 | tokens_a.pop() 11 | else: 12 | tokens_b.pop() 13 | 14 | def read_examples(input_line, unique_id): 15 | """ 16 | Note from JOJO: this is copied from bert source code 17 | refer to: 18 | https://daiwk.github.io/posts/nlp-bert-code-annotated-application.html#inputexample 19 | for understanding 20 | Read a list of `InputExample`s from an input file.""" 21 | examples = [] 22 | # unique_id = 0 23 | line = input_line #reader.readline() 24 | # if not line: 25 | # break 26 | line = line.strip() 27 | text_a = None 28 | text_b = None 29 | m = re.match(r"^(.*) \|\|\| (.*)$", line) 30 | if m is None: 31 | text_a = line 32 | else: 33 | text_a = m.group(1) 34 | text_b = m.group(2) 35 | examples.append( 36 | InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b)) 37 | # unique_id += 1 38 | return examples 39 | 40 | ## Bert text encoding 41 | class InputExample(object): 42 | def __init__(self, unique_id, text_a, text_b): 43 | self.unique_id = unique_id 44 | self.text_a = text_a 45 | self.text_b = text_b 46 | 47 | class InputFeatures(object): 48 | """A single set of features of data.""" 49 | def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids): 50 | self.unique_id = unique_id 51 | self.tokens = tokens 52 | self.input_ids = input_ids 53 | self.input_mask = input_mask 54 | self.input_type_ids = input_type_ids 55 | 56 | def convert_examples_to_features(examples, seq_length, tokenizer): 57 | """Loads a data file into a list of `InputBatch`s.""" 58 | features = [] 59 | for (ex_index, example) in enumerate(examples): 60 | tokens_a = tokenizer.tokenize(example.text_a) 61 | 62 | tokens_b = None 63 | if example.text_b: 64 | tokens_b = tokenizer.tokenize(example.text_b) 65 | 66 | if tokens_b: 67 | # Modifies `tokens_a` and `tokens_b` in place so that the total 68 | # length is less than the specified length. 69 | # Account for [CLS], [SEP], [SEP] with "- 3" 70 | _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3) 71 | else: 72 | # Account for [CLS] and [SEP] with "- 2" 73 | if len(tokens_a) > seq_length - 2: 74 | tokens_a = tokens_a[0:(seq_length - 2)] 75 | tokens = [] 76 | input_type_ids = [] 77 | tokens.append("[CLS]") 78 | input_type_ids.append(0) 79 | for token in tokens_a: 80 | tokens.append(token) 81 | input_type_ids.append(0) 82 | tokens.append("[SEP]") 83 | input_type_ids.append(0) 84 | 85 | if tokens_b: 86 | for token in tokens_b: 87 | tokens.append(token) 88 | input_type_ids.append(1) 89 | tokens.append("[SEP]") 90 | input_type_ids.append(1) 91 | 92 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 93 | 94 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 95 | # tokens are attended to. 96 | input_mask = [1] * len(input_ids) 97 | 98 | # Zero-pad up to the sequence length. 99 | while len(input_ids) < seq_length: 100 | input_ids.append(0) 101 | input_mask.append(0) 102 | input_type_ids.append(0) 103 | 104 | assert len(input_ids) == seq_length 105 | assert len(input_mask) == seq_length 106 | assert len(input_type_ids) == seq_length 107 | features.append( 108 | InputFeatures( 109 | unique_id=example.unique_id, 110 | tokens=tokens, 111 | input_ids=input_ids, 112 | input_mask=input_mask, 113 | input_type_ids=input_type_ids)) 114 | return features -------------------------------------------------------------------------------- /datasets/refer_multiphrase.py: -------------------------------------------------------------------------------- 1 | # Builder for visual grouding datasets 2 | from .grounding_datasets import FlickrMultiPhraseDataset 3 | from PIL import Image 4 | import datasets.transforms as T 5 | import torch 6 | 7 | class flickr30k(FlickrMultiPhraseDataset): 8 | def __init__(self, data_root, im_dir, split, transforms, 9 | max_seq_len=90, bert_model='bert-base-uncased', lstm=False): 10 | super(flickr30k, self).__init__( 11 | data_root=data_root, 12 | im_dir=im_dir, 13 | dataset='flickr', 14 | split=split, 15 | max_seq_len=max_seq_len, 16 | lstm=lstm, 17 | bert_model=bert_model 18 | ) 19 | self._transforms = transforms 20 | 21 | def __getitem__(self, idx): 22 | input_sample, target = super(flickr30k, self).__getitem__(idx) 23 | target = {k: torch.as_tensor(v) for k, v in target.items()} 24 | # target['boxes'] = torch.as_tensor(target['boxes']) 25 | img = Image.fromarray(input_sample["img"]) 26 | if self._transforms is not None: 27 | img, target = self._transforms(img, target) 28 | input_sample["img"] = img 29 | return input_sample, target 30 | 31 | 32 | def make_refer_transforms(img_size=224 ,max_img_size=1333 ,test=False): 33 | normalize = T.Compose([ 34 | T.ToTensor(), 35 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 36 | ]) 37 | 38 | if not test: 39 | return T.Compose([ 40 | # T.RandomHorizontalFlip(), 41 | T.RandomIntensitySaturation(), 42 | T.RandomResize([img_size], max_size=max_img_size), 43 | # T.RandomAffineTransform(degrees=(-5,5), translate=(0.1, 0.1), 44 | # scale=(0.9, 1.1)), 45 | normalize 46 | ]) 47 | else: 48 | return T.Compose([ 49 | T.RandomResize([img_size], max_size=max_img_size), 50 | normalize 51 | ]) 52 | 53 | 54 | def build_flickr30k(split='train', 55 | data_root="./data/annotations", 56 | im_dir="./data/flickr30k/f30k_images", 57 | bert_model='bert-base-uncased', 58 | img_size=224, 59 | max_img_size=1333): 60 | istest = split != 'train' 61 | return flickr30k( 62 | data_root=data_root, 63 | im_dir=im_dir, 64 | transforms=make_refer_transforms(img_size, max_img_size, test=istest), 65 | split=split, 66 | bert_model=bert_model 67 | ) 68 | 69 | 70 | if __name__ == "__main__": 71 | # comment out normalize in make_refer_transforms when testing 72 | from PIL import Image, ImageDraw 73 | # flickr 74 | d_train = build_flickr30k(split='train', bert_model="./configs/VinVL_VQA_base") 75 | d_val = build_flickr30k(split='val', bert_model="./configs/VinVL_VQA_base") 76 | d_test = build_flickr30k(split='test', bert_model="./configs/VinVL_VQA_base") 77 | print(f"flickr30k datasets have : {len(d_train)} Training samples") 78 | print(f"flickr30k datasets have : {len(d_test)} Testing samples") 79 | for i in range(0, 200, 50): 80 | samples, target = d_train[i] 81 | img = samples['img'] 82 | img1 = ImageDraw.Draw(img) 83 | print(img) 84 | print(target['boxes']) 85 | img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red') 86 | # if target['boxes'].shape[0] > 1: 87 | img1.rectangle(target['boxes'][1].numpy().tolist(), outline='blue') 88 | img.save(f"./exps/flickr_train_sample{i}.jpg") 89 | 90 | # # refcoco 91 | # d_train = build_refcoco(split='trainval', version='refcoco') 92 | # d_testA = build_refcoco(split='testA', version='refcoco') 93 | # d_testB = build_refcoco(split='testB', version='refcoco') 94 | # print(f"Refcoco datasets have : {len(d_train)} Training samples") 95 | # print(f"Refcoco datasets have : {len(d_testA)}/{len(d_testB)} Testing samples") 96 | # for i in range(0, 200, 50): 97 | # samples, target = d_train[i] 98 | # img = samples['img'] 99 | # img1 = ImageDraw.Draw(img) 100 | # img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red') 101 | # img.save(f"./exps/refcoco_train_sample{i}.jpg") 102 | 103 | # # refcoco 104 | # d_train = build_refcoco(split='trainval', version='refcoco+') 105 | # d_testA = build_refcoco(split='testA', version='refcoco+') 106 | # d_testB = build_refcoco(split='testB', version='refcoco+') 107 | # print(f"Refcoco+ datasets have : {len(d_train)} Training samples") 108 | # print(f"Refcoco+ datasets have : {len(d_testA)}/{len(d_testB)} Testing samples") 109 | # for i in range(0, 200, 50): 110 | # samples, target = d_train[i] 111 | # img = samples['img'] 112 | # img1 = ImageDraw.Draw(img) 113 | # img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red') 114 | # img.save(f"./exps/refcoco+_train_sample{i}.jpg") 115 | 116 | # # referit 117 | # d_train = build_referit(split='trainval') 118 | # d_test = build_referit(split='test') 119 | # print(f"ReferIt datasets have : {len(d_train)} Training samples") 120 | # print(f"ReferIt datasets have : {len(d_test)} Testing samples") 121 | # for i in range(0, 200, 50): 122 | # samples, target = d_train[i] 123 | # img = samples['img'] 124 | # img1 = ImageDraw.Draw(img) 125 | # img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red') 126 | # img.save(f"./exps/referit_train_sample{i}.jpg") 127 | 128 | # class GeneralReferDataset(torch.utils.data.Dataset): 129 | # """ 130 | # A collection of datasets. 131 | # """ 132 | # def __init__(self, datasets): 133 | # super(GeneralReferDataset, self).__init__() 134 | # self.datasets = datasets 135 | # self.dataset_len = [len(x) for x in datasets] 136 | 137 | # def __getitem__(self, idx): 138 | # for i, dataset in enumerate(self.datasets): 139 | # if idx >= self.dataset_len[i]: 140 | # idx = idx - self.dataset_len[i] 141 | # else: 142 | # return dataset.__getitem__(idx) 143 | 144 | # def __len__(self): 145 | # return sum(self.dataset_len) 146 | 147 | # class RefCOCO(ReferDataset): 148 | # def __init__(self, data_root, im_dir, split, transforms, version="unc", 149 | # max_query_len=128, lstm=False, bert_model='bert-base-uncased'): 150 | # super(RefCOCO, self).__init__( 151 | # data_root=data_root, 152 | # im_dir=im_dir, 153 | # dataset=version, 154 | # split=split, 155 | # max_query_len=max_query_len, 156 | # lstm=lstm, 157 | # bert_model=bert_model 158 | # ) 159 | # self._transforms = transforms 160 | 161 | # def __getitem__(self, idx): 162 | # input_sample, target = super(RefCOCO, self).__getitem__(idx) 163 | # target = {k: torch.as_tensor(v) for k, v in target.items()} 164 | # # target['boxes'] = torch.as_tensor(target['boxes']) 165 | # img = Image.fromarray(input_sample["img"]) 166 | # if self._transforms is not None: 167 | # img, target = self._transforms(img, target) 168 | # input_sample["img"] = img 169 | # return input_sample, target 170 | 171 | 172 | # class ReferIt(ReferDataset): 173 | # def __init__(self, data_root, im_dir, split, transforms, 174 | # max_query_len=128, lstm=False, bert_model='bert-base-uncased'): 175 | # super(ReferIt, self).__init__( 176 | # data_root=data_root, 177 | # im_dir=im_dir, 178 | # dataset='referit', 179 | # split=split, 180 | # max_query_len=max_query_len, 181 | # lstm=lstm, 182 | # bert_model=bert_model 183 | # ) 184 | # self._transforms = transforms 185 | 186 | # def __getitem__(self, idx): 187 | # input_sample, target = super(ReferIt, self).__getitem__(idx) 188 | # target = {k: torch.as_tensor(v) for k, v in target.items()} 189 | # # target['boxes'] = torch.as_tensor(target['boxes']) 190 | # img = Image.fromarray(input_sample["img"]) 191 | # if self._transforms is not None: 192 | # img, target = self._transforms(img, target) 193 | # input_sample["img"] = img 194 | # return input_sample, target 195 | # def build_referit(split='train', 196 | # data_root="./data/annotations", 197 | # im_dir="./data/referit/images"): 198 | # istest = split != 'train' 199 | # return ReferIt( 200 | # data_root=data_root, 201 | # im_dir=im_dir, 202 | # transforms=make_refer_transforms(test=istest), 203 | # split=split, 204 | # lstm=False 205 | # ) 206 | 207 | 208 | # def build_refcoco(split='train', 209 | # version='refcoco', 210 | # data_root="./data/annotations", 211 | # im_dir="./data/refcoco/train2014"): 212 | # istest = split != 'train' 213 | # if version == 'refcoco': 214 | # version = 'unc' 215 | # elif version == 'refcoco+': 216 | # version = 'unc+' 217 | # elif version == 'refcocog': 218 | # version = 'gref' 219 | # else: 220 | # raise NotImplementedError 221 | 222 | # return RefCOCO( 223 | # data_root=data_root, 224 | # im_dir=im_dir, 225 | # version=version, 226 | # transforms=make_refer_transforms(test=istest), 227 | # split=split, 228 | # lstm=False 229 | # ) 230 | 231 | 232 | # def build_refer_collections(): 233 | # flickr30k_d = build_flickr30k(split='train') 234 | # refcoco_d = build_refcoco(split='trainval', version='refcoco') 235 | # refcocop_d = build_refcoco(split='trainval', version='refcoco+') 236 | # referit = build_referit(split='trainval') 237 | # return GeneralReferDataset(datasets=[flickr30k_d, refcoco_d, refcocop_d, referit]) -------------------------------------------------------------------------------- /datasets/refer_resc.py: -------------------------------------------------------------------------------- 1 | # Builder for visual grouding datasets 2 | from datasets.grounding_datasets.resc_refer_dataset import ReferDataset 3 | from PIL import Image 4 | import datasets.transforms as T 5 | import torch 6 | 7 | class GeneralReferDataset(torch.utils.data.Dataset): 8 | """ 9 | A collection of datasets. 10 | """ 11 | def __init__(self, datasets): 12 | super(GeneralReferDataset, self).__init__() 13 | self.datasets = datasets 14 | self.dataset_len = [len(x) for x in datasets] 15 | 16 | def __getitem__(self, idx): 17 | for i, dataset in enumerate(self.datasets): 18 | if idx >= self.dataset_len[i]: 19 | idx = idx - self.dataset_len[i] 20 | else: 21 | return dataset.__getitem__(idx) 22 | 23 | def __len__(self): 24 | return sum(self.dataset_len) 25 | 26 | class flickr30k(ReferDataset): 27 | def __init__(self, data_root, im_dir, split, transforms, 28 | max_query_len=40, lstm=False, bert_model='bert-base-uncased'): 29 | super(flickr30k, self).__init__( 30 | data_root=data_root, 31 | im_dir=im_dir, 32 | dataset='flickr', 33 | split=split, 34 | max_query_len=max_query_len, 35 | lstm=lstm, 36 | bert_model=bert_model 37 | ) 38 | self._transforms = transforms 39 | 40 | def __getitem__(self, idx): 41 | input_sample, target = super(flickr30k, self).__getitem__(idx) 42 | target = {k: torch.as_tensor(v) for k, v in target.items()} 43 | # target['boxes'] = torch.as_tensor(target['boxes']) 44 | img = Image.fromarray(input_sample["img"]) 45 | if self._transforms is not None: 46 | img, target = self._transforms(img, target) 47 | input_sample["img"] = img 48 | return input_sample, target 49 | 50 | class RefCOCO(ReferDataset): 51 | def __init__(self, data_root, im_dir, split, transforms, version="unc", 52 | max_query_len=128, lstm=False, bert_model='bert-base-uncased'): 53 | super(RefCOCO, self).__init__( 54 | data_root=data_root, 55 | im_dir=im_dir, 56 | dataset=version, 57 | split=split, 58 | max_query_len=max_query_len, 59 | lstm=lstm, 60 | bert_model=bert_model 61 | ) 62 | self._transforms = transforms 63 | 64 | def __getitem__(self, idx): 65 | input_sample, target = super(RefCOCO, self).__getitem__(idx) 66 | target = {k: torch.as_tensor(v) for k, v in target.items()} 67 | # target['boxes'] = torch.as_tensor(target['boxes']) 68 | img = Image.fromarray(input_sample["img"]) 69 | if self._transforms is not None: 70 | img, target = self._transforms(img, target) 71 | input_sample["img"] = img 72 | return input_sample, target 73 | 74 | 75 | class ReferIt(ReferDataset): 76 | def __init__(self, data_root, im_dir, split, transforms, 77 | max_query_len=128, lstm=False, bert_model='bert-base-uncased'): 78 | super(ReferIt, self).__init__( 79 | data_root=data_root, 80 | im_dir=im_dir, 81 | dataset='referit', 82 | split=split, 83 | max_query_len=max_query_len, 84 | lstm=lstm, 85 | bert_model=bert_model 86 | ) 87 | self._transforms = transforms 88 | 89 | def __getitem__(self, idx): 90 | input_sample, target = super(ReferIt, self).__getitem__(idx) 91 | target = {k: torch.as_tensor(v) for k, v in target.items()} 92 | # target['boxes'] = torch.as_tensor(target['boxes']) 93 | img = Image.fromarray(input_sample["img"]) 94 | if self._transforms is not None: 95 | img, target = self._transforms(img, target) 96 | input_sample["img"] = img 97 | return input_sample, target 98 | 99 | 100 | def make_refer_transforms(img_size=224 ,max_img_size=1333 ,test=False): 101 | normalize = T.Compose([ 102 | T.ToTensor(), 103 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 104 | ]) 105 | 106 | if not test: 107 | return T.Compose([ 108 | # T.RandomHorizontalFlip(), 109 | T.RandomIntensitySaturation(), 110 | T.RandomResize([img_size], max_size=max_img_size), 111 | # T.RandomAffineTransform(degrees=(-5,5), translate=(0.1, 0.1), 112 | # scale=(0.9, 1.1)), 113 | normalize 114 | ]) 115 | else: 116 | return T.Compose([ 117 | T.RandomResize([img_size], max_size=max_img_size), 118 | normalize 119 | ]) 120 | 121 | 122 | def build_flickr30k_resc( 123 | split='train', 124 | data_root="./data/annotations_resc", 125 | im_dir="./data/flickr30k/f30k_images", 126 | img_size=224, 127 | max_img_size=1333, 128 | max_query_len=40, 129 | bert_model='bert-base-uncased'): 130 | istest = not split in ['train', 'trainval'] 131 | return flickr30k( 132 | data_root=data_root, 133 | im_dir=im_dir, 134 | transforms=make_refer_transforms(img_size, max_img_size, test=istest), 135 | split=split, 136 | max_query_len=max_query_len, 137 | bert_model=bert_model 138 | ) 139 | 140 | def build_referit_resc( 141 | split='train', 142 | data_root="./data/annotations_resc", 143 | im_dir="./data/referit/images", 144 | max_query_len=40, 145 | img_size=224, 146 | max_img_size=1333, 147 | bert_model='bert-base-uncased'): 148 | istest = not split in ['train', 'trainval'] 149 | return ReferIt( 150 | data_root=data_root, 151 | im_dir=im_dir, 152 | transforms=make_refer_transforms(img_size, max_img_size, test=istest), 153 | split=split, 154 | max_query_len=max_query_len, 155 | lstm=False, 156 | bert_model=bert_model 157 | ) 158 | 159 | 160 | def build_refcoco_resc( 161 | split='train', 162 | version='unc', 163 | data_root="./data/annotations_resc", 164 | im_dir="./data/refcoco/train2014", 165 | max_query_len=40, 166 | img_size=224, 167 | max_img_size=1333, 168 | bert_model='bert-base-uncased'): 169 | istest = not split in ['train', 'trainval'] 170 | return RefCOCO( 171 | data_root=data_root, 172 | im_dir=im_dir, 173 | version=version, 174 | transforms=make_refer_transforms(img_size, max_img_size, test=istest), 175 | split=split, 176 | max_query_len=max_query_len, 177 | lstm=False, 178 | bert_model=bert_model 179 | ) 180 | 181 | 182 | def build_visualgenome( 183 | split='all', 184 | data_root="./data/annotations_resc", 185 | im_dir="./data/visualgenome/VG_100K", 186 | max_query_len=40, 187 | img_size=224, 188 | max_img_size=1333, 189 | bert_model='bert-base-uncased'): 190 | istest = False 191 | return RefCOCO( 192 | data_root=data_root, 193 | im_dir=im_dir, 194 | version='vg', 195 | transforms=make_refer_transforms(img_size, max_img_size, test=istest), 196 | split=split, 197 | max_query_len=max_query_len, 198 | lstm=False, 199 | bert_model=bert_model 200 | ) 201 | 202 | # def build_refer_collections(): 203 | # flickr30k_d = build_flickr30k(split='train') 204 | # refcoco_d = build_refcoco(split='trainval', version='refcoco') 205 | # refcocop_d = build_refcoco(split='trainval', version='refcoco+') 206 | # referit = build_referit(split='trainval') 207 | # return GeneralReferDataset(datasets=[flickr30k_d, refcoco_d, refcocop_d, referit]) 208 | 209 | if __name__ == "__main__": 210 | # comment out normalize in make_refer_transforms when testing 211 | from PIL import Image, ImageDraw 212 | # flickr 213 | d_train = build_flickr30k(split='train') 214 | d_val = build_flickr30k(split='val') 215 | d_test = build_flickr30k(split='test') 216 | print(f"flickr30k datasets have : {len(d_train)} Training samples") 217 | print(f"flickr30k datasets have : {len(d_test)} Testing samples") 218 | for i in range(0, 200, 50): 219 | samples, target = d_train[i] 220 | img = samples['img'] 221 | img1 = ImageDraw.Draw(img) 222 | img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red') 223 | img.save(f"./exps/flickr_train_sample{i}.jpg") 224 | 225 | # refcoco 226 | d_train = build_refcoco(split='trainval', version='refcoco') 227 | d_testA = build_refcoco(split='testA', version='refcoco') 228 | d_testB = build_refcoco(split='testB', version='refcoco') 229 | print(f"Refcoco datasets have : {len(d_train)} Training samples") 230 | print(f"Refcoco datasets have : {len(d_testA)}/{len(d_testB)} Testing samples") 231 | for i in range(0, 200, 50): 232 | samples, target = d_train[i] 233 | img = samples['img'] 234 | img1 = ImageDraw.Draw(img) 235 | img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red') 236 | img.save(f"./exps/refcoco_train_sample{i}.jpg") 237 | 238 | # refcoco 239 | d_train = build_refcoco(split='trainval', version='refcoco+') 240 | d_testA = build_refcoco(split='testA', version='refcoco+') 241 | d_testB = build_refcoco(split='testB', version='refcoco+') 242 | print(f"Refcoco+ datasets have : {len(d_train)} Training samples") 243 | print(f"Refcoco+ datasets have : {len(d_testA)}/{len(d_testB)} Testing samples") 244 | for i in range(0, 200, 50): 245 | samples, target = d_train[i] 246 | img = samples['img'] 247 | img1 = ImageDraw.Draw(img) 248 | img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red') 249 | img.save(f"./exps/refcoco+_train_sample{i}.jpg") 250 | 251 | # referit 252 | d_train = build_referit(split='trainval') 253 | d_test = build_referit(split='test') 254 | print(f"ReferIt datasets have : {len(d_train)} Training samples") 255 | print(f"ReferIt datasets have : {len(d_test)} Testing samples") 256 | for i in range(0, 200, 50): 257 | samples, target = d_train[i] 258 | img = samples['img'] 259 | img1 = ImageDraw.Draw(img) 260 | img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red') 261 | img.save(f"./exps/referit_train_sample{i}.jpg") -------------------------------------------------------------------------------- /datasets/refer_segmentation.py: -------------------------------------------------------------------------------- 1 | # Builder for visual grouding datasets 2 | from .grounding_datasets import ReferSegDataset 3 | from PIL import Image 4 | import datasets.transforms as T 5 | import torch 6 | 7 | class RefCOCO(ReferSegDataset): 8 | def __init__(self, data_root, im_dir, seg_dir, split, transforms, version="refcoco_unc", 9 | max_query_len=40, bert_model='bert-base-uncased'): 10 | super(RefCOCO, self).__init__( 11 | data_root=data_root, 12 | im_dir=im_dir, 13 | seg_dir=seg_dir, 14 | dataset=version, 15 | split=split, 16 | max_query_len=max_query_len, 17 | bert_model=bert_model 18 | ) 19 | self._transforms = transforms 20 | 21 | def __getitem__(self, idx): 22 | input_sample, target = super(RefCOCO, self).__getitem__(idx) 23 | target = {k: torch.as_tensor(v) for k, v in target.items()} 24 | # target['boxes'] = torch.as_tensor(target['boxes']) 25 | img = Image.fromarray(input_sample["img"]) 26 | if self._transforms is not None: 27 | img, target = self._transforms(img, target) 28 | input_sample["img"] = img 29 | return input_sample, target 30 | 31 | 32 | def make_refer_seg_transforms(img_size=224 ,max_img_size=1333 ,test=False): 33 | normalize = T.Compose([ 34 | T.ToTensor(), 35 | T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) 36 | ]) 37 | 38 | if not test: 39 | return T.Compose([ 40 | # T.RandomHorizontalFlip(), 41 | T.RandomIntensitySaturation(), 42 | T.RandomResize([img_size], max_size=max_img_size), 43 | # T.RandomAffineTransform(degrees=(-5,5), translate=(0.1, 0.1), 44 | # scale=(0.9, 1.1)), 45 | normalize 46 | ]) 47 | else: 48 | return T.Compose([ 49 | T.RandomResize([img_size], max_size=max_img_size), 50 | normalize 51 | ]) 52 | 53 | 54 | def build_refcoco_segmentation( 55 | split='train', 56 | version='refcoco_unc', 57 | data_root="./data/refcoco/anns", 58 | im_dir="./data/refcoco/images/train2014", 59 | seg_dir="./data/refcoco/masks", 60 | img_size=224, 61 | max_img_size=1333, 62 | bert_model='bert-base-uncased' 63 | ): 64 | ''' 65 | 'refcoco_unc' 66 | 'refcoco+_unc' 67 | 'refcocog_google' 68 | 'refcocog_umd' 69 | ''' 70 | istest = split != 'train' 71 | 72 | return RefCOCO( 73 | data_root=data_root, 74 | im_dir=im_dir, 75 | seg_dir=seg_dir, 76 | version=version, 77 | transforms=make_refer_seg_transforms(img_size, max_img_size, test=istest), 78 | split=split, 79 | bert_model=bert_model 80 | ) 81 | 82 | if __name__ == "__main__": 83 | # comment out normalize in make_refer_transforms when testing 84 | from PIL import Image, ImageDraw 85 | import numpy as np 86 | # flickr 87 | d_train = build_refcoco_segmentation(split='train') 88 | d_val = build_refcoco_segmentation(split='val') 89 | d_test = build_refcoco_segmentation(split='testA') 90 | print(f"flickr30k datasets have : {len(d_train)} Training samples") 91 | print(f"flickr30k datasets have : {len(d_val)} Val samples") 92 | print(f"flickr30k datasets have : {len(d_test)} Testing samples") 93 | for i in range(0, 200, 50): 94 | samples, target = d_train[i] 95 | img = samples['img'] 96 | mask = target['masks'] 97 | img1 = ImageDraw.Draw(img) 98 | img1.rectangle(target['boxes'][0].numpy().tolist(), outline='red') 99 | img.save(f"./exps/refcoco_train_sample{i}.jpg") 100 | 101 | print(mask.shape, mask.dtype) 102 | mask = mask.numpy().astype(np.uint8)[0] * 255 103 | print(mask) 104 | mask = Image.fromarray(mask) 105 | mask.save(f"./exps/refcoco_mask_sample{i}.jpg") -------------------------------------------------------------------------------- /datasets/samplers.py: -------------------------------------------------------------------------------- 1 | import os 2 | import math 3 | import torch 4 | import torch.distributed as dist 5 | from torch.utils.data.sampler import Sampler 6 | 7 | 8 | class DistributedSampler(Sampler): 9 | """Sampler that restricts data loading to a subset of the dataset. 10 | It is especially useful in conjunction with 11 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 12 | process can pass a DistributedSampler instance as a DataLoader sampler, 13 | and load a subset of the original dataset that is exclusive to it. 14 | .. note:: 15 | Dataset is assumed to be of constant size. 16 | Arguments: 17 | dataset: Dataset used for sampling. 18 | num_replicas (optional): Number of processes participating in 19 | distributed training. 20 | rank (optional): Rank of the current process within num_replicas. 21 | """ 22 | 23 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True): 24 | if num_replicas is None: 25 | if not dist.is_available(): 26 | raise RuntimeError("Requires distributed package to be available") 27 | num_replicas = dist.get_world_size() 28 | if rank is None: 29 | if not dist.is_available(): 30 | raise RuntimeError("Requires distributed package to be available") 31 | rank = dist.get_rank() 32 | self.dataset = dataset 33 | self.num_replicas = num_replicas 34 | self.rank = rank 35 | self.epoch = 0 36 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 37 | self.total_size = self.num_samples * self.num_replicas 38 | self.shuffle = shuffle 39 | 40 | def __iter__(self): 41 | if self.shuffle: 42 | # deterministically shuffle based on epoch 43 | g = torch.Generator() 44 | g.manual_seed(self.epoch) 45 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 46 | else: 47 | indices = torch.arange(len(self.dataset)).tolist() 48 | 49 | # add extra samples to make it evenly divisible 50 | indices += indices[: (self.total_size - len(indices))] 51 | assert len(indices) == self.total_size 52 | 53 | # subsample 54 | offset = self.num_samples * self.rank 55 | indices = indices[offset : offset + self.num_samples] 56 | assert len(indices) == self.num_samples 57 | 58 | return iter(indices) 59 | 60 | def __len__(self): 61 | return self.num_samples 62 | 63 | def set_epoch(self, epoch): 64 | self.epoch = epoch 65 | 66 | 67 | class NodeDistributedSampler(Sampler): 68 | """Sampler that restricts data loading to a subset of the dataset. 69 | It is especially useful in conjunction with 70 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 71 | process can pass a DistributedSampler instance as a DataLoader sampler, 72 | and load a subset of the original dataset that is exclusive to it. 73 | .. note:: 74 | Dataset is assumed to be of constant size. 75 | Arguments: 76 | dataset: Dataset used for sampling. 77 | num_replicas (optional): Number of processes participating in 78 | distributed training. 79 | rank (optional): Rank of the current process within num_replicas. 80 | """ 81 | 82 | def __init__(self, dataset, num_replicas=None, rank=None, local_rank=None, local_size=None, shuffle=True): 83 | if num_replicas is None: 84 | if not dist.is_available(): 85 | raise RuntimeError("Requires distributed package to be available") 86 | num_replicas = dist.get_world_size() 87 | if rank is None: 88 | if not dist.is_available(): 89 | raise RuntimeError("Requires distributed package to be available") 90 | rank = dist.get_rank() 91 | if local_rank is None: 92 | local_rank = int(os.environ.get('LOCAL_RANK', 0)) 93 | if local_size is None: 94 | local_size = int(os.environ.get('LOCAL_SIZE', 1)) 95 | self.dataset = dataset 96 | self.shuffle = shuffle 97 | self.num_replicas = num_replicas 98 | self.num_parts = local_size 99 | self.rank = rank 100 | self.local_rank = local_rank 101 | self.epoch = 0 102 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 103 | self.total_size = self.num_samples * self.num_replicas 104 | 105 | self.total_size_parts = self.num_samples * self.num_replicas // self.num_parts 106 | 107 | def __iter__(self): 108 | if self.shuffle: 109 | # deterministically shuffle based on epoch 110 | g = torch.Generator() 111 | g.manual_seed(self.epoch) 112 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 113 | else: 114 | indices = torch.arange(len(self.dataset)).tolist() 115 | indices = [i for i in indices if i % self.num_parts == self.local_rank] 116 | 117 | # add extra samples to make it evenly divisible 118 | indices += indices[:(self.total_size_parts - len(indices))] 119 | assert len(indices) == self.total_size_parts 120 | 121 | # subsample 122 | indices = indices[self.rank // self.num_parts:self.total_size_parts:self.num_replicas // self.num_parts] 123 | assert len(indices) == self.num_samples 124 | 125 | return iter(indices) 126 | 127 | def __len__(self): 128 | return self.num_samples 129 | 130 | def set_epoch(self, epoch): 131 | self.epoch = epoch 132 | -------------------------------------------------------------------------------- /datasets/transforms.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Modified from DETR (https://github.com/facebookresearch/detr) 3 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 4 | # ------------------------------------------------------------------------ 5 | 6 | """ 7 | Transforms and data augmentation for both image + bbox. 8 | """ 9 | import random 10 | 11 | import PIL 12 | import torch 13 | import torchvision.transforms as T 14 | import torchvision.transforms.functional as F 15 | import numpy as np 16 | 17 | from util.box_ops import box_xyxy_to_cxcywh 18 | from util.misc import interpolate 19 | 20 | 21 | def crop(image, target, region): 22 | cropped_image = F.crop(image, *region) 23 | 24 | target = target.copy() 25 | i, j, h, w = region 26 | 27 | # should we do something wrt the original size? 28 | target["size"] = torch.tensor([h, w]) 29 | 30 | fields = ["labels", "area", "iscrowd"] 31 | 32 | if "boxes" in target: 33 | boxes = target["boxes"] 34 | max_size = torch.as_tensor([w, h], dtype=torch.float32) 35 | cropped_boxes = boxes - torch.as_tensor([j, i, j, i]) 36 | cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size) 37 | cropped_boxes = cropped_boxes.clamp(min=0) 38 | area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1) 39 | target["boxes"] = cropped_boxes.reshape(-1, 4) 40 | target["area"] = area 41 | fields.append("boxes") 42 | 43 | if "masks" in target: 44 | # FIXME should we update the area here if there are no boxes? 45 | target['masks'] = target['masks'][:, i:i + h, j:j + w] 46 | fields.append("masks") 47 | 48 | # remove elements for which the boxes or masks that have zero area 49 | if "boxes" in target or "masks" in target: 50 | # favor boxes selection when defining which elements to keep 51 | # this is compatible with previous implementation 52 | if "boxes" in target: 53 | cropped_boxes = target['boxes'].reshape(-1, 2, 2) 54 | keep = torch.all(cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1) 55 | else: 56 | keep = target['masks'].flatten(1).any(1) 57 | 58 | for field in fields: 59 | target[field] = target[field][keep] 60 | 61 | return cropped_image, target 62 | 63 | 64 | def hflip(image, target): 65 | flipped_image = F.hflip(image) 66 | 67 | w, h = image.size 68 | 69 | target = target.copy() 70 | if "boxes" in target: 71 | boxes = target["boxes"] 72 | boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor([-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0]) 73 | target["boxes"] = boxes 74 | 75 | if "masks" in target: 76 | target['masks'] = target['masks'].flip(-1) 77 | 78 | return flipped_image, target 79 | 80 | 81 | def resize(image, target, size, max_size=None): 82 | # size can be min_size (scalar) or (w, h) tuple 83 | 84 | def get_size_with_aspect_ratio(image_size, size, max_size=None): 85 | w, h = image_size 86 | if max_size is not None: 87 | min_original_size = float(min((w, h))) 88 | max_original_size = float(max((w, h))) 89 | if max_original_size / min_original_size * size > max_size: 90 | size = int(round(max_size * min_original_size / max_original_size)) 91 | 92 | if (w <= h and w == size) or (h <= w and h == size): 93 | return (h, w) 94 | 95 | if w < h: 96 | ow = size 97 | oh = int(size * h / w) 98 | else: 99 | oh = size 100 | ow = int(size * w / h) 101 | 102 | return (oh, ow) 103 | 104 | def get_size(image_size, size, max_size=None): 105 | if isinstance(size, (list, tuple)): 106 | return size[::-1] 107 | else: 108 | return get_size_with_aspect_ratio(image_size, size, max_size) 109 | 110 | size = get_size(image.size, size, max_size) 111 | rescaled_image = F.resize(image, size) 112 | 113 | if target is None: 114 | return rescaled_image, None 115 | 116 | ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(rescaled_image.size, image.size)) 117 | ratio_width, ratio_height = ratios 118 | 119 | target = target.copy() 120 | if "boxes" in target: 121 | boxes = target["boxes"] 122 | scaled_boxes = boxes * torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height]) 123 | target["boxes"] = scaled_boxes 124 | 125 | if "area" in target: 126 | area = target["area"] 127 | scaled_area = area * (ratio_width * ratio_height) 128 | target["area"] = scaled_area 129 | 130 | h, w = size 131 | target["size"] = torch.tensor([h, w]) 132 | 133 | if "masks" in target: 134 | target['masks'] = interpolate( 135 | target['masks'][:, None].float(), size, mode="nearest")[:, 0] > 0.5 136 | 137 | return rescaled_image, target 138 | 139 | 140 | def pad(image, target, padding): 141 | # assumes that we only pad on the bottom right corners 142 | padded_image = F.pad(image, (0, 0, padding[0], padding[1])) 143 | if target is None: 144 | return padded_image, None 145 | target = target.copy() 146 | # should we do something wrt the original size? 147 | target["size"] = torch.tensor(padded_image[::-1]) 148 | if "masks" in target: 149 | target['masks'] = torch.nn.functional.pad(target['masks'], (0, padding[0], 0, padding[1])) 150 | return padded_image, target 151 | 152 | 153 | class RandomCrop(object): 154 | def __init__(self, size): 155 | self.size = size 156 | 157 | def __call__(self, img, target): 158 | region = T.RandomCrop.get_params(img, self.size) 159 | return crop(img, target, region) 160 | 161 | 162 | class RandomSizeCrop(object): 163 | def __init__(self, min_size: int, max_size: int): 164 | self.min_size = min_size 165 | self.max_size = max_size 166 | 167 | def __call__(self, img: PIL.Image.Image, target: dict): 168 | w = random.randint(self.min_size, min(img.width, self.max_size)) 169 | h = random.randint(self.min_size, min(img.height, self.max_size)) 170 | region = T.RandomCrop.get_params(img, [h, w]) 171 | return crop(img, target, region) 172 | 173 | 174 | class CenterCrop(object): 175 | def __init__(self, size): 176 | self.size = size 177 | 178 | def __call__(self, img, target): 179 | image_width, image_height = img.size 180 | crop_height, crop_width = self.size 181 | crop_top = int(round((image_height - crop_height) / 2.)) 182 | crop_left = int(round((image_width - crop_width) / 2.)) 183 | return crop(img, target, (crop_top, crop_left, crop_height, crop_width)) 184 | 185 | 186 | class RandomHorizontalFlip(object): 187 | def __init__(self, p=0.5): 188 | self.p = p 189 | 190 | def __call__(self, img, target): 191 | if random.random() < self.p: 192 | return hflip(img, target) 193 | return img, target 194 | 195 | 196 | class RandomResize(object): 197 | def __init__(self, sizes, max_size=None): 198 | assert isinstance(sizes, (list, tuple)) 199 | self.sizes = sizes 200 | self.max_size = max_size 201 | 202 | def __call__(self, img, target=None): 203 | size = random.choice(self.sizes) 204 | return resize(img, target, size, self.max_size) 205 | 206 | 207 | class RandomPad(object): 208 | def __init__(self, max_pad): 209 | self.max_pad = max_pad 210 | 211 | def __call__(self, img, target): 212 | pad_x = random.randint(0, self.max_pad) 213 | pad_y = random.randint(0, self.max_pad) 214 | return pad(img, target, (pad_x, pad_y)) 215 | 216 | 217 | class RandomSelect(object): 218 | """ 219 | Randomly selects between transforms1 and transforms2, 220 | with probability p for transforms1 and (1 - p) for transforms2 221 | """ 222 | def __init__(self, transforms1, transforms2, p=0.5): 223 | self.transforms1 = transforms1 224 | self.transforms2 = transforms2 225 | self.p = p 226 | 227 | def __call__(self, img, target): 228 | if random.random() < self.p: 229 | return self.transforms1(img, target) 230 | return self.transforms2(img, target) 231 | 232 | 233 | class ToTensor(object): 234 | def __call__(self, img, target): 235 | return F.to_tensor(img), target 236 | 237 | 238 | class RandomErasing(object): 239 | 240 | def __init__(self, *args, **kwargs): 241 | self.eraser = T.RandomErasing(*args, **kwargs) 242 | 243 | def __call__(self, img, target): 244 | return self.eraser(img), target 245 | 246 | 247 | class Normalize(object): 248 | def __init__(self, mean, std): 249 | self.mean = mean 250 | self.std = std 251 | 252 | def __call__(self, image, target=None): 253 | image = F.normalize(image, mean=self.mean, std=self.std) 254 | if target is None: 255 | return image, None 256 | target = target.copy() 257 | h, w = image.shape[-2:] 258 | if "boxes" in target: 259 | boxes = target["boxes"] 260 | boxes = box_xyxy_to_cxcywh(boxes) 261 | boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32) 262 | target["boxes"] = boxes 263 | return image, target 264 | 265 | import cv2 266 | class RandomIntensitySaturation(object): 267 | def __call__(self, image, target=None): 268 | fraction = 0.50 269 | image = np.asarray(image) 270 | img_hsv = cv2.cvtColor(cv2.cvtColor(image, cv2.COLOR_RGB2BGR), cv2.COLOR_BGR2HSV) 271 | S = img_hsv[:, :, 1].astype(np.float32) 272 | V = img_hsv[:, :, 2].astype(np.float32) 273 | a = (random.random() * 2 - 1) * fraction + 1 274 | if a > 1: 275 | np.clip(S, a_min=0, a_max=255, out=S) 276 | a = (random.random() * 2 - 1) * fraction + 1 277 | V *= a 278 | if a > 1: 279 | np.clip(V, a_min=0, a_max=255, out=V) 280 | 281 | img_hsv[:, :, 1] = S.astype(np.uint8) 282 | img_hsv[:, :, 2] = V.astype(np.uint8) 283 | img = cv2.cvtColor(cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR), cv2.COLOR_BGR2RGB) 284 | img = PIL.Image.fromarray(img) 285 | return img, target 286 | 287 | from util.transforms import random_affine 288 | class RandomAffineTransform(object): 289 | def __init__(self, degrees=(-10, 10), translate=(.1, .1), scale=(0.90, 1.10)): 290 | self.degrees = degrees 291 | self.translate = translate 292 | self.scale = scale 293 | 294 | def __call__(self, image, targets): 295 | bboxes = targets["boxes"] 296 | image = np.asarray(image) 297 | image, _, bboxes = random_affine(image, mask=None, targets=bboxes.numpy(), 298 | degrees=self.degrees, translate=self.translate, 299 | scale=self.scale) 300 | # TODO: A hack here for later transform operations 301 | img = PIL.Image.fromarray(image) 302 | targets["boxes"] = torch.as_tensor(bboxes) 303 | return img, targets 304 | 305 | class Compose(object): 306 | def __init__(self, transforms): 307 | self.transforms = transforms 308 | 309 | def __call__(self, image, target): 310 | for t in self.transforms: 311 | image, target = t(image, target) 312 | return image, target 313 | 314 | def __repr__(self): 315 | format_string = self.__class__.__name__ + "(" 316 | for t in self.transforms: 317 | format_string += "\n" 318 | format_string += " {0}".format(t) 319 | format_string += "\n)" 320 | return format_string 321 | -------------------------------------------------------------------------------- /engine_vg.py: -------------------------------------------------------------------------------- 1 | """ 2 | Train and eval functions used in main.py 3 | """ 4 | import math 5 | import os 6 | import sys 7 | import json 8 | from typing import Iterable 9 | 10 | import torch 11 | import util.misc as utils 12 | from PIL import Image, ImageDraw 13 | import matplotlib.pyplot as plt 14 | import numpy as np 15 | import torch.nn.functional as F 16 | 17 | # from datasets.data_prefetcher import data_prefetcher 18 | 19 | # # Reuse Deformable DETR's train function 20 | # from engine import train_one_epoch 21 | 22 | def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, 23 | data_loader: Iterable, optimizer: torch.optim.Optimizer, 24 | lr_scheduler: torch.optim.lr_scheduler._LRScheduler, 25 | device: torch.device, epoch: int, max_norm: float = 0): 26 | model.train() 27 | criterion.train() 28 | metric_logger = utils.MetricLogger(delimiter=" ") 29 | metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) 30 | # metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) 31 | metric_logger.add_meter('grad_norm', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) 32 | header = 'Epoch: [{}]'.format(epoch) 33 | print_freq = 50 34 | 35 | prefetcher = data_prefetcher(data_loader, device, prefetch=True) 36 | samples, targets = prefetcher.next() 37 | 38 | # for samples, targets in metric_logger.log_every(data_loader, print_freq, header): 39 | for _ in metric_logger.log_every(range(len(data_loader)), print_freq, header): 40 | outputs = model(samples) 41 | loss_dict = criterion(outputs, targets) 42 | weight_dict = criterion.weight_dict 43 | losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) 44 | 45 | # reduce losses over all GPUs for logging purposes 46 | loss_dict_reduced = utils.reduce_dict(loss_dict) 47 | loss_dict_reduced_unscaled = {f'{k}_unscaled': v 48 | for k, v in loss_dict_reduced.items()} 49 | loss_dict_reduced_scaled = {k: v * weight_dict[k] 50 | for k, v in loss_dict_reduced.items() if k in weight_dict} 51 | losses_reduced_scaled = sum(loss_dict_reduced_scaled.values()) 52 | 53 | loss_value = losses_reduced_scaled.item() 54 | 55 | if not math.isfinite(loss_value): 56 | print("Loss is {}, stopping training".format(loss_value)) 57 | print(loss_dict_reduced) 58 | sys.exit(1) 59 | 60 | optimizer.zero_grad() 61 | losses.backward() 62 | if max_norm > 0: 63 | grad_total_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) 64 | else: 65 | grad_total_norm = utils.get_total_grad_norm(model.parameters(), max_norm) 66 | optimizer.step() 67 | lr_scheduler.step() 68 | 69 | metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled) 70 | # metric_logger.update(class_error=loss_dict_reduced['class_error']) 71 | metric_logger.update(lr=optimizer.param_groups[0]["lr"]) 72 | metric_logger.update(grad_norm=grad_total_norm) 73 | 74 | samples, targets = prefetcher.next() 75 | # gather the stats from all processes 76 | metric_logger.synchronize_between_processes() 77 | print("Averaged stats:", metric_logger) 78 | return {k: meter.global_avg for k, meter in metric_logger.meters.items()} 79 | 80 | from util.box_ops import box_iou, box_cxcywh_to_xyxy, mask_iou 81 | @torch.no_grad() 82 | def evaluate(model, criterion, postprocessors, data_loader, device, output_dir, visualize=False): 83 | model.eval() 84 | criterion.eval() 85 | # visualize=False 86 | if visualize: 87 | split_name = data_loader.dataset.split 88 | output_dir = output_dir / 'vis' / split_name 89 | output_dir.mkdir(parents=True, exist_ok=True) 90 | (output_dir / 'mask').mkdir(parents=True, exist_ok=True) 91 | (output_dir / 'bbox').mkdir(parents=True, exist_ok=True) 92 | (output_dir / 'att').mkdir(parents=True, exist_ok=True) 93 | (output_dir / 'gt').mkdir(parents=True, exist_ok=True) 94 | purple = np.array([[[128, 0, 128]]], dtype=np.uint8) 95 | yellow = np.array([[[255, 255, 0]]], dtype=np.uint8) 96 | metric_logger = utils.MetricLogger(delimiter=" ") 97 | # metric_logger.add_meter('class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) 98 | header = data_loader.dataset.split + ':' 99 | 100 | results_dict = {} 101 | results_iou = {'det':{}, 'seg':{}} 102 | sum_accu = 0. 103 | sum_iou = 0. 104 | cnt_test = 0. 105 | seg_iou = 0. 106 | cnt_seg = 0. 107 | for samples, targets in metric_logger.log_every(data_loader, 50, header): 108 | samples = {k: v.to(device, non_blocking=True) for k, v in samples.items()} 109 | targets = [{k: v.to(device) for k, v in t.items()} for t in targets] 110 | 111 | outputs = model(samples) 112 | loss_dict = criterion(outputs, targets) 113 | weight_dict = criterion.weight_dict 114 | 115 | # reduce losses over all GPUs for logging purposes 116 | loss_dict_reduced = utils.reduce_dict(loss_dict) 117 | loss_dict_reduced_scaled = {k: v * weight_dict[k] 118 | for k, v in loss_dict_reduced.items() if k in weight_dict} 119 | loss_dict_reduced_unscaled = {f'{k}_unscaled': v 120 | for k, v in loss_dict_reduced.items()} 121 | metric_logger.update(loss=sum(loss_dict_reduced_scaled.values()), 122 | **loss_dict_reduced_scaled, 123 | **loss_dict_reduced_unscaled) 124 | # metric_logger.update(class_error=loss_dict_reduced['class_error']) 125 | 126 | # TODO: some issues with data loaders here 127 | orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) 128 | results = postprocessors['bbox'](outputs, orig_target_sizes) 129 | # res = {target['image_id'].item(): output for target, output in zip(targets, results)} 130 | 131 | for i, res in enumerate(results): 132 | t = box_cxcywh_to_xyxy(targets[i]['boxes']) 133 | assert t.size(0) == res['boxes'].size(0), (res, t) 134 | iou, union = box_iou(t, res['boxes']) 135 | iou = torch.diag(iou) 136 | # print(t, res['boxes'], iou, union) 137 | sum_accu = sum_accu + torch.sum((iou > 0.5).type(torch.float))#.item() 138 | sum_iou = sum_iou + torch.sum(iou)#.item() 139 | cnt_test = cnt_test + torch.tensor(len(targets[i]['boxes']), device=sum_iou.device) 140 | results_iou['det'][targets[i]['dataset_id'].item()] = torch.sum(iou).item() 141 | results_scaled = postprocessors['bbox'](outputs, orig_target_sizes, scale_to_original_shape=True) 142 | 143 | # TODO support multi-phrase in the future 144 | if 'segm' in postprocessors.keys(): 145 | target_sizes = torch.stack([t["size"] for t in targets], dim=0) 146 | results = postprocessors['segm'](results, outputs, orig_target_sizes, target_sizes) 147 | for i, res in enumerate(results): 148 | t = targets[i] 149 | t_mask = t['masks'] 150 | pred_mask = res['masks'][0] 151 | # print(pred_mask.shape, t_mask.shape) 152 | iou = mask_iou(pred_mask[0], t_mask) 153 | seg_iou = seg_iou + iou 154 | cnt_seg = cnt_seg + 1 155 | results_iou['seg'][targets[i]['dataset_id'].item()] = iou.item() 156 | if visualize: 157 | dataset_id = t['dataset_id'].item() 158 | pred_mask = res['masks_origin'][0, 0].cpu().unsqueeze(-1).numpy().astype(np.uint8) 159 | img, mask, phrase, tgt_box, img_file = data_loader.dataset.pull_item(dataset_id) 160 | assert pred_mask.shape[:2] == mask.shape, (pred_mask.shape, mask.shape) 161 | # print(pred_mask.shape, yellow.shape) 162 | img_name = img_file.split('/')[-1].split('.')[0] 163 | pred_mask = pred_mask * yellow + (1-pred_mask)*purple 164 | # print(pred_mask.shape, yellow.shape) 165 | pred_mask = Image.fromarray(pred_mask) 166 | pred_mask.save(output_dir / 'mask'/ f"{img_name}_{dataset_id:05d}.jpg") 167 | 168 | 169 | mask = np.expand_dims(mask, -1) 170 | gt = mask * yellow + (1-mask)*purple 171 | # print(pred_mask.shape, yellow.shape) 172 | gt_mask = Image.fromarray(gt) 173 | gt_mask.save(output_dir / 'gt'/ f"{img_name}_{dataset_id:05d}.jpg") 174 | 175 | pred_box = results_scaled[i]['boxes'][0].cpu().numpy().tolist() 176 | # print(pred_box, tgt_box) 177 | img_bbox = Image.fromarray(img) 178 | draw = ImageDraw.Draw(img_bbox) 179 | draw.rectangle(pred_box, outline='blue', width=5) 180 | draw.rectangle(tgt_box.tolist(), outline='red', width=5) 181 | img_bbox.save(output_dir / 'bbox'/ f"{img_name}_{dataset_id:05d}.jpg") 182 | 183 | att_mask = outputs['mask_att'][i:i+1, :].cpu() 184 | h, w, _ = mask.shape 185 | att_mask = F.interpolate(att_mask, size=(320, 320), mode="bilinear").numpy() 186 | # print(att_mask.shape) 187 | plt.imsave(output_dir / 'att' /f"{img_name}_{dataset_id:05d}_0.jpg", att_mask[0, 0, :h//2, :w//2], cmap='viridis') 188 | plt.imsave(output_dir / 'att' /f"{img_name}_{dataset_id:05d}_1.jpg", att_mask[0, 1, :h//2, :w//2], cmap='viridis') 189 | plt.imsave(output_dir / 'att' /f"{img_name}_{dataset_id:05d}_2.jpg", att_mask[0, 2, :h//2, :w//2], cmap='viridis') 190 | plt.imsave(output_dir / 'att' /f"{img_name}_{dataset_id:05d}_7.jpg", att_mask[0, 7, :h//2, :w//2], cmap='viridis') 191 | # att_mask = att_mask[0, 0, :h, :w, None] 192 | # att_mask_rescaled = (att_mask - att_mask.min()) / (att_mask.max()-att_mask.min()) 193 | # att_mask_rescaled = np.clip(1.5 * att_mask_rescaled - 0.5, 0., 1.0) 194 | # att_img = (img * att_mask_rescaled).astype(np.uint8) 195 | # att_img = Image.fromarray(att_img) 196 | # att_img.save(output_dir / 'att' / f"{img_name}_{dataset_id:05d}_0.jpg") 197 | # plt.imsave(output_dir / 'att' /f"0_{img_name}_{dataset_id:05d}.jpg", att_mask[:,:,0], cmap='viridis') 198 | 199 | 200 | results_dict.update({target['image_id'].item(): output['boxes'].cpu().numpy().tolist() for target, output in zip(targets, results_scaled)}) 201 | 202 | # gather the stats from all processes 203 | metric_logger.synchronize_between_processes() 204 | print("Averaged stats:", metric_logger) 205 | 206 | stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()} 207 | if utils.is_dist_avail_and_initialized(): 208 | torch.distributed.all_reduce(sum_accu) 209 | torch.distributed.all_reduce(cnt_test) 210 | torch.distributed.all_reduce(sum_iou) 211 | stats["accuracy_iou0.5"] = (sum_accu / cnt_test).cpu().item() 212 | stats["miou"] = (sum_iou / cnt_test).cpu().item() 213 | 214 | if 'segm' in postprocessors.keys(): 215 | if utils.is_dist_avail_and_initialized(): 216 | torch.distributed.all_reduce(seg_iou) 217 | cnt_seg = utils.get_world_size() * cnt_seg 218 | print(cnt_seg) 219 | stats["seg_miou"] = (seg_iou / cnt_seg).cpu().item() 220 | 221 | # do not print aux test loss 222 | stats = {k:v for k,v in stats.items() if k.split('_')[-1] not in ['unscaled', '0', '1', '2']} 223 | # with (output_dir / f"{data_loader.dataset.split}_iou.json").open("w") as f: 224 | # f.write(json.dumps(results_iou) + "\n") 225 | return stats, results_dict 226 | 227 | 228 | def to_cuda(samples, targets, device): 229 | # samples = samples.to(device, non_blocking=True) 230 | samples = {k: v.to(device, non_blocking=True) for k, v in samples.items()} 231 | targets = [{k: v.to(device, non_blocking=True) for k, v in t.items()} for t in targets] 232 | return samples, targets 233 | 234 | class data_prefetcher(): 235 | def __init__(self, loader, device, prefetch=True): 236 | self.loader = iter(loader) 237 | self.prefetch = prefetch 238 | self.device = device 239 | if prefetch: 240 | self.stream = torch.cuda.Stream() 241 | self.preload() 242 | 243 | def preload(self): 244 | try: 245 | self.next_samples, self.next_targets = next(self.loader) 246 | except StopIteration: 247 | self.next_samples = None 248 | self.next_targets = None 249 | return 250 | # if record_stream() doesn't work, another option is to make sure device inputs are created 251 | # on the main stream. 252 | # self.next_input_gpu = torch.empty_like(self.next_input, device='cuda') 253 | # self.next_target_gpu = torch.empty_like(self.next_target, device='cuda') 254 | # Need to make sure the memory allocated for next_* is not still in use by the main stream 255 | # at the time we start copying to next_*: 256 | # self.stream.wait_stream(torch.cuda.current_stream()) 257 | with torch.cuda.stream(self.stream): 258 | self.next_samples, self.next_targets = to_cuda(self.next_samples, self.next_targets, self.device) 259 | # more code for the alternative if record_stream() doesn't work: 260 | # copy_ will record the use of the pinned source tensor in this side stream. 261 | # self.next_input_gpu.copy_(self.next_input, non_blocking=True) 262 | # self.next_target_gpu.copy_(self.next_target, non_blocking=True) 263 | # self.next_input = self.next_input_gpu 264 | # self.next_target = self.next_target_gpu 265 | 266 | # With Amp, it isn't necessary to manually convert data to half. 267 | # if args.fp16: 268 | # self.next_input = self.next_input.half() 269 | # else: 270 | 271 | def next(self): 272 | if self.prefetch: 273 | torch.cuda.current_stream().wait_stream(self.stream) 274 | samples = self.next_samples 275 | targets = self.next_targets 276 | if samples is not None: 277 | for k, v in samples.items(): 278 | v.record_stream(torch.cuda.current_stream()) 279 | if targets is not None: 280 | for t in targets: 281 | for k, v in t.items(): 282 | v.record_stream(torch.cuda.current_stream()) 283 | self.preload() 284 | else: 285 | try: 286 | samples, targets = next(self.loader) 287 | samples, targets = to_cuda(samples, targets, self.device) 288 | except StopIteration: 289 | samples = None 290 | targets = None 291 | return samples, targets 292 | -------------------------------------------------------------------------------- /models/__init__.py: -------------------------------------------------------------------------------- 1 | from .reftr_transformer import build_reftr as build_transformer_based_reftr 2 | from .reftr_segmentation import build_reftr_seg 3 | 4 | def build_reftr(args): 5 | if args.reftr_type.startswith('transformer'): 6 | if args.masks: 7 | return build_reftr_seg(args) 8 | else: 9 | return build_transformer_based_reftr(args) 10 | else: 11 | raise NotImplementedError -------------------------------------------------------------------------------- /models/modeling/backbone.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Backbone modules. 12 | """ 13 | from collections import OrderedDict 14 | 15 | import torch 16 | import torch.nn.functional as F 17 | import torchvision 18 | from torch import nn 19 | from torchvision.models._utils import IntermediateLayerGetter 20 | from typing import Dict, List 21 | 22 | from util.misc import NestedTensor, is_main_process 23 | 24 | from .position_encoding import build_position_encoding 25 | 26 | class MLP(nn.Module): 27 | """ Very simple multi-layer perceptron (also called FFN)""" 28 | 29 | def __init__(self, input_dim, hidden_dim, output_dim, num_layers): 30 | super().__init__() 31 | self.num_layers = num_layers 32 | h = [hidden_dim] * (num_layers - 1) 33 | self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) 34 | 35 | def forward(self, x): 36 | for i, layer in enumerate(self.layers): 37 | x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) 38 | return x 39 | 40 | def _get_clones(module, N): 41 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) 42 | 43 | class FrozenBatchNorm2d(torch.nn.Module): 44 | """ 45 | BatchNorm2d where the batch statistics and the affine parameters are fixed. 46 | 47 | Copy-paste from torchvision.misc.ops with added eps before rqsrt, 48 | without which any other models than torchvision.models.resnet[18,34,50,101] 49 | produce nans. 50 | """ 51 | 52 | def __init__(self, n, eps=1e-5): 53 | super(FrozenBatchNorm2d, self).__init__() 54 | self.register_buffer("weight", torch.ones(n)) 55 | self.register_buffer("bias", torch.zeros(n)) 56 | self.register_buffer("running_mean", torch.zeros(n)) 57 | self.register_buffer("running_var", torch.ones(n)) 58 | self.eps = eps 59 | 60 | def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, 61 | missing_keys, unexpected_keys, error_msgs): 62 | num_batches_tracked_key = prefix + 'num_batches_tracked' 63 | if num_batches_tracked_key in state_dict: 64 | del state_dict[num_batches_tracked_key] 65 | 66 | super(FrozenBatchNorm2d, self)._load_from_state_dict( 67 | state_dict, prefix, local_metadata, strict, 68 | missing_keys, unexpected_keys, error_msgs) 69 | 70 | def forward(self, x): 71 | # move reshapes to the beginning 72 | # to make it fuser-friendly 73 | w = self.weight.reshape(1, -1, 1, 1) 74 | b = self.bias.reshape(1, -1, 1, 1) 75 | rv = self.running_var.reshape(1, -1, 1, 1) 76 | rm = self.running_mean.reshape(1, -1, 1, 1) 77 | eps = self.eps 78 | scale = w * (rv + eps).rsqrt() 79 | bias = b - rm * scale 80 | return x * scale + bias 81 | 82 | 83 | class BackboneBase(nn.Module): 84 | 85 | def __init__(self, backbone: nn.Module, train_backbone: bool, return_interm_layers: bool): 86 | super().__init__() 87 | for name, parameter in backbone.named_parameters(): 88 | if not train_backbone or 'layer2' not in name and 'layer3' not in name and 'layer4' not in name: 89 | parameter.requires_grad_(False) 90 | if return_interm_layers: 91 | return_layers = {"layer1": "0", "layer2": "1", "layer3": "2", "layer4": "3"} 92 | # return_layers = {"layer2": "0", "layer3": "1", "layer4": "2"} 93 | self.strides = [4, 8, 16, 32] 94 | self.num_channels = [256, 512, 1024, 2048] 95 | else: 96 | return_layers = {'layer4': "0"} 97 | self.strides = [32] 98 | self.num_channels = [2048] 99 | self.body = IntermediateLayerGetter(backbone, return_layers=return_layers) 100 | 101 | def forward(self, tensor_list: NestedTensor): 102 | xs = self.body(tensor_list.tensors) 103 | out: Dict[str, NestedTensor] = {} 104 | for name, x in xs.items(): 105 | m = tensor_list.mask 106 | assert m is not None 107 | mask = F.interpolate(m[None].float(), size=x.shape[-2:]).to(torch.bool)[0] 108 | out[name] = NestedTensor(x, mask) 109 | return out 110 | 111 | 112 | class Backbone(BackboneBase): 113 | """ResNet backbone with frozen BatchNorm.""" 114 | def __init__(self, name: str, 115 | train_backbone: bool, 116 | return_interm_layers: bool, 117 | dilation: bool): 118 | norm_layer = FrozenBatchNorm2d 119 | backbone = getattr(torchvision.models, name)( 120 | replace_stride_with_dilation=[False, False, dilation], 121 | pretrained=is_main_process(), norm_layer=norm_layer) 122 | assert name not in ('resnet18', 'resnet34'), "number of channels are hard coded" 123 | super().__init__(backbone, train_backbone, return_interm_layers) 124 | if dilation: 125 | self.strides[-1] = self.strides[-1] // 2 126 | 127 | 128 | class Joiner(nn.Sequential): 129 | def __init__(self, backbone, position_embedding): 130 | super().__init__(backbone, position_embedding) 131 | self.strides = backbone.strides 132 | self.num_channels = backbone.num_channels 133 | 134 | def forward(self, tensor_list: NestedTensor): 135 | xs = self[0](tensor_list) 136 | out: List[NestedTensor] = [] 137 | pos = [] 138 | for name, x in sorted(xs.items()): 139 | out.append(x) 140 | 141 | # position encoding 142 | for x in out: 143 | pos.append(self[1](x).to(x.tensors.dtype)) 144 | 145 | return out, pos 146 | 147 | 148 | def build_backbone(args): 149 | position_embedding = build_position_encoding(args) 150 | train_backbone = args.lr_backbone > 0 151 | return_interm_layers = args.masks or (args.num_feature_levels > 1) 152 | backbone = Backbone(args.backbone, train_backbone, return_interm_layers, args.dilation) 153 | model = Joiner(backbone, position_embedding) 154 | return model 155 | -------------------------------------------------------------------------------- /models/modeling/matcher.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Modules to compute the matching cost and solve the corresponding LSAP. 12 | """ 13 | import torch 14 | from scipy.optimize import linear_sum_assignment 15 | from torch import nn 16 | 17 | from util.box_ops import box_cxcywh_to_xyxy, generalized_box_iou 18 | 19 | class OnetoAllMatcher(nn.Module): 20 | """ 21 | Assume only one gt per match batch 22 | """ 23 | def __init__(self, 24 | cost_class: float = 1, 25 | cost_bbox: float = 1, 26 | cost_giou: float = 1): 27 | """Creates the matcher 28 | 29 | Params: 30 | cost_class: This is the relative weight of the classification error in the matching cost 31 | cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost 32 | cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost 33 | """ 34 | super().__init__() 35 | self.cost_class = cost_class 36 | self.cost_bbox = cost_bbox 37 | self.cost_giou = cost_giou 38 | assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0" 39 | 40 | def forward(self, outputs, targets, topk=1, use_softmax_match=False): 41 | with torch.no_grad(): 42 | bs, num_queries = outputs["pred_logits"].shape[:2] 43 | 44 | out_prob = outputs["pred_logits"] 45 | out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] 46 | tgt_ids = torch.cat([v["labels"] for v in targets]) 47 | tgt_bbox = torch.cat([v["boxes"] for v in targets]) 48 | 49 | if use_softmax_match: 50 | out_prob = nn.functional.softmax(out_prob.view(bs, num_queries), dim=-1) 51 | out_prob = out_prob.view(bs * num_queries, -1) 52 | cost_class = -(out_prob + 1e-8).log() 53 | cost_class = cost_class[:, tgt_ids] 54 | else: 55 | alpha = 0.25 56 | gamma = 2.0 57 | out_prob = out_prob.flatten(0, 1).sigmoid() 58 | neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log()) 59 | pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) 60 | # Compute the costs. 61 | cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids] 62 | 63 | cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) 64 | cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), 65 | box_cxcywh_to_xyxy(tgt_bbox)) 66 | 67 | # Final cost matrix 68 | C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou 69 | C = C.view(bs, num_queries, -1).cpu() 70 | 71 | # Muchen: Here is what different from HungarianMacher 72 | sizes = [len(v["boxes"]) for v in targets] 73 | 74 | indices = [] 75 | for batch_i, c in enumerate(C.split(sizes, -1)): 76 | cost_matrix = c[batch_i] 77 | q, tgt = cost_matrix.shape 78 | 79 | assert tgt == 1, f"cost_matrix have a size of: {cost_matrix.shape}" 80 | # take the 81 | topv, topi = torch.topk(-1*cost_matrix.flatten(), topk) 82 | indices.append((topi, [0]*topk)) 83 | 84 | return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] 85 | 86 | 87 | class HungarianMatcher(nn.Module): 88 | """This class computes an assignment between the targets and the predictions of the network 89 | 90 | For efficiency reasons, the targets don't include the no_object. Because of this, in general, 91 | there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, 92 | while the others are un-matched (and thus treated as non-objects). 93 | """ 94 | 95 | def __init__(self, 96 | cost_class: float = 1, 97 | cost_bbox: float = 1, 98 | cost_giou: float = 1): 99 | """Creates the matcher 100 | 101 | Params: 102 | cost_class: This is the relative weight of the classification error in the matching cost 103 | cost_bbox: This is the relative weight of the L1 error of the bounding box coordinates in the matching cost 104 | cost_giou: This is the relative weight of the giou loss of the bounding box in the matching cost 105 | """ 106 | super().__init__() 107 | self.cost_class = cost_class 108 | self.cost_bbox = cost_bbox 109 | self.cost_giou = cost_giou 110 | assert cost_class != 0 or cost_bbox != 0 or cost_giou != 0, "all costs cant be 0" 111 | 112 | # TODO: work around here for parameter passing 113 | def forward(self, outputs, targets, topk=1, use_softmax_match=False): 114 | """ Performs the matching 115 | 116 | Params: 117 | outputs: This is a dict that contains at least these entries: 118 | "pred_logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits 119 | "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates 120 | 121 | targets: This is a list of targets (len(targets) = batch_size), where each target is a dict containing: 122 | "labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of ground-truth 123 | objects in the target) containing the class labels 124 | "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates 125 | 126 | Returns: 127 | A list of size batch_size, containing tuples of (index_i, index_j) where: 128 | - index_i is the indices of the selected predictions (in order) 129 | - index_j is the indices of the corresponding selected targets (in order) 130 | For each batch element, it holds: 131 | len(index_i) = len(index_j) = min(num_queries, num_target_boxes) 132 | """ 133 | with torch.no_grad(): 134 | bs, num_queries = outputs["pred_logits"].shape[:2] 135 | 136 | # We flatten to compute the cost matrices in a batch 137 | out_prob = outputs["pred_logits"].flatten(0, 1).sigmoid() 138 | out_bbox = outputs["pred_boxes"].flatten(0, 1) # [batch_size * num_queries, 4] 139 | 140 | # Also concat the target labels and boxes 141 | tgt_ids = torch.cat([v["labels"] for v in targets]) 142 | tgt_bbox = torch.cat([v["boxes"] for v in targets]) 143 | 144 | # Compute the classification cost. 145 | alpha = 0.25 146 | gamma = 2.0 147 | neg_cost_class = (1 - alpha) * (out_prob ** gamma) * (-(1 - out_prob + 1e-8).log()) 148 | pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log()) 149 | cost_class = pos_cost_class[:, tgt_ids] - neg_cost_class[:, tgt_ids] 150 | 151 | # Compute the L1 cost between boxes 152 | cost_bbox = torch.cdist(out_bbox, tgt_bbox, p=1) 153 | 154 | # Compute the giou cost betwen boxes 155 | cost_giou = -generalized_box_iou(box_cxcywh_to_xyxy(out_bbox), 156 | box_cxcywh_to_xyxy(tgt_bbox)) 157 | 158 | # Final cost matrix 159 | C = self.cost_bbox * cost_bbox + self.cost_class * cost_class + self.cost_giou * cost_giou 160 | C = C.view(bs, num_queries, -1).cpu() 161 | 162 | sizes = [len(v["boxes"]) for v in targets] 163 | indices = [linear_sum_assignment(c[i]) for i, c in enumerate(C.split(sizes, -1))] 164 | return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices] 165 | 166 | 167 | def build_matcher(args, mode='hungarian'): 168 | if mode == 'hungarian': 169 | print("Building Hungarian Matcher") 170 | return HungarianMatcher(cost_class=args.set_cost_class, 171 | cost_bbox=args.set_cost_bbox, 172 | cost_giou=args.set_cost_giou) 173 | elif mode == "one_to_all": 174 | print("Building One to all Matcher") 175 | return OnetoAllMatcher(cost_class=args.set_cost_class, 176 | cost_bbox=args.set_cost_bbox, 177 | cost_giou=args.set_cost_giou) 178 | else: 179 | raise NotImplementedError 180 | 181 | -------------------------------------------------------------------------------- /models/modeling/position_encoding.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Various positional encodings for the transformer. 12 | """ 13 | import math 14 | import torch 15 | from torch import nn 16 | 17 | from util.misc import NestedTensor 18 | 19 | 20 | class PositionEmbeddingSine(nn.Module): 21 | """ 22 | This is a more standard version of the position embedding, very similar to the one 23 | used by the Attention is all you need paper, generalized to work on images. 24 | """ 25 | def __init__(self, num_pos_feats=64, temperature=10000, normalize=False, scale=None): 26 | super().__init__() 27 | self.num_pos_feats = num_pos_feats 28 | self.temperature = temperature 29 | self.normalize = normalize 30 | if scale is not None and normalize is False: 31 | raise ValueError("normalize should be True if scale is passed") 32 | if scale is None: 33 | scale = 2 * math.pi 34 | self.scale = scale 35 | 36 | def forward(self, tensor_list: NestedTensor): 37 | x = tensor_list.tensors 38 | mask = tensor_list.mask 39 | assert mask is not None 40 | not_mask = ~mask 41 | y_embed = not_mask.cumsum(1, dtype=torch.float32) 42 | x_embed = not_mask.cumsum(2, dtype=torch.float32) 43 | if self.normalize: 44 | eps = 1e-6 45 | y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale 46 | x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale 47 | 48 | dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) 49 | dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) 50 | 51 | pos_x = x_embed[:, :, :, None] / dim_t 52 | pos_y = y_embed[:, :, :, None] / dim_t 53 | pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3) 54 | pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3) 55 | pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) 56 | return pos 57 | 58 | 59 | class PositionEmbeddingLearned(nn.Module): 60 | """ 61 | Absolute pos embedding, learned. 62 | """ 63 | def __init__(self, num_pos_feats=256): 64 | super().__init__() 65 | self.row_embed = nn.Embedding(50, num_pos_feats) 66 | self.col_embed = nn.Embedding(50, num_pos_feats) 67 | self.reset_parameters() 68 | 69 | def reset_parameters(self): 70 | nn.init.uniform_(self.row_embed.weight) 71 | nn.init.uniform_(self.col_embed.weight) 72 | 73 | def forward(self, tensor_list: NestedTensor): 74 | x = tensor_list.tensors 75 | h, w = x.shape[-2:] 76 | i = torch.arange(w, device=x.device) 77 | j = torch.arange(h, device=x.device) 78 | x_emb = self.col_embed(i) 79 | y_emb = self.row_embed(j) 80 | pos = torch.cat([ 81 | x_emb.unsqueeze(0).repeat(h, 1, 1), 82 | y_emb.unsqueeze(1).repeat(1, w, 1), 83 | ], dim=-1).permute(2, 0, 1).unsqueeze(0).repeat(x.shape[0], 1, 1, 1) 84 | return pos 85 | 86 | 87 | def build_position_encoding(args): 88 | N_steps = args.hidden_dim // 2 89 | if args.position_embedding in ('v2', 'sine'): 90 | # TODO find a better way of exposing other arguments 91 | position_embedding = PositionEmbeddingSine(N_steps, normalize=True) 92 | elif args.position_embedding in ('v3', 'learned'): 93 | position_embedding = PositionEmbeddingLearned(N_steps) 94 | else: 95 | raise ValueError(f"not supported {args.position_embedding}") 96 | 97 | return position_embedding 98 | -------------------------------------------------------------------------------- /models/modeling/transformer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 2 | """ 3 | DETR Transformer class. 4 | Copy-paste from torch.nn.Transformer with modifications: 5 | * positional encodings are passed in MHattention 6 | * extra LN at the end of encoder is removed 7 | * decoder returns a stack of activations from all decoding layers 8 | """ 9 | import copy 10 | from typing import Optional, List 11 | 12 | import torch 13 | import torch.nn.functional as F 14 | from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_ 15 | from torch import nn, Tensor 16 | 17 | 18 | class Transformer(nn.Module): 19 | """ 20 | Modified based on deformable transformer to enable multi-scale. 21 | """ 22 | def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, 23 | num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, 24 | activation="relu", normalize_before=False, num_feature_levels=1, 25 | return_intermediate_dec=False): 26 | super().__init__() 27 | 28 | encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, 29 | dropout, activation, normalize_before) 30 | encoder_norm = nn.LayerNorm(d_model) if normalize_before else None 31 | self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) 32 | 33 | decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, 34 | dropout, activation, normalize_before) 35 | decoder_norm = nn.LayerNorm(d_model) 36 | self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm, 37 | return_intermediate=return_intermediate_dec) 38 | 39 | self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model)) 40 | self._reset_parameters() 41 | 42 | self.d_model = d_model 43 | self.nhead = nhead 44 | 45 | def _reset_parameters(self): 46 | for p in self.parameters(): 47 | if p.dim() > 1: 48 | nn.init.xavier_uniform_(p) 49 | normal_(self.level_embed) 50 | 51 | def forward(self, src, mask, pos_embed, query_embed=None, lang_feat=None): 52 | src_flatten = [] 53 | mask_flatten = [] 54 | lvl_pos_embed_flatten = [] 55 | for lvl, (src, mask, pos_embed) in enumerate(zip(srcs, masks, pos_embeds)): 56 | bs, c, h, w = src.shape 57 | src = src.flatten(2).transpose(1, 2) 58 | mask = mask.flatten(1) 59 | pos_embed = pos_embed.flatten(2).transpose(1, 2) 60 | lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1) 61 | lvl_pos_embed_flatten.append(lvl_pos_embed) 62 | src_flatten.append(src) 63 | mask_flatten.append(mask) 64 | src_flatten = torch.cat(src_flatten, 1).transpose(0, 1) 65 | mask_flatten = torch.cat(mask_flatten, 1).transpose(0, 1) 66 | lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1).transpose(0, 1) 67 | 68 | query_embed, tgt = torch.split(query_embed, c, dim=1) 69 | query_embed = query_embed.unsqueeze(1).expand(-1, bs, -1) 70 | tgt = tgt.unsqueeze(1).expand(-1, bs, -1) 71 | lang_feat = lang_feat.transpose(0, 1) 72 | 73 | query_embed = query_embed + lang_feat 74 | 75 | memory = self.encoder(src_flatten, src_key_padding_mask=mask_flatten, pos=lvl_pos_embed_flatten) 76 | hs = self.decoder(tgt, memory, memory_key_padding_mask=mask_flatten, 77 | pos=lvl_pos_embed_flatten, query_pos=query_embed) 78 | return hs.transpose(1, 2), #memory.permute(1, 2, 0).view(bs, c, h, w) 79 | 80 | 81 | class TransformerEncoder(nn.Module): 82 | 83 | def __init__(self, encoder_layer, num_layers, norm=None): 84 | super().__init__() 85 | self.layers = _get_clones(encoder_layer, num_layers) 86 | self.num_layers = num_layers 87 | self.norm = norm 88 | 89 | def forward(self, src, 90 | mask: Optional[Tensor] = None, 91 | src_key_padding_mask: Optional[Tensor] = None, 92 | pos: Optional[Tensor] = None): 93 | output = src 94 | 95 | for layer in self.layers: 96 | output = layer(output, src_mask=mask, 97 | src_key_padding_mask=src_key_padding_mask, pos=pos) 98 | 99 | if self.norm is not None: 100 | output = self.norm(output) 101 | 102 | return output 103 | 104 | 105 | class TransformerDecoder(nn.Module): 106 | 107 | def __init__(self, decoder_layer, num_layers, norm=None, return_intermediate=False): 108 | super().__init__() 109 | self.layers = _get_clones(decoder_layer, num_layers) 110 | self.num_layers = num_layers 111 | self.norm = norm 112 | self.return_intermediate = return_intermediate 113 | 114 | def forward(self, tgt, memory, 115 | tgt_mask: Optional[Tensor] = None, 116 | memory_mask: Optional[Tensor] = None, 117 | tgt_key_padding_mask: Optional[Tensor] = None, 118 | memory_key_padding_mask: Optional[Tensor] = None, 119 | pos: Optional[Tensor] = None, 120 | query_pos: Optional[Tensor] = None): 121 | output = tgt 122 | 123 | intermediate = [] 124 | 125 | for layer in self.layers: 126 | output = layer(output, memory, tgt_mask=tgt_mask, 127 | memory_mask=memory_mask, 128 | tgt_key_padding_mask=tgt_key_padding_mask, 129 | memory_key_padding_mask=memory_key_padding_mask, 130 | pos=pos, query_pos=query_pos) 131 | if self.return_intermediate: 132 | intermediate.append(self.norm(output)) 133 | 134 | if self.norm is not None: 135 | output = self.norm(output) 136 | if self.return_intermediate: 137 | intermediate.pop() 138 | intermediate.append(output) 139 | 140 | if self.return_intermediate: 141 | return torch.stack(intermediate) 142 | 143 | return output.unsqueeze(0) 144 | 145 | 146 | class TransformerEncoderLayer(nn.Module): 147 | 148 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, 149 | activation="relu", normalize_before=False): 150 | super().__init__() 151 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 152 | # Implementation of Feedforward model 153 | self.linear1 = nn.Linear(d_model, dim_feedforward) 154 | self.dropout = nn.Dropout(dropout) 155 | self.linear2 = nn.Linear(dim_feedforward, d_model) 156 | 157 | self.norm1 = nn.LayerNorm(d_model) 158 | self.norm2 = nn.LayerNorm(d_model) 159 | self.dropout1 = nn.Dropout(dropout) 160 | self.dropout2 = nn.Dropout(dropout) 161 | 162 | self.activation = _get_activation_fn(activation) 163 | self.normalize_before = normalize_before 164 | 165 | def with_pos_embed(self, tensor, pos: Optional[Tensor]): 166 | return tensor if pos is None else tensor + pos 167 | 168 | def forward_post(self, 169 | src, 170 | src_mask: Optional[Tensor] = None, 171 | src_key_padding_mask: Optional[Tensor] = None, 172 | pos: Optional[Tensor] = None): 173 | q = k = self.with_pos_embed(src, pos) 174 | src2 = self.self_attn(q, k, value=src, attn_mask=src_mask, 175 | key_padding_mask=src_key_padding_mask)[0] 176 | src = src + self.dropout1(src2) 177 | src = self.norm1(src) 178 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src)))) 179 | src = src + self.dropout2(src2) 180 | src = self.norm2(src) 181 | return src 182 | 183 | def forward_pre(self, src, 184 | src_mask: Optional[Tensor] = None, 185 | src_key_padding_mask: Optional[Tensor] = None, 186 | pos: Optional[Tensor] = None): 187 | src2 = self.norm1(src) 188 | q = k = self.with_pos_embed(src2, pos) 189 | src2 = self.self_attn(q, k, value=src2, attn_mask=src_mask, 190 | key_padding_mask=src_key_padding_mask)[0] 191 | src = src + self.dropout1(src2) 192 | src2 = self.norm2(src) 193 | src2 = self.linear2(self.dropout(self.activation(self.linear1(src2)))) 194 | src = src + self.dropout2(src2) 195 | return src 196 | 197 | def forward(self, src, 198 | src_mask: Optional[Tensor] = None, 199 | src_key_padding_mask: Optional[Tensor] = None, 200 | pos: Optional[Tensor] = None): 201 | if self.normalize_before: 202 | return self.forward_pre(src, src_mask, src_key_padding_mask, pos) 203 | return self.forward_post(src, src_mask, src_key_padding_mask, pos) 204 | 205 | 206 | class TransformerDecoderLayer(nn.Module): 207 | 208 | def __init__(self, d_model, nhead, dim_feedforward=2048, dropout=0.1, 209 | activation="relu", normalize_before=False): 210 | super().__init__() 211 | self.self_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 212 | self.multihead_attn = nn.MultiheadAttention(d_model, nhead, dropout=dropout) 213 | # Implementation of Feedforward model 214 | self.linear1 = nn.Linear(d_model, dim_feedforward) 215 | self.dropout = nn.Dropout(dropout) 216 | self.linear2 = nn.Linear(dim_feedforward, d_model) 217 | 218 | self.norm1 = nn.LayerNorm(d_model) 219 | self.norm2 = nn.LayerNorm(d_model) 220 | self.norm3 = nn.LayerNorm(d_model) 221 | self.dropout1 = nn.Dropout(dropout) 222 | self.dropout2 = nn.Dropout(dropout) 223 | self.dropout3 = nn.Dropout(dropout) 224 | 225 | self.activation = _get_activation_fn(activation) 226 | self.normalize_before = normalize_before 227 | 228 | def with_pos_embed(self, tensor, pos: Optional[Tensor]): 229 | return tensor if pos is None else tensor + pos 230 | 231 | def forward_post(self, tgt, memory, 232 | tgt_mask: Optional[Tensor] = None, 233 | memory_mask: Optional[Tensor] = None, 234 | tgt_key_padding_mask: Optional[Tensor] = None, 235 | memory_key_padding_mask: Optional[Tensor] = None, 236 | pos: Optional[Tensor] = None, 237 | query_pos: Optional[Tensor] = None): 238 | q = k = self.with_pos_embed(tgt, query_pos) 239 | tgt2 = self.self_attn(q, k, value=tgt, attn_mask=tgt_mask, 240 | key_padding_mask=tgt_key_padding_mask)[0] 241 | tgt = tgt + self.dropout1(tgt2) 242 | tgt = self.norm1(tgt) 243 | tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt, query_pos), 244 | key=self.with_pos_embed(memory, pos), 245 | value=memory, attn_mask=memory_mask, 246 | key_padding_mask=memory_key_padding_mask)[0] 247 | tgt = tgt + self.dropout2(tgt2) 248 | tgt = self.norm2(tgt) 249 | tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) 250 | tgt = tgt + self.dropout3(tgt2) 251 | tgt = self.norm3(tgt) 252 | return tgt 253 | 254 | def forward_pre(self, tgt, memory, 255 | tgt_mask: Optional[Tensor] = None, 256 | memory_mask: Optional[Tensor] = None, 257 | tgt_key_padding_mask: Optional[Tensor] = None, 258 | memory_key_padding_mask: Optional[Tensor] = None, 259 | pos: Optional[Tensor] = None, 260 | query_pos: Optional[Tensor] = None): 261 | tgt2 = self.norm1(tgt) 262 | q = k = self.with_pos_embed(tgt2, query_pos) 263 | tgt2 = self.self_attn(q, k, value=tgt2, attn_mask=tgt_mask, 264 | key_padding_mask=tgt_key_padding_mask)[0] 265 | tgt = tgt + self.dropout1(tgt2) 266 | tgt2 = self.norm2(tgt) 267 | tgt2 = self.multihead_attn(query=self.with_pos_embed(tgt2, query_pos), 268 | key=self.with_pos_embed(memory, pos), 269 | value=memory, attn_mask=memory_mask, 270 | key_padding_mask=memory_key_padding_mask)[0] 271 | tgt = tgt + self.dropout2(tgt2) 272 | tgt2 = self.norm3(tgt) 273 | tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) 274 | tgt = tgt + self.dropout3(tgt2) 275 | return tgt 276 | 277 | def forward(self, tgt, memory, 278 | tgt_mask: Optional[Tensor] = None, 279 | memory_mask: Optional[Tensor] = None, 280 | tgt_key_padding_mask: Optional[Tensor] = None, 281 | memory_key_padding_mask: Optional[Tensor] = None, 282 | pos: Optional[Tensor] = None, 283 | query_pos: Optional[Tensor] = None): 284 | if self.normalize_before: 285 | return self.forward_pre(tgt, memory, tgt_mask, memory_mask, 286 | tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos) 287 | return self.forward_post(tgt, memory, tgt_mask, memory_mask, 288 | tgt_key_padding_mask, memory_key_padding_mask, pos, query_pos) 289 | 290 | 291 | def _get_clones(module, N): 292 | return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) 293 | 294 | def build_transformer(args): 295 | return Transformer( 296 | d_model=args.hidden_dim, 297 | nhead=args.nheads, 298 | num_encoder_layers=args.enc_layers, 299 | num_decoder_layers=args.dec_layers, 300 | dim_feedforward=args.dim_feedforward, 301 | dropout=args.dropout, 302 | activation="relu", 303 | return_intermediate_dec=True, 304 | num_feature_levels=args.num_feature_levels) 305 | 306 | def _get_activation_fn(activation): 307 | """Return an activation function given a string""" 308 | if activation == "relu": 309 | return F.relu 310 | if activation == "gelu": 311 | return F.gelu 312 | if activation == "glu": 313 | return F.glu 314 | raise RuntimeError(F"activation should be relu/gelu, not {activation}.") -------------------------------------------------------------------------------- /models/post_process.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | import math 5 | from util import box_ops 6 | 7 | class PostProcessVGOnePhrase(nn.Module): 8 | """ This module converts the model's output into the format expected by the coco api""" 9 | 10 | @torch.no_grad() 11 | def forward(self, outputs, target_sizes, scale_to_original_shape=False): 12 | """ Perform the computation 13 | Parameters: 14 | outputs: raw outputs of the model 15 | target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch 16 | For evaluation, this must be the original image size (before any data augmentation) 17 | For visualization, this should be the image size after data augment, but before padding 18 | """ 19 | out_bbox = outputs['pred_boxes'] 20 | bs, k, _ = out_bbox.shape 21 | 22 | assert len(out_bbox) == len(target_sizes) 23 | assert target_sizes.shape[1] == 2 24 | 25 | # TODO for multiple predictions 26 | # print("out_bbox.shape:", out_bbox.shape) 27 | out_bbox = out_bbox[:, 0, :] 28 | boxes = box_ops.box_cxcywh_to_xyxy(out_bbox) 29 | 30 | # and from relative [0, 1] to absolute [0, height] coordinates 31 | if scale_to_original_shape: 32 | img_h, img_w = target_sizes.unbind(1) 33 | scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) 34 | boxes = boxes * scale_fct 35 | 36 | # print("boxes.shape:", boxes.shape) 37 | # return boxes 38 | results = [{'boxes': boxes[i:i+1, :]} for i in range(bs)] 39 | return results 40 | 41 | class PostProcessVGMultiPhrase(nn.Module): 42 | """ This module converts the model's output into the format expected by the coco api""" 43 | 44 | @torch.no_grad() 45 | def forward(self, outputs, target_sizes, scale_to_original_shape=False): 46 | """ Perform the computation 47 | Parameters: 48 | outputs: raw outputs of the model 49 | target_sizes: tensor of dimension [batch_size x 2] containing the size of each images of the batch 50 | For evaluation, this must be the original image size (before any data augmentation) 51 | For visualization, this should be the image size after data augment, but before padding 52 | """ 53 | out_bbox = outputs['pred_boxes'] 54 | bsz, num_phrase, k, _ = out_bbox.shape 55 | mask = outputs['phrase_mask'].view(bsz, num_phrase, k, -1) 56 | # print(out_bbox.shape, mask.shape) 57 | 58 | target_boxes = [] 59 | assert bsz == len(target_sizes) 60 | for i in range(bsz): 61 | mask_i = mask[i] 62 | pred_i = torch.masked_select(out_bbox[i], mask_i).view(-1, k, 4) 63 | 64 | assert target_sizes.shape[1] == 2 65 | 66 | # TODO for multiple predictions 67 | # print("out_bbox.shape:", out_bbox.shape) 68 | out_bbox_i = pred_i[:, 0, :] 69 | boxes = box_ops.box_cxcywh_to_xyxy(out_bbox_i) 70 | 71 | # and from relative [0, 1] to absolute [0, height] coordinates 72 | if scale_to_original_shape: 73 | img_h, img_w = target_sizes[i:i+1].unbind(1) 74 | scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1) 75 | # print(boxes, scale_fct) 76 | boxes = boxes * scale_fct 77 | 78 | target_boxes.append(boxes) 79 | 80 | # print("boxes.shape:", boxes.shape) 81 | # return boxes 82 | results = [{'boxes': target_boxes[i]} for i in range(bsz)] 83 | return results -------------------------------------------------------------------------------- /models/reftr.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import torch 3 | import torch.nn.functional as F 4 | 5 | from typing import Optional, List 6 | from torch.nn.init import xavier_uniform_, constant_, uniform_, normal_ 7 | from torch import nn, Tensor 8 | from models.modeling.transformer import TransformerDecoder, TransformerDecoderLayer ,TransformerEncoder, TransformerEncoderLayer 9 | 10 | class VLTransformer(nn.Module): 11 | def __init__(self, d_model=512, nhead=8, num_encoder_layers=6, 12 | num_decoder_layers=6, dim_feedforward=2048, dropout=0.1, 13 | activation="relu", normalize_before=False, num_feature_levels=1, 14 | num_queries=1 ,return_intermediate_dec=False, max_lang_seq=128): 15 | super().__init__() 16 | # Positional embedding and feat type embedding 17 | # token type embedding to indicate image feature vs language feature 18 | self.max_lang_seq = max_lang_seq 19 | self.num_queries = num_queries 20 | self.d_model = d_model 21 | self.nhead = nhead 22 | self.lang_pos_embeddings = nn.Embedding(max_lang_seq, d_model) 23 | self.token_type_embeddings = nn.Embedding(2, d_model) 24 | 25 | # Transformer Encoder as encoder 26 | self.level_embed = nn.Parameter(torch.Tensor(num_feature_levels, d_model)) 27 | encoder_layer = TransformerEncoderLayer(d_model, nhead, dim_feedforward, 28 | dropout, activation, normalize_before) 29 | encoder_norm = nn.LayerNorm(d_model) if normalize_before else None 30 | self.encoder = TransformerEncoder(encoder_layer, num_encoder_layers, encoder_norm) 31 | 32 | # if num_decoder_layers < 0, no decoder is used 33 | self.use_decoder = num_decoder_layers > 0 34 | if self.use_decoder: 35 | decoder_layer = TransformerDecoderLayer(d_model, nhead, dim_feedforward, 36 | dropout, activation, normalize_before) 37 | decoder_norm = nn.LayerNorm(d_model) 38 | self.decoder = TransformerDecoder(decoder_layer, num_decoder_layers, decoder_norm, 39 | return_intermediate=return_intermediate_dec) 40 | else: 41 | print("No decoder is used!") 42 | 43 | self._reset_parameters() 44 | 45 | def _reset_parameters(self): 46 | for p in self.parameters(): 47 | if p.dim() > 1: 48 | nn.init.xavier_uniform_(p) 49 | normal_(self.level_embed) 50 | 51 | def process_img_feat(self, img_srcs, img_masks, img_pos_embeds): 52 | src_flatten = [] 53 | mask_flatten = [] 54 | lvl_pos_embed_flatten = [] 55 | for lvl, (src, mask, pos_embed) in enumerate(zip(img_srcs, img_masks, img_pos_embeds)): 56 | bs, c, h, w = src.shape 57 | src = src.flatten(2).transpose(1, 2) 58 | mask = mask.flatten(1) 59 | pos_embed = pos_embed.flatten(2).transpose(1, 2) 60 | lvl_pos_embed = pos_embed + self.level_embed[lvl].view(1, 1, -1) 61 | lvl_pos_embed_flatten.append(lvl_pos_embed) 62 | src_flatten.append(src) 63 | mask_flatten.append(mask) 64 | img_src_flatten = torch.cat(src_flatten, 1)#.transpose(0, 1) 65 | img_mask_flatten = torch.cat(mask_flatten, 1)#.transpose(0, 1) 66 | img_lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)#.transpose(0, 1) 67 | 68 | # Add token type embedding if available 69 | bsz, seq_length, dim = img_src_flatten.shape 70 | if self.token_type_embeddings is not None: 71 | token_type_ids = torch.ones((bsz, seq_length), dtype=torch.long, device=img_src_flatten.device) 72 | token_type_embeddings = self.token_type_embeddings(token_type_ids) 73 | img_lvl_pos_embed_flatten = img_lvl_pos_embed_flatten + token_type_embeddings 74 | 75 | return img_mask_flatten,\ 76 | img_src_flatten.transpose(0, 1),\ 77 | img_lvl_pos_embed_flatten.transpose(0, 1) 78 | 79 | def process_lang_feat(self, lang_srcs, lang_masks): 80 | bsz, seq_length, dim = lang_srcs.shape 81 | assert seq_length <= self.max_lang_seq 82 | position_ids = torch.arange(seq_length, dtype=torch.long, device=lang_srcs.device) 83 | position_ids = position_ids.unsqueeze(0).expand(bsz, -1) 84 | position_embeddings = self.lang_pos_embeddings(position_ids) 85 | 86 | if self.token_type_embeddings is not None: 87 | token_type_ids = torch.zeros((bsz, seq_length), dtype=torch.long, device=lang_srcs.device) 88 | token_type_embeddings = self.token_type_embeddings(token_type_ids) 89 | position_embeddings = position_embeddings + token_type_embeddings 90 | 91 | # Non-zero area is ignored 92 | lang_masks = lang_masks.logical_not() 93 | assert (lang_masks[:, 0] == False).all() 94 | 95 | return lang_masks,\ 96 | lang_srcs.transpose(0, 1),\ 97 | position_embeddings.transpose(0, 1) 98 | 99 | def encode(self, img_srcs, img_masks, img_pos_embeds, 100 | lang_srcs, lang_masks): 101 | # create image feature and mask & pos info 102 | 103 | # print(f"img_srcs/img_masks/img_pos_embeds: {img_srcs.shape} {img_masks.shape} {img_pos_embeds.shape}") 104 | img_masks, img_srcs, img_pos_embeds =\ 105 | self.process_img_feat(img_srcs, img_masks, img_pos_embeds) 106 | # print(f"img_srcs/img_masks/img_pos_embeds: {img_srcs.shape} {img_masks.shape} {img_pos_embeds.shape}") 107 | # print(img_masks) 108 | 109 | # print(f"lang_srcs/lang_masks: {lang_srcs.shape} {lang_masks.shape}") 110 | lang_masks, lang_srcs, lang_pos_embeds =\ 111 | self.process_lang_feat(lang_srcs, lang_masks) 112 | # print(f"lang_srcs/lang_masks/lang_pos_embeds: {lang_srcs.shape} {lang_masks.shape} {lang_pos_embeds.shape}") 113 | # print(lang_masks) 114 | 115 | masks = torch.cat([lang_masks, img_masks], dim=1) 116 | srcs = torch.cat([lang_srcs, img_srcs], dim=0) 117 | pos_embeds = torch.cat([lang_pos_embeds, img_pos_embeds], dim=0) 118 | 119 | memory = self.encoder(srcs, src_key_padding_mask=masks, pos=pos_embeds) 120 | return memory, masks, pos_embeds 121 | 122 | def forward(self, img_srcs, img_masks, img_pos_embeds, 123 | lang_srcs, lang_masks, 124 | query=None, query_mask=None, query_pos=None): 125 | 126 | memory, masks, pos_embeds =\ 127 | self.encode(img_srcs, img_masks, img_pos_embeds, lang_srcs, lang_masks) 128 | 129 | if self.use_decoder: 130 | # TODO here 131 | hs = self.decoder(query, memory, 132 | memory_key_padding_mask=masks, 133 | tgt_key_padding_mask=query_mask, 134 | pos=pos_embeds, query_pos=query_pos) 135 | else: 136 | hs = memory.unsqueeze(0) 137 | return hs.transpose(1, 2) 138 | 139 | 140 | def build_vl_transformer(args): 141 | return VLTransformer( 142 | d_model=args.hidden_dim, 143 | nhead=args.nheads, 144 | num_encoder_layers=args.enc_layers, 145 | num_decoder_layers=args.dec_layers, 146 | dim_feedforward=args.dim_feedforward, 147 | dropout=args.dropout, 148 | activation="relu", 149 | num_feature_levels=args.num_feature_levels, 150 | return_intermediate_dec=True, 151 | max_lang_seq=args.max_lang_seq 152 | ) 153 | -------------------------------------------------------------------------------- /models/reftr_transformer.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn.functional as F 3 | from torch import nn 4 | 5 | from util.misc import (NestedTensor, nested_tensor_from_tensor_list) 6 | from models.modeling.backbone import build_backbone, MLP 7 | 8 | from transformers import RobertaModel, BertModel 9 | from models.reftr import build_vl_transformer 10 | from models.criterion import CriterionVGOnePhrase, CriterionVGMultiPhrase 11 | from models.post_process import PostProcessVGOnePhrase, PostProcessVGMultiPhrase 12 | 13 | 14 | def mlp_mapping(input_dim, output_dim): 15 | return torch.nn.Sequential( 16 | nn.Linear(input_dim, output_dim), 17 | nn.LayerNorm(output_dim), 18 | nn.ReLU(), 19 | nn.Dropout(0.1), 20 | nn.Linear(output_dim, output_dim), 21 | nn.LayerNorm(output_dim), 22 | nn.ReLU(), 23 | ) 24 | 25 | 26 | class QueryEncoder(nn.Module): 27 | def __init__(self, num_queries_per_phrase, hidden_dim, ablation): 28 | super(QueryEncoder, self).__init__() 29 | self.ablation = ablation 30 | self.hidden_dim = hidden_dim 31 | self.query_embed = nn.Embedding(num_queries_per_phrase, hidden_dim*2) 32 | self.linear1 = nn.Linear(hidden_dim, hidden_dim) 33 | self.linear2 = nn.Linear(hidden_dim, hidden_dim) 34 | self.linear3 = nn.Linear(hidden_dim, hidden_dim) 35 | self.fuse_encoder_query = mlp_mapping(hidden_dim*2, hidden_dim) 36 | self.context_out = nn.Sequential( 37 | nn.Linear(hidden_dim, hidden_dim), 38 | nn.LayerNorm(hidden_dim) 39 | ) 40 | 41 | def forward(self, lang_context_feat, lang_query_feat, mask_query_context): 42 | learnable_querys = self.query_embed.weight 43 | bs, n_ph, _ = lang_query_feat.shape 44 | n_q = learnable_querys.size(0) 45 | # n_context = lang_context_feat.size(1) 46 | 47 | # attended reduce 48 | k = self.linear1(lang_context_feat[:, 0:1, :]) 49 | q = self.linear2(lang_context_feat).transpose(1, 2) 50 | v = self.linear3(lang_context_feat).unsqueeze(1) # b, 1, n_context, -1 51 | att_weight = torch.bmm(k, q) 52 | att_weight = att_weight.expand(-1, n_ph, -1) 53 | att_weight = att_weight.masked_fill(mask_query_context, float('-inf')) 54 | att_weight_normalized = F.softmax(att_weight, dim=-1).unsqueeze(-1) # b, n_ph, n_context, -1 55 | context_feats = self.context_out((v * att_weight_normalized).sum(dim=-2)) # b, n_ph, -1 56 | 57 | # residual connection 58 | context_feats = lang_context_feat[:, None, 0, :] + context_feats 59 | 60 | lang_query_feat = torch.cat([context_feats, lang_query_feat], dim=-1) 61 | lang_query_feat = self.fuse_encoder_query(lang_query_feat) 62 | phrase_queries = lang_query_feat.view(bs, n_ph, 1, -1).repeat(1, 1, 1, 2) +\ 63 | learnable_querys.view(1, 1, n_q, -1) 64 | phrase_queries = phrase_queries.view(bs, n_ph*n_q, -1).transpose(0, 1) 65 | 66 | return torch.split(phrase_queries, self.hidden_dim, dim=-1) 67 | 68 | 69 | class RefTR(nn.Module): 70 | def __init__(self, img_backbone, lang_backbone, vl_transformer, 71 | num_feature_levels=1, num_queries_per_phrase=1, 72 | freeze_lang_backbone=False, aux_loss=False, ablation='none'): 73 | super(RefTR, self).__init__() 74 | # print("ABLATION !!!", ablation) 75 | self.img_backbone = img_backbone 76 | self.lang_backbone = lang_backbone 77 | self.vl_transformer = vl_transformer 78 | self.num_feature_levels = num_feature_levels 79 | self.num_queries_per_phrase = num_queries_per_phrase 80 | self.hidden_dim = hidden_dim = vl_transformer.d_model 81 | print("Model dim:", hidden_dim) 82 | self.bbox_embed = MLP(hidden_dim, hidden_dim, 4, 3) 83 | 84 | self.lang_hidden_dim = lang_backbone.config.hidden_size 85 | print("Language model dim:", self.lang_hidden_dim) 86 | self.map_sentence = mlp_mapping(self.lang_hidden_dim, hidden_dim) 87 | 88 | # TODO here 89 | self.use_decoder = self.vl_transformer.use_decoder 90 | if self.use_decoder: 91 | self.map_phrase = mlp_mapping(self.lang_hidden_dim, hidden_dim) 92 | self.query_encoder = QueryEncoder( 93 | num_queries_per_phrase=num_queries_per_phrase, 94 | hidden_dim=hidden_dim, 95 | ablation='none' 96 | ) 97 | 98 | # Set up for Feature Payramid 99 | if num_feature_levels > 1: 100 | num_backbone_outs = len(self.img_backbone.strides)-1 101 | input_proj_list = [] 102 | for l_ in range(num_backbone_outs): 103 | l_ = l_ + 1 104 | in_channels = self.img_backbone.num_channels[l_] 105 | print(f"layer {l_}: {self.img_backbone.num_channels[l_]}") 106 | input_proj_list.append(nn.Sequential( 107 | nn.Conv2d(in_channels, hidden_dim, kernel_size=1), 108 | nn.GroupNorm(32, hidden_dim), 109 | )) 110 | for l_ in range(num_feature_levels - num_backbone_outs): 111 | print(f"layer {l_}: {in_channels}") 112 | input_proj_list.append(nn.Sequential( 113 | nn.Conv2d(in_channels, hidden_dim, kernel_size=3, stride=2, padding=1), 114 | nn.GroupNorm(32, hidden_dim), 115 | )) 116 | in_channels = hidden_dim 117 | self.input_proj = nn.ModuleList(input_proj_list) 118 | else: 119 | # TODO fix this for other network 120 | assert self.img_backbone.num_channels[-1] == 2048 121 | self.input_proj = nn.ModuleList([ 122 | nn.Sequential( 123 | nn.Conv2d(self.img_backbone.num_channels[-1], hidden_dim, kernel_size=1), 124 | nn.GroupNorm(32, hidden_dim), 125 | )]) 126 | 127 | self.aux_loss = aux_loss 128 | self.freeze_lang_backbone = freeze_lang_backbone 129 | 130 | # initialization 131 | nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0) 132 | nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0) 133 | for proj in self.input_proj: 134 | nn.init.xavier_uniform_(proj[0].weight, gain=1) 135 | nn.init.constant_(proj[0].bias, 0) 136 | 137 | def init_from_pretrained_detr(self, state_dict): 138 | """ 139 | Initialize from pretrained DETR. 140 | """ 141 | # print(state_dict.keys()) 142 | state_dict_backbone = {k.split('.', 1)[1]: v for k, v in state_dict.items() if k.split('.', 1)[0] == 'backbone'} 143 | state_dict_transformer_encoder = {k.split('.', 2)[2]: v for k, v in state_dict.items() if 'transformer.encoder' in k} 144 | self.img_backbone.load_state_dict(state_dict_backbone) 145 | self.vl_transformer.encoder.load_state_dict(state_dict_transformer_encoder) 146 | return 147 | 148 | def freeze_img_backbone(self): 149 | for param in self.backbone.parameters(): 150 | param.requires_grad = False 151 | 152 | def freeze_bert(self): 153 | """ 154 | Freeze for distributed training 155 | """ 156 | for param in self.textmodel.parameters(): 157 | param.requires_grad = False 158 | 159 | def forward(self, samples): 160 | # TODO 161 | img = samples["img"] 162 | 163 | # Visual Module 164 | srcs = [] 165 | masks = [] 166 | if not isinstance(img, NestedTensor): 167 | img = nested_tensor_from_tensor_list(img) 168 | img_features, pos = self.img_backbone(img) 169 | 170 | # FPN features & masks 171 | pos = pos[-2:] 172 | for l_, feat in enumerate(img_features[-2:]): 173 | src, mask = feat.decompose() 174 | srcs.append(self.input_proj[l_](src)) 175 | masks.append(mask) 176 | # print(f"l: {l} src/mask/pos: {srcs[-1].shape} / {mask.shape} / {pos[l].shape}") 177 | assert mask is not None 178 | if self.num_feature_levels > len(srcs): 179 | _len_srcs = len(srcs) 180 | for l_ in range(_len_srcs, self.num_feature_levels): 181 | if l_ == _len_srcs: 182 | src = self.input_proj[l_](img_features[-1].tensors) 183 | else: 184 | src = self.input_proj[l_](srcs[-1]) 185 | m = img.mask 186 | mask = F.interpolate(m[None].float(), size=src.shape[-2:]).to(torch.bool)[0] 187 | pos_l = self.img_backbone[1](NestedTensor(src, mask)).to(src.dtype) 188 | srcs.append(src) 189 | masks.append(mask) 190 | pos.append(pos_l) 191 | # print(f"l: {l} src/mask/pos: {src.shape} / {mask.shape} / {pos_l.shape}") 192 | 193 | # Language model 194 | sentence = samples["sentence"] 195 | sentence_mask = samples["sentence_mask"] 196 | # ---------------------------------------------- # 197 | # Ablation on context encoder 198 | # sentence_feat = self.lang_backbone.embeddings(sentence) 199 | # ---------------------------------------------- # 200 | sentence_feat, sentence_feat_pooled = self.lang_backbone(sentence, token_type_ids=None, attention_mask=sentence_mask)[0:2] 201 | sentence_feat = self.map_sentence(sentence_feat) 202 | 203 | # Process phrase queries 204 | n_q = self.num_queries_per_phrase 205 | bsz = sentence.size(0) 206 | if 'phrase' in samples.keys(): 207 | phrases = samples["phrase"] 208 | phrase_masks = samples["phrase_mask"] 209 | p_pos_l = samples['phrase_pos_l'] 210 | p_pos_r = samples['phrase_pos_r'] 211 | n_ph = phrases.size(1) 212 | assert n_ph == p_pos_l.size(1) 213 | 214 | # Get Phrase Representation 215 | phrases = phrases.view(bsz * n_ph, -1) 216 | phrase_masks = phrase_masks.view(bsz * n_ph, -1) 217 | phrase_pooled_feat = self.lang_backbone(phrases, token_type_ids=None, attention_mask=phrase_masks)[1] 218 | 219 | # p_len = p_pos_r - p_pos_l 220 | # TODO language len set to 90 in flickr Multiphrase setting 221 | # assert 90 == n_context 222 | 223 | # Set up phrase-specific mask on context 224 | mask_context = [] 225 | for i in range(bsz): 226 | for j in range(n_ph): 227 | mask = torch.ones_like(sentence_mask[0, :], device=sentence_mask.device) 228 | mask[p_pos_l[i, j]:p_pos_r[i, j]] = 0 229 | mask_context.append(mask) 230 | mask_context = torch.stack(mask_context).view(bsz, n_ph, -1).to(torch.bool) 231 | 232 | # Mask for Query Decoder input 233 | # TODO Hack here: Take the third mask of each phrase, 234 | # if 0, the phrase only contains "[CLS] [SEP]", ignore 235 | query_mask = phrase_masks.view(bsz, n_ph, -1)[:, :, 2:3] 236 | query_mask = query_mask.logical_not() 237 | query_mask = query_mask.expand(-1, -1, n_q) 238 | query_mask = query_mask.view(bsz, n_ph*n_q) 239 | else: 240 | n_ph = 1 241 | phrase_pooled_feat = sentence_feat_pooled 242 | sentence_len = sentence_mask.to(torch.int32).sum(-1) 243 | mask_context = sentence_mask.view(bsz, n_ph, -1).logical_not().to(torch.bool) 244 | # Mask out [CLS] and [SEP] 245 | mask_context[:, :, 0] = True 246 | for i in range(bsz): 247 | mask_context[i, :, sentence_len[i]-1] = True 248 | query_mask = torch.zeros((bsz, 1), device=sentence_mask.device).to(torch.bool) 249 | 250 | phrase_pooled_feat = self.map_phrase(phrase_pooled_feat).view(bsz, n_ph, -1) 251 | 252 | # print(f"phrase_queries {phrase_queries.shape} phrase_masks {phrase_masks.shape}") 253 | memory, memory_mask, memory_pos =\ 254 | self.vl_transformer.encode( 255 | img_srcs=srcs, 256 | img_masks=masks, 257 | img_pos_embeds=pos, 258 | lang_srcs=sentence_feat, 259 | lang_masks=sentence_mask 260 | ) 261 | memory_lang = memory[:sentence_feat.size(1)] 262 | query, query_pos =\ 263 | self.query_encoder( 264 | lang_context_feat=memory_lang.transpose(0, 1), 265 | lang_query_feat=phrase_pooled_feat, 266 | mask_query_context=mask_context 267 | ) 268 | 269 | hs = self.vl_transformer.decoder( 270 | tgt=query, 271 | memory=memory, 272 | tgt_key_padding_mask=query_mask, 273 | memory_key_padding_mask=memory_mask, 274 | query_pos=query_pos, 275 | pos=memory_pos, 276 | ).transpose(1, 2) 277 | 278 | # print(f"hs: {hs.shape}") 279 | num_l = hs.size(0) 280 | hs = hs.view(num_l, bsz, n_ph, n_q, -1) 281 | # ----------------------------------------------# 282 | # Ablation on no decoder 283 | # hs = (query + query_pos).transpose(1, 2) 284 | # hs = hs.reshape(1, bsz, n_ph, n_q, -1) 285 | # ----------------------------------------------# 286 | # TODO this 287 | outputs_coord = self.bbox_embed(hs).sigmoid() 288 | if torch.isnan(outputs_coord).any(): 289 | print(outputs_coord) 290 | print(hs) 291 | print(query) 292 | 293 | out = {'pred_boxes': outputs_coord[-1], 'phrase_mask': query_mask.logical_not()} 294 | if self.aux_loss: 295 | out['aux_outputs'] = self._set_aux_loss(outputs_coord, query_mask.logical_not()) 296 | 297 | return out 298 | 299 | @torch.jit.unused 300 | def _set_aux_loss(self, outputs_coord, phrase_mask): 301 | # this is a workaround to make torchscript happy, as torchscript 302 | # doesn't support dictionary with non-homogeneous values, such 303 | # as a dict having both a Tensor and a list. 304 | return [{'pred_boxes': b, 'phrase_mask': phrase_mask} for b in outputs_coord[:-1]] 305 | 306 | 307 | def build_reftr(args): 308 | # num_classes = 1 # if args.dataset_file != 'coco' else 91 309 | device = torch.device(args.device) 310 | if args.no_decoder: 311 | args.dec_layers = 0 312 | 313 | img_backbone = build_backbone(args) 314 | vl_transformer = build_vl_transformer(args) 315 | if args.bert_model.split('-')[0] == 'roberta': 316 | lang_backbone = RobertaModel.from_pretrained(args.bert_model) 317 | else: 318 | lang_backbone = BertModel.from_pretrained(args.bert_model) 319 | 320 | weight_dict = {'loss_giou': args.giou_loss_coef, 'loss_bbox': args.bbox_loss_coef} 321 | # weight_dict['loss_giou'] = args.giou_loss_coef 322 | 323 | # TODO this is a hack 324 | if args.aux_loss: 325 | aux_weight_dict = {} 326 | for i in range(args.dec_layers - 1): 327 | aux_weight_dict.update({k + f'_{i}': v for k, v in weight_dict.items()}) 328 | aux_weight_dict.update({k + '_enc': v for k, v in weight_dict.items()}) 329 | weight_dict.update(aux_weight_dict) 330 | 331 | print("ABLATION !!!", args.ablation) 332 | 333 | model = RefTR( 334 | img_backbone=img_backbone, 335 | lang_backbone=lang_backbone, 336 | vl_transformer=vl_transformer, 337 | num_feature_levels=args.num_feature_levels, 338 | num_queries_per_phrase=args.num_queries_per_phrase, 339 | freeze_lang_backbone=args.freeze_bert, 340 | aux_loss=args.aux_loss, 341 | ablation=args.ablation 342 | ) 343 | criterion = CriterionVGMultiPhrase(weight_dict, losses=['boxes']) 344 | postprocessors = {'bbox': PostProcessVGMultiPhrase()} 345 | 346 | criterion.to(device) 347 | return model, criterion, postprocessors 348 | 349 | # if __name__ == "__main__": 350 | # import sys, argparse 351 | # sys.path.append(path_to_parent) 352 | # from main_vg import get_args_parser 353 | # parser = argparse.ArgumentParser('Deformable DETR training and evaluation script', parents=[get_args_parser()]) 354 | # args = parser.parse_args() 355 | # model, ce, postprocessors = build_model(args) 356 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pycocotools 2 | tqdm 3 | cython 4 | scipy 5 | -------------------------------------------------------------------------------- /tools/launch.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------------------------------------------------- 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # -------------------------------------------------------------------------------------------------------------------------- 6 | # Modified from https://github.com/pytorch/pytorch/blob/173f224570017b4b1a3a1a13d0bff280a54d9cd9/torch/distributed/launch.py 7 | # -------------------------------------------------------------------------------------------------------------------------- 8 | 9 | r""" 10 | `torch.distributed.launch` is a module that spawns up multiple distributed 11 | training processes on each of the training nodes. 12 | The utility can be used for single-node distributed training, in which one or 13 | more processes per node will be spawned. The utility can be used for either 14 | CPU training or GPU training. If the utility is used for GPU training, 15 | each distributed process will be operating on a single GPU. This can achieve 16 | well-improved single-node training performance. It can also be used in 17 | multi-node distributed training, by spawning up multiple processes on each node 18 | for well-improved multi-node distributed training performance as well. 19 | This will especially be benefitial for systems with multiple Infiniband 20 | interfaces that have direct-GPU support, since all of them can be utilized for 21 | aggregated communication bandwidth. 22 | In both cases of single-node distributed training or multi-node distributed 23 | training, this utility will launch the given number of processes per node 24 | (``--nproc_per_node``). If used for GPU training, this number needs to be less 25 | or euqal to the number of GPUs on the current system (``nproc_per_node``), 26 | and each process will be operating on a single GPU from *GPU 0 to 27 | GPU (nproc_per_node - 1)*. 28 | **How to use this module:** 29 | 1. Single-Node multi-process distributed training 30 | :: 31 | >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE 32 | YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 and all other 33 | arguments of your training script) 34 | 2. Multi-Node multi-process distributed training: (e.g. two nodes) 35 | Node 1: *(IP: 192.168.1.1, and has a free port: 1234)* 36 | :: 37 | >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE 38 | --nnodes=2 --node_rank=0 --master_addr="192.168.1.1" 39 | --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 40 | and all other arguments of your training script) 41 | Node 2: 42 | :: 43 | >>> python -m torch.distributed.launch --nproc_per_node=NUM_GPUS_YOU_HAVE 44 | --nnodes=2 --node_rank=1 --master_addr="192.168.1.1" 45 | --master_port=1234 YOUR_TRAINING_SCRIPT.py (--arg1 --arg2 --arg3 46 | and all other arguments of your training script) 47 | 3. To look up what optional arguments this module offers: 48 | :: 49 | >>> python -m torch.distributed.launch --help 50 | **Important Notices:** 51 | 1. This utilty and multi-process distributed (single-node or 52 | multi-node) GPU training currently only achieves the best performance using 53 | the NCCL distributed backend. Thus NCCL backend is the recommended backend to 54 | use for GPU training. 55 | 2. In your training program, you must parse the command-line argument: 56 | ``--local_rank=LOCAL_PROCESS_RANK``, which will be provided by this module. 57 | If your training program uses GPUs, you should ensure that your code only 58 | runs on the GPU device of LOCAL_PROCESS_RANK. This can be done by: 59 | Parsing the local_rank argument 60 | :: 61 | >>> import argparse 62 | >>> parser = argparse.ArgumentParser() 63 | >>> parser.add_argument("--local_rank", type=int) 64 | >>> args = parser.parse_args() 65 | Set your device to local rank using either 66 | :: 67 | >>> torch.cuda.set_device(arg.local_rank) # before your code runs 68 | or 69 | :: 70 | >>> with torch.cuda.device(arg.local_rank): 71 | >>> # your code to run 72 | 3. In your training program, you are supposed to call the following function 73 | at the beginning to start the distributed backend. You need to make sure that 74 | the init_method uses ``env://``, which is the only supported ``init_method`` 75 | by this module. 76 | :: 77 | torch.distributed.init_process_group(backend='YOUR BACKEND', 78 | init_method='env://') 79 | 4. In your training program, you can either use regular distributed functions 80 | or use :func:`torch.nn.parallel.DistributedDataParallel` module. If your 81 | training program uses GPUs for training and you would like to use 82 | :func:`torch.nn.parallel.DistributedDataParallel` module, 83 | here is how to configure it. 84 | :: 85 | model = torch.nn.parallel.DistributedDataParallel(model, 86 | device_ids=[arg.local_rank], 87 | output_device=arg.local_rank) 88 | Please ensure that ``device_ids`` argument is set to be the only GPU device id 89 | that your code will be operating on. This is generally the local rank of the 90 | process. In other words, the ``device_ids`` needs to be ``[args.local_rank]``, 91 | and ``output_device`` needs to be ``args.local_rank`` in order to use this 92 | utility 93 | 5. Another way to pass ``local_rank`` to the subprocesses via environment variable 94 | ``LOCAL_RANK``. This behavior is enabled when you launch the script with 95 | ``--use_env=True``. You must adjust the subprocess example above to replace 96 | ``args.local_rank`` with ``os.environ['LOCAL_RANK']``; the launcher 97 | will not pass ``--local_rank`` when you specify this flag. 98 | .. warning:: 99 | ``local_rank`` is NOT globally unique: it is only unique per process 100 | on a machine. Thus, don't use it to decide if you should, e.g., 101 | write to a networked filesystem. See 102 | https://github.com/pytorch/pytorch/issues/12042 for an example of 103 | how things can go wrong if you don't do this correctly. 104 | """ 105 | 106 | 107 | import sys 108 | import subprocess 109 | import os 110 | import socket 111 | from argparse import ArgumentParser, REMAINDER 112 | 113 | import torch 114 | 115 | 116 | def parse_args(): 117 | """ 118 | Helper function parsing the command line options 119 | @retval ArgumentParser 120 | """ 121 | parser = ArgumentParser(description="PyTorch distributed training launch " 122 | "helper utilty that will spawn up " 123 | "multiple distributed processes") 124 | 125 | # Optional arguments for the launch helper 126 | parser.add_argument("--nnodes", type=int, default=1, 127 | help="The number of nodes to use for distributed " 128 | "training") 129 | parser.add_argument("--node_rank", type=int, default=0, 130 | help="The rank of the node for multi-node distributed " 131 | "training") 132 | parser.add_argument("--nproc_per_node", type=int, default=1, 133 | help="The number of processes to launch on each node, " 134 | "for GPU training, this is recommended to be set " 135 | "to the number of GPUs in your system so that " 136 | "each process can be bound to a single GPU.") 137 | parser.add_argument("--master_addr", default="127.0.0.1", type=str, 138 | help="Master node (rank 0)'s address, should be either " 139 | "the IP address or the hostname of node 0, for " 140 | "single node multi-proc training, the " 141 | "--master_addr can simply be 127.0.0.1") 142 | parser.add_argument("--master_port", default=29500, type=int, 143 | help="Master node (rank 0)'s free port that needs to " 144 | "be used for communciation during distributed " 145 | "training") 146 | 147 | # positional 148 | parser.add_argument("training_script", type=str, 149 | help="The full path to the single GPU training " 150 | "program/script to be launched in parallel, " 151 | "followed by all the arguments for the " 152 | "training script") 153 | 154 | # rest from the training program 155 | parser.add_argument('training_script_args', nargs=REMAINDER) 156 | return parser.parse_args() 157 | 158 | 159 | def main(): 160 | args = parse_args() 161 | 162 | # world size in terms of number of processes 163 | dist_world_size = args.nproc_per_node * args.nnodes 164 | 165 | # set PyTorch distributed related environmental variables 166 | current_env = os.environ.copy() 167 | current_env["MASTER_ADDR"] = args.master_addr 168 | current_env["MASTER_PORT"] = str(args.master_port) 169 | current_env["WORLD_SIZE"] = str(dist_world_size) 170 | 171 | processes = [] 172 | 173 | for local_rank in range(0, args.nproc_per_node): 174 | # each process's rank 175 | dist_rank = args.nproc_per_node * args.node_rank + local_rank 176 | current_env["RANK"] = str(dist_rank) 177 | current_env["LOCAL_RANK"] = str(local_rank) 178 | 179 | cmd = [args.training_script] + args.training_script_args 180 | 181 | process = subprocess.Popen(cmd, env=current_env) 182 | processes.append(process) 183 | 184 | for process in processes: 185 | process.wait() 186 | if process.returncode != 0: 187 | raise subprocess.CalledProcessError(returncode=process.returncode, 188 | cmd=process.args) 189 | 190 | 191 | if __name__ == "__main__": 192 | main() -------------------------------------------------------------------------------- /tools/run_dist_launch.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # ------------------------------------------------------------------------ 3 | # Deformable DETR 4 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 5 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 6 | # ------------------------------------------------------------------------ 7 | 8 | set -x 9 | 10 | GPUS=$1 11 | RUN_COMMAND=${@:2} 12 | if [ $GPUS -lt 8 ]; then 13 | GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS} 14 | else 15 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 16 | fi 17 | MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"} 18 | MASTER_PORT=${MASTER_PORT:-"29500"} 19 | NODE_RANK=${NODE_RANK:-0} 20 | 21 | let "NNODES=GPUS/GPUS_PER_NODE" 22 | 23 | conda activate pytorch 24 | which python 25 | python ./tools/launch.py \ 26 | --nnodes ${NNODES} \ 27 | --node_rank ${NODE_RANK} \ 28 | --master_addr ${MASTER_ADDR} \ 29 | --master_port ${MASTER_PORT} \ 30 | --nproc_per_node ${GPUS_PER_NODE} \ 31 | ${RUN_COMMAND} -------------------------------------------------------------------------------- /tools/run_dist_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | # -------------------------------------------------------------------------------------------------------------------------- 3 | # Modified from https://github.com/open-mmlab/mmdetection/blob/3b53fe15d87860c6941f3dda63c0f27422da6266/tools/slurm_train.sh 4 | # -------------------------------------------------------------------------------------------------------------------------- 5 | 6 | set -x 7 | PARTITION=edith 8 | JOB_NAME=$1 9 | GPUS=$2 10 | RUN_COMMAND=${@:3} 11 | RUN_TIME=${RUN_TIME:-"240:00:00"} 12 | dt=$(date '+%Y_%m%d_%H') 13 | if [ $GPUS -lt 8 ]; then 14 | GPUS_PER_NODE=${GPUS_PER_NODE:-$GPUS} 15 | else 16 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 17 | fi 18 | CPUS_PER_TASK=${CPUS_PER_TASK:-2} 19 | # CPUS_PER_TASK=2 20 | SRUN_ARGS=${SRUN_ARGS:-""} 21 | mkdir logs 22 | 23 | srun -p ${PARTITION} \ 24 | --job-name=${JOB_NAME} \ 25 | --gres=gpu:${GPUS_PER_NODE} \ 26 | --ntasks=${GPUS} \ 27 | --ntasks-per-node=${GPUS_PER_NODE} \ 28 | --cpus-per-task=${CPUS_PER_TASK} \ 29 | --time=${RUN_TIME}\ 30 | --kill-on-bad-exit=1 \ 31 | ${SRUN_ARGS} \ 32 | ${RUN_COMMAND}\ 33 | > ./logs/${JOB_NAME}_${dt}.log 2>&1 & 34 | 35 | # Removing this args because the 36 | # SRUN_ARGS="--nodelist=edith1" MASTER_PORT=29501 GPUS_PER_NODE=4 ./tools/run_dist_slurm.sh edith RefTR 4 configs/r50_deformable_vg_detr_single_scale_pretrained.sh -------------------------------------------------------------------------------- /tools/vis_log.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import shutil 4 | from tqdm import tqdm 5 | from torch.utils.tensorboard import SummaryWriter 6 | 7 | 8 | def convert_from_log(log_dir): 9 | if os.path.exists(f'{log_dir}/tb'): 10 | shutil.rmtree(f'{log_dir}/tb') 11 | tb_writter = SummaryWriter(log_dir=f'{log_dir}/tb') 12 | with open(f"{log_dir}/log.txt", 'r') as f: 13 | lines = f.readlines() 14 | for epoch, line in tqdm(enumerate(lines)): 15 | line = line.strip() 16 | if line == '': 17 | break 18 | info = json.loads(line) 19 | 20 | tb_writter.add_scalar('Loss/train', info['train_loss'], epoch) 21 | tb_writter.add_scalar('Loss_bbox/train', info['train_loss_bbox_unscaled'], epoch) 22 | tb_writter.add_scalar('Loss_ce/train', info['train_loss_ce_unscaled'], epoch) 23 | 24 | tb_writter.add_scalar('Loss/test', info['test_loss'], epoch) 25 | tb_writter.add_scalar('Loss_bbox/test', info['test_loss_bbox_unscaled'], epoch) 26 | tb_writter.add_scalar('Loss_ce/test', info['test_loss_ce_unscaled'], epoch) 27 | tb_writter.add_scalar('Accuracy/test', info['test_accuracy_iou0.5'], epoch) 28 | tb_writter.add_scalar('Miou/test', info['test_miou'], epoch) 29 | tb_writter.close() 30 | 31 | 32 | if __name__ == '__main__': 33 | exp_path = './exps' 34 | for x in os.listdir(exp_path): 35 | if os.path.isdir(f'{exp_path}/{x}') and os.path.exists(f'{exp_path}/{x}/log.txt'): 36 | convert_from_log(f'{exp_path}/{x}') 37 | -------------------------------------------------------------------------------- /tox.ini: -------------------------------------------------------------------------------- 1 | [flake8] 2 | ignore = E226,E302,E41,F401 3 | max-line-length = 200 4 | exclude = tests/* 5 | max-complexity = 10 6 | -------------------------------------------------------------------------------- /util/__init__.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | -------------------------------------------------------------------------------- /util/box_ops.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Utilities for bounding box manipulation and GIoU. 12 | """ 13 | import torch 14 | from torchvision.ops.boxes import box_area 15 | 16 | 17 | def box_cxcywh_to_xyxy(x): 18 | x_c, y_c, w, h = x.unbind(-1) 19 | b = [(x_c - 0.5 * w), (y_c - 0.5 * h), 20 | (x_c + 0.5 * w), (y_c + 0.5 * h)] 21 | return torch.stack(b, dim=-1) 22 | 23 | 24 | def box_xyxy_to_cxcywh(x): 25 | x0, y0, x1, y1 = x.unbind(-1) 26 | b = [(x0 + x1) / 2, (y0 + y1) / 2, 27 | (x1 - x0), (y1 - y0)] 28 | return torch.stack(b, dim=-1) 29 | 30 | 31 | # modified from torchvision to also return the union 32 | def box_iou(boxes1, boxes2): 33 | area1 = box_area(boxes1) 34 | area2 = box_area(boxes2) 35 | 36 | lt = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] 37 | rb = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] 38 | 39 | wh = (rb - lt).clamp(min=0) # [N,M,2] 40 | inter = wh[:, :, 0] * wh[:, :, 1] # [N,M] 41 | 42 | union = area1[:, None] + area2 - inter 43 | 44 | iou = inter / union 45 | return iou, union 46 | 47 | 48 | def generalized_box_iou(boxes1, boxes2): 49 | """ 50 | Generalized IoU from https://giou.stanford.edu/ 51 | 52 | The boxes should be in [x0, y0, x1, y1] format 53 | 54 | Returns a [N, M] pairwise matrix, where N = len(boxes1) 55 | and M = len(boxes2) 56 | """ 57 | # degenerate boxes gives inf / nan results 58 | # so do an early check 59 | assert (boxes1[:, 2:] >= boxes1[:, :2]).all() 60 | assert (boxes2[:, 2:] >= boxes2[:, :2]).all() 61 | iou, union = box_iou(boxes1, boxes2) 62 | 63 | lt = torch.min(boxes1[:, None, :2], boxes2[:, :2]) 64 | rb = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) 65 | 66 | wh = (rb - lt).clamp(min=0) # [N,M,2] 67 | area = wh[:, :, 0] * wh[:, :, 1] 68 | 69 | return iou - (area - union) / area 70 | 71 | 72 | def masks_to_boxes(masks): 73 | """Compute the bounding boxes around the provided masks 74 | 75 | The masks should be in format [N, H, W] where N is the number of masks, (H, W) are the spatial dimensions. 76 | 77 | Returns a [N, 4] tensors, with the boxes in xyxy format 78 | """ 79 | if masks.numel() == 0: 80 | return torch.zeros((0, 4), device=masks.device) 81 | 82 | h, w = masks.shape[-2:] 83 | 84 | y = torch.arange(0, h, dtype=torch.float) 85 | x = torch.arange(0, w, dtype=torch.float) 86 | y, x = torch.meshgrid(y, x) 87 | 88 | x_mask = (masks * x.unsqueeze(0)) 89 | x_max = x_mask.flatten(1).max(-1)[0] 90 | x_min = x_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 91 | 92 | y_mask = (masks * y.unsqueeze(0)) 93 | y_max = y_mask.flatten(1).max(-1)[0] 94 | y_min = y_mask.masked_fill(~(masks.bool()), 1e8).flatten(1).min(-1)[0] 95 | 96 | return torch.stack([x_min, y_min, x_max, y_max], 1) 97 | 98 | 99 | def mask_iou(masks, target): 100 | assert(target.shape[-2:] == masks.shape[-2:]) 101 | I = torch.sum(torch.logical_and(masks, target)) 102 | U = torch.sum(torch.logical_or(masks, target)) 103 | return I.float() / U.float() 104 | -------------------------------------------------------------------------------- /util/collate_fn.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import re 3 | import collections 4 | from .misc import NestedTensor 5 | from typing import List 6 | from torch._six import string_classes 7 | 8 | def collate_fn_vg(batch): 9 | batch = list(zip(*batch)) 10 | 11 | batch[0] = default_collate(batch[0]) 12 | return tuple(batch) 13 | 14 | 15 | def _max_by_axis(the_list): 16 | # type: (List[List[int]]) -> List[int] 17 | maxes = the_list[0] 18 | for sublist in the_list[1:]: 19 | for index, item in enumerate(sublist): 20 | maxes[index] = max(maxes[index], item) 21 | return maxes 22 | 23 | 24 | def nested_tensor_from_tensor_list(tensor_list: List[torch.Tensor]): 25 | # TODO make this more general 26 | if tensor_list[0].ndim == 3: 27 | # TODO make it support different-sized images 28 | max_size = _max_by_axis([list(img.shape) for img in tensor_list]) 29 | # min_size = tuple(min(s) for s in zip(*[img.shape for img in tensor_list])) 30 | batch_shape = [len(tensor_list)] + max_size 31 | b, c, h, w = batch_shape 32 | dtype = tensor_list[0].dtype 33 | device = tensor_list[0].device 34 | tensor = torch.zeros(batch_shape, dtype=dtype, device=device) 35 | mask = torch.ones((b, h, w), dtype=torch.bool, device=device) 36 | for img, pad_img, m in zip(tensor_list, tensor, mask): 37 | pad_img[: img.shape[0], : img.shape[1], : img.shape[2]].copy_(img) 38 | m[: img.shape[1], :img.shape[2]] = False 39 | else: 40 | raise ValueError('not supported') 41 | return NestedTensor(tensor, mask) 42 | 43 | 44 | # Following is modified from Modified from 45 | # https://github.com/pytorch/pytorch/blob/master/torch/utils/data/_utils/collate.py 46 | ############################################################################# 47 | default_collate_err_msg_format = ( 48 | "default_collate: batch must contain tensors, numpy arrays, numbers, " 49 | "dicts or lists; found {}") 50 | 51 | np_str_obj_array_pattern = re.compile(r'[SaUO]') 52 | 53 | def default_collate(batch): 54 | r"""Puts each data field into a tensor with outer dimension batch size 55 | """ 56 | elem = batch[0] 57 | elem_type = type(elem) 58 | if isinstance(elem, torch.Tensor): 59 | out = None 60 | if torch.utils.data.get_worker_info() is not None: 61 | # If we're in a background process, concatenate directly into a 62 | # shared memory tensor to avoid an extra copy 63 | numel = sum([x.numel() for x in batch]) 64 | storage = elem.storage()._new_shared(numel) 65 | out = elem.new(storage) 66 | return torch.stack(batch, 0, out=out) 67 | elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \ 68 | and elem_type.__name__ != 'string_': 69 | if elem_type.__name__ == 'ndarray' or elem_type.__name__ == 'memmap': 70 | # array of string classes and object 71 | if np_str_obj_array_pattern.search(elem.dtype.str) is not None: 72 | raise TypeError(default_collate_err_msg_format.format(elem.dtype)) 73 | 74 | return default_collate([torch.as_tensor(b) for b in batch]) 75 | elif elem.shape == (): # scalars 76 | return torch.as_tensor(batch) 77 | elif isinstance(elem, float): 78 | return torch.tensor(batch, dtype=torch.float64) 79 | elif isinstance(elem, int): 80 | return torch.tensor(batch) 81 | elif isinstance(elem, string_classes): 82 | return batch 83 | elif isinstance(elem, collections.abc.Mapping): 84 | d = {} 85 | for key in elem: 86 | if key == 'img': 87 | d['img'] = nested_tensor_from_tensor_list([d['img'] for d in batch]) 88 | else: 89 | d[key] = default_collate([d[key] for d in batch]) 90 | return d 91 | elif isinstance(elem, tuple) and hasattr(elem, '_fields'): # namedtuple 92 | return elem_type(*(default_collate(samples) for samples in zip(*batch))) 93 | elif isinstance(elem, collections.abc.Sequence): 94 | # check to make sure that the elements in batch have consistent size 95 | it = iter(batch) 96 | elem_size = len(next(it)) 97 | if not all(len(elem) == elem_size for elem in it): 98 | raise RuntimeError('each element in list of batch should be of equal size') 99 | transposed = zip(*batch) 100 | return [default_collate(samples) for samples in transposed] 101 | 102 | raise TypeError(default_collate_err_msg_format.format(elem_type)) -------------------------------------------------------------------------------- /util/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | import math 2 | # config["max_num_epochs"] = 100 3 | # warm_up_epochs = 5 4 | # lr_milestones = [20,40] 5 | # # MultiStepLR without warm up 6 | # multistep_lr = lambda epoch: 0.1**len([m for m in lr_milestones if m <= epoch]) 7 | # # warm_up_with_multistep_lr 8 | # warm_up_with_multistep_lr = lambda epoch: (epoch+1) / warm_up_epochs if epoch < warm_up_epochs else 0.1**len([m for m in lr_milestones if m <= epoch]) 9 | # # warm_up_with_step_lr 10 | # gamma = 0.9; stepsize = 1 11 | # warm_up_with_step_lr = lambda epoch: (epoch+1) / warm_up_epochs if epoch < warm_up_epochs \ 12 | # else gamma**( ((epoch - warm_up_epochs) /(config["max_num_epochs"] - warm_up_epochs))//stepsize*stepsize) 13 | # # warm_up_with_cosine_lr 14 | # warm_up_with_cosine_lr = lambda epoch: (epoch+1) / warm_up_epochs if epoch < warm_up_epochs \ 15 | # else 0.5 * ( math.cos((epoch - warm_up_epochs) /(config["max_num_epochs"] - warm_up_epochs) * math.pi) + 1) 16 | 17 | # scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda=warm_up_with_cosine_lr) 18 | 19 | class MultiStepWarmupLR: 20 | def __init__(self, decay_rate=0.1, lr_milestones=[20000, 40000], warm_up_steps=5000, min_decay_rate=0.01) -> None: 21 | self.deacy_rate = decay_rate 22 | self.lr_milestones = lr_milestones 23 | self.warm_up_steps = warm_up_steps 24 | self.min_decay_rate = min_decay_rate 25 | 26 | def __call__(self, steps): 27 | if steps < self.warm_up_steps: 28 | rate = (steps+1)/self.warm_up_steps 29 | else: 30 | rate = self.deacy_rate ** len([m for m in self.lr_milestones if m <= steps]) 31 | # make sure lr is not too small 32 | if rate <= self.min_decay_rate: 33 | return self.min_decay_rate 34 | else: 35 | return rate 36 | 37 | class CosineWarmupLR: 38 | def __init__(self, max_T=100, warm_up_steps=5, min_decay_rate=0.01) -> None: 39 | self.max_T = max_T 40 | self.warm_up_steps = warm_up_steps 41 | self.min_decay_rate = min_decay_rate 42 | 43 | def __call__(self, steps): 44 | if steps < self.warm_up_steps: 45 | rate = (steps+1)/self.warm_up_steps 46 | else: 47 | rate = 0.5 * (math.cos((steps - self.warm_up_steps) / (self.max_T - self.warm_up_steps) * math.pi) + 1) 48 | # make sure lr is not too small 49 | if rate <= self.min_decay_rate: 50 | return self.min_decay_rate 51 | else: 52 | return rate 53 | -------------------------------------------------------------------------------- /util/plot_utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------ 2 | # Deformable DETR 3 | # Copyright (c) 2020 SenseTime. All Rights Reserved. 4 | # Licensed under the Apache License, Version 2.0 [see LICENSE for details] 5 | # ------------------------------------------------------------------------ 6 | # Modified from DETR (https://github.com/facebookresearch/detr) 7 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved 8 | # ------------------------------------------------------------------------ 9 | 10 | """ 11 | Plotting utilities to visualize training logs. 12 | """ 13 | import torch 14 | import pandas as pd 15 | import seaborn as sns 16 | import matplotlib.pyplot as plt 17 | 18 | from pathlib import Path, PurePath 19 | 20 | 21 | def plot_logs(logs, fields=('class_error', 'loss_bbox_unscaled', 'mAP'), ewm_col=0, log_name='log.txt'): 22 | ''' 23 | Function to plot specific fields from training log(s). Plots both training and test results. 24 | 25 | :: Inputs - logs = list containing Path objects, each pointing to individual dir with a log file 26 | - fields = which results to plot from each log file - plots both training and test for each field. 27 | - ewm_col = optional, which column to use as the exponential weighted smoothing of the plots 28 | - log_name = optional, name of log file if different than default 'log.txt'. 29 | 30 | :: Outputs - matplotlib plots of results in fields, color coded for each log file. 31 | - solid lines are training results, dashed lines are test results. 32 | 33 | ''' 34 | func_name = "plot_utils.py::plot_logs" 35 | 36 | # verify logs is a list of Paths (list[Paths]) or single Pathlib object Path, 37 | # convert single Path to list to avoid 'not iterable' error 38 | 39 | if not isinstance(logs, list): 40 | if isinstance(logs, PurePath): 41 | logs = [logs] 42 | print(f"{func_name} info: logs param expects a list argument, converted to list[Path].") 43 | else: 44 | raise ValueError(f"{func_name} - invalid argument for logs parameter.\n \ 45 | Expect list[Path] or single Path obj, received {type(logs)}") 46 | 47 | # verify valid dir(s) and that every item in list is Path object 48 | for i, dir in enumerate(logs): 49 | if not isinstance(dir, PurePath): 50 | raise ValueError(f"{func_name} - non-Path object in logs argument of {type(dir)}: \n{dir}") 51 | if dir.exists(): 52 | continue 53 | raise ValueError(f"{func_name} - invalid directory in logs argument:\n{dir}") 54 | 55 | # load log file(s) and plot 56 | dfs = [pd.read_json(Path(p) / log_name, lines=True) for p in logs] 57 | 58 | fig, axs = plt.subplots(ncols=len(fields), figsize=(16, 5)) 59 | 60 | for df, color in zip(dfs, sns.color_palette(n_colors=len(logs))): 61 | for j, field in enumerate(fields): 62 | if field == 'mAP': 63 | coco_eval = pd.DataFrame(pd.np.stack(df.test_coco_eval.dropna().values)[:, 1]).ewm(com=ewm_col).mean() 64 | axs[j].plot(coco_eval, c=color) 65 | else: 66 | df.interpolate().ewm(com=ewm_col).mean().plot( 67 | y=[f'train_{field}', f'test_{field}'], 68 | ax=axs[j], 69 | color=[color] * 2, 70 | style=['-', '--'] 71 | ) 72 | for ax, field in zip(axs, fields): 73 | ax.legend([Path(p).name for p in logs]) 74 | ax.set_title(field) 75 | 76 | 77 | def plot_precision_recall(files, naming_scheme='iter'): 78 | if naming_scheme == 'exp_id': 79 | # name becomes exp_id 80 | names = [f.parts[-3] for f in files] 81 | elif naming_scheme == 'iter': 82 | names = [f.stem for f in files] 83 | else: 84 | raise ValueError(f'not supported {naming_scheme}') 85 | fig, axs = plt.subplots(ncols=2, figsize=(16, 5)) 86 | for f, color, name in zip(files, sns.color_palette("Blues", n_colors=len(files)), names): 87 | data = torch.load(f) 88 | # precision is n_iou, n_points, n_cat, n_area, max_det 89 | precision = data['precision'] 90 | recall = data['params'].recThrs 91 | scores = data['scores'] 92 | # take precision for all classes, all areas and 100 detections 93 | precision = precision[0, :, :, 0, -1].mean(1) 94 | scores = scores[0, :, :, 0, -1].mean(1) 95 | prec = precision.mean() 96 | rec = data['recall'][0, :, 0, -1].mean() 97 | print(f'{naming_scheme} {name}: mAP@50={prec * 100: 05.1f}, ' + 98 | f'score={scores.mean():0.3f}, ' + 99 | f'f1={2 * prec * rec / (prec + rec + 1e-8):0.3f}' 100 | ) 101 | axs[0].plot(recall, precision, c=color) 102 | axs[1].plot(recall, scores, c=color) 103 | 104 | axs[0].set_title('Precision / Recall') 105 | axs[0].legend(names) 106 | axs[1].set_title('Scores / Recall') 107 | axs[1].legend(names) 108 | return fig, axs 109 | 110 | 111 | 112 | -------------------------------------------------------------------------------- /util/transforms.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Generic Image Transform utillities. 5 | """ 6 | 7 | import cv2 8 | import random, math 9 | import numpy as np 10 | from collections import Iterable 11 | 12 | import torch.nn.functional as F 13 | from torch.autograd import Variable 14 | 15 | 16 | def letterbox(img, mask, height, color=(123.7, 116.3, 103.5)): # resize a rectangular image to a padded square 17 | shape = img.shape[:2] # shape = [height, width] 18 | ratio = float(height) / max(shape) # ratio = old / new 19 | new_shape = (round(shape[1] * ratio), round(shape[0] * ratio)) 20 | dw = (height - new_shape[0]) / 2 # width padding 21 | dh = (height - new_shape[1]) / 2 # height padding 22 | top, bottom = round(dh - 0.1), round(dh + 0.1) 23 | left, right = round(dw - 0.1), round(dw + 0.1) 24 | img = cv2.resize(img, new_shape, interpolation=cv2.INTER_AREA) # resized, no border 25 | img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color) # padded square 26 | if mask is not None: 27 | mask = cv2.resize(mask, new_shape, interpolation=cv2.INTER_NEAREST) # resized, no border 28 | mask = cv2.copyMakeBorder(mask, top, bottom, left, right, cv2.BORDER_CONSTANT, value=255) # padded square 29 | return img, mask, ratio, dw, dh 30 | 31 | def random_affine(img, mask, targets, degrees=(-10, 10), translate=(.1, .1), scale=(.9, 1.1), shear=(-2, 2), 32 | borderValue=(123.7, 116.3, 103.5), all_bbox=None): 33 | border = 0 # width of added border (optional) 34 | height = max(img.shape[0], img.shape[1]) + border * 2 35 | 36 | # Rotation and Scale 37 | R = np.eye(3) 38 | a = random.random() * (degrees[1] - degrees[0]) + degrees[0] 39 | # a += random.choice([-180, -90, 0, 90]) # 90deg rotations added to small rotations 40 | s = random.random() * (scale[1] - scale[0]) + scale[0] 41 | R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s) 42 | 43 | # Translation 44 | T = np.eye(3) 45 | T[0, 2] = (random.random() * 2 - 1) * translate[0] * img.shape[0] + border # x translation (pixels) 46 | T[1, 2] = (random.random() * 2 - 1) * translate[1] * img.shape[1] + border # y translation (pixels) 47 | 48 | # Shear 49 | S = np.eye(3) 50 | S[0, 1] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # x shear (deg) 51 | S[1, 0] = math.tan((random.random() * (shear[1] - shear[0]) + shear[0]) * math.pi / 180) # y shear (deg) 52 | 53 | M = S @ T @ R # Combined rotation matrix. ORDER IS IMPORTANT HERE!! 54 | imw = cv2.warpPerspective(img, M, dsize=(height, height), flags=cv2.INTER_LINEAR, 55 | borderValue=borderValue) # BGR order borderValue 56 | if mask is not None: 57 | maskw = cv2.warpPerspective(mask, M, dsize=(height, height), flags=cv2.INTER_NEAREST, 58 | borderValue=255) # BGR order borderValue 59 | else: 60 | maskw = None 61 | 62 | # Return warped points also 63 | if isinstance(targets, list): 64 | targetlist=[] 65 | for bbox in targets: 66 | targetlist.append(wrap_points(bbox, M, height, a)) 67 | return imw, maskw, targetlist 68 | elif targets.ndim == 1: ## previous main 69 | targets = wrap_points(targets, M, height, a) 70 | return imw, maskw, targets 71 | elif targets.ndim == 2: 72 | for i in range(targets.shape[0]): 73 | targets[i] = wrap_points(targets[i], M, height, a) 74 | return imw, maskw, targets 75 | else: 76 | return imw 77 | 78 | def wrap_points(targets, M, height, a): 79 | # n = targets.shape[0] 80 | # points = targets[:, 1:5].copy() 81 | points = targets.copy() 82 | # area0 = (points[:, 2] - points[:, 0]) * (points[:, 3] - points[:, 1]) 83 | area0 = (points[2] - points[0]) * (points[3] - points[1]) 84 | 85 | # warp points 86 | xy = np.ones((4, 3)) 87 | xy[:, :2] = points[[0, 1, 2, 3, 0, 3, 2, 1]].reshape(4, 2) # x1y1, x2y2, x1y2, x2y1 88 | xy = (xy @ M.T)[:, :2].reshape(1, 8) 89 | 90 | # create new boxes 91 | x = xy[:, [0, 2, 4, 6]] 92 | y = xy[:, [1, 3, 5, 7]] 93 | xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, 1).T 94 | 95 | # apply angle-based reduction 96 | radians = a * math.pi / 180 97 | reduction = max(abs(math.sin(radians)), abs(math.cos(radians))) ** 0.5 98 | x = (xy[:, 2] + xy[:, 0]) / 2 99 | y = (xy[:, 3] + xy[:, 1]) / 2 100 | w = (xy[:, 2] - xy[:, 0]) * reduction 101 | h = (xy[:, 3] - xy[:, 1]) * reduction 102 | xy = np.concatenate((x - w / 2, y - h / 2, x + w / 2, y + h / 2)).reshape(4, 1).T 103 | 104 | # reject warped points outside of image 105 | np.clip(xy, 0, height, out=xy) 106 | w = xy[:, 2] - xy[:, 0] 107 | h = xy[:, 3] - xy[:, 1] 108 | area = w * h 109 | ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16)) 110 | i = (w > 4) & (h > 4) & (area / (area0 + 1e-16) > 0.1) & (ar < 10) 111 | 112 | ## print(targets, xy) 113 | ## [ 56 36 108 210] [[ 47.80464857 15.6096533 106.30993434 196.71267693]] 114 | # targets = targets[i] 115 | # targets[:, 1:5] = xy[i] 116 | targets = xy[0] 117 | return targets -------------------------------------------------------------------------------- /util/word_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | """ 4 | Language-related data loading helper functions and class wrappers. 5 | Copied from https://github.com/zyang-ur/ReSC/blob/e4022f87bfd11200b67c4509bb9746640834ceae/utils/word_utils.py#L45 6 | """ 7 | 8 | import re 9 | import torch 10 | import codecs 11 | 12 | UNK_TOKEN = '' 13 | PAD_TOKEN = '' 14 | END_TOKEN = '' 15 | SENTENCE_SPLIT_REGEX = re.compile(r'(\W+)') 16 | 17 | 18 | class Dictionary(object): 19 | def __init__(self): 20 | self.word2idx = {} 21 | self.idx2word = [] 22 | 23 | def add_word(self, word): 24 | if word not in self.word2idx: 25 | self.idx2word.append(word) 26 | self.word2idx[word] = len(self.idx2word) - 1 27 | return self.word2idx[word] 28 | 29 | def __len__(self): 30 | return len(self.idx2word) 31 | 32 | def __getitem__(self, a): 33 | if isinstance(a, int): 34 | return self.idx2word[a] 35 | elif isinstance(a, list): 36 | return [self.idx2word[x] for x in a] 37 | elif isinstance(a, str): 38 | return self.word2idx[a] 39 | else: 40 | raise TypeError("Query word/index argument must be int or str") 41 | 42 | def __contains__(self, word): 43 | return word in self.word2idx 44 | 45 | 46 | class Corpus(object): 47 | def __init__(self): 48 | self.dictionary = Dictionary() 49 | 50 | def set_max_len(self, value): 51 | self.max_len = value 52 | 53 | def load_file(self, filename): 54 | with codecs.open(filename, 'r', 'utf-8') as f: 55 | for line in f: 56 | line = line.strip() 57 | self.add_to_corpus(line) 58 | self.dictionary.add_word(UNK_TOKEN) 59 | self.dictionary.add_word(PAD_TOKEN) 60 | 61 | def add_to_corpus(self, line): 62 | """Tokenizes a text line.""" 63 | # Add words to the dictionary 64 | words = line.split() 65 | # tokens = len(words) 66 | for word in words: 67 | word = word.lower() 68 | self.dictionary.add_word(word) 69 | 70 | def tokenize(self, line, max_len=20): 71 | # Tokenize line contents 72 | words = SENTENCE_SPLIT_REGEX.split(line.strip()) 73 | # words = [w.lower() for w in words if len(w) > 0] 74 | words = [w.lower() for w in words if (len(w) > 0 and w!=' ')] ## do not include space as a token 75 | 76 | if words[-1] == '.': 77 | words = words[:-1] 78 | 79 | if max_len > 0: 80 | if len(words) > max_len: 81 | words = words[:max_len] 82 | elif len(words) < max_len: 83 | # words = [PAD_TOKEN] * (max_len - len(words)) + words 84 | words = words + [END_TOKEN] + [PAD_TOKEN] * (max_len - len(words) - 1) 85 | 86 | tokens = len(words) ## for end token 87 | ids = torch.LongTensor(tokens) 88 | token = 0 89 | for word in words: 90 | if word not in self.dictionary: 91 | word = UNK_TOKEN 92 | # print(word, type(word), word.encode('ascii','ignore').decode('ascii'), type(word.encode('ascii','ignore').decode('ascii'))) 93 | if type(word)!=type('a'): 94 | print(word, type(word), word.encode('ascii','ignore').decode('ascii'), type(word.encode('ascii','ignore').decode('ascii'))) 95 | word = word.encode('ascii','ignore').decode('ascii') 96 | ids[token] = self.dictionary[word] 97 | token += 1 98 | # ids[token] = self.dictionary[END_TOKEN] 99 | return ids 100 | 101 | def __len__(self): 102 | return len(self.dictionary) --------------------------------------------------------------------------------