├── .gitignore ├── .gitmodules ├── CODE_OF_CONDUCT.md ├── DOWNLOAD.md ├── INSTALL.md ├── LICENSE ├── MODEL_ZOO.md ├── README.md ├── SECURITY.md ├── VinVL_DOWNLOAD.md ├── VinVL_MODEL_ZOO.md ├── docs ├── oscar.PNG ├── oscar_logo.png └── pretrain_corpus.PNG ├── oscar ├── __init__.py ├── datasets │ ├── __init__.py │ ├── build.py │ └── oscar_tsv.py ├── modeling │ ├── __init__.py │ ├── modeling_bert.py │ └── modeling_utils.py ├── run_captioning.py ├── run_gqa.py ├── run_nlvr.py ├── run_oscarplus_pretrain.py ├── run_retrieval.py ├── run_vqa.py └── utils │ ├── __init__.py │ ├── caption_evaluate.py │ ├── cbs.py │ ├── cider │ └── pyciderevalcap │ │ ├── __init__.py │ │ ├── cider │ │ ├── __init__.py │ │ ├── cider.py │ │ └── cider_scorer.py │ │ └── ciderD │ │ ├── __init__.py │ │ ├── ciderD.py │ │ └── ciderD_scorer.py │ ├── logger.py │ ├── metric_logger.py │ ├── misc.py │ ├── task_utils.py │ ├── tsv_file.py │ └── tsv_file_ops.py ├── requirements.txt └── setup.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Initially taken from Github's Python gitignore file 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # Distribution / packaging 12 | .Python 13 | build/ 14 | develop-eggs/ 15 | dist/ 16 | downloads/ 17 | eggs/ 18 | .eggs/ 19 | lib/ 20 | lib64/ 21 | parts/ 22 | sdist/ 23 | var/ 24 | wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | 53 | # Translations 54 | *.mo 55 | *.pot 56 | 57 | # Django stuff: 58 | *.log 59 | local_settings.py 60 | db.sqlite3 61 | 62 | # Flask stuff: 63 | instance/ 64 | .webassets-cache 65 | 66 | # Scrapy stuff: 67 | .scrapy 68 | 69 | # Sphinx documentation 70 | docs/_build/ 71 | 72 | # PyBuilder 73 | target/ 74 | 75 | # Jupyter Notebook 76 | .ipynb_checkpoints 77 | 78 | # IPython 79 | profile_default/ 80 | ipython_config.py 81 | 82 | # pyenv 83 | .python-version 84 | 85 | # celery beat schedule file 86 | celerybeat-schedule 87 | 88 | # SageMath parsed files 89 | *.sage.py 90 | 91 | # Environments 92 | .env 93 | .venv 94 | env/ 95 | venv/ 96 | ENV/ 97 | env.bak/ 98 | venv.bak/ 99 | 100 | # Spyder project settings 101 | .spyderproject 102 | .spyproject 103 | 104 | # Rope project settings 105 | .ropeproject 106 | 107 | # mkdocs documentation 108 | /site 109 | 110 | # mypy 111 | .mypy_cache/ 112 | .dmypy.json 113 | dmypy.json 114 | 115 | # Pyre type checker 116 | .pyre/ 117 | 118 | # vscode 119 | .vscode 120 | 121 | # TF code 122 | tensorflow_code 123 | 124 | # Models 125 | models 126 | proc_data 127 | 128 | # examples 129 | runs 130 | examples/runs 131 | 132 | # pyCharm 133 | .idea/ 134 | 135 | # local folders 136 | data 137 | models 138 | output 139 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- 1 | [submodule "transformers"] 2 | path = transformers 3 | url = git@github.com:huggingface/transformers.git 4 | [submodule "coco_caption"] 5 | path = coco_caption 6 | url = git@github.com:LuoweiZhou/coco-caption.git 7 | -------------------------------------------------------------------------------- /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Microsoft Open Source Code of Conduct 2 | 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). 4 | 5 | Resources: 6 | 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/) 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns 10 | -------------------------------------------------------------------------------- /DOWNLOAD.md: -------------------------------------------------------------------------------- 1 | # Download 2 | 3 | Note: The data is on Azure Storage Blob, a SAS with Read permission is provided. Please append the following SAS at the end of each link to download: 4 | ```bash 5 | ?sp=r&st=2023-08-28T01:12:41Z&se=3023-08-28T09:12:41Z&sv=2022-11-02&sr=c&sig=6R1YmWluiXmPLsdVn1rDUpeBp2SYBMxDjc6KoKNlY8Q%3D 6 | ``` 7 | 8 | ## Datasets 9 | We provide the extracted image region features, object tags, and the original text annotations for each downstream tasks. 10 | ```bash 11 | wget https://biglmdiag.blob.core.windows.net/oscar/datasets/$TASK_NAME.zip 12 | unzip $TASK_NAME.zip -d $DATA_DIR 13 | ``` 14 | `TASK_NAME` could be `coco_caption`, `coco_ir`, `vqa`, `GQA`, `nlvr2`. 15 | 16 | ## Pre-trained Models 17 | We provide pre-trained *Oscar* models of Bert-base and Bert-large structures, with the name starting with `base` and `large`, respectively. 18 | ```bash 19 | wget https://biglmdiag.blob.core.windows.net/oscar/pretrained_models/$MODEL_NAME.zip 20 | unzip $MODEL_NAME.zip -d $MODEL_DIR 21 | ``` 22 | `MODEL_NAME` could be `base-vg-labels`, `large-vg-labels`, `base-oid-labels`, `base-no-labels`. 23 | 24 | The models are trained with both image region features and object tags. The image region features are extracted by the Faster R-CNN with 25 | ResNet-101, using object and attribute annotations from [Visual Genome](http://visualgenome.org/). 26 | The object tags are from: 27 | 1) the same VisualGenome model, named as `-vg-labels`. Or, 28 | 2) the model trained on object annotations from [Open Images V5](https://storage.googleapis.com/openimages/web/index.html). named as `-oid-labels`. Or, 29 | 3) no object tags provied, serving as baseline, named as `-no-labels`. 30 | 31 | 32 | ### Note 33 | It is recommended to download large files with **AzCopy** for faster speed. 34 | AzCopy executable tools can be downloaded [here](https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-v10#download-azcopy). 35 | Decompress the tar file and put the executable in any path. To download from 36 | any URL above, the command is: 37 | ```bash 38 | path/to/azcopy copy 39 | 40 | # for example, downloading coco_caption.zip 41 | path/to/azcopy copy https://biglmdiag.blob.core.windows.net/oscar/datasets/coco_caption.zip 42 | ``` 43 | 44 | -------------------------------------------------------------------------------- /INSTALL.md: -------------------------------------------------------------------------------- 1 | ## Installation 2 | ### Requirements 3 | - Python 3.7 4 | - Pytorch 1.2 5 | - torchvision 0.4.0 6 | - cuda 10.0 7 | 8 | ### Setup with Conda 9 | ```bash 10 | # create a new environment 11 | conda create --name oscar python=3.7 12 | conda activate oscar 13 | 14 | # install pytorch1.2 15 | conda install pytorch==1.2.0 torchvision==0.4.0 cudatoolkit=10.0 -c pytorch 16 | 17 | export INSTALL_DIR=$PWD 18 | 19 | # install apex 20 | cd $INSTALL_DIR 21 | git clone https://github.com/NVIDIA/apex.git 22 | cd apex 23 | python setup.py install --cuda_ext --cpp_ext 24 | 25 | # install oscar 26 | cd $INSTALL_DIR 27 | git clone --recursive git@github.com:microsoft/Oscar.git 28 | cd Oscar/coco_caption 29 | ./get_stanford_models.sh 30 | cd .. 31 | python setup.py build develop 32 | 33 | # install requirements 34 | pip install -r requirements.txt 35 | 36 | unset INSTALL_DIR 37 | ``` 38 | 39 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) Microsoft Corporation. 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE 22 | -------------------------------------------------------------------------------- /MODEL_ZOO.md: -------------------------------------------------------------------------------- 1 | Note: The data is on Azure Storage Blob, a SAS with Read permission is provided. Please append the following SAS at the end of each link to download: 2 | ```bash 3 | ?sp=r&st=2023-08-28T01:12:41Z&se=3023-08-28T09:12:41Z&sv=2022-11-02&sr=c&sig=6R1YmWluiXmPLsdVn1rDUpeBp2SYBMxDjc6KoKNlY8Q%3D 4 | ``` 5 | 6 | ## Table of Contents 7 | - VQA 8 | - GQA 9 | - NLVR2 10 | - Image/Text Retrieval 11 | - Image Captioning on COCO 12 | 13 | 14 | ## Performance 15 | Task | t2i | t2i | i2t | i2t | IC | IC | IC | IC | NoCaps | NoCaps | VQA | NLVR2 | 16 | --------|-----|-----|-----|-----|-----|-----|------|------|--------|--------|----------|---------| 17 | Metric | R@1 | R@5 | R@1 | R@5 | B@4 | M | C | S | C | S | test-std | test-P | 18 | SoTA_S |39.2 | 68.0|56.6 | 84.5|38.9 |29.2 |129.8 | 22.4 | 61.5 | 9.2 | 70.90 | 53.50 | 19 | SoTA_B |48.4 | 76.7|63.3 | 87.0|39.5 |29.3 |129.3 | 23.2 | 73.1 | 11.2 | 72.54 | 78.87 | 20 | SoTA_L |51.7 | 78.4|66.6 | 89.4| - | - | - | - | - | - | 73.40 | 79.50 | 21 | ----- |--- |--- |--- |--- |--- |--- |--- |--- |--- |--- |--- |--- | 22 | Oscar_B |54.0 | 80.8|70.0 | 91.1|40.5 |29.7 |137.6 | 22.8 | 78.8 | 11.7 | 73.44 | 78.44 | 23 | Oscar_L |57.5 | 82.8|73.5 | 92.2|41.7 |30.6 |140.0 | 24.5 | 80.9 | 11.3 | 73.82 | 80.37 | 24 | gain | 5.8 | 4.4| 6.9 | 2.8| 2.2 | 1.3 | 10.7 | 1.3 | 7.8 | 0.5 | 0.42 | 0.87 | 25 | 26 | t2i: text-to-image retrieval; i2t: image-to-text retrieval; IC: image captioning on COCO. 27 | 28 | For reference, we also release the training logs and output. 29 | 30 | 31 | ## VQA 32 | Script to finetune for Oscar base model. 33 | Base model is trained on train split and evaluated on the val split. Good for later comparison. 34 | 35 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/base/base_9m_ep107_1192k_eu1/application_1575931286052_40649/results/eval_logs.json), [output.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/base/base_9m_ep107_1192k_eu1/application_1575931286052_40649/results/stdout.txt).
36 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/base/results.txt).
37 | Model checkpoint: [.zip](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/base/vqa_base_best.zip). 38 | ```bash 39 | python oscar/run_vqa.py -j 4 --img_feature_dim 2054 --max_img_seq_length 40 | 50 --data_label_type mask --img_feature_type faster_r-cnn --data_dir datasets/vqa/2k 41 | --model_type bert --model_name_or_path pretrained_models/base-vg-labels/ep_107_1192087 42 | --task_name vqa_text --do_train --do_lower_case --max_seq_length 128 --per_gpu_eval_batch_size 43 | 256 --per_gpu_train_batch_size 32 --learning_rate 5e-05 --num_train_epochs 25 44 | --output_dir results --label_file datasets/vqa/cache/trainval_ans2label.pkl 45 | --save_epoch 1 --seed 88 --evaluate_during_training --logging_steps 4000 --drop_out 46 | 0.3 --weight_decay 0.05 --warmup_steps 0 --loss_type bce --img_feat_format pt 47 | --classifier linear --cls_hidden_scale 3 --txt_data_dir datasets/vqa/2k 48 | ``` 49 | 50 | Script to finetune for Oscar large model. 51 | Large model is trained on train+val split and evaluated on the val split, for reproduce the paper's best result. 52 | 53 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/large/ab128_img_large_rr1_ep20_590k_tv_done_good/exp_ab128_img_large_rr1_ep20_590k_tv_0.00003_128_50_dp_0.3_wd_0.05_bce_3linear_s88_abcd/results/eval_logs.json), [output.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/large/ab128_img_large_rr1_ep20_590k_tv_done_good/exp_ab128_img_large_rr1_ep20_590k_tv_0.00003_128_50_dp_0.3_wd_0.05_bce_3linear_s88_abcd/stdout.txt).
54 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/large/results.txt).
55 | Model checkpoint: [.zip](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/large/vqa_large_best.zip). 56 | ```bash 57 | python oscar/run_vqa.py -j 4 --img_feature_dim 2054 --max_img_seq_length 58 | 50 --data_label_type mask --img_feature_type faster_r-cnn --data_dir datasets/vqa/2k 59 | --model_type bert --model_name_or_path pretrained_models/large-vg-labels/ep_20_590000 60 | --task_name vqa_text --do_train_val --do_lower_case --max_seq_length 128 --per_gpu_eval_batch_size 61 | 256 --per_gpu_train_batch_size 24 --learning_rate 3e-05 --num_train_epochs 25 62 | --label_file datasets/vqa/cache/trainval_ans2label.pkl --save_epoch 30 63 | --seed 88 --evaluate_during_training --logging_steps 4000 --drop_out 0.3 --weight_decay 64 | 0.05 --warmup_steps 0 --loss_type bce --save_after_epoch 15 --output_dir results --img_feat_format pt --classifier linear --cls_hidden_scale 3 --txt_data_dir datasets/vqa/2k 65 | ``` 66 | 67 | 68 | ## GQA 69 | Script to finetune for Oscar base model. 70 | 71 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/gqa/base/ab175_base_ep107_1192k_0.4true_taeb_done_25eps_good/exp_ab175_base_ep107_1192k_0.4true_taeb_b_48_0.00005_165_45_dp_0.3_abce/results/eval_logs.json), [output.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/gqa/base/ab175_base_ep107_1192k_0.4true_taeb_done_25eps_good/exp_ab175_base_ep107_1192k_0.4true_taeb_b_48_0.00005_165_45_dp_0.3_abce/stdout.txt).
72 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/gqa/base/ab165_img45_1568928610179_62515_test_done_good/results.txt).
73 | Model checkpoint: [.zip](https://biglmdiag.blob.core.windows.net/oscar/exp/gqa/base/gqa_base_best.zip). 74 | ```bash 75 | python oscar/run_gqa.py -j 4 --img_feature_dim 2054 --max_img_seq_length 76 | 45 --data_dir datasets/GQA/0.4true --model_type bert --model_name_or_path pretrained_models/base-vg-labels/ep_107_1192087 77 | --task_name gqa --do_lower_case --max_seq_length 165 --per_gpu_eval_batch_size 78 | 256 --per_gpu_train_batch_size 48 --learning_rate 5e-05 --num_train_epochs 5 --output_dir 79 | results --label_file datasets/GQA/questions1.2/trainval_testdev_all_ans2label.pkl 80 | --img_feature_type faster_r-cnn --data_label_type all --train_data_type all --eval_data_type 81 | bal --label2ans_file datasets/GQA/questions1.2/trainval_testdev_all_label2ans.pkl 82 | --loss_type xe --save_epoch 2 --seed 88 --evaluate_during_training --logging_steps 83 | 4000 --drop_out 0.3 --do_train --weight_decay 0.05 --warmup_steps 0 84 | ``` 85 | 86 | ## NLVR2 87 | Script to finetune for Oscar base model. 88 | 89 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/base/exp_rvln_base_ep107_1192k_wm1w_b72_0.00003_55_40_dp0.3_3mlp_wm10000_abcf_best/results/eval_logs.json), [output.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/base/exp_rvln_base_ep107_1192k_wm1w_b72_0.00003_55_40_dp0.3_3mlp_wm10000_abcf_best/stdout.txt).
90 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/base/exp_nlvr_base_11123_testall_b24_0.00003_55_43_dp_0.3_mlp_abcj_best/stdout.txt). 91 | ```bash 92 | python oscar/run_nlvr.py -j 4 --img_feature_dim 2054 --max_img_seq_length 93 | 40 --data_dir datasets/nlvr2/ft_corpus --model_type bert --model_name_or_path pretrained_models/base-vg-labels/ep_107_1192087 94 | --task_name nlvr --do_lower_case --max_seq_length 55 --per_gpu_eval_batch_size 95 | 64 --per_gpu_train_batch_size 72 --learning_rate 3e-05 --num_train_epochs 20 --output_dir 96 | results --img_feature_type faster_r-cnn --data_label_type all --train_data_type 97 | all --eval_data_type all --loss_type xe --save_epoch -1 --seed 88 --evaluate_during_training 98 | --logging_steps -1 --drop_out 0.3 --do_train --weight_decay 0.05 --warmup_steps 99 | 10000 --classifier mlp --cls_hidden_scale 3 --num_choice 2 --use_pair 100 | ``` 101 | 102 | Script to finetune for Oscar large model. 103 | 104 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/large/large_1583307153868_14140/exp_rvln_large_ep55_1618k_b24_0.00002_seq55_img40_dp0.3_2mlp_wm5000_abcj/results/eval_logs.json), [output.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/large/large_1583307153868_14140/exp_rvln_large_ep55_1618k_b24_0.00002_seq55_img40_dp0.3_2mlp_wm5000_abcj/stdout.txt).
105 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/large/large_1583307153868_14140/exp_nlvr_large_1583307153868_14140_testall_b24_0.00003_55_43_dp_0.3_mlp_abck/stdout.txt). 106 | ```bash 107 | python oscar/run_nlvr.py -j 4 --img_feature_dim 2054 --max_img_seq_length 108 | 40 --data_dir datasets/nlvr2/ft_corpus --model_type bert --model_name_or_path pretrained_models/large-vg-labels/ep_55_1617000 109 | --task_name nlvr --do_lower_case --max_seq_length 55 --per_gpu_eval_batch_size 110 | 64 --per_gpu_train_batch_size 24 --learning_rate 3e-05 --num_train_epochs 20 --output_dir 111 | results --img_feature_type faster_r-cnn --data_label_type all --train_data_type 112 | all --eval_data_type all --loss_type xe --save_epoch -1 --seed 88 --evaluate_during_training 113 | --logging_steps -1 --drop_out 0.3 --do_train --weight_decay 0.05 --warmup_steps 114 | 5000 --classifier mlp --cls_hidden_scale 2 --num_choice 2 --use_pair 115 | ``` 116 | 117 | 131 | 132 | ## Image Text Retrieval 133 | Script to finetune for Oscar base model (4 V100 with 16G mem): 134 | 135 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/retrieval/base/eval_logs.json), [log.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/retrieval/base/log.txt). 136 | Model checkpoint: [checkpoint.zip](https://biglmdiag.blob.core.windows.net/oscar/exp/retrieval/base/checkpoint.zip). 137 | 138 | ```bash 139 | python oscar/run_retrieval.py \ 140 | --model_name_or_path pretrained_models/base-vg-labels/ep_67_588997 \ 141 | --do_train \ 142 | --do_lower_case \ 143 | --evaluate_during_training \ 144 | --num_captions_per_img_val 20 \ 145 | --eval_caption_index_file minival_caption_indexs_top20.pt \ 146 | --per_gpu_train_batch_size 32 \ 147 | --learning_rate 0.00002 \ 148 | --num_train_epochs 30 \ 149 | --weight_decay 0.05 \ 150 | --save_steps 5000 \ 151 | --add_od_labels \ 152 | --od_label_type vg \ 153 | --max_seq_length 70 \ 154 | --output_dir output/ 155 | ``` 156 | 157 | Script to finetune for Oscar large model (8 V100 with 32G mem): 158 | 159 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/retrieval/large/eval_logs.json), [log.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/retrieval/large/log.txt). 160 | Model checkpoint: [checkpoint.zip](https://biglmdiag.blob.core.windows.net/oscar/exp/retrieval/large/checkpoint.zip). 161 | 162 | ```bash 163 | python oscar/run_retrieval.py \ 164 | --model_name_or_path pretrained_models/large-vg-labels/ep_7_816000 \ 165 | --do_train \ 166 | --do_lower_case \ 167 | --evaluate_during_training \ 168 | --num_captions_per_img_val 20 \ 169 | --eval_caption_index_file minival_caption_indexs_top20.pt \ 170 | --per_gpu_train_batch_size 16 \ 171 | --learning_rate 0.00001 \ 172 | --num_train_epochs 30 \ 173 | --save_steps 5000 \ 174 | --add_od_labels \ 175 | --od_label_type vg \ 176 | --max_seq_length 70 \ 177 | --output_dir output/ 178 | ``` 179 | 180 | Script to inference on COCO 1K test set: 181 | ```bash 182 | python oscar/run_retrieval.py \ 183 | --do_test \ 184 | --do_eval \ 185 | --test_split test \ 186 | --num_captions_per_img_val 5 \ 187 | --eval_img_keys_file test_img_keys_1k.tsv \ 188 | --cross_image_eval \ 189 | --per_gpu_eval_batch_size 64 \ 190 | --eval_model_dir your_model_for_evaluation # could be base/large models. 191 | ``` 192 | 193 | Script to inference on COCO 5K test set: 194 | ```bash 195 | python oscar/run_retrieval.py \ 196 | --do_test \ 197 | --do_eval \ 198 | --test_split test \ 199 | --num_captions_per_img_val 5 \ 200 | --eval_img_keys_file test_img_keys.tsv \ 201 | --cross_image_eval \ 202 | --per_gpu_eval_batch_size 64 \ 203 | --eval_model_dir your_model_for_evaluation # could be base/large models. 204 | ``` 205 | 206 | 207 | ## Image Captioning on COCO 208 | Script to finetune for Oscar base model (4 V100 with 16G mem): 209 | 210 | Training logs: [log.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/coco_caption/base/log.txt). 211 | Model checkpoint: [checkpoint.zip](https://biglmdiag.blob.core.windows.net/oscar/exp/coco_caption/base/checkpoint.zip). 212 | 213 | 1) First train with cross-entropy loss: 214 | ```bash 215 | python oscar/run_captioning.py \ 216 | --model_name_or_path pretrained_models/base-vg-labels/ep_67_588997 \ 217 | --do_train \ 218 | --do_lower_case \ 219 | --evaluate_during_training \ 220 | --add_od_labels \ 221 | --learning_rate 0.00003 \ 222 | --per_gpu_train_batch_size 64 \ 223 | --num_train_epochs 30 \ 224 | --save_steps 5000 \ 225 | --output_dir output/ 226 | ``` 227 | 2) Finetune with CIDEr optimization: 228 | ```bash 229 | python oscar/run_captioning.py \ 230 | --model_name_or_path your_checkpoint_from_cross_entropy \ 231 | --do_train \ 232 | --do_lower_case \ 233 | --evaluate_during_training \ 234 | --add_od_labels \ 235 | --learning_rate 0.000005 \ 236 | --per_gpu_train_batch_size 16 \ 237 | --num_train_epochs 5 \ 238 | --scst \ 239 | --save_steps 2000 \ 240 | --output_dir output/ 241 | ``` 242 | 243 | Script to finetune for Oscar large model (8 V100 with 32G mem): 244 | 1) First train with cross-entropy loss: 245 | ```bash 246 | python oscar/run_captioning.py \ 247 | --model_name_or_path pretrained_models/large-vg-labels/ep_7_816000 \ 248 | --do_train \ 249 | --do_lower_case \ 250 | --evaluate_during_training \ 251 | --add_od_labels \ 252 | --learning_rate 0.00001 \ 253 | --per_gpu_train_batch_size 32 \ 254 | --num_train_epochs 30 \ 255 | --save_steps 5000 \ 256 | --output_dir output/ 257 | ``` 258 | 2) Finetune with CIDEr optimization: 259 | ```bash 260 | python oscar/run_captioning.py \ 261 | --model_name_or_path your_checkpoint_from_cross_entropy \ 262 | --do_train \ 263 | --do_lower_case \ 264 | --evaluate_during_training \ 265 | --add_od_labels \ 266 | --learning_rate 0.000005 \ 267 | --per_gpu_train_batch_size 8 \ 268 | --num_train_epochs 5 \ 269 | --scst \ 270 | --save_steps 2000 \ 271 | --output_dir output/ 272 | ``` 273 | 274 | Script to inference on COCO test set: 275 | ```bash 276 | python oscar/run_captioning.py \ 277 | --do_test \ 278 | --do_eval \ 279 | --test_yaml test.yaml \ 280 | --per_gpu_eval_batch_size 64 \ 281 | --num_beams 5 \ 282 | --max_gen_length 20 \ 283 | --eval_model_dir your_model_for_evaluation # could be bert base/large. 284 | ``` 285 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Oscar: Object-Semantics Aligned Pre-training for Vision-and-Language Tasks 2 | # VinVL: Revisiting Visual Representations in Vision-Language Models 3 | ## Updates 4 | 5 | 04/17/2023: Visual instruction tuning with GPT-4 is released! Please check out the multimodal model LLaVA: [[Project Page](https://llava-vl.github.io/)] [[Paper](https://arxiv.org/abs/2304.08485)] [[Demo](https://llava.hliu.cc/)] [[Data](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K)] [[Model](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v0)] 6 | 7 | 05/28/2020: Released finetuned models on downstream tasks, please check [MODEL_ZOO.md](MODEL_ZOO.md).
8 | 05/15/2020: Released pretrained models, datasets, and code for downstream tasks finetuning.
9 | 01/13/2021: our new work [VinVL](https://arxiv.org/abs/2101.00529) proposed OSCAR+, an improved version of OSCAR, and provided a better object-attribute detection model to extract features for V+L tasks. The VinVL work achieved SOTA performance on all seven V+L tasks here. Please stay tuned for the model and code release.
10 | 03/08/2021: Oscar+ pretraining code released, please check the last section in [VinVL_MODEL_ZOO.md](VinVL_MODEL_ZOO.md). All image features and model checkpoints in VinVL are also released. Please check [VinVL](https://github.com/pzzhang/VinVL) for details.
11 | 04/13/2021: Our [Scene Graph Benchmark Repo](https://github.com/microsoft/scene_graph_benchmark) has been released. Welcome to use the code there to extract image features with VinVL pretrained models.
12 | 13 | 14 | ## Introduction 15 | This repository contains source code necessary to reproduce the results presented in the paper [Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks](https://arxiv.org/abs/2004.06165). 16 | We propose a new cross-modal pre-training method **Oscar** (Object-Semantics Aligned Pre-training). It leverages **object tags** detected in images as anchor points to significantly ease the learning of image-text alignments. We pre-train Oscar on the public corpus of 6.5 million text-image pairs, and fine-tune it on downstream tasks, creating new state-of-the-arts on six well-established vision-language understanding and generation tasks. For more on this project, see the [Microsoft Research Blog post](https://www.microsoft.com/en-us/research/blog/objects-are-the-secret-key-to-revealing-the-world-between-vision-and-language/). 17 | 18 | 19 | 20 | 21 | ## Performance 22 | Task | t2i | t2i | i2t | i2t | IC | IC | IC | IC | NoCaps | NoCaps | VQA | NLVR2 | GQA | 23 | --------|-----|-----|-----|-----|-----|-----|------|------|--------|--------|----------|---------|---------| 24 | Metric | R@1 | R@5 | R@1 | R@5 | B@4 | M | C | S | C | S | test-std | test-P | test-std| 25 | SoTA_S |39.2 | 68.0|56.6 | 84.5|38.9 |29.2 |129.8 | 22.4 | 61.5 | 9.2 | 70.92 | 58.80 | 63.17 | 26 | SoTA_B |54.0 | 80.8|70.0 | 91.1|40.5 |29.7 |137.6 | 22.8 | 86.58| 12.38 | 73.67 | 79.30 | - | 27 | SoTA_L |57.5 | 82.8|73.5 | 92.2|41.7 |30.6 |140.0 | 24.5 | - | - | 74.93 | 81.47 | - | 28 | ----- |--- |--- |--- |--- |--- |--- |--- |--- |--- |--- |--- |--- |--- | 29 | Oscar_B |54.0 | 80.8|70.0 | 91.1|40.5 |29.7 |137.6 | 22.8 | 78.8 | 11.7 | 73.44 | 78.36 | 61.62 | 30 | Oscar_L |57.5 | 82.8|73.5 | 92.2|41.7 |30.6 |140.0 | 24.5 | 80.9 | 11.3 | 73.82 | 80.05 | - | 31 | ----- |--- |--- |--- |--- |--- |--- |--- |--- |--- |--- |--- |--- |--- | 32 | VinVL_B |58.1 | 83.2|74.6 | 92.6|40.9 |30.9 |140.6 | 25.1 | 92.46| 13.07 | 76.12 | 83.08 | 64.65 | 33 | VinVL_L |58.8 | 83.5|75.4 | 92.9|41.0 |31.1 |140.9 | 25.2 | - | - | 76.62 | 83.98 | - | 34 | gain | 1.3 | 0.7| 1.9 | 0.6| -0.7| 0.5 | 0.9 | 0.7 | 5.9 | 0.7 | 1.69 | 2.51 | 1.48 | 35 | 36 | t2i: text-to-image retrieval; i2t: image-to-text retrieval; IC: image captioning on COCO. 37 | 38 | 39 | ## Download 40 | We released pre-trained models, datasets, VinVL image features, and Oscar+ pretraining corpus for downstream tasks. 41 | Please check [VinVL_DOWNLOAD.md](VinVL_DOWNLOAD.md) for details. 42 | 43 | To download checkpoints for the Vanilla OSCAR, please check [DOWNLOAD.md](DOWNLOAD.md) for details. 44 | 45 | ## Installation 46 | Check [INSTALL.md](INSTALL.md) for installation instructions. 47 | 48 | ## Model Zoo 49 | Check [MODEL_ZOO.md](MODEL_ZOO.md) for scripts to run oscar downstream finetuning. 50 | 51 | Check [VinVL_MODEL_ZOO.md](VinVL_MODEL_ZOO.md) for scripts to run oscar+ pretraining and downstream finetuning. 52 | 53 | ## Citations 54 | Please consider citing this paper if you use the code: 55 | ``` 56 | @article{li2020oscar, 57 | title={Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks}, 58 | author={Li, Xiujun and Yin, Xi and Li, Chunyuan and Hu, Xiaowei and Zhang, Pengchuan and Zhang, Lei and Wang, Lijuan and Hu, Houdong and Dong, Li and Wei, Furu and Choi, Yejin and Gao, Jianfeng}, 59 | journal={ECCV 2020}, 60 | year={2020} 61 | } 62 | 63 | @article{zhang2021vinvl, 64 | title={VinVL: Making Visual Representations Matter in Vision-Language Models}, 65 | author={Zhang, Pengchuan and Li, Xiujun and Hu, Xiaowei and Yang, Jianwei and Zhang, Lei and Wang, Lijuan and Choi, Yejin and Gao, Jianfeng}, 66 | journal={CVPR 2021}, 67 | year={2021} 68 | } 69 | ``` 70 | 71 | ## License 72 | Oscar is released under the MIT license. See [LICENSE](LICENSE) for details. 73 | 74 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | ## Security 4 | 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/). 6 | 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below. 8 | 9 | ## Reporting Security Issues 10 | 11 | **Please do not report security vulnerabilities through public GitHub issues.** 12 | 13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report). 14 | 15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com). If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc). 16 | 17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 18 | 19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue: 20 | 21 | * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.) 22 | * Full paths of source file(s) related to the manifestation of the issue 23 | * The location of the affected source code (tag/branch/commit or direct URL) 24 | * Any special configuration required to reproduce the issue 25 | * Step-by-step instructions to reproduce the issue 26 | * Proof-of-concept or exploit code (if possible) 27 | * Impact of the issue, including how an attacker might exploit the issue 28 | 29 | This information will help us triage your report more quickly. 30 | 31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs. 32 | 33 | ## Preferred Languages 34 | 35 | We prefer all communications to be in English. 36 | 37 | ## Policy 38 | 39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd). 40 | 41 | -------------------------------------------------------------------------------- /VinVL_DOWNLOAD.md: -------------------------------------------------------------------------------- 1 | # Download 2 | 3 | ## Datasets 4 | We provide the extracted image region features, object tags, and the original text annotations for each downstream tasks. 5 | ```bash 6 | path/to/azcopy copy 'https://biglmdiag.blob.core.windows.net/vinvl/datasets/TASK_NAME' --recursive 7 | ``` 8 | `TASK_NAME` could be `coco_caption`, `nocaps`, `coco_ir`, `vqa`, `gqa`, `nlvr2`. 9 | 10 | ## Pre-trained Models 11 | We provide pre-trained *Oscar+* models of Bert-base and Bert-large structures, with the name starting with `base` and `large`, respectively. 12 | ```bash 13 | path/to/azcopy copy 'https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/TASK_NAME' --recursive 14 | ``` 15 | `TASK_NAME` could be `image_captioning` (including `nocaps`), `coco_ir`, `vqa`, `gqa`, `nlvr2`, `od_models`. 16 | 17 | The models are trained with both image region features and object tags. The image region features are extracted by the Faster R-CNN with 18 | ResNet-101, using object and attribute annotations from [Visual Genome](http://visualgenome.org/). 19 | The object tags are from: 20 | 1) the same VisualGenome model, named as `-vg-labels`. Or, 21 | 2) the model trained on object annotations from [Open Images V5](https://storage.googleapis.com/openimages/web/index.html). named as `-oid-labels`. Or, 22 | 3) no object tags provied, serving as baseline, named as `-no-labels`. 23 | 24 | ## Pre-exacted Image Features 25 | For ease-of-use, we make pretrained features available for all pretraining datasets and downstream tasks. 26 | Features are stored in tsv (tab-separated-values) format that can be used in [pretraining](oscar/datasets/oscar_tsv.py) and dowstream tasks like [COCO Image-Text Retrieval](oscar/run_retrieval.py). 27 | 28 | Notice that all the links below are links to a folder. We recommend using the following AzCopy command to download. 29 | ``` 30 | path/to/azcopy copy --recursive 31 | ``` 32 | 33 | [COCO 2014 Train/Val Image Features (~50G)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/coco_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/) 34 | 35 | [COCO 2014 Test Image Features (~16G)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/coco_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/coco2014test/) 36 | 37 | [COCO 2015 Test Image Features (~32G)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/coco_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/coco2015test/) 38 | 39 | [GQA All Image Features (~62G)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/gqa_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/) 40 | 41 | [NVLR2 Train/Del/Test Image Features (~28G)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/nlvr2_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/) 42 | 43 | [Flickr30k All Image Features (~14G)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/flickr30k_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/) 44 | 45 | [Google Conceptual Captions Image Features (Huge, ~960G, splitted into 12 chunks)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/googlecc_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/) 46 | 47 | [SBU Image Features (Huge, ~280G, splitted into 4 chunks)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/sbu_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/) 48 | 49 | [Open Images Detection Image Features (Huge, ~530G, splitted into 8 chunks)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/oi_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/) 50 | 51 | 52 | ## Oscar+ pretraining corpus 53 | 54 | 55 | [Small corpus](https://biglmdiag.blob.core.windows.net/vinvl/pretrain_corpus/coco_flickr30k_gqa.tsv) 56 | 57 | [Medium corpus](https://biglmdiag.blob.core.windows.net/vinvl/pretrain_corpus/coco_flickr30k_gqa_oi.tsv) 58 | 59 | [Large corpus](https://biglmdiag.blob.core.windows.net/vinvl/pretrain_corpus/coco_flickr30k_googlecc_gqa_sbu_oi.tsv) 60 | 61 | We have tried our best to make sure that there is no data contamination between pretraining corpus and test sets for downstream tasks. 62 | More specifically, we use two methods to achieve this. 63 | (1) We use the COCO Image ID of Visual Genome and Flickr30k images. 64 | (2) For COCO, Visual Genome and Flickr30k, we calucate the pair-wise l2 norm between two images after resizing them into the same size. 65 | 66 | 67 | ### Note 68 | It is recommended to download large files with **AzCopy** for faster speed. 69 | AzCopy executable tools can be downloaded [here](https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-v10#download-azcopy). 70 | Decompress the tar file and put the executable in any path. To download from 71 | any URL above, the command is: 72 | ```bash 73 | path/to/azcopy copy 74 | 75 | # for example, downloading coco_caption.zip 76 | path/to/azcopy copy https://biglmdiag.blob.core.windows.net/oscar/datasets/coco_caption.zip 77 | ``` 78 | 79 | -------------------------------------------------------------------------------- /VinVL_MODEL_ZOO.md: -------------------------------------------------------------------------------- 1 | ## Table of Contents 2 | - VQA 3 | - GQA 4 | - NLVR2 5 | - Image/Text Retrieval 6 | - Image Captioning on COCO 7 | - Oscarplus pretraining 8 | 9 | 10 | ## Performance 11 | Task | t2i | t2i | i2t | i2t | IC | IC | IC | IC | NoCaps | NoCaps | VQA | NLVR2 | GQA | 12 | --------|-----|-----|-----|-----|-----|-----|------|------|--------|--------|----------|---------|---------| 13 | Metric | R@1 | R@5 | R@1 | R@5 | B@4 | M | C | S | C | S | test-std | test-P | test-std| 14 | SoTA_S |39.2 | 68.0|56.6 | 84.5|38.9 |29.2 |129.8 | 22.4 | 61.5 | 9.2 | 70.92 | 58.80 | 63.17 | 15 | SoTA_B |54.0 | 80.8|70.0 | 91.1|40.5 |29.7 |137.6 | 22.8 | 86.58| 12.38 | 73.67 | 79.30 | 61.62 | 16 | SoTA_L |57.5 | 82.8|73.5 | 92.2|41.7 |30.6 |140.0 | 24.5 | - | - | 74.93 | 81.47 | - | 17 | ----- |--- |--- |--- |--- |--- |--- |--- |--- |--- |--- |--- |--- |--- | 18 | VinVL_B |58.1 | 83.2|74.6 | 92.6|40.9 |30.9 |140.4 | 25.1 | 92.46 (with [VIVO](https://arxiv.org/abs/2009.13682))| 13.07 (with [VIVO](https://arxiv.org/abs/2009.13682)) | 76.12 | 83.08 | 64.65 | 19 | VinVL_L |58.8 | 83.5|75.4 | 92.9|41.0 |31.1 |140.9 | 25.2 | - | - | 76.62 | 83.98 | - | 20 | gain | 1.3 | 0.7| 1.9 | 0.6| -0.7| 0.5 | 0.9 | 0.7 | 5.9 | 0.7 | 1.69 | 2.51 | 1.48 | 21 | 22 | t2i: text-to-image retrieval; i2t: image-to-text retrieval; IC: image captioning on COCO. 23 | 24 | For reference, we also release the training logs and output. 25 | 26 | 27 | ## VQA 28 | Script to finetune for Oscar base model. 29 | Base model is trained on train split and evaluated on the val split. Good for later comparison. 30 | 33 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/vqa/base/test/results.txt).
34 | Model checkpoint: [.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/vqa/base/best.zip). 35 | ```bash 36 | python oscar/run_vqa.py -j 4 --img_feature_dim 2054 --max_img_seq_length 37 | 50 --data_label_type mask --img_feature_type faster_r-cnn --data_dir vinvl/datasets/vqa 38 | --model_type bert --model_name_or_path vinvl/model_ckpts/vqa/base/checkpoint-2000000 39 | --task_name vqa_text --do_train --do_lower_case --max_seq_length 128 --per_gpu_eval_batch_size 40 | 256 --per_gpu_train_batch_size 32 --learning_rate 5e-05 --num_train_epochs 25 41 | --output_dir results --label_file datasets/vqa/cache/trainval_ans2label.pkl 42 | --save_epoch 1 --seed 88 --evaluate_during_training --logging_steps 4000 --drop_out 43 | 0.3 --weight_decay 0.05 --warmup_steps 0 --loss_type bce --img_feat_format pt 44 | --classifier linear --cls_hidden_scale 3 --txt_data_dir vinvl/datasets/vqa 45 | ``` 46 | 47 | Script to finetune for Oscar large model. 48 | Large model is trained on train+val split and evaluated on the val split, for reproduce the paper's best result. 49 | 50 | 53 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/vqa/large/test/results.txt).
54 | Model checkpoint: [.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/vqa/large/best.zip). 55 | ```bash 56 | python oscar/run_vqa.py -j 4 --img_feature_dim 2054 --max_img_seq_length 57 | 50 --data_label_type mask --img_feature_type faster_r-cnn --data_dir vinvl/datasets/vqa 58 | --model_type bert --model_name_or_path vinvl/model_ckpts/vqa/large/checkpoint-2000000 59 | --task_name vqa_text --do_train_val --do_lower_case --max_seq_length 128 --per_gpu_eval_batch_size 60 | 256 --per_gpu_train_batch_size 24 --learning_rate 3e-05 --num_train_epochs 25 61 | --label_file datasets/vqa/cache/trainval_ans2label.pkl --save_epoch 30 62 | --seed 88 --evaluate_during_training --logging_steps 4000 --drop_out 0.3 --weight_decay 63 | 0.05 --warmup_steps 0 --loss_type bce --save_after_epoch 15 --output_dir results --img_feat_format pt --classifier linear --cls_hidden_scale 3 --txt_data_dir vinvl/datasets/vqa 64 | ``` 65 | 66 | 67 | ## GQA 68 | Script to finetune for Oscar base model. 69 | 70 | 73 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/gqa/base/results.txt).
74 | Model checkpoint: [.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/gqa/base/best.zip). 75 | ```bash 76 | python oscar/run_gqa.py -j 4 --img_feature_dim 2054 --max_img_seq_length 77 | 45 --data_dir vinvl/datasets/gqa --model_type bert --model_name_or_path vinvl/model_ckpts/vqa/base/checkpoint-2000000 78 | --task_name gqa --do_lower_case --max_seq_length 165 --per_gpu_eval_batch_size 79 | 256 --per_gpu_train_batch_size 48 --learning_rate 5e-05 --num_train_epochs 5 --output_dir 80 | results --label_file vinvl/datasets/gqa/trainval_testdev_all_ans2label.pkl 81 | --img_feature_type faster_r-cnn --data_label_type all --train_data_type all --eval_data_type 82 | bal --label2ans_file vinvl/datasets/gqa/trainval_testdev_all_label2ans.pkl 83 | --loss_type xe --save_epoch 2 --seed 88 --evaluate_during_training --logging_steps 84 | 4000 --drop_out 0.3 --do_train --weight_decay 0.05 --warmup_steps 0 85 | ``` 86 | 87 | ## NLVR2 88 | Script to finetune for Oscar base model. 89 | 90 | 93 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/nlvr2/base/rvln_base_oscar_v2_71.5_86236_test_done_best/exp_rvln_base_oscar_v2_71.5_86236_test_b24_0.00003_55_41_dp_0.3_mlp_abch/stdout.txt).
94 | Model checkpoint: [.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/nlvr2/base/best.zip). 95 | ```bash 96 | python oscar/run_nlvr.py -j 4 --img_feature_dim 2054 --max_img_seq_length 97 | 40 --data_dir vinvl/datasets/nlvr2 --model_type bert --model_name_or_path vinvl/model_ckpts/vqa/base/checkpoint-2000000 98 | --task_name nlvr --do_lower_case --max_seq_length 55 --per_gpu_eval_batch_size 99 | 64 --per_gpu_train_batch_size 72 --learning_rate 3e-05 --num_train_epochs 20 --output_dir 100 | results --img_feature_type faster_r-cnn --data_label_type all --train_data_type 101 | all --eval_data_type all --loss_type xe --save_epoch -1 --seed 88 --evaluate_during_training 102 | --logging_steps -1 --drop_out 0.3 --do_train --weight_decay 0.05 --warmup_steps 103 | 10000 --classifier mlp --cls_hidden_scale 3 --num_choice 2 --use_pair 104 | ``` 105 | 106 | Script to finetune for Oscar large model. 107 | 108 | 111 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/nlvr2/large/rvln_oscar_v2_large_99617_test_done_best/exp_rvln_oscar_v2_large_99617_test_b24_0.00003_55_50_dp_0.3_mlp_abce/stdout.txt).
112 | Model checkpoint: [.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/nlvr2/large/best.zip). 113 | ```bash 114 | python oscar/run_nlvr.py -j 4 --img_feature_dim 2054 --max_img_seq_length 115 | 40 --data_dir vinvl/datasets/nlvr2 --model_type bert --model_name_or_path vinvl/model_ckpts/vqa/large/checkpoint-2000000 116 | --task_name nlvr --do_lower_case --max_seq_length 55 --per_gpu_eval_batch_size 117 | 64 --per_gpu_train_batch_size 24 --learning_rate 3e-05 --num_train_epochs 20 --output_dir 118 | results --img_feature_type faster_r-cnn --data_label_type all --train_data_type 119 | all --eval_data_type all --loss_type xe --save_epoch -1 --seed 88 --evaluate_during_training 120 | --logging_steps -1 --drop_out 0.3 --do_train --weight_decay 0.05 --warmup_steps 121 | 5000 --classifier mlp --cls_hidden_scale 2 --num_choice 2 --use_pair 122 | ``` 123 | 124 | 138 | 139 | ## Image Text Retrieval 140 | Script to finetune for Oscarplus base model (8 V100 with 16G mem): 141 | 142 | Training logs: [train_logs](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/coco_ir/base/train_logs/), 143 | 144 | Training logs: [test_logs](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/coco_ir/base/test_logs/), 145 | 146 | Command [command](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/coco_ir/base/philly.yaml). 147 | 148 | Model checkpoint: [ckeckpoint](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/coco_ir/base/checkpoint-0132780/). 149 | 150 | ```bash 151 | python oscar/run_retrieval.py \ 152 | --model_name_or_path vinvl/coco_ir/base/checkpoint-1340000 \ 153 | --do_train \ 154 | --do_lower_case \ 155 | --evaluate_during_training \ 156 | --num_captions_per_img_val 20 \ 157 | --eval_caption_index_file minival_caption_indexs_top20.pt \ 158 | --per_gpu_train_batch_size 16 \ 159 | --learning_rate 0.00002 \ 160 | --num_train_epochs 30 \ 161 | --weight_decay 0.05 \ 162 | --save_steps 5000 \ 163 | --add_od_labels \ 164 | --od_label_type vg \ 165 | --max_seq_length 70 \ 166 | --max_img_seq_length 70 \ 167 | --output_dir output/ 168 | ``` 169 | 170 | Script to finetune for Oscarplus large model (8 V100 with 32G mem): 171 | 172 | Training logs: [train_logs](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/coco_ir/large/train_logs/), 173 | 174 | Training logs: [test_logs](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/coco_ir/large/test_logs/), 175 | 176 | Command [command](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/coco_ir/large/philly.yaml). 177 | 178 | Model checkpoint: [ckeckpoint](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/coco_ir/large/checkpoint-0132780/). 179 | 180 | ```bash 181 | python oscar/run_retrieval.py \ 182 | --model_name_or_path vinvl/coco_ir/base/checkpoint-0660000 \ 183 | --do_train \ 184 | --do_lower_case \ 185 | --evaluate_during_training \ 186 | --num_captions_per_img_val 20 \ 187 | --eval_caption_index_file minival_caption_indexs_top20.pt \ 188 | --per_gpu_train_batch_size 16 \ 189 | --learning_rate 7.5e-06 \ 190 | --num_train_epochs 30 \ 191 | --save_steps 5000 \ 192 | --add_od_labels \ 193 | --od_label_type vg \ 194 | --max_seq_length 70 \ 195 | --max_img_seq_length 70 \ 196 | --output_dir output \ 197 | --img_feat_file vinvl/image_features/coco_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/features.tsv 198 | ``` 199 | 200 | Script to inference on COCO 1K test set: 201 | ```bash 202 | python oscar/run_retrieval.py \ 203 | --do_test \ 204 | --do_eval \ 205 | --test_split test \ 206 | --num_captions_per_img_val 5 \ 207 | --eval_img_keys_file test_img_keys_1k.tsv \ 208 | --cross_image_eval \ 209 | --per_gpu_eval_batch_size 64 \ 210 | --img_feat_file vinvl/image_features/coco_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/features.tsv \ 211 | --eval_model_dir your_model_for_evaluation # could be base/large models. 212 | ``` 213 | 214 | Script to inference on COCO 5K test set: 215 | ```bash 216 | python oscar/run_retrieval.py \ 217 | --do_test \ 218 | --do_eval \ 219 | --test_split test \ 220 | --num_captions_per_img_val 5 \ 221 | --eval_img_keys_file test_img_keys.tsv \ 222 | --cross_image_eval \ 223 | --per_gpu_eval_batch_size 64 \ 224 | --img_feat_file vinvl/image_features/coco_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/features.tsv \ 225 | --eval_model_dir your_model_for_evaluation # could be base/large models. 226 | ``` 227 | 228 | 229 | ## Image Captioning on COCO 230 | Script to finetune for base model: 231 | 232 | Pretrained model checkpoint: [pretrained_base.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/image_captioning/pretrained_base.zip). 233 | Finetuned model checkpoint (w/ cross entropy): [coco_captioning_base_xe.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/image_captioning/coco_captioning_base_xe.zip). 234 | Finetuned model checkpoint (w/ CIDEr optimization): [coco_captioning_base_scst.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/image_captioning/coco_captioning_base_scst.zip). 235 | 236 | 1) First train with cross-entropy loss (8 V100 with 16G mem): 237 | ```bash 238 | python oscar/run_captioning.py \ 239 | --model_name_or_path pretrained_models/image_captioning/pretrained_base \ 240 | --do_train \ 241 | --do_lower_case \ 242 | --add_od_labels \ 243 | --learning_rate 3e-5 \ 244 | --per_gpu_train_batch_size 64 \ 245 | --num_train_epochs 60 \ 246 | --tie_weights \ 247 | --freeze_embedding \ 248 | --label_smoothing 0.1 \ 249 | --drop_worst_ratio 0.2 \ 250 | --drop_worst_after 20000 \ 251 | --output_dir output/ 252 | ``` 253 | 2) Finetune with CIDEr optimization (8 V100 with 32G mem): 254 | ```bash 255 | python oscar/run_captioning.py \ 256 | --model_name_or_path your_checkpoint_from_cross_entropy \ 257 | --do_train \ 258 | --do_lower_case \ 259 | --add_od_labels \ 260 | --learning_rate 3e-6 \ 261 | --per_gpu_train_batch_size 16 \ 262 | --num_train_epochs 75 \ 263 | --tie_weights \ 264 | --freeze_embedding \ 265 | --scst \ 266 | --output_dir output/ 267 | ``` 268 | 269 | Script to finetune for large model: 270 | 271 | Pretrained model checkpoint: [pretrained_large.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/image_captioning/pretrained_large.zip). 272 | Finetuned model checkpoint (w/ cross entropy): [coco_captioning_large_xe.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/image_captioning/coco_captioning_large_xe.zip). 273 | Finetuned model checkpoint (w/ CIDEr optimization): [coco_captioning_large_scst.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/image_captioning/coco_captioning_large_scst.zip). 274 | 275 | 1) First train with cross-entropy loss (8 V100 with 32G mem): 276 | ```bash 277 | python oscar/run_captioning.py \ 278 | --model_name_or_path pretrained_models/image_captioning/pretrained_large \ 279 | --do_train \ 280 | --do_lower_case \ 281 | --add_od_labels \ 282 | --learning_rate 1e-5 \ 283 | --per_gpu_train_batch_size 64 \ 284 | --num_train_epochs 60 \ 285 | --tie_weights \ 286 | --freeze_embedding \ 287 | --label_smoothing 0.1 \ 288 | --drop_worst_ratio 0.2 \ 289 | --drop_worst_after 20000 \ 290 | --output_dir output/ 291 | ``` 292 | 2) Finetune with CIDEr optimization (8 V100 with 32G mem): 293 | ```bash 294 | python oscar/run_captioning.py \ 295 | --model_name_or_path your_checkpoint_from_cross_entropy \ 296 | --do_train \ 297 | --do_lower_case \ 298 | --add_od_labels \ 299 | --learning_rate 8e-7 \ 300 | --per_gpu_train_batch_size 6 \ 301 | --num_train_epochs 25 \ 302 | --tie_weights \ 303 | --freeze_embedding \ 304 | --scst \ 305 | --output_dir output/ 306 | ``` 307 | 308 | Script to inference on COCO test set: 309 | ```bash 310 | python oscar/run_captioning.py \ 311 | --do_test \ 312 | --do_eval \ 313 | --test_yaml test.yaml \ 314 | --per_gpu_eval_batch_size 64 \ 315 | --num_beams 5 \ 316 | --max_gen_length 20 \ 317 | --eval_model_dir your_model_for_evaluation # could be base or large models 318 | ``` 319 | 320 | ## Image Captioning on NoCaps 321 | Note that [NoCaps] (https://nocaps.org/) does not allow to use extra 322 | image-caption pairs for training except COCO. So the model is directly initialized 323 | from bert-base, and trained on COCO data. 324 | 325 | Script to train base model: 326 | 327 | Finetuned model checkpoint (w/ cross entropy): [nocaps_base_xe.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/image_captioning/nocaps_base_xe.zip). 328 | Finetuned model checkpoint (w/ CIDEr optimization): [nocaps_base_scst.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/image_captioning/nocaps_base_scst.zip). 329 | 330 | 1) First train with cross-entropy loss (4 V100 with 16G mem): 331 | ```bash 332 | python oscar/run_captioning.py \ 333 | --model_name_or_path bert-base-uncased \ 334 | --do_train \ 335 | --do_lower_case \ 336 | --add_od_labels \ 337 | --learning_rate 0.0001 \ 338 | --per_gpu_train_batch_size 64 \ 339 | --num_train_epochs 30 \ 340 | --tie_weights \ 341 | --freeze_embedding \ 342 | --output_dir output/ 343 | ``` 344 | 2) Train with CIDEr optimization (8 V100 with 32G mem): 345 | ```bash 346 | python oscar/run_captioning.py \ 347 | --model_name_or_path your_checkpoint_from_cross_entropy \ 348 | --do_train \ 349 | --do_lower_case \ 350 | --add_od_labels \ 351 | --scheduler constant \ 352 | --learning_rate 5e-6 \ 353 | --per_gpu_train_batch_size 14 \ 354 | --num_train_epochs 50 \ 355 | --tie_weights \ 356 | --freeze_embedding \ 357 | --scst \ 358 | --output_dir output/ 359 | ``` 360 | 361 | Script to inference on NoCaps val set with Constrained Beam Search: 362 | ```bash 363 | python oscar/run_captioning.py \ 364 | --do_test \ 365 | --do_eval \ 366 | --data_dir datasets/nocaps \ 367 | --test_yaml val.yaml \ 368 | --per_gpu_eval_batch_size 2 \ 369 | --num_beams 5 \ 370 | --use_cbs \ 371 | --max_gen_length 20 \ 372 | --eval_model_dir your_model_for_evaluation 373 | ``` 374 | 375 | 383 | 384 | ## Oscarplus pretraining 385 | Table 16 below shows the statistics of image and text of the pre-training corpora. 386 | In our ablation study, we have corpora of three different sizes: [Small](https://biglmdiag.blob.core.windows.net/vinvl/pretrain_corpus/coco_flickr30k_gqa_x152c4big2exp168.yaml), [Medium](https://biglmdiag.blob.core.windows.net/vinvl/pretrain_corpus/coco_flickr30k_gqa_oi_x152c4big2exp168.yaml), [Large](https://biglmdiag.blob.core.windows.net/vinvl/pretrain_corpus/coco_flickr30k_googlecc_gqa_sbu_oi_x152c4big2exp168.yaml). 387 | Notice that we make use of image tagging datasets OpenImages, by generating captions using OSCAR's image captioning model to form triplets of ``(generated caption, image tags, image features)'' for the OSCAR+ pre-training. 388 | 389 | 390 | Script to perform oscar+ pretraining with the [large corpus](https://biglmdiag.blob.core.windows.net/vinvl/pretrain_corpus/coco_flickr30k_googlecc_gqa_sbu_oi_x152c4big2exp168.yaml). 391 | ```bash 392 | python -m torch.distributed.launch --nproc_per_node=8 oscar/run_oscarplus_pretrain.py \ 393 | --use_b 1 \ 394 | --max_grad_norm 10.0 --gradient_accumulation_steps 1 \ 395 | --use_img_layernorm 1 \ 396 | --output_dir \ 397 | --bert_model bert --model_name_or_path bert-base-uncased \ 398 | --do_lower_case --learning_rate 5e-05 399 | --warmup_steps 0 --do_train --max_seq_length 35 --on_memory \ 400 | --max_img_seq_length 50 --img_feature_dim 2054 \ 401 | --drop_out 0.1 --train_batch_size 8 \ 402 | --ckpt_period 10000 --max_iters 2000000 --log_period 100 \ 403 | --data_dir --dataset_file coco_flickr30k_googlecc_gqa_sbu_oi_x152c4big2exp168.yaml 404 | --textb_sample_mode 1 --texta_false_prob 0.25 405 | ``` 406 | 407 | 408 | One can perform the vanilla OSCAR pretraining by setting 409 | ```bash 410 | --textb_sample_mode 0 --texta_false_prob 0.0 411 | ``` 412 | 413 | One can also split the large pretraining corpus into two parts, i.e., coco_flickr30k_gqa + googlecc_sbu_oi, and use different textb_sample_modes for them. 414 | To set textb_sample_mode=2 for coco_flickr30k_gqa has the potential to emphasize the QA-pairs in the small corpus. 415 | ```bash 416 | --data_dir --dataset_file coco_flickr30k_gqa_x152c4big2exp168.yaml 417 | --textb_sample_mode 2 --texta_false_prob 0.25 \ 418 | --extra_dataset_file googlecc_sbu_oi_x152c4big2exp168.yaml \ 419 | --extra_textb_sample_mode 1 --extra_loss_weight 0.5 420 | ``` -------------------------------------------------------------------------------- /docs/oscar.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Oscar/266075fef2486846bb7110fbb6232074e09e076d/docs/oscar.PNG -------------------------------------------------------------------------------- /docs/oscar_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Oscar/266075fef2486846bb7110fbb6232074e09e076d/docs/oscar_logo.png -------------------------------------------------------------------------------- /docs/pretrain_corpus.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/microsoft/Oscar/266075fef2486846bb7110fbb6232074e09e076d/docs/pretrain_corpus.PNG -------------------------------------------------------------------------------- /oscar/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.0" 2 | -------------------------------------------------------------------------------- /oscar/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.0" 2 | -------------------------------------------------------------------------------- /oscar/datasets/build.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import torch 4 | from oscar.utils.misc import get_world_size 5 | from .oscar_tsv import OscarTSVDataset 6 | from transformers.pytorch_transformers import BertTokenizer 7 | 8 | 9 | class BatchCollator(object): 10 | """ 11 | From a list of samples from the dataset, 12 | returns the images and targets. 13 | """ 14 | def __call__(self, batch): 15 | return list(zip(*batch)) 16 | 17 | 18 | def build_dataset(args): 19 | """ 20 | Arguments: 21 | args: configuration. 22 | """ 23 | full_yaml_file = os.path.join(args.data_dir, args.dataset_file) 24 | assert os.path.isfile(full_yaml_file) 25 | 26 | tokenizer = BertTokenizer.from_pretrained( 27 | args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, 28 | do_lower_case=args.do_lower_case) 29 | 30 | cfg = dict( 31 | yaml_file=full_yaml_file, 32 | args=args, 33 | seq_len=args.max_seq_length, 34 | on_memory=args.on_memory, 35 | tokenizer=tokenizer, 36 | ) 37 | # make dataset from factory 38 | datasets = [OscarTSVDataset(**cfg)] 39 | if args.extra_dataset_file: 40 | full_yaml_file = os.path.join(args.data_dir, args.extra_dataset_file) 41 | assert os.path.isfile(full_yaml_file) 42 | cfg['yaml_file'] = full_yaml_file 43 | cfg['textb_sample_mode'] = args.extra_textb_sample_mode 44 | datasets.append(OscarTSVDataset(**cfg)) 45 | 46 | return datasets 47 | 48 | 49 | def make_data_sampler(dataset, shuffle, distributed): 50 | if distributed: 51 | return torch.utils.data.distributed.DistributedSampler( 52 | dataset, shuffle=shuffle 53 | ) 54 | if shuffle: 55 | sampler = torch.utils.data.sampler.RandomSampler(dataset) 56 | else: 57 | sampler = torch.utils.data.sampler.SequentialSampler(dataset) 58 | return sampler 59 | 60 | 61 | class IterationBasedBatchSampler(torch.utils.data.sampler.BatchSampler): 62 | """ 63 | Wraps a BatchSampler, resampling from it until 64 | a specified number of iterations have been sampled 65 | """ 66 | 67 | def __init__(self, batch_sampler, num_iterations, start_iter=0): 68 | self.batch_sampler = batch_sampler 69 | self.num_iterations = num_iterations 70 | self.start_iter = start_iter 71 | 72 | def __iter__(self): 73 | iteration = self.start_iter 74 | while iteration <= self.num_iterations: 75 | # if the underlying sampler has a set_epoch method, like 76 | # DistributedSampler, used for making each process see 77 | # a different split of the dataset, then set it 78 | if hasattr(self.batch_sampler.sampler, "set_epoch"): 79 | self.batch_sampler.sampler.set_epoch(iteration) 80 | for batch in self.batch_sampler: 81 | iteration += 1 82 | if iteration > self.num_iterations: 83 | break 84 | yield batch 85 | 86 | def __len__(self): 87 | return self.num_iterations 88 | 89 | 90 | def make_batch_data_sampler( 91 | sampler, images_per_batch, num_iters=None, 92 | start_iter=0 93 | ): 94 | batch_sampler = torch.utils.data.sampler.BatchSampler( 95 | sampler, images_per_batch, drop_last=False 96 | ) 97 | if num_iters is not None and num_iters >= 0: 98 | batch_sampler = IterationBasedBatchSampler( 99 | batch_sampler, num_iters, start_iter 100 | ) 101 | return batch_sampler 102 | 103 | 104 | def make_data_loader(args, is_distributed=False, arguments=None): 105 | num_gpus = get_world_size() 106 | # figure out start iteration 107 | if arguments is None: 108 | start_iter = 0 109 | else: 110 | start_iter = arguments['iteration'] 111 | # figure out the batchsize 112 | grad_accumulate_steps = 1 113 | if hasattr(args, 'gradient_accumulation_steps'): 114 | grad_accumulate_steps = args.gradient_accumulation_steps 115 | assert ( 116 | args.train_batch_size % grad_accumulate_steps == 0 117 | ), "train_batch_size ({}) must be divisible by the number " 118 | "of Gradient accumulation ({}) used."\ 119 | .format(args.train_batch_size, grad_accumulate_steps) 120 | images_per_batch = args.train_batch_size//grad_accumulate_steps 121 | assert ( 122 | images_per_batch % num_gpus == 0 123 | ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number " 124 | "of GPUs ({}) used.".format(images_per_batch, num_gpus) 125 | images_per_gpu = images_per_batch // num_gpus 126 | logger = logging.getLogger(__name__) 127 | logger.info("Train with {} images per GPU".format(images_per_gpu)) 128 | shuffle = True 129 | num_iters = args.max_iters * grad_accumulate_steps 130 | 131 | # build dataset 132 | datasets = build_dataset(args) 133 | 134 | data_loaders = [] 135 | for i, dataset in enumerate(datasets): 136 | sampler = make_data_sampler(dataset, shuffle, is_distributed) 137 | 138 | batch_sampler = make_batch_data_sampler( 139 | sampler, images_per_gpu, num_iters, start_iter 140 | ) 141 | num_workers = args.num_workers 142 | data_loader = torch.utils.data.DataLoader( 143 | dataset, 144 | num_workers=num_workers, 145 | batch_sampler=batch_sampler, 146 | collate_fn=BatchCollator(), 147 | pin_memory=True, 148 | ) 149 | data_loaders.append(data_loader) 150 | return data_loaders 151 | -------------------------------------------------------------------------------- /oscar/modeling/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.0" 2 | -------------------------------------------------------------------------------- /oscar/run_oscarplus_pretrain.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import, division, print_function 2 | 3 | import argparse 4 | import datetime 5 | import json 6 | import logging 7 | import os 8 | import random 9 | import sys 10 | import time 11 | import math 12 | import shutil 13 | 14 | sys.path.insert(0, '.') 15 | 16 | import numpy as np 17 | import torch 18 | 19 | from oscar.modeling.modeling_bert import BertImgForPreTraining 20 | from transformers.pytorch_transformers import (WEIGHTS_NAME, BertConfig, 21 | BertTokenizer) 22 | 23 | from oscar.datasets.build import make_data_loader 24 | 25 | from transformers.pytorch_transformers import AdamW, WarmupLinearSchedule 26 | from oscar.utils.misc import mkdir, get_rank 27 | from oscar.utils.metric_logger import TensorboardLogger 28 | 29 | logger = logging.getLogger(__name__) 30 | 31 | ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig,)), ()) 32 | 33 | MODEL_CLASSES = { 34 | 'bert': (BertConfig, BertImgForPreTraining, BertTokenizer), 35 | } 36 | 37 | 38 | """ ****** Pretraining ****** """ 39 | 40 | 41 | def main(): 42 | parser = argparse.ArgumentParser() 43 | 44 | ## Required parameters 45 | parser.add_argument("--data_dir", default=None, type=str, required=False, 46 | help="The input data dir. " 47 | "Should contain the .yaml files for the task.") 48 | parser.add_argument("--dataset_file", default=None, type=str, required=True, 49 | help="The training dataset yaml file.") 50 | parser.add_argument("--extra_dataset_file", default=None, type=str, required=False, 51 | help="The extra training dataset yaml file.") 52 | parser.add_argument("--bert_model", default=None, type=str, required=True, 53 | help="Bert pre-trained model selected in the list: bert-base-uncased, " 54 | "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.") 55 | parser.add_argument("--output_dir", default=None, type=str, required=True, 56 | help="The output directory where the model checkpoints will be written.") 57 | 58 | # image chunks 59 | parser.add_argument("--chunk_start_id", default=-1, type=int, 60 | help="Image Chunk Start ID") 61 | parser.add_argument("--chunk_end_id", default=-1, type=int, 62 | help="Image Chunk End ID") 63 | 64 | ## Image parameters 65 | parser.add_argument("--max_img_seq_length", default=50, type=int, 66 | help="The maximum total input image sequence length.") 67 | parser.add_argument("--img_feature_dim", default=2054, type=int, 68 | help="The Image Feature Dimension.") 69 | parser.add_argument("--img_feature_type", default='faster_r-cnn', type=str, 70 | help="faster_r-cnn or mask_r-cnn") 71 | parser.add_argument("--use_layernorm", action='store_true', 72 | help="use_layernorm") 73 | 74 | parser.add_argument("--drop_out", default=0.1, type=float, 75 | help="Drop out for BERT.") 76 | 77 | parser.add_argument("--use_b", type=int, default=1, help="use_b") 78 | parser.add_argument("--textb_sample_mode", type=int, default=0, 79 | help="0: sample from both texta&textb, " 80 | "1: sample from textb, " 81 | "2: sample from QA answers") 82 | parser.add_argument("--extra_textb_sample_mode", type=int, default=1) 83 | parser.add_argument("--texta_false_prob", type=float, default=0.0, 84 | help="the probality that we sample wrong texta, should in [0.0, 0.5]") 85 | 86 | parser.add_argument("--model_name_or_path", default=None, type=str, 87 | required=True, 88 | help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join( 89 | ALL_MODELS)) 90 | parser.add_argument("--config_name", default="", type=str, 91 | help="Pretrained config name or path if not the same as model_name") 92 | parser.add_argument("--tokenizer_name", default="", type=str, 93 | help="Pretrained tokenizer name or path if not the same as model_name") 94 | parser.add_argument("--cache_dir", default="", type=str, 95 | help="Where do you want to store the pre-trained models downloaded from s3") 96 | 97 | parser.add_argument("--max_seq_length", default=35, type=int, 98 | help="The maximum total input sequence length after WordPiece tokenization. \n" 99 | "Sequences longer than this will be truncated, and sequences shorter than this will be padded.") 100 | parser.add_argument("--do_train", action='store_true', 101 | help="Whether to run training.") 102 | parser.add_argument("--learning_rate", default=5e-5, type=float, 103 | help="The initial learning rate for Adam.") 104 | parser.add_argument("--max_iters", default=2000000, type=int, 105 | help="Maximal number of training iterations.") 106 | parser.add_argument("--train_batch_size", default=1024, type=int, 107 | help="Batch size for training.") 108 | parser.add_argument("--num_workers", default=6, type=int, 109 | help="Number of workers for dataset.") 110 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, 111 | help="Epsilon for Adam optimizer.") 112 | parser.add_argument("--optim", default='adamw', type=str, 113 | help="The optimizer used for Bert, [adamw, lamb], default: adamw") 114 | parser.add_argument("--max_grad_norm", default=-1.0, type=float, help="Max gradient norm.") 115 | parser.add_argument("--warmup_steps", default=0, type=int, 116 | help="Linear warmup over warmup_steps.") 117 | parser.add_argument("--no_cuda", action='store_true', 118 | help="Whether not to use CUDA when available") 119 | parser.add_argument("--on_memory", action='store_true', 120 | help="Whether to load train samples into memory or use disk") 121 | parser.add_argument("--do_lower_case", action='store_true', 122 | help="Whether to lower case the input text. True for uncased models, False for cased models.") 123 | parser.add_argument("--local_rank", type=int, default=-1, 124 | help="local_rank for distributed training on gpus") 125 | parser.add_argument('--seed', type=int, default=42, 126 | help="random seed for initialization") 127 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1, 128 | help="Number of updates steps to accumualte before performing a backward/update pass.") 129 | 130 | parser.add_argument("--from_scratch", action='store_true', 131 | help="train from scratch") 132 | parser.add_argument("--use_img_layernorm", type=int, default=0, 133 | help="Normalize image features with bertlayernorm") 134 | parser.add_argument("--img_layer_norm_eps", default=1e-12, type=float, 135 | help="The eps in image feature laynorm layer") 136 | # distributed 137 | parser.add_argument('--gpu_ids', type=str, default='-1') 138 | parser.add_argument("--mask_loss_for_unmatched", type=int, default=1, 139 | help="masked language model loss for unmatched triplets") 140 | parser.add_argument("--extra_loss_weight", type=float, default=0.0, 141 | help="the loss weight for the extra train data batch (should be in [0,1])") 142 | parser.add_argument( 143 | "--use_gtlabels", 144 | type=int, default=1, 145 | help="use groundtruth labels for text b or not" 146 | ) 147 | # logging 148 | parser.add_argument('--ckpt_period', type=int, default=10000, 149 | help="Period for saving checkpoint") 150 | parser.add_argument('--log_period', type=int, default=100, 151 | help="Period for saving logging info") 152 | args = parser.parse_args() 153 | 154 | if args.gpu_ids != '-1': 155 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids 156 | 157 | args.num_gpus = int( 158 | os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1 159 | args.distributed = args.num_gpus > 1 160 | 161 | if args.gpu_ids != '-1': 162 | os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids 163 | 164 | if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train: 165 | logger.info("Output Directory Exists.") 166 | 167 | # Setup CUDA, GPU & distributed training 168 | if args.local_rank == -1 or args.no_cuda: 169 | device = torch.device( 170 | "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 171 | args.n_gpu = torch.cuda.device_count() 172 | else: # Initializes the distributed backend which will take care of sychronizing nodes/GPUs 173 | torch.cuda.set_device(args.local_rank) 174 | device = torch.device("cuda", args.local_rank) 175 | torch.distributed.init_process_group( 176 | backend='nccl', init_method="env://" 177 | ) 178 | args.n_gpu = 1 179 | args.device = device 180 | 181 | # Setup logging 182 | logging.basicConfig( 183 | format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', 184 | datefmt='%m/%d/%Y %H:%M:%S', 185 | level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN) 186 | logger.warning( 187 | "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s", 188 | args.local_rank, device, args.n_gpu, bool(args.local_rank != -1) 189 | ) 190 | 191 | if args.gradient_accumulation_steps < 1: 192 | raise ValueError( 193 | "Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format( 194 | args.gradient_accumulation_steps)) 195 | 196 | random.seed(args.seed) 197 | np.random.seed(args.seed) 198 | torch.manual_seed(args.seed) 199 | if args.n_gpu > 0: 200 | torch.cuda.manual_seed_all(args.seed) 201 | 202 | if not args.do_train: 203 | raise ValueError( 204 | "Training is currently the only implemented execution option. Please set `do_train`.") 205 | 206 | if not os.path.exists(args.output_dir): 207 | mkdir(args.output_dir) 208 | 209 | last_checkpoint_dir = None 210 | arguments = {"iteration": 0} 211 | if os.path.exists(args.output_dir): 212 | save_file = os.path.join(args.output_dir, "last_checkpoint") 213 | try: 214 | with open(save_file, "r") as f: 215 | last_saved = f.read() 216 | last_saved = last_saved.strip() 217 | except IOError: 218 | # if file doesn't exist, maybe because it has just been 219 | # deleted by a separate process 220 | last_saved = "" 221 | if last_saved: 222 | folder_name = os.path.splitext(last_saved.split('/')[0])[0] # in the form of checkpoint-00001 or checkpoint-00001/pytorch_model.bin 223 | last_checkpoint_dir = os.path.join(args.output_dir, folder_name) 224 | arguments["iteration"] = int(folder_name.split('-')[-1]) 225 | assert os.path.isfile(os.path.join(last_checkpoint_dir, WEIGHTS_NAME)), "Last_checkpoint detected, but file not found!" 226 | 227 | # model first 228 | if get_rank() != 0: 229 | torch.distributed.barrier() 230 | config_class, model_class, tokenizer_class = MODEL_CLASSES[args.bert_model] 231 | if last_checkpoint_dir is not None: # recovery 232 | args.model_name_or_path = last_checkpoint_dir 233 | logger.info(" -> Recovering model from {}".format(last_checkpoint_dir)) 234 | 235 | config = config_class.from_pretrained( 236 | args.config_name if args.config_name else args.model_name_or_path, 237 | ) 238 | config.img_layer_norm_eps = args.img_layer_norm_eps 239 | config.use_img_layernorm = args.use_img_layernorm 240 | 241 | # discrete code 242 | config.img_feature_dim = args.img_feature_dim 243 | config.img_feature_type = args.img_feature_type 244 | config.hidden_dropout_prob = args.drop_out 245 | if args.texta_false_prob < 0.5 and (args.texta_false_prob > 0 or not args.use_b): 246 | args.num_contrast_classes = 3 247 | else: 248 | args.num_contrast_classes = 2 249 | config.num_contrast_classes = args.num_contrast_classes 250 | 251 | # Prepare model 252 | # model = BertForPreTraining.from_pretrained(args.bert_model) 253 | load_num = 0 254 | while load_num < 10: 255 | try: 256 | model = BertImgForPreTraining.from_pretrained( 257 | args.model_name_or_path, 258 | from_tf=bool('.ckpt' in args.model_name_or_path), 259 | config=config) 260 | break 261 | except: 262 | load_num += 1 263 | 264 | # train from scratch 265 | if args.from_scratch: 266 | if last_checkpoint_dir is None: 267 | logger.info("Training from scratch ... ") 268 | model.apply(model.init_weights) 269 | total_params = sum(p.numel() for p in model.parameters()) 270 | logger.info( 271 | 'Total Parameters: {}'.format(total_params)) 272 | 273 | for key, val in vars(config).items(): 274 | setattr(args, key, val) 275 | 276 | if get_rank() == 0 and args.local_rank != -1: 277 | torch.distributed.barrier() 278 | 279 | model.to(args.device) 280 | 281 | logger.info("Training/evaluation parameters %s", args) 282 | 283 | tb_log_dir = os.path.join(args.output_dir, 'train_logs') 284 | meters = TensorboardLogger( 285 | log_dir=tb_log_dir, 286 | delimiter=" ", 287 | ) 288 | 289 | # Prepare optimizer 290 | param_optimizer = list(model.named_parameters()) 291 | no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] 292 | optimizer_grouped_parameters = [ 293 | {'params': [p for n, p in param_optimizer if 294 | not any(nd in n for nd in no_decay)], 295 | 'weight_decay': 0.01}, 296 | {'params': [p for n, p in param_optimizer if 297 | any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 298 | ] 299 | 300 | optimizer = AdamW(optimizer_grouped_parameters, 301 | lr=args.learning_rate, eps=args.adam_epsilon) 302 | scheduler = WarmupLinearSchedule(optimizer, 303 | warmup_steps=args.warmup_steps, 304 | t_total=args.max_iters) 305 | 306 | if arguments['iteration'] > 0 and os.path.isfile(os.path.join(last_checkpoint_dir, 'optimizer.pth')): # recovery 307 | logger.info( 308 | "Load BERT optimizer from {}".format(last_checkpoint_dir)) 309 | optimizer_to_load = torch.load( 310 | os.path.join(last_checkpoint_dir, 'optimizer.pth'), 311 | map_location=torch.device("cpu")) 312 | optimizer.load_state_dict(optimizer_to_load.pop("optimizer")) 313 | scheduler.load_state_dict(optimizer_to_load.pop("scheduler")) 314 | 315 | if args.distributed: 316 | model = torch.nn.parallel.DistributedDataParallel( 317 | model, device_ids=[args.local_rank], output_device=args.local_rank, 318 | find_unused_parameters=True) 319 | elif args.n_gpu > 1: 320 | model = torch.nn.DataParallel(model) 321 | 322 | # train_examples = None 323 | train_dataloaders = make_data_loader( 324 | args, is_distributed=args.distributed, arguments=arguments 325 | ) 326 | 327 | if isinstance(train_dataloaders, list): 328 | train_dataloader = train_dataloaders[0] 329 | else: 330 | train_dataloader = train_dataloaders 331 | train_dataloader_extra = [None] * len(train_dataloader) 332 | if isinstance(train_dataloaders, list) and len(train_dataloaders) > 1: 333 | logger.info("Having two train dataloaders!") 334 | train_dataloader_extra = train_dataloaders[1] 335 | tokenizer = train_dataloader.dataset.tokenizer 336 | 337 | # torch.backends.cudnn.benchmark = True 338 | 339 | max_iter = len(train_dataloader) 340 | start_iter = arguments["iteration"] 341 | logger.info("***** Running training *****") 342 | logger.info(" Num examples = {}".format(len(train_dataloader.dataset))) 343 | logger.info(" Instantaneous batch size = %d", 344 | args.train_batch_size // args.gradient_accumulation_steps) 345 | logger.info( 346 | " Total train batch size (w. parallel, distributed & accumulation) = %d", 347 | args.train_batch_size) 348 | logger.info(" Gradient Accumulation steps = %d", 349 | args.gradient_accumulation_steps) 350 | logger.info(" Total optimization steps = %d", 351 | max_iter // args.gradient_accumulation_steps) 352 | 353 | log_json = {} 354 | 355 | model.train() 356 | model.zero_grad() 357 | 358 | clock_started = False 359 | # Every args.ckpt_period, report train_score and save model 360 | tr_loss = 0 361 | nb_tr_examples, nb_tr_steps = 0, 0 362 | for step, (batch, batch_extra) in enumerate(zip(train_dataloader, train_dataloader_extra), start_iter): 363 | if not clock_started: 364 | start_training_time = time.time() 365 | end = time.time() 366 | clock_started = True 367 | 368 | def data_process(mini_batch): 369 | images, targets, qa_inds = \ 370 | mini_batch[0], mini_batch[1], mini_batch[2] 371 | targets_transposed = list(zip(*targets)) 372 | input_ids = torch.stack(targets_transposed[0]).to(args.device, non_blocking=True) 373 | input_mask = torch.stack(targets_transposed[1]).to(args.device, non_blocking=True) 374 | segment_ids = torch.stack(targets_transposed[2]).to(args.device, non_blocking=True) 375 | lm_label_ids = torch.stack(targets_transposed[3]).to(args.device, non_blocking=True) 376 | is_next = torch.stack(targets_transposed[4]).to(args.device, non_blocking=True) 377 | is_img_match = torch.stack(targets_transposed[5]).to(args.device, non_blocking=True) 378 | 379 | return images, input_ids, input_mask, segment_ids, lm_label_ids, is_next 380 | 381 | images1, input_ids1, input_mask1, segment_ids1, lm_label_ids1, is_next1 \ 382 | = data_process(batch) 383 | if batch_extra is not None: 384 | images2, input_ids2, input_mask2, segment_ids2, lm_label_ids2, is_next2 \ 385 | = data_process(batch_extra) 386 | 387 | data_time = time.time() - end 388 | 389 | def forward_backward(images, input_ids, input_mask, segment_ids, 390 | lm_label_ids, is_next, loss_weight=1.0): 391 | # feature as input 392 | image_features = torch.stack(images).to(args.device, non_blocking=True) 393 | 394 | outputs = model(input_ids, segment_ids, input_mask, 395 | lm_label_ids, is_next, img_feats=image_features) 396 | 397 | loss = loss_weight * outputs[0] 398 | 399 | if args.n_gpu > 1: 400 | loss = loss.mean() # mean() to average on multi-gpu. 401 | 402 | if args.gradient_accumulation_steps > 1: 403 | loss = loss / args.gradient_accumulation_steps 404 | loss.backward() 405 | 406 | return loss.item(), input_ids.size(0) 407 | 408 | start1 = time.time() 409 | loss1, nb_tr_example1 = forward_backward( 410 | images1, input_ids1, input_mask1, 411 | segment_ids1, lm_label_ids1, is_next1, 412 | loss_weight=1.0-args.extra_loss_weight 413 | ) 414 | tr_loss += loss1 415 | nb_tr_examples += nb_tr_example1 416 | compute_time1 = time.time() - start1 417 | 418 | loss2, nb_tr_example2 = 0.0, 0 419 | compute_time2 = 0.0 420 | if batch_extra is not None: 421 | start2 = time.time() 422 | loss2, nb_tr_example2 = forward_backward( 423 | images2, input_ids2, input_mask2, 424 | segment_ids2, lm_label_ids2, is_next2, 425 | loss_weight=args.extra_loss_weight 426 | ) 427 | tr_loss += loss2 428 | nb_tr_examples += nb_tr_example2 429 | compute_time2 = time.time() - start2 430 | 431 | nb_tr_steps += 1 432 | arguments["iteration"] = step + 1 433 | 434 | if (step + 1) % args.gradient_accumulation_steps == 0: 435 | # do gradient clipping 436 | if args.max_grad_norm > 0: 437 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) 438 | # do the optimization steps 439 | optimizer.step() 440 | scheduler.step() # Update learning rate schedule 441 | optimizer.zero_grad() 442 | 443 | # measure elapsed time 444 | batch_time = time.time() - end 445 | end = time.time() 446 | metrics_to_log = { 447 | 'time_info': {'compute': batch_time, 'data': data_time, 448 | 'compute1': compute_time1, 449 | 'compute2': compute_time2}, 450 | 'batch_metrics': {'loss': loss1+loss2} 451 | } 452 | params_to_log = {'params': {'bert_lr': optimizer.param_groups[0]["lr"]}} 453 | meters.update_metrics(metrics_to_log) 454 | meters.update_params(params_to_log) 455 | 456 | if args.log_period > 0 and (step + 1) % args.log_period == 0: 457 | avg_time = meters.meters['time_info']['compute'].global_avg 458 | eta_seconds = avg_time * (max_iter - step - 1) 459 | eta_string = str( 460 | datetime.timedelta(seconds=int(eta_seconds))) 461 | logger.info( 462 | meters.delimiter.join( 463 | [ 464 | "eta: {eta}", 465 | "iter: {iter}", 466 | "max mem: {memory:.0f}", 467 | ] 468 | ).format( 469 | eta=eta_string, 470 | iter=step + 1, 471 | memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, 472 | ) + "\n " + meters.get_logs(step + 1) 473 | ) 474 | 475 | if (step + 1) == max_iter or (step + 1) % args.ckpt_period == 0: # Save a trained model 476 | log_json[step+1] = tr_loss 477 | train_metrics_total = torch.Tensor([tr_loss, nb_tr_examples, nb_tr_steps]).to(args.device) 478 | torch.distributed.all_reduce(train_metrics_total) 479 | # reset metrics 480 | tr_loss = 0 481 | nb_tr_examples, nb_tr_steps = 0, 0 482 | 483 | if get_rank() == 0: 484 | # report metrics 485 | train_score_gathered = train_metrics_total[0] / \ 486 | train_metrics_total[2] 487 | logger.info("PROGRESS: {}%".format( 488 | round(100 * (step + 1) / max_iter, 4))) 489 | logger.info( 490 | "EVALERR: {}%".format(train_score_gathered)) 491 | meters.update_metrics( 492 | { 493 | 'epoch_metrics': {'ex_cnt': train_metrics_total[1], 494 | 'loss': train_score_gathered} 495 | } 496 | ) 497 | with open(os.path.join(args.output_dir, 'loss_logs.json'), 498 | 'w') as fp: 499 | json.dump(log_json, fp) 500 | 501 | # save checkpoint 502 | output_dir = os.path.join(args.output_dir, 503 | 'checkpoint-{:07d}'.format( 504 | step + 1)) 505 | if not os.path.exists(output_dir): 506 | os.makedirs(output_dir) 507 | model_to_save = model.module if hasattr( 508 | model, 509 | 'module') else model # Take care of distributed/parallel training 510 | optimizer_to_save = { 511 | "optimizer": optimizer.state_dict(), 512 | "scheduler": scheduler.state_dict()} 513 | 514 | save_num = 0 515 | while save_num < 10: 516 | try: 517 | model_to_save.save_pretrained(output_dir) 518 | torch.save(args, os.path.join(output_dir, 519 | 'training_args.bin')) 520 | tokenizer.save_pretrained(output_dir) 521 | torch.save(optimizer_to_save, 522 | os.path.join(output_dir, 523 | 'optimizer.pth')) 524 | save_file = os.path.join(args.output_dir, "last_checkpoint") 525 | with open(save_file, "w") as f: 526 | f.write('checkpoint-{:07d}/pytorch_model.bin'.format(step + 1)) 527 | break 528 | except: 529 | save_num += 1 530 | logger.info( 531 | "Saving model checkpoint {0} to {1}".format( 532 | step + 1, output_dir)) 533 | 534 | if clock_started: 535 | total_training_time = time.time() - start_training_time 536 | else: 537 | total_training_time = 0.0 538 | total_time_str = str(datetime.timedelta(seconds=total_training_time)) 539 | logger.info( 540 | "Total training time: {} ({:.4f} s / it)".format( 541 | total_time_str, total_training_time / max_iter 542 | ) 543 | ) 544 | # close the tb logger 545 | meters.close() 546 | 547 | 548 | if __name__ == "__main__": 549 | main() 550 | -------------------------------------------------------------------------------- /oscar/run_retrieval.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Microsoft Corporation. Licensed under the MIT license. 2 | 3 | from __future__ import absolute_import, division, print_function 4 | import argparse 5 | import os 6 | import base64 7 | import os.path as op 8 | import random, json 9 | import numpy as np 10 | import torch 11 | import torch.nn as nn 12 | from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler 13 | from tqdm import tqdm 14 | 15 | from oscar.utils.tsv_file import TSVFile 16 | from oscar.utils.logger import setup_logger 17 | from oscar.utils.misc import mkdir, set_seed 18 | from oscar.modeling.modeling_bert import ImageBertForSequenceClassification 19 | from transformers.pytorch_transformers import BertTokenizer, BertConfig 20 | from transformers.pytorch_transformers import AdamW, WarmupLinearSchedule, WarmupConstantSchedule 21 | 22 | 23 | class RetrievalDataset(Dataset): 24 | """ Image/Text Retrieval Dataset""" 25 | def __init__(self, tokenizer, args, split='train', is_train=True): 26 | """ 27 | tokenizer: tokenizer to process caption text. 28 | args: configureation parameters including max_seq_length, etc. 29 | split: used to infer the data used for training or testing. 30 | All files are in .pt format of a dictionary with image keys and 31 | image features (pytorch tensors), captions (list of str, support multiple 32 | captions per image), labels (list of dictionary or str of all labels), 33 | 34 | """ 35 | super(RetrievalDataset, self).__init__() 36 | self.img_file = args.img_feat_file 37 | caption_file = op.join(args.data_dir, '{}_captions.pt'.format(split)) 38 | self.img_tsv = TSVFile(self.img_file) 39 | self.captions = torch.load(caption_file) 40 | self.img_keys = list(self.captions.keys()) # img_id as int 41 | if not type(self.captions[self.img_keys[0]]) == list: 42 | self.captions = {k: json.loads(self.captions[k]) for k in self.img_keys} 43 | 44 | # get the image image_id to index map 45 | imgid2idx_file = op.join(op.dirname(self.img_file), 'imageid2idx.json') 46 | self.image_id2idx = json.load(open(imgid2idx_file)) # img_id as string 47 | 48 | if args.add_od_labels: 49 | label_data_dir = op.dirname(self.img_file) 50 | label_file = os.path.join(label_data_dir, "predictions.tsv") 51 | self.label_tsv = TSVFile(label_file) 52 | self.labels = {} 53 | for line_no in range(self.label_tsv.num_rows()): 54 | row = self.label_tsv.seek(line_no) 55 | image_id = row[0] 56 | if int(image_id) in self.img_keys: 57 | results = json.loads(row[1]) 58 | objects = results['objects'] if type( 59 | results) == dict else results 60 | self.labels[int(image_id)] = { 61 | "image_h": results["image_h"] if type( 62 | results) == dict else 600, 63 | "image_w": results["image_w"] if type( 64 | results) == dict else 800, 65 | "class": [cur_d['class'] for cur_d in objects], 66 | "boxes": np.array([cur_d['rect'] for cur_d in objects], 67 | dtype=np.float32) 68 | } 69 | self.label_tsv._fp.close() 70 | self.label_tsv._fp = None 71 | 72 | if is_train: 73 | self.num_captions_per_img = args.num_captions_per_img_train 74 | else: 75 | self.num_captions_per_img = args.num_captions_per_img_val 76 | if args.eval_img_keys_file: 77 | # select a subset of image keys for evaluation. eg. COCO 1k and 5k 78 | # eval_img_keys_file is a list of image keys saved in tsv file 79 | with open(op.join(args.data_dir, args.eval_img_keys_file), 'r') as f: 80 | img_keys = f.readlines() 81 | self.img_keys = [int(k.strip()) for k in img_keys] 82 | self.captions = {k: self.captions[k] for k in self.img_keys} 83 | if args.add_od_labels: 84 | self.labels = {k: self.labels[k] for k in self.img_keys} 85 | 86 | if args.eval_caption_index_file: 87 | # hard negative image/caption indexs for retrieval re-rank setting. 88 | # useful for mini val set to monitor the performance during training. 89 | # However, it cannot be used together with cross image evaluation. 90 | self.has_caption_indexs = True 91 | assert not args.cross_image_eval 92 | caption_index_file = op.join(args.data_dir, args.eval_caption_index_file) 93 | self.caption_indexs = torch.load(caption_index_file) 94 | if not type(self.caption_indexs[self.img_keys[0]]) == list: 95 | self.caption_indexs = {k: json.loads(self.caption_indexs[k]) for k in self.img_keys} 96 | else: 97 | self.has_caption_indexs = False 98 | self.is_train = is_train 99 | self.output_mode = args.output_mode 100 | self.tokenizer = tokenizer 101 | self.max_seq_len = args.max_seq_length 102 | self.max_img_seq_len = args.max_img_seq_length 103 | self.args = args 104 | 105 | def get_image_caption_index(self, index): 106 | # return img_idx to access features and [img_key, cap_idx] to access caption 107 | if not self.is_train and self.args.cross_image_eval: 108 | img_idx = index // (self.num_captions_per_img * len(self.img_keys)) 109 | cap_idx = index % (self.num_captions_per_img * len(self.img_keys)) 110 | img_idx1 = cap_idx // self.num_captions_per_img 111 | cap_idx1 = cap_idx % self.num_captions_per_img 112 | return img_idx, [self.img_keys[img_idx1], cap_idx1] 113 | if not self.is_train and self.has_caption_indexs: 114 | img_idx = index // self.num_captions_per_img 115 | cap_idx = index % self.num_captions_per_img 116 | img_key1, cap_idx1 = self.caption_indexs[self.img_keys[img_idx]][cap_idx] 117 | return img_idx, [img_key1, cap_idx1] 118 | img_idx = index // self.num_captions_per_img 119 | cap_idx = index % self.num_captions_per_img 120 | return img_idx, [self.img_keys[img_idx], cap_idx] 121 | 122 | def get_label(self, index): 123 | img_idx, cap_idx = self.get_image_caption_index(index) 124 | return 1 if self.img_keys[img_idx] == cap_idx[0] else 0 125 | 126 | def get_od_labels(self, img_key): 127 | if self.args.add_od_labels: 128 | if type(self.labels[img_key]) == str: 129 | od_labels = self.labels[img_key] 130 | else: 131 | od_labels = ' '.join(self.labels[img_key]['class']) 132 | return od_labels 133 | 134 | def tensorize_example(self, text_a, img_feat, text_b=None, 135 | cls_token_segment_id=0, pad_token_segment_id=0, 136 | sequence_a_segment_id=0, sequence_b_segment_id=1): 137 | tokens_a = self.tokenizer.tokenize(text_a) 138 | if len(tokens_a) > self.args.max_seq_length - 2: 139 | tokens_a = tokens_a[:(self.args.max_seq_length - 2)] 140 | 141 | tokens = [self.tokenizer.cls_token] + tokens_a + [self.tokenizer.sep_token] 142 | segment_ids = [cls_token_segment_id] + [sequence_a_segment_id] * (len(tokens_a) + 1) 143 | seq_a_len = len(tokens) 144 | if text_b: 145 | tokens_b = self.tokenizer.tokenize(text_b) 146 | if len(tokens_b) > self.max_seq_len - len(tokens) - 1: 147 | tokens_b = tokens_b[: (self.max_seq_len - len(tokens) - 1)] 148 | tokens += tokens_b + [self.tokenizer.sep_token] 149 | segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1) 150 | 151 | seq_len = len(tokens) 152 | seq_padding_len = self.max_seq_len - seq_len 153 | tokens += [self.tokenizer.pad_token] * seq_padding_len 154 | segment_ids += [pad_token_segment_id] * seq_padding_len 155 | input_ids = self.tokenizer.convert_tokens_to_ids(tokens) 156 | 157 | # image features 158 | img_len = img_feat.shape[0] 159 | if img_len > self.max_img_seq_len: 160 | img_feat = img_feat[0 : self.max_img_seq_len, :] 161 | img_len = img_feat.shape[0] 162 | img_padding_len = 0 163 | else: 164 | img_padding_len = self.max_img_seq_len - img_len 165 | padding_matrix = torch.zeros((img_padding_len, img_feat.shape[1])) 166 | img_feat = torch.cat((img_feat, padding_matrix), 0) 167 | 168 | # generate attention_mask 169 | att_mask_type = self.args.att_mask_type 170 | if att_mask_type == "CLR": 171 | attention_mask = [1] * seq_len + [0] * seq_padding_len + \ 172 | [1] * img_len + [0] * img_padding_len 173 | else: 174 | # use 2D mask to represent the attention 175 | max_len = self.max_seq_len + self.max_img_seq_len 176 | attention_mask = torch.zeros((max_len, max_len), dtype=torch.long) 177 | # full attention of C-C, L-L, R-R 178 | c_start, c_end = 0, seq_a_len 179 | l_start, l_end = seq_a_len, seq_len 180 | r_start, r_end = self.max_seq_len, self.max_seq_len + img_len 181 | attention_mask[c_start : c_end, c_start : c_end] = 1 182 | attention_mask[l_start : l_end, l_start : l_end] = 1 183 | attention_mask[r_start : r_end, r_start : r_end] = 1 184 | if att_mask_type == 'CL': 185 | attention_mask[c_start : c_end, l_start : l_end] = 1 186 | attention_mask[l_start : l_end, c_start : c_end] = 1 187 | elif att_mask_type == 'CR': 188 | attention_mask[c_start : c_end, r_start : r_end] = 1 189 | attention_mask[r_start : r_end, c_start : c_end] = 1 190 | elif att_mask_type == 'LR': 191 | attention_mask[l_start : l_end, r_start : r_end] = 1 192 | attention_mask[r_start : r_end, l_start : l_end] = 1 193 | else: 194 | raise ValueError("Unsupported attention mask type {}".format(att_mask_type)) 195 | 196 | input_ids = torch.tensor(input_ids, dtype=torch.long) 197 | attention_mask = torch.tensor(attention_mask, dtype=torch.long) 198 | segment_ids = torch.tensor(segment_ids, dtype=torch.long) 199 | return (input_ids, attention_mask, segment_ids, img_feat) 200 | 201 | def __getitem__(self, index): 202 | if self.is_train: 203 | img_idx, cap_idxs = self.get_image_caption_index(index) 204 | img_key = self.img_keys[img_idx] 205 | feature = self.get_image(img_key) 206 | caption = self.captions[cap_idxs[0]][cap_idxs[1]] 207 | od_labels = self.get_od_labels(img_key) 208 | example = self.tensorize_example(caption, feature, text_b=od_labels) 209 | 210 | # select a negative pair 211 | neg_img_indexs = list(range(0, img_idx)) + list(range(img_idx + 1, len(self.img_keys))) 212 | img_idx_neg = random.choice(neg_img_indexs) 213 | if random.random() <= 0.5: 214 | # randomly select a negative caption from a different image. 215 | cap_idx_neg = random.randint(0, self.num_captions_per_img - 1) 216 | caption_neg = self.captions[self.img_keys[img_idx_neg]][cap_idx_neg] 217 | example_neg = self.tensorize_example(caption_neg, feature, text_b=od_labels) 218 | else: 219 | # randomly select a negative image 220 | feature_neg = self.get_image(self.img_keys[img_idx_neg]) 221 | od_labels_neg = self.get_od_labels(self.img_keys[img_idx_neg]) 222 | example_neg = self.tensorize_example(caption, feature_neg, text_b=od_labels_neg) 223 | 224 | example_pair = tuple(list(example) + [1] + list(example_neg) + [0]) 225 | return index, example_pair 226 | else: 227 | img_idx, cap_idxs = self.get_image_caption_index(index) 228 | img_key = self.img_keys[img_idx] 229 | feature = self.get_image(img_key) 230 | caption = self.captions[cap_idxs[0]][cap_idxs[1]] 231 | od_labels = self.get_od_labels(img_key) 232 | example = self.tensorize_example(caption, feature, text_b=od_labels) 233 | label = 1 if img_key == cap_idxs[0] else 0 234 | return index, tuple(list(example) + [label]) 235 | 236 | def get_image(self, image_id): 237 | image_idx = self.image_id2idx[str(image_id)] 238 | row = self.img_tsv.seek(image_idx) 239 | num_boxes = int(row[1]) 240 | features = np.frombuffer(base64.b64decode(row[-1]), 241 | dtype=np.float32).reshape((num_boxes, -1)) 242 | t_features = torch.from_numpy(features) 243 | return t_features 244 | 245 | def __len__(self): 246 | if not self.is_train and self.args.cross_image_eval: 247 | return len(self.img_keys) ** 2 * self.num_captions_per_img 248 | return len(self.img_keys) * self.num_captions_per_img 249 | 250 | 251 | def compute_score_with_logits(logits, labels): 252 | if logits.shape[1] > 1: 253 | logits = torch.max(logits, 1)[1].data # argmax 254 | scores = logits == labels 255 | else: 256 | scores = torch.zeros_like(labels).cuda() 257 | for i, (logit, label) in enumerate(zip(logits, labels)): 258 | logit_ = torch.sigmoid(logit) 259 | if (logit_ >= 0.5 and label == 1) or (logit_ < 0.5 and label == 0): 260 | scores[i] = 1 261 | return scores 262 | 263 | 264 | def compute_ranks(dataset, results): 265 | labels = np.array([dataset.get_label(i) for i in range(len(dataset))]) 266 | similarities = np.array([results[i] for i in range(len(dataset))]) 267 | if dataset.has_caption_indexs: 268 | num_captions_per_img = dataset.num_captions_per_img 269 | else: 270 | num_captions_per_img = len(dataset.img_keys) * dataset.num_captions_per_img 271 | labels = np.reshape(labels, [-1, num_captions_per_img]) 272 | similarities = np.reshape(similarities, [-1, num_captions_per_img]) 273 | i2t_ranks, t2i_ranks = [], [] 274 | for lab, sim in zip(labels, similarities): 275 | inds = np.argsort(sim)[::-1] 276 | rank = num_captions_per_img 277 | for r, ind in enumerate(inds): 278 | if lab[ind] == 1: 279 | rank = r 280 | break 281 | i2t_ranks.append(rank) 282 | if not dataset.has_caption_indexs: 283 | labels = np.swapaxes(labels, 0, 1) 284 | similarities = np.swapaxes(similarities, 0, 1) 285 | for lab, sim in zip(labels, similarities): 286 | inds = np.argsort(sim)[::-1] 287 | rank = num_captions_per_img 288 | for r, ind in enumerate(inds): 289 | if lab[ind] == 1: 290 | rank = r 291 | break 292 | t2i_ranks.append(rank) 293 | return i2t_ranks, t2i_ranks 294 | 295 | 296 | def save_checkpoint(model, tokenizer, args, epoch, global_step): 297 | checkpoint_dir = op.join(args.output_dir, 'checkpoint-{}-{}'.format( 298 | epoch, global_step)) 299 | mkdir(checkpoint_dir) 300 | model_to_save = model.module if hasattr(model, 'module') else model 301 | save_num = 0 302 | while (save_num < 10): 303 | try: 304 | model_to_save.save_pretrained(checkpoint_dir) 305 | torch.save(args, op.join(checkpoint_dir, 'training_args.bin')) 306 | tokenizer.save_pretrained(checkpoint_dir) 307 | logger.info("Save checkpoint to {}".format(checkpoint_dir)) 308 | break 309 | except: 310 | save_num += 1 311 | if save_num == 10: 312 | logger.info("Failed to save checkpoint after 10 trails.") 313 | return 314 | 315 | 316 | def train(args, train_dataset, val_dataset, model, tokenizer): 317 | args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) 318 | train_sampler = RandomSampler(train_dataset) 319 | train_dataloader = DataLoader(train_dataset, sampler=train_sampler, 320 | batch_size=args.train_batch_size, num_workers=args.num_workers) 321 | 322 | if args.max_steps > 0: 323 | t_total = args.max_steps 324 | args.num_train_epochs = args.max_steps // (len(train_dataloader) // \ 325 | args.gradient_accumulation_steps) + 1 326 | else: 327 | t_total = len(train_dataloader) // args.gradient_accumulation_steps \ 328 | * args.num_train_epochs 329 | 330 | # Prepare optimizer and scheduler 331 | no_decay = ['bias', 'LayerNorm.weight'] 332 | grouped_parameters = [ 333 | {'params': [p for n, p in model.named_parameters() if not \ 334 | any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay}, 335 | {'params': [p for n, p in model.named_parameters() if \ 336 | any(nd in n for nd in no_decay)], 'weight_decay': 0.0} 337 | ] 338 | optimizer = AdamW(grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon) 339 | if args.scheduler == "constant": 340 | scheduler = WarmupConstantSchedule( 341 | optimizer, warmup_steps=args.warmup_steps) 342 | elif args.scheduler == "linear": 343 | scheduler = WarmupLinearSchedule( 344 | optimizer, warmup_steps=args.warmup_steps, t_total=t_total) 345 | else: 346 | raise ValueError("Unknown scheduler type: {}".format(args.scheduler)) 347 | 348 | if args.n_gpu > 1: 349 | model = torch.nn.DataParallel(model) 350 | 351 | logger.info("***** Running training *****") 352 | logger.info(" Num examples = %d", len(train_dataset)) 353 | logger.info(" Num Epochs = %d", args.num_train_epochs) 354 | logger.info(" Batch size per GPU = %d", args.per_gpu_train_batch_size) 355 | logger.info(" Total train batch size (w. parallel, & accumulation) = %d", 356 | args.train_batch_size * args.gradient_accumulation_steps) 357 | logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) 358 | logger.info(" Total optimization steps = %d", t_total) 359 | 360 | global_step, global_loss, global_acc =0, 0.0, 0.0 361 | model.zero_grad() 362 | log_json = [] 363 | best_score = 0 364 | for epoch in range(int(args.num_train_epochs)): 365 | for step, (_, batch) in enumerate(train_dataloader): 366 | model.train() 367 | batch = tuple(t.to(args.device) for t in batch) 368 | inputs = { 369 | 'input_ids': torch.cat((batch[0], batch[5]), dim=0), 370 | 'attention_mask': torch.cat((batch[1], batch[6]), dim=0), 371 | 'token_type_ids': torch.cat((batch[2], batch[7]), dim=0), 372 | 'img_feats': torch.cat((batch[3], batch[8]), dim=0), 373 | 'labels': torch.cat((batch[4], batch[9]), dim=0) 374 | } 375 | outputs = model(**inputs) 376 | loss, logits = outputs[:2] 377 | if args.n_gpu > 1: 378 | loss = loss.mean() # mean() to average on multi-gpu parallel training 379 | if args.gradient_accumulation_steps > 1: 380 | loss = loss / args.gradient_accumulation_steps 381 | loss.backward() 382 | torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) 383 | batch_score = compute_score_with_logits(logits, inputs['labels']).sum() 384 | batch_acc = batch_score.item() / (args.train_batch_size * 2) 385 | global_loss += loss.item() 386 | global_acc += batch_acc 387 | if (step + 1) % args.gradient_accumulation_steps == 0: 388 | global_step += 1 389 | scheduler.step() 390 | optimizer.step() 391 | model.zero_grad() 392 | if global_step % args.logging_steps == 0: 393 | logger.info("Epoch: {}, global_step: {}, lr: {:.6f}, loss: {:.4f} ({:.4f}), " \ 394 | "score: {:.4f} ({:.4f})".format(epoch, global_step, 395 | optimizer.param_groups[0]["lr"], loss, global_loss / global_step, 396 | batch_acc, global_acc / global_step) 397 | ) 398 | 399 | if (args.save_steps > 0 and global_step % args.save_steps == 0) or \ 400 | global_step == t_total: 401 | save_checkpoint(model, tokenizer, args, epoch, global_step) 402 | # evaluation 403 | if args.evaluate_during_training: 404 | logger.info("Perform evaluation at step: %d" % (global_step)) 405 | test_result = test(args, model, val_dataset) 406 | eval_result = evaluate(val_dataset, test_result) 407 | rank_accs = eval_result['i2t_retrieval'] 408 | if rank_accs['R@1'] > best_score: 409 | best_score = rank_accs['R@1'] 410 | epoch_log = {'epoch': epoch, 'global_step': global_step, 411 | 'R1': rank_accs['R@1'], 'R5': rank_accs['R@5'], 412 | 'R10': rank_accs['R@10'], 'best_R1':best_score} 413 | log_json.append(epoch_log) 414 | with open(args.output_dir + '/eval_logs.json', 'w') as fp: 415 | json.dump(log_json, fp) 416 | return global_step, global_loss / global_step 417 | 418 | 419 | def test(args, model, eval_dataset): 420 | args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu) 421 | eval_sampler = SequentialSampler(eval_dataset) 422 | eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, 423 | batch_size=args.eval_batch_size, num_workers=args.num_workers) 424 | 425 | logger.info("Num examples = {}".format(len(eval_dataset))) 426 | logger.info("Evaluation batch size = {}".format(args.eval_batch_size)) 427 | model.eval() 428 | results = {} 429 | softmax = nn.Softmax(dim=1) 430 | for indexs, batch in tqdm(eval_dataloader): 431 | batch = tuple(t.to(args.device) for t in batch) 432 | with torch.no_grad(): 433 | inputs = { 434 | 'input_ids': batch[0], 435 | 'attention_mask': batch[1], 436 | 'token_type_ids': batch[2], 437 | 'img_feats': batch[3], 438 | 'labels': batch[4] 439 | } 440 | _, logits = model(**inputs)[:2] 441 | if args.num_labels == 2: 442 | probs = softmax(logits) 443 | result = probs[:, 1] # the confidence to be a matched pair 444 | else: 445 | result = logits 446 | result = [_.to(torch.device("cpu")) for _ in result] 447 | results.update({idx.item(): res.item() for idx, res in zip(indexs, result)}) 448 | return results 449 | 450 | 451 | def evaluate(eval_dataset, test_results): 452 | i2t_ranks, t2i_ranks = compute_ranks(eval_dataset, test_results) 453 | rank = [1, 5, 10] 454 | i2t_accs = [sum([_ < r for _ in i2t_ranks]) / len(i2t_ranks) for r in rank] 455 | logger.info("I2T Retrieval: {:.4f} @ R1, {:.4f} @ R5, {:.4f} @ R10".format( 456 | i2t_accs[0], i2t_accs[1], i2t_accs[2])) 457 | eval_result = {"i2t_retrieval": {"R@1": i2t_accs[0], "R@5": i2t_accs[1], "R@10": i2t_accs[2]}} 458 | if t2i_ranks: 459 | t2i_accs = [sum([_ < r for _ in t2i_ranks]) / len(t2i_ranks) for r in rank] 460 | logger.info("T2I Retrieval: {:.4f} @ R1, {:.4f} @ R5, {:.4f} @ R10".format( 461 | t2i_accs[0], t2i_accs[1], t2i_accs[2])) 462 | eval_result["t2i_retrieval"] = {"R@1": t2i_accs[0], "R@5": t2i_accs[1], "R@10": t2i_accs[2]} 463 | return eval_result 464 | 465 | 466 | def get_predict_file(args): 467 | cc = [] 468 | data = op.basename(op.join(args.data_dir, '')[:-1]) 469 | if data != 'coco_ir': 470 | cc.append(data) 471 | cc.append(args.test_split) 472 | if args.add_od_labels: 473 | cc.append('wlabels{}'.format(args.od_label_type)) 474 | return op.join(args.eval_model_dir, '{}.results.pt'.format('.'.join(cc))) 475 | 476 | 477 | def restore_training_settings(args): 478 | assert not args.do_train and (args.do_test or args.do_eval) 479 | train_args = torch.load(op.join(args.eval_model_dir, 'training_args.bin')) 480 | override_params = ['do_lower_case', 'img_feature_type', 'max_seq_length', 481 | 'max_img_seq_length', 'add_od_labels', 'od_label_type', 482 | 'use_img_layernorm', 'img_layer_norm_eps'] 483 | for param in override_params: 484 | if hasattr(train_args, param): 485 | train_v = getattr(train_args, param) 486 | test_v = getattr(args, param) 487 | if train_v != test_v: 488 | logger.warning('Override {} with train args: {} -> {}'.format(param, 489 | test_v, train_v)) 490 | setattr(args, param, train_v) 491 | return args 492 | 493 | 494 | def main(): 495 | parser = argparse.ArgumentParser() 496 | parser.add_argument("--data_dir", default='datasets/coco_ir', type=str, required=False, 497 | help="The input data dir with all required files.") 498 | parser.add_argument("--img_feat_file", default='datasets/coco_ir/features.tsv', type=str, required=False, 499 | help="The absolute address of the image feature file.") 500 | parser.add_argument("--model_name_or_path", default=None, type=str, required=False, 501 | help="Path to pre-trained model or model type. required for training.") 502 | parser.add_argument("--output_dir", default='output/', type=str, required=False, 503 | help="The output directory to save checkpoint and test results.") 504 | parser.add_argument("--loss_type", default='sfmx', type=str, 505 | help="Loss function types: support kl, sfmx") 506 | parser.add_argument("--config_name", default="", type=str, 507 | help="Pretrained config name or path if not the same as model_name.") 508 | parser.add_argument("--tokenizer_name", default="", type=str, 509 | help="Pretrained tokenizer name or path if not the same as model_name.") 510 | parser.add_argument("--max_seq_length", default=70, type=int, 511 | help="The maximum total input sequence length after tokenization. " 512 | "Sequences longer than this will be truncated, " 513 | "sequences shorter will be padded." 514 | "This number is calculated on COCO dataset" 515 | "If add object detection labels, the suggested length should be 70.") 516 | parser.add_argument("--do_train", action='store_true', help="Whether to run training.") 517 | parser.add_argument("--do_test", action='store_true', help="Whether to run inference.") 518 | parser.add_argument("--do_eval", action='store_true', help="Whether to run performance valuation." 519 | "do not activate if we want to inference on dataset without gt labels.") 520 | parser.add_argument("--test_split", default='test', type=str, help='data split name.') 521 | parser.add_argument("--eval_img_keys_file", default='', type=str, 522 | help="image key tsv to select a subset of images for evaluation. " 523 | "This is useful in 5-folds evaluation. The topn index file is not " 524 | "needed in this case.") 525 | parser.add_argument("--eval_caption_index_file", default='', type=str, 526 | help="index of a list of (img_key, cap_idx) for each image." 527 | "this is used to perform re-rank using hard negative samples." 528 | "useful for validation set to monitor the performance during training.") 529 | parser.add_argument("--cross_image_eval", action='store_true', 530 | help="perform cross image inference, ie. each image with all texts from other images.") 531 | parser.add_argument("--add_od_labels", default=False, action='store_true', 532 | help="Whether to add object detection labels or not.") 533 | parser.add_argument("--od_label_type", default='vg', type=str, 534 | help="label type, support vg, gt, oid") 535 | parser.add_argument("--att_mask_type", default='CLR', type=str, 536 | help="attention mask type, support ['CL', 'CR', 'LR', 'CLR']" 537 | "C: caption, L: labels, R: image regions; CLR is full attention by default." 538 | "CL means attention between caption and labels." 539 | "please pay attention to the order CLR, which is the default concat order.") 540 | parser.add_argument("--do_lower_case", action='store_true', 541 | help="Set this flag if you are using an uncased model.") 542 | parser.add_argument("--drop_out", default=0.1, type=float, help="Drop out in BERT.") 543 | parser.add_argument("--max_img_seq_length", default=50, type=int, 544 | help="The maximum total input image sequence length.") 545 | parser.add_argument("--img_feature_dim", default=2054, type=int, 546 | help="The Image Feature Dimension.") 547 | parser.add_argument("--img_feature_type", default='frcnn', type=str, 548 | help="Image feature type.") 549 | parser.add_argument("--use_img_layernorm", type=int, default=1, 550 | help="Normalize image features with bertlayernorm") 551 | parser.add_argument("--img_layer_norm_eps", default=1e-12, type=float, 552 | help="The eps in image feature laynorm layer") 553 | parser.add_argument("--per_gpu_train_batch_size", default=32, type=int, 554 | help="Batch size per GPU/CPU for training.") 555 | parser.add_argument("--per_gpu_eval_batch_size", default=64, type=int, 556 | help="Batch size per GPU/CPU for evaluation.") 557 | parser.add_argument("--output_mode", default='classification', type=str, 558 | help="output mode, support classification or regression.") 559 | parser.add_argument("--num_labels", default=2, type=int, 560 | help="num_labels is 2 for classification and 1 for regression.") 561 | parser.add_argument("--num_captions_per_img_train", default=5, type=int, 562 | help="number of positive matched captions for each training image.") 563 | parser.add_argument("--num_captions_per_img_val", default=5, type=int, 564 | help="number of captions for each testing image.") 565 | parser.add_argument('--gradient_accumulation_steps', type=int, default=1, 566 | help="Number of updates steps to accumulate before backward.") 567 | parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial lr.") 568 | parser.add_argument("--weight_decay", default=0.05, type=float, help="Weight deay.") 569 | parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam.") 570 | parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.") 571 | parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup.") 572 | parser.add_argument("--scheduler", default='linear', type=str, help="constant or linear.") 573 | parser.add_argument("--num_workers", default=4, type=int, help="Workers in dataloader.") 574 | parser.add_argument("--num_train_epochs", default=20, type=int, 575 | help="Total number of training epochs to perform.") 576 | parser.add_argument("--max_steps", default=-1, type=int, 577 | help="Total number of training steps. Override num_train_epochs.") 578 | parser.add_argument('--logging_steps', type=int, default=20, help="Log every X steps.") 579 | parser.add_argument('--save_steps', type=int, default=-1, 580 | help="Save checkpoint every X steps. Will also perform evaluatin.") 581 | parser.add_argument("--evaluate_during_training", action='store_true', 582 | help="Run evaluation during training at each save_steps.") 583 | parser.add_argument("--eval_model_dir", type=str, default='', 584 | help="Model directory for evaluation.") 585 | parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA.") 586 | parser.add_argument('--seed', type=int, default=88, help="random seed for initialization.") 587 | args = parser.parse_args() 588 | 589 | global logger 590 | mkdir(args.output_dir) 591 | logger = setup_logger("vlpretrain", args.output_dir, 0) 592 | 593 | args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu") 594 | args.n_gpu = torch.cuda.device_count() 595 | set_seed(args.seed, args.n_gpu) 596 | logger.warning("Device: %s, n_gpu: %s", args.device, args.n_gpu) 597 | logger.info('output_mode: {}, #Labels: {}'.format(args.output_mode, args.num_labels)) 598 | 599 | config_class, tokenizer_class = BertConfig, BertTokenizer 600 | model_class = ImageBertForSequenceClassification 601 | if args.do_train: 602 | config = config_class.from_pretrained(args.config_name if args.config_name else \ 603 | args.model_name_or_path, num_labels=args.num_labels, finetuning_task='ir') 604 | tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name \ 605 | else args.model_name_or_path, do_lower_case=args.do_lower_case) 606 | config.img_feature_dim = args.img_feature_dim 607 | config.img_feature_type = args.img_feature_type 608 | config.hidden_dropout_prob = args.drop_out 609 | config.loss_type = args.loss_type 610 | config.img_layer_norm_eps = args.img_layer_norm_eps 611 | config.use_img_layernorm = args.use_img_layernorm 612 | model = model_class.from_pretrained(args.model_name_or_path, 613 | from_tf=bool('.ckpt' in args.model_name_or_path), config=config) 614 | else: 615 | checkpoint = args.eval_model_dir 616 | assert op.isdir(checkpoint) 617 | config = config_class.from_pretrained(checkpoint) 618 | tokenizer = tokenizer_class.from_pretrained(checkpoint) 619 | logger.info("Evaluate the following checkpoint: %s", checkpoint) 620 | model = model_class.from_pretrained(checkpoint, config=config) 621 | 622 | model.to(args.device) 623 | logger.info("Training/evaluation parameters %s", args) 624 | if args.do_train: 625 | train_dataset = RetrievalDataset(tokenizer, args, 'train', is_train=True) 626 | if args.evaluate_during_training: 627 | val_dataset = RetrievalDataset(tokenizer, args, 'minival', is_train=False) 628 | else: 629 | val_dataset = None 630 | global_step, avg_loss = train(args, train_dataset, val_dataset, model, tokenizer) 631 | logger.info("Training done: total_step = %s, avg loss = %s", global_step, avg_loss) 632 | 633 | # inference and evaluation 634 | if args.do_test or args.do_eval: 635 | args = restore_training_settings(args) 636 | test_dataset = RetrievalDataset(tokenizer, args, args.test_split, is_train=False) 637 | checkpoint = args.eval_model_dir 638 | assert op.isdir(checkpoint) 639 | logger.info("Evaluate the following checkpoint: %s", checkpoint) 640 | model = model_class.from_pretrained(checkpoint, config=config) 641 | model.to(args.device) 642 | if args.n_gpu > 1: 643 | model = torch.nn.DataParallel(model) 644 | 645 | pred_file = get_predict_file(args) 646 | if op.isfile(pred_file): 647 | logger.info("Prediction file exist, skip inference.") 648 | if args.do_eval: 649 | test_result = torch.load(pred_file) 650 | else: 651 | test_result = test(args, model, test_dataset) 652 | torch.save(test_result, pred_file) 653 | logger.info("Prediction results saved to {}.".format(pred_file)) 654 | 655 | if args.do_eval: 656 | eval_result = evaluate(test_dataset, test_result) 657 | result_file = op.splitext(pred_file)[0] + '.eval.json' 658 | with open(result_file, 'w') as f: 659 | json.dump(eval_result, f) 660 | logger.info("Evaluation results saved to {}.".format(result_file)) 661 | 662 | 663 | if __name__ == "__main__": 664 | main() 665 | -------------------------------------------------------------------------------- /oscar/utils/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.1.0" 2 | -------------------------------------------------------------------------------- /oscar/utils/caption_evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Microsoft Corporation. Licensed under the MIT license. 2 | 3 | from collections import OrderedDict, defaultdict 4 | import json 5 | import numpy as np 6 | import os.path as op 7 | from pprint import pprint 8 | import torch 9 | import re 10 | import subprocess 11 | import tempfile 12 | import time 13 | from typing import Dict, Optional 14 | 15 | from coco_caption.pycocotools.coco import COCO 16 | from coco_caption.pycocoevalcap.eval import COCOEvalCap 17 | from .cider.pyciderevalcap.ciderD.ciderD import CiderD 18 | 19 | 20 | def evaluate_on_nocaps(split, predict_file, data_dir='data/nocaps/', evaluate_file=None): 21 | ''' 22 | NOTE: Put the auth file in folder ~/.evalai/ 23 | ''' 24 | if not evaluate_file: 25 | evaluate_file = op.splitext(predict_file)[0] + '.eval.json' 26 | if op.isfile(evaluate_file): 27 | print('{} already exists'.format(evaluate_file)) 28 | with open(evaluate_file, 'r') as fp: 29 | metrics = json.load(fp) 30 | return metrics 31 | 32 | image_info_file = op.join(data_dir, 33 | 'nocaps_{}_image_info.json'.format(split)) 34 | image_info = json.load(open(image_info_file)) 35 | open_image_id2id = {} 36 | for it in image_info['images']: 37 | open_image_id2id[it['open_images_id']] = it['id'] 38 | predictions = [] 39 | cap_id = 0 40 | with open(predict_file, 'r') as fp: 41 | for line in fp: 42 | p = line.strip().split('\t') 43 | predictions.append( 44 | {'image_id': open_image_id2id[p[0]], 45 | 'caption': json.loads(p[1])[0]['caption'], 46 | 'id': cap_id}) 47 | cap_id += 1 48 | if split == 'test': 49 | print('Are you sure to submit test split result at: {}'.format(predict_file)) 50 | import ipdb;ipdb.set_trace() 51 | nocapseval = NocapsEvaluator(phase=split) 52 | metrics = nocapseval.evaluate(predictions) 53 | pprint(metrics) 54 | with open(evaluate_file, 'w') as fp: 55 | json.dump(metrics, fp) 56 | return metrics 57 | 58 | 59 | def evaluate_on_coco_caption(res_file, label_file, outfile=None): 60 | """ 61 | res_tsv: TSV file, each row is [image_key, json format list of captions]. 62 | Each caption is a dict, with fields "caption", "conf". 63 | label_file: JSON file of ground truth captions in COCO format. 64 | """ 65 | assert label_file.endswith('.json') 66 | if res_file.endswith('.tsv'): 67 | res_file_coco = op.splitext(res_file)[0] + '_coco_format.json' 68 | convert_tsv_to_coco_format(res_file, res_file_coco) 69 | else: 70 | raise ValueError('unknown prediction result file format: {}'.format(res_file)) 71 | 72 | coco = COCO(label_file) 73 | cocoRes = coco.loadRes(res_file_coco) 74 | cocoEval = COCOEvalCap(coco, cocoRes, 'corpus') 75 | 76 | # evaluate on a subset of images by setting 77 | # cocoEval.params['image_id'] = cocoRes.getImgIds() 78 | # please remove this line when evaluating the full validation set 79 | cocoEval.params['image_id'] = cocoRes.getImgIds() 80 | 81 | # evaluate results 82 | # SPICE will take a few minutes the first time, but speeds up due to caching 83 | cocoEval.evaluate() 84 | result = cocoEval.eval 85 | if not outfile: 86 | print(result) 87 | else: 88 | with open(outfile, 'w') as fp: 89 | json.dump(result, fp, indent=4) 90 | return result 91 | 92 | 93 | def convert_tsv_to_coco_format(res_tsv, outfile, 94 | sep='\t', key_col=0, cap_col=1): 95 | results = [] 96 | with open(res_tsv) as fp: 97 | for line in fp: 98 | parts = line.strip().split(sep) 99 | key = parts[key_col] 100 | if cap_col < len(parts): 101 | caps = json.loads(parts[cap_col]) 102 | assert len(caps) == 1, 'cannot evaluate multiple captions per image' 103 | cap = caps[0].get('caption', '') 104 | else: 105 | # empty caption generated 106 | cap = "" 107 | results.append( 108 | {'image_id': key, 109 | 'caption': cap} 110 | ) 111 | with open(outfile, 'w') as fp: 112 | json.dump(results, fp) 113 | 114 | 115 | class ScstRewardCriterion(torch.nn.Module): 116 | CIDER_REWARD_WEIGHT = 1 117 | 118 | def __init__(self, cider_cached_tokens='corpus', baseline_type='greedy'): 119 | self.CiderD_scorer = CiderD(df=cider_cached_tokens) 120 | assert baseline_type in ['greedy', 'sample'] 121 | self.baseline_type = baseline_type 122 | self._cur_score = None 123 | super().__init__() 124 | 125 | def forward(self, gt_res, greedy_res, sample_res, sample_logprobs): 126 | batch_size = len(gt_res) 127 | sample_res_size = len(sample_res) 128 | seq_per_img = sample_res_size // batch_size 129 | 130 | gen_res = [] 131 | gen_res.extend(sample_res) 132 | gt_idx = [i // seq_per_img for i in range(sample_res_size)] 133 | if self.baseline_type == 'greedy': 134 | assert len(greedy_res) == batch_size 135 | gen_res.extend(greedy_res) 136 | gt_idx.extend([i for i in range(batch_size)]) 137 | 138 | scores = self._calculate_eval_scores(gen_res, gt_idx, gt_res) 139 | 140 | if self.baseline_type == 'greedy': 141 | baseline = scores[-batch_size:][:, np.newaxis] 142 | else: 143 | sc_ = scores.reshape(batch_size, seq_per_img) 144 | baseline = (sc_.sum(1, keepdims=True) - sc_) / (sc_.shape[1] - 1) 145 | 146 | # sample - baseline 147 | reward = scores[:sample_res_size].reshape(batch_size, seq_per_img) 148 | self._cur_score = reward.mean() 149 | reward = reward - baseline 150 | reward = reward.reshape(sample_res_size) 151 | 152 | reward = torch.as_tensor(reward, device=sample_logprobs.device, dtype=torch.float) 153 | loss = - sample_logprobs * reward 154 | loss = loss.mean() 155 | return loss 156 | 157 | def get_score(self): 158 | return self._cur_score 159 | 160 | def _calculate_eval_scores(self, gen_res, gt_idx, gt_res): 161 | ''' 162 | gen_res: generated captions, list of str 163 | gt_idx: list of int, of the same length as gen_res 164 | gt_res: ground truth captions, list of list of str. 165 | gen_res[i] corresponds to gt_res[gt_idx[i]] 166 | Each image can have multiple ground truth captions 167 | ''' 168 | gen_res_size = len(gen_res) 169 | 170 | res = OrderedDict() 171 | for i in range(gen_res_size): 172 | res[i] = [self._wrap_sentence(gen_res[i])] 173 | 174 | gts = OrderedDict() 175 | gt_res_ = [ 176 | [self._wrap_sentence(gt_res[i][j]) for j in range(len(gt_res[i]))] 177 | for i in range(len(gt_res)) 178 | ] 179 | for i in range(gen_res_size): 180 | gts[i] = gt_res_[gt_idx[i]] 181 | 182 | res_ = [{'image_id':i, 'caption': res[i]} for i in range(len(res))] 183 | _, batch_cider_scores = self.CiderD_scorer.compute_score(gts, res_) 184 | scores = self.CIDER_REWARD_WEIGHT * batch_cider_scores 185 | return scores 186 | 187 | @classmethod 188 | def _wrap_sentence(self, s): 189 | # ensure the sentence ends with token 190 | # in order to keep consisitent with cider_cached_tokens 191 | r = s.strip() 192 | if r.endswith('.'): 193 | r = r[:-1] 194 | r += ' ' 195 | return r 196 | 197 | 198 | class NocapsEvaluator(object): 199 | r""" 200 | Code from https://github.com/nocaps-org/updown-baseline/blob/master/updown/utils/evalai.py 201 | 202 | A utility class to submit model predictions on nocaps splits to EvalAI, and retrieve model 203 | performance based on captioning metrics (such as CIDEr, SPICE). 204 | 205 | Extended Summary 206 | ---------------- 207 | This class and the training script together serve as a working example for "EvalAI in the 208 | loop", showing how evaluation can be done remotely on privately held splits. Annotations 209 | (captions) and evaluation-specific tools (e.g. `coco-caption `_) 210 | are not required locally. This enables users to select best checkpoint, perform early 211 | stopping, learning rate scheduling based on a metric, etc. without actually doing evaluation. 212 | 213 | Parameters 214 | ---------- 215 | phase: str, optional (default = "val") 216 | Which phase to evaluate on. One of "val" or "test". 217 | 218 | Notes 219 | ----- 220 | This class can be used for retrieving metrics on both, val and test splits. However, we 221 | recommend to avoid using it for test split (at least during training). Number of allowed 222 | submissions to test split on EvalAI are very less, and can exhaust in a few iterations! However, 223 | the number of submissions to val split are practically infinite. 224 | """ 225 | 226 | def __init__(self, phase: str = "val"): 227 | 228 | # Constants specific to EvalAI. 229 | self._challenge_id = 355 230 | self._phase_id = 742 if phase == "val" else 743 231 | 232 | def evaluate( 233 | self, predictions, iteration: Optional[int] = None 234 | ) -> Dict[str, Dict[str, float]]: 235 | r""" 236 | Take the model predictions (in COCO format), submit them to EvalAI, and retrieve model 237 | performance based on captioning metrics. 238 | 239 | Parameters 240 | ---------- 241 | predictions: List[Prediction] 242 | Model predictions in COCO format. They are a list of dicts with keys 243 | ``{"image_id": int, "caption": str}``. 244 | iteration: int, optional (default = None) 245 | Training iteration where the checkpoint was evaluated. 246 | 247 | Returns 248 | ------- 249 | Dict[str, Dict[str, float]] 250 | Model performance based on all captioning metrics. Nested dict structure:: 251 | 252 | { 253 | "B1": {"in-domain", "near-domain", "out-domain", "entire"}, # BLEU-1 254 | "B2": {"in-domain", "near-domain", "out-domain", "entire"}, # BLEU-2 255 | "B3": {"in-domain", "near-domain", "out-domain", "entire"}, # BLEU-3 256 | "B4": {"in-domain", "near-domain", "out-domain", "entire"}, # BLEU-4 257 | "METEOR": {"in-domain", "near-domain", "out-domain", "entire"}, 258 | "ROUGE-L": {"in-domain", "near-domain", "out-domain", "entire"}, 259 | "CIDEr": {"in-domain", "near-domain", "out-domain", "entire"}, 260 | "SPICE": {"in-domain", "near-domain", "out-domain", "entire"}, 261 | } 262 | 263 | """ 264 | # Save predictions as a json file first. 265 | _, predictions_filename = tempfile.mkstemp(suffix=".json", text=True) 266 | with open(predictions_filename, "w") as f: 267 | json.dump(predictions, f) 268 | 269 | submission_command = ( 270 | f"evalai challenge {self._challenge_id} phase {self._phase_id} " 271 | f"submit --file {predictions_filename}" 272 | ) 273 | 274 | submission_command_subprocess = subprocess.Popen( 275 | submission_command.split(), 276 | stdout=subprocess.PIPE, 277 | stdin=subprocess.PIPE, 278 | stderr=subprocess.STDOUT, 279 | ) 280 | 281 | # This terminal output will have submission ID we need to check. 282 | submission_command_stdout = submission_command_subprocess.communicate(input=b"N\n")[ 283 | 0 284 | ].decode("utf-8") 285 | 286 | submission_id_regex = re.search("evalai submission ([0-9]+)", submission_command_stdout) 287 | try: 288 | # Get an integer submission ID (as a string). 289 | submission_id = submission_id_regex.group(0).split()[-1] # type: ignore 290 | except: 291 | # Very unlikely, but submission may fail because of some glitch. Retry for that. 292 | return self.evaluate(predictions) 293 | 294 | if iteration is not None: 295 | print(f"Submitted predictions for iteration {iteration}, submission id: {submission_id}.") 296 | else: 297 | print(f"Submitted predictions, submission_id: {submission_id}") 298 | 299 | # Placeholder stdout for a pending submission. 300 | result_stdout: str = "The Submission is yet to be evaluated." 301 | num_tries: int = 0 302 | 303 | # Query every 10 seconds for result until it appears. 304 | while "CIDEr" not in result_stdout: 305 | 306 | time.sleep(10) 307 | result_stdout = subprocess.check_output( 308 | ["evalai", "submission", submission_id, "result"] 309 | ).decode("utf-8") 310 | num_tries += 1 311 | 312 | # Raise error if it takes more than 5 minutes. 313 | if num_tries == 30: 314 | raise ConnectionError("Unable to get results from EvalAI within 5 minutes!") 315 | 316 | # Convert result to json. 317 | metrics = json.loads(result_stdout, encoding="utf-8") 318 | 319 | # keys: {"in-domain", "near-domain", "out-domain", "entire"} 320 | # In each of these, keys: {"B1", "B2", "B3", "B4", "METEOR", "ROUGE-L", "CIDEr", "SPICE"} 321 | metrics = { 322 | "in-domain": metrics[0]["in-domain"], 323 | "near-domain": metrics[1]["near-domain"], 324 | "out-domain": metrics[2]["out-domain"], 325 | "entire": metrics[3]["entire"], 326 | } 327 | 328 | # Restructure the metrics dict for better tensorboard logging. 329 | # keys: {"B1", "B2", "B3", "B4", "METEOR", "ROUGE-L", "CIDEr", "SPICE"} 330 | # In each of these, keys: keys: {"in-domain", "near-domain", "out-domain", "entire"} 331 | flipped_metrics: Dict[str, Dict[str, float]] = defaultdict(dict) 332 | for key, val in metrics.items(): 333 | for subkey, subval in val.items(): 334 | flipped_metrics[subkey][key] = subval 335 | 336 | return flipped_metrics 337 | 338 | -------------------------------------------------------------------------------- /oscar/utils/cider/pyciderevalcap/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /oscar/utils/cider/pyciderevalcap/cider/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /oscar/utils/cider/pyciderevalcap/cider/cider.py: -------------------------------------------------------------------------------- 1 | # Filename: cider.py 2 | # 3 | # 4 | # Description: Describes the class to compute the CIDEr 5 | # (Consensus-Based Image Description Evaluation) Metric 6 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 7 | # 8 | # Creation Date: Sun Feb 8 14:16:54 2015 9 | # 10 | # Authors: Ramakrishna Vedantam and 11 | # Tsung-Yi Lin 12 | from __future__ import absolute_import 13 | from __future__ import division 14 | from __future__ import print_function 15 | 16 | from .cider_scorer import CiderScorer 17 | 18 | 19 | class Cider: 20 | """ 21 | Main Class to compute the CIDEr metric 22 | 23 | """ 24 | def __init__(self, n=4, df="corpus"): 25 | """ 26 | Initialize the CIDEr scoring function 27 | : param n (int): n-gram size 28 | : param df (string): specifies where to get the IDF values from 29 | takes values 'corpus', 'coco-train' 30 | : return: None 31 | """ 32 | # set cider to sum over 1 to 4-grams 33 | self._n = n 34 | self._df = df 35 | self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df) 36 | 37 | def compute_score(self, gts, res): 38 | """ 39 | Main function to compute CIDEr score 40 | : param gts (dict) : {image:tokenized reference sentence} 41 | : param res (dict) : {image:tokenized candidate sentence} 42 | : return: cider (float) : computed CIDEr score for the corpus 43 | """ 44 | 45 | # clear all the previous hypos and refs 46 | self.cider_scorer.clear() 47 | 48 | for res_id in res: 49 | 50 | hypo = res_id['caption'] 51 | ref = gts[res_id['image_id']] 52 | 53 | # Sanity check. 54 | assert(type(hypo) is list) 55 | assert(len(hypo) == 1) 56 | assert(type(ref) is list) 57 | assert(len(ref) > 0) 58 | self.cider_scorer += (hypo[0], ref) 59 | 60 | (score, scores) = self.cider_scorer.compute_score() 61 | 62 | return score, scores 63 | 64 | def method(self): 65 | return "CIDEr" 66 | -------------------------------------------------------------------------------- /oscar/utils/cider/pyciderevalcap/cider/cider_scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Tsung-Yi Lin 3 | # Ramakrishna Vedantam 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | import copy 9 | import six 10 | from six.moves import cPickle 11 | from collections import defaultdict 12 | import numpy as np 13 | import math 14 | import os 15 | 16 | def precook(s, n=4, out=False): 17 | """ 18 | Takes a string as input and returns an object that can be given to 19 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 20 | can take string arguments as well. 21 | :param s: string : sentence to be converted into ngrams 22 | :param n: int : number of ngrams for which representation is calculated 23 | :return: term frequency vector for occuring ngrams 24 | """ 25 | words = s.split() 26 | counts = defaultdict(int) 27 | for k in range(1,n+1): 28 | for i in range(len(words)-k+1): 29 | ngram = tuple(words[i:i+k]) 30 | counts[ngram] += 1 31 | return counts 32 | 33 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" 34 | '''Takes a list of reference sentences for a single segment 35 | and returns an object that encapsulates everything that BLEU 36 | needs to know about them. 37 | :param refs: list of string : reference sentences for some image 38 | :param n: int : number of ngrams for which (ngram) representation is calculated 39 | :return: result (list of dict) 40 | ''' 41 | return [precook(ref, n) for ref in refs] 42 | 43 | def cook_test(test, n=4): 44 | '''Takes a test sentence and returns an object that 45 | encapsulates everything that BLEU needs to know about it. 46 | :param test: list of string : hypothesis sentence for some image 47 | :param n: int : number of ngrams for which (ngram) representation is calculated 48 | :return: result (dict) 49 | ''' 50 | return precook(test, n, True) 51 | 52 | class CiderScorer(object): 53 | """CIDEr scorer. 54 | """ 55 | 56 | def copy(self): 57 | ''' copy the refs.''' 58 | new = CiderScorer(n=self.n) 59 | new.ctest = copy.copy(self.ctest) 60 | new.crefs = copy.copy(self.crefs) 61 | return new 62 | 63 | def __init__(self, df_mode="corpus", test=None, refs=None, n=4, sigma=6.0): 64 | ''' singular instance ''' 65 | self.n = n 66 | self.sigma = sigma 67 | self.crefs = [] 68 | self.ctest = [] 69 | self.df_mode = df_mode 70 | self.ref_len = None 71 | if self.df_mode != "corpus": 72 | pkl_file = cPickle.load(open(os.path.join('data', df_mode + '.p'),'rb'), **(dict(encoding='latin1') if six.PY3 else {})) 73 | self.ref_len = np.log(float(pkl_file['ref_len'])) 74 | self.document_frequency = pkl_file['document_frequency'] 75 | self.cook_append(test, refs) 76 | 77 | def clear(self): 78 | self.crefs = [] 79 | self.ctest = [] 80 | 81 | def cook_append(self, test, refs): 82 | '''called by constructor and __iadd__ to avoid creating new instances.''' 83 | 84 | if refs is not None: 85 | self.crefs.append(cook_refs(refs)) 86 | if test is not None: 87 | self.ctest.append(cook_test(test)) ## N.B.: -1 88 | else: 89 | self.ctest.append(None) # lens of crefs and ctest have to match 90 | 91 | def size(self): 92 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 93 | return len(self.crefs) 94 | 95 | def __iadd__(self, other): 96 | '''add an instance (e.g., from another sentence).''' 97 | 98 | if type(other) is tuple: 99 | ## avoid creating new CiderScorer instances 100 | self.cook_append(other[0], other[1]) 101 | else: 102 | self.ctest.extend(other.ctest) 103 | self.crefs.extend(other.crefs) 104 | 105 | return self 106 | def compute_doc_freq(self): 107 | ''' 108 | Compute term frequency for reference data. 109 | This will be used to compute idf (inverse document frequency later) 110 | The term frequency is stored in the object 111 | :return: None 112 | ''' 113 | for refs in self.crefs: 114 | # refs, k ref captions of one image 115 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]): 116 | self.document_frequency[ngram] += 1 117 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 118 | 119 | def compute_cider(self): 120 | def counts2vec(cnts): 121 | """ 122 | Function maps counts of ngram to vector of tfidf weights. 123 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights. 124 | The n-th entry of array denotes length of n-grams. 125 | :param cnts: 126 | :return: vec (array of dict), norm (array of float), length (int) 127 | """ 128 | vec = [defaultdict(float) for _ in range(self.n)] 129 | length = 0 130 | norm = [0.0 for _ in range(self.n)] 131 | for (ngram,term_freq) in cnts.items(): 132 | # give word count 1 if it doesn't appear in reference corpus 133 | df = np.log(max(1.0, self.document_frequency[ngram])) 134 | # ngram index 135 | n = len(ngram)-1 136 | # tf (term_freq) * idf (precomputed idf) for n-grams 137 | vec[n][ngram] = float(term_freq)*(self.ref_len - df) 138 | # compute norm for the vector. the norm will be used for 139 | # computing similarity 140 | norm[n] += pow(vec[n][ngram], 2) 141 | 142 | if n == 1: 143 | length += term_freq 144 | norm = [np.sqrt(n) for n in norm] 145 | return vec, norm, length 146 | 147 | def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): 148 | ''' 149 | Compute the cosine similarity of two vectors. 150 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis 151 | :param vec_ref: array of dictionary for vector corresponding to reference 152 | :param norm_hyp: array of float for vector corresponding to hypothesis 153 | :param norm_ref: array of float for vector corresponding to reference 154 | :param length_hyp: int containing length of hypothesis 155 | :param length_ref: int containing length of reference 156 | :return: array of score for each n-grams cosine similarity 157 | ''' 158 | delta = float(length_hyp - length_ref) 159 | # measure consine similarity 160 | val = np.array([0.0 for _ in range(self.n)]) 161 | for n in range(self.n): 162 | # ngram 163 | for (ngram,count) in vec_hyp[n].items(): 164 | val[n] += vec_hyp[n][ngram] * vec_ref[n][ngram] 165 | 166 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0): 167 | val[n] /= (norm_hyp[n]*norm_ref[n]) 168 | 169 | assert(not math.isnan(val[n])) 170 | return val 171 | 172 | # compute log reference length 173 | if self.df_mode == "corpus": 174 | self.ref_len = np.log(float(len(self.crefs))) 175 | 176 | scores = [] 177 | for test, refs in zip(self.ctest, self.crefs): 178 | # compute vector for test captions 179 | vec, norm, length = counts2vec(test) 180 | # compute vector for ref captions 181 | score = np.array([0.0 for _ in range(self.n)]) 182 | for ref in refs: 183 | vec_ref, norm_ref, length_ref = counts2vec(ref) 184 | score += sim(vec, vec_ref, norm, norm_ref, length, length_ref) 185 | # change by vrama91 - mean of ngram scores, instead of sum 186 | score_avg = np.mean(score) 187 | # divide by number of references 188 | score_avg /= len(refs) 189 | # multiply score by 10 190 | score_avg *= 10.0 191 | # append score of an image to the score list 192 | scores.append(score_avg) 193 | return scores 194 | 195 | def compute_score(self, option=None, verbose=0): 196 | # compute idf 197 | if self.df_mode == "corpus": 198 | self.document_frequency = defaultdict(float) 199 | self.compute_doc_freq() 200 | # assert to check document frequency 201 | assert(len(self.ctest) >= max(self.document_frequency.values())) 202 | # import json for now and write the corresponding files 203 | # compute cider score 204 | score = self.compute_cider() 205 | # debug 206 | # print score 207 | return np.mean(np.array(score)), np.array(score) 208 | -------------------------------------------------------------------------------- /oscar/utils/cider/pyciderevalcap/ciderD/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /oscar/utils/cider/pyciderevalcap/ciderD/ciderD.py: -------------------------------------------------------------------------------- 1 | # Filename: ciderD.py 2 | # 3 | # Description: Describes the class to compute the CIDEr-D (Consensus-Based Image Description Evaluation) Metric 4 | # by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726) 5 | # 6 | # Creation Date: Sun Feb 8 14:16:54 2015 7 | # 8 | # Authors: Ramakrishna Vedantam and Tsung-Yi Lin 9 | from __future__ import absolute_import 10 | from __future__ import division 11 | from __future__ import print_function 12 | 13 | from .ciderD_scorer import CiderScorer 14 | import pdb 15 | 16 | class CiderD: 17 | """ 18 | Main Class to compute the CIDEr metric 19 | 20 | """ 21 | def __init__(self, n=4, sigma=6.0, df="corpus"): 22 | # set cider to sum over 1 to 4-grams 23 | self._n = n 24 | # set the standard deviation parameter for gaussian penalty 25 | self._sigma = sigma 26 | # set which where to compute document frequencies from 27 | self._df = df 28 | self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df) 29 | 30 | def compute_score(self, gts, res): 31 | """ 32 | Main function to compute CIDEr score 33 | :param hypo_for_image (dict) : dictionary with key and value 34 | ref_for_image (dict) : dictionary with key and value 35 | :return: cider (float) : computed CIDEr score for the corpus 36 | """ 37 | 38 | # clear all the previous hypos and refs 39 | tmp_cider_scorer = self.cider_scorer.copy_empty() 40 | tmp_cider_scorer.clear() 41 | for res_id in res: 42 | 43 | hypo = res_id['caption'] 44 | ref = gts[res_id['image_id']] 45 | 46 | # Sanity check. 47 | assert(type(hypo) is list) 48 | assert(len(hypo) == 1) 49 | assert(type(ref) is list) 50 | assert(len(ref) > 0) 51 | tmp_cider_scorer += (hypo[0], ref) 52 | 53 | (score, scores) = tmp_cider_scorer.compute_score() 54 | 55 | return score, scores 56 | 57 | def method(self): 58 | return "CIDEr-D" 59 | -------------------------------------------------------------------------------- /oscar/utils/cider/pyciderevalcap/ciderD/ciderD_scorer.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # Tsung-Yi Lin 3 | # Ramakrishna Vedantam 4 | from __future__ import absolute_import 5 | from __future__ import division 6 | from __future__ import print_function 7 | 8 | import copy 9 | from collections import defaultdict 10 | import numpy as np 11 | import pdb 12 | import math 13 | import six 14 | from six.moves import cPickle 15 | import os 16 | 17 | def precook(s, n=4, out=False): 18 | """ 19 | Takes a string as input and returns an object that can be given to 20 | either cook_refs or cook_test. This is optional: cook_refs and cook_test 21 | can take string arguments as well. 22 | :param s: string : sentence to be converted into ngrams 23 | :param n: int : number of ngrams for which representation is calculated 24 | :return: term frequency vector for occuring ngrams 25 | """ 26 | words = s.split() 27 | counts = defaultdict(int) 28 | for k in range(1,n+1): 29 | for i in range(len(words)-k+1): 30 | ngram = tuple(words[i:i+k]) 31 | counts[ngram] += 1 32 | return counts 33 | 34 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average" 35 | '''Takes a list of reference sentences for a single segment 36 | and returns an object that encapsulates everything that BLEU 37 | needs to know about them. 38 | :param refs: list of string : reference sentences for some image 39 | :param n: int : number of ngrams for which (ngram) representation is calculated 40 | :return: result (list of dict) 41 | ''' 42 | return [precook(ref, n) for ref in refs] 43 | 44 | def cook_test(test, n=4): 45 | '''Takes a test sentence and returns an object that 46 | encapsulates everything that BLEU needs to know about it. 47 | :param test: list of string : hypothesis sentence for some image 48 | :param n: int : number of ngrams for which (ngram) representation is calculated 49 | :return: result (dict) 50 | ''' 51 | return precook(test, n, True) 52 | 53 | class CiderScorer(object): 54 | """CIDEr scorer. 55 | """ 56 | 57 | def copy(self): 58 | ''' copy the refs.''' 59 | new = CiderScorer(n=self.n) 60 | new.ctest = copy.copy(self.ctest) 61 | new.crefs = copy.copy(self.crefs) 62 | return new 63 | 64 | def copy_empty(self): 65 | new = CiderScorer(df_mode="corpus", n=self.n, sigma=self.sigma) 66 | new.df_mode = self.df_mode 67 | new.ref_len = self.ref_len 68 | new.document_frequency = self.document_frequency 69 | return new 70 | 71 | def __init__(self, df_mode="corpus", test=None, refs=None, n=4, sigma=6.0): 72 | ''' singular instance ''' 73 | self.n = n 74 | self.sigma = sigma 75 | self.crefs = [] 76 | self.ctest = [] 77 | self.df_mode = df_mode 78 | self.ref_len = None 79 | if self.df_mode != "corpus": 80 | pkl_file = cPickle.load(open(df_mode,'rb'), **(dict(encoding='latin1') if six.PY3 else {})) 81 | self.ref_len = np.log(float(pkl_file['ref_len'])) 82 | self.document_frequency = pkl_file['document_frequency'] 83 | else: 84 | self.document_frequency = None 85 | self.cook_append(test, refs) 86 | 87 | def clear(self): 88 | self.crefs = [] 89 | self.ctest = [] 90 | 91 | def cook_append(self, test, refs): 92 | '''called by constructor and __iadd__ to avoid creating new instances.''' 93 | 94 | if refs is not None: 95 | self.crefs.append(cook_refs(refs)) 96 | if test is not None: 97 | self.ctest.append(cook_test(test)) ## N.B.: -1 98 | else: 99 | self.ctest.append(None) # lens of crefs and ctest have to match 100 | 101 | def size(self): 102 | assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest)) 103 | return len(self.crefs) 104 | 105 | def __iadd__(self, other): 106 | '''add an instance (e.g., from another sentence).''' 107 | 108 | if type(other) is tuple: 109 | ## avoid creating new CiderScorer instances 110 | self.cook_append(other[0], other[1]) 111 | else: 112 | self.ctest.extend(other.ctest) 113 | self.crefs.extend(other.crefs) 114 | 115 | return self 116 | def compute_doc_freq(self): 117 | ''' 118 | Compute term frequency for reference data. 119 | This will be used to compute idf (inverse document frequency later) 120 | The term frequency is stored in the object 121 | :return: None 122 | ''' 123 | for refs in self.crefs: 124 | # refs, k ref captions of one image 125 | for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]): 126 | self.document_frequency[ngram] += 1 127 | # maxcounts[ngram] = max(maxcounts.get(ngram,0), count) 128 | 129 | def compute_cider(self): 130 | def counts2vec(cnts): 131 | """ 132 | Function maps counts of ngram to vector of tfidf weights. 133 | The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights. 134 | The n-th entry of array denotes length of n-grams. 135 | :param cnts: 136 | :return: vec (array of dict), norm (array of float), length (int) 137 | """ 138 | vec = [defaultdict(float) for _ in range(self.n)] 139 | length = 0 140 | norm = [0.0 for _ in range(self.n)] 141 | for (ngram,term_freq) in cnts.items(): 142 | # give word count 1 if it doesn't appear in reference corpus 143 | df = np.log(max(1.0, self.document_frequency[ngram])) 144 | # ngram index 145 | n = len(ngram)-1 146 | # tf (term_freq) * idf (precomputed idf) for n-grams 147 | vec[n][ngram] = float(term_freq)*(self.ref_len - df) 148 | # compute norm for the vector. the norm will be used for computing similarity 149 | norm[n] += pow(vec[n][ngram], 2) 150 | 151 | if n == 1: 152 | length += term_freq 153 | norm = [np.sqrt(n) for n in norm] 154 | return vec, norm, length 155 | 156 | def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref): 157 | ''' 158 | Compute the cosine similarity of two vectors. 159 | :param vec_hyp: array of dictionary for vector corresponding to hypothesis 160 | :param vec_ref: array of dictionary for vector corresponding to reference 161 | :param norm_hyp: array of float for vector corresponding to hypothesis 162 | :param norm_ref: array of float for vector corresponding to reference 163 | :param length_hyp: int containing length of hypothesis 164 | :param length_ref: int containing length of reference 165 | :return: array of score for each n-grams cosine similarity 166 | ''' 167 | delta = float(length_hyp - length_ref) 168 | # measure consine similarity 169 | val = np.array([0.0 for _ in range(self.n)]) 170 | for n in range(self.n): 171 | # ngram 172 | for (ngram,count) in vec_hyp[n].items(): 173 | # vrama91 : added clipping 174 | val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram] 175 | 176 | if (norm_hyp[n] != 0) and (norm_ref[n] != 0): 177 | val[n] /= (norm_hyp[n]*norm_ref[n]) 178 | 179 | assert(not math.isnan(val[n])) 180 | # vrama91: added a length based gaussian penalty 181 | val[n] *= np.e**(-(delta**2)/(2*self.sigma**2)) 182 | return val 183 | 184 | # compute log reference length 185 | if self.df_mode == "corpus": 186 | self.ref_len = np.log(float(len(self.crefs))) 187 | #elif self.df_mode == "coco-val-df": 188 | # if coco option selected, use length of coco-val set 189 | # self.ref_len = np.log(float(40504)) 190 | 191 | scores = [] 192 | for test, refs in zip(self.ctest, self.crefs): 193 | # compute vector for test captions 194 | vec, norm, length = counts2vec(test) 195 | # compute vector for ref captions 196 | score = np.array([0.0 for _ in range(self.n)]) 197 | for ref in refs: 198 | vec_ref, norm_ref, length_ref = counts2vec(ref) 199 | score += sim(vec, vec_ref, norm, norm_ref, length, length_ref) 200 | # change by vrama91 - mean of ngram scores, instead of sum 201 | score_avg = np.mean(score) 202 | # divide by number of references 203 | score_avg /= len(refs) 204 | # multiply score by 10 205 | score_avg *= 10.0 206 | # append score of an image to the score list 207 | scores.append(score_avg) 208 | return scores 209 | 210 | def compute_score(self, option=None, verbose=0): 211 | # compute idf 212 | if self.df_mode == "corpus": 213 | self.document_frequency = defaultdict(float) 214 | self.compute_doc_freq() 215 | # assert to check document frequency 216 | assert(len(self.ctest) >= max(self.document_frequency.values())) 217 | # import json for now and write the corresponding files 218 | # compute cider score 219 | score = self.compute_cider() 220 | # debug 221 | # print score 222 | return np.mean(np.array(score)), np.array(score) 223 | -------------------------------------------------------------------------------- /oscar/utils/logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Microsoft Corporation. Licensed under the MIT license. 2 | 3 | import logging 4 | from logging import StreamHandler, Handler, getLevelName 5 | import os 6 | import sys 7 | 8 | 9 | # this class is a copy of logging.FileHandler except we end self.close() 10 | # at the end of each emit. While closing file and reopening file after each 11 | # write is not efficient, it allows us to see partial logs when writing to 12 | # fused Azure blobs, which is very convenient 13 | class FileHandler(StreamHandler): 14 | """ 15 | A handler class which writes formatted logging records to disk files. 16 | """ 17 | def __init__(self, filename, mode='a', encoding=None, delay=False): 18 | """ 19 | Open the specified file and use it as the stream for logging. 20 | """ 21 | # Issue #27493: add support for Path objects to be passed in 22 | filename = os.fspath(filename) 23 | #keep the absolute path, otherwise derived classes which use this 24 | #may come a cropper when the current directory changes 25 | self.baseFilename = os.path.abspath(filename) 26 | self.mode = mode 27 | self.encoding = encoding 28 | self.delay = delay 29 | if delay: 30 | #We don't open the stream, but we still need to call the 31 | #Handler constructor to set level, formatter, lock etc. 32 | Handler.__init__(self) 33 | self.stream = None 34 | else: 35 | StreamHandler.__init__(self, self._open()) 36 | 37 | def close(self): 38 | """ 39 | Closes the stream. 40 | """ 41 | self.acquire() 42 | try: 43 | try: 44 | if self.stream: 45 | try: 46 | self.flush() 47 | finally: 48 | stream = self.stream 49 | self.stream = None 50 | if hasattr(stream, "close"): 51 | stream.close() 52 | finally: 53 | # Issue #19523: call unconditionally to 54 | # prevent a handler leak when delay is set 55 | StreamHandler.close(self) 56 | finally: 57 | self.release() 58 | 59 | def _open(self): 60 | """ 61 | Open the current base file with the (original) mode and encoding. 62 | Return the resulting stream. 63 | """ 64 | return open(self.baseFilename, self.mode, encoding=self.encoding) 65 | 66 | def emit(self, record): 67 | """ 68 | Emit a record. 69 | 70 | If the stream was not opened because 'delay' was specified in the 71 | constructor, open it before calling the superclass's emit. 72 | """ 73 | if self.stream is None: 74 | self.stream = self._open() 75 | StreamHandler.emit(self, record) 76 | self.close() 77 | 78 | def __repr__(self): 79 | level = getLevelName(self.level) 80 | return '<%s %s (%s)>' % (self.__class__.__name__, self.baseFilename, level) 81 | 82 | 83 | def setup_logger(name, save_dir, distributed_rank, filename="log.txt"): 84 | logger = logging.getLogger(name) 85 | logger.setLevel(logging.DEBUG) 86 | # don't log results for the non-master process 87 | if distributed_rank > 0: 88 | return logger 89 | ch = logging.StreamHandler(stream=sys.stdout) 90 | ch.setLevel(logging.DEBUG) 91 | formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s") 92 | ch.setFormatter(formatter) 93 | logger.addHandler(ch) 94 | 95 | if save_dir: 96 | fh = FileHandler(os.path.join(save_dir, filename)) 97 | fh.setLevel(logging.DEBUG) 98 | fh.setFormatter(formatter) 99 | logger.addHandler(fh) 100 | 101 | return logger 102 | 103 | -------------------------------------------------------------------------------- /oscar/utils/metric_logger.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from collections import defaultdict 3 | from collections import deque 4 | import os 5 | 6 | import torch 7 | 8 | from .misc import is_main_process 9 | 10 | 11 | class SmoothedValue(object): 12 | """Track a series of values and provide access to smoothed values over a 13 | window or the global series average. 14 | """ 15 | 16 | def __init__(self, window_size=20): 17 | self.deque = deque(maxlen=window_size) 18 | # self.series = [] 19 | self.total = 0.0 20 | self.count = 0 21 | 22 | def update(self, value): 23 | self.deque.append(value) 24 | # self.series.append(value) 25 | self.count += 1 26 | self.total += value 27 | 28 | @property 29 | def median(self): 30 | d = torch.tensor(list(self.deque)) 31 | return d.median().item() 32 | 33 | @property 34 | def avg(self): 35 | d = torch.tensor(list(self.deque)) 36 | return d.mean().item() 37 | 38 | @property 39 | def global_avg(self): 40 | return self.total / self.count 41 | 42 | @property 43 | def last_value(self): 44 | return self.deque[-1] 45 | 46 | 47 | class MetricLogger(object): 48 | def __init__(self, delimiter="\t"): 49 | self.meters = {} 50 | self.params = {} 51 | self.delimiter = delimiter 52 | 53 | def update_params(self, update_dict): 54 | for param_group, group_dict in update_dict.items(): 55 | if param_group not in self.params: 56 | self.params[param_group] = {} 57 | for param_name, param_value in group_dict.items(): 58 | # skipping parameters if they start with '_' 59 | if param_name.startswith('_'): 60 | continue 61 | if isinstance(param_value, torch.Tensor): 62 | param_value = param_value.item() 63 | assert isinstance(param_value, (float, int)) 64 | self.params[param_group][param_name] = param_value 65 | 66 | def update_metrics(self, update_dict): 67 | for metric_group, group_dict in update_dict.items(): 68 | if metric_group not in self.meters: 69 | self.meters[metric_group] = defaultdict(SmoothedValue) 70 | for metric_name, metric_value in group_dict.items(): 71 | # skipping metrics if they start with '_' 72 | if metric_name.startswith('_'): 73 | continue 74 | if isinstance(metric_value, torch.Tensor): 75 | metric_value = metric_value.item() 76 | assert isinstance(metric_value, (float, int)) 77 | self.meters[metric_group][metric_name].update(metric_value) 78 | 79 | def get_logs(self, iteration): 80 | return_str = [] 81 | if len(self.meters) > 0: 82 | offset_m = max([len(group_name) for group_name in self.meters.keys()]) 83 | else: 84 | offset_m = 0 85 | if len(self.params) > 0: 86 | offset_p = max([len(group_name) for group_name in self.params.keys()]) 87 | else: 88 | offset_p = 0 89 | offset = max(offset_m, offset_p) 90 | 91 | for group_name, values in sorted(self.meters.items(), 92 | key=lambda x: x[0]): 93 | loss_str = [] 94 | for name, meter in values.items(): 95 | loss_str.append("{}: {:.4f} ({:.4f})".format( 96 | name, meter.median, meter.global_avg, 97 | )) 98 | return_str.append( 99 | "{:{offset}s} - {}".format( 100 | group_name, self.delimiter.join(loss_str), offset=offset, 101 | ), 102 | ) 103 | for group_name, values in self.params.items(): 104 | loss_str = [] 105 | for name, param in values.items(): 106 | loss_str.append("{}: {:.6f}".format(name, param)) 107 | return_str.append( 108 | "{:{offset}s} - {}".format( 109 | group_name, self.delimiter.join(loss_str), offset=offset, 110 | ), 111 | ) 112 | return "\n ".join(return_str) 113 | 114 | 115 | class TensorboardLogger(MetricLogger): 116 | def __init__(self, 117 | log_dir, 118 | delimiter='\t'): 119 | super(TensorboardLogger, self).__init__(delimiter) 120 | try: 121 | from tensorboardX import SummaryWriter 122 | except ImportError: 123 | raise ImportError( 124 | 'To use tensorboard please install tensorboardX ' 125 | '[ pip install tensorboardx ].' 126 | ) 127 | self.philly_tb_logger = None 128 | self.philly_tb_logger_avg = None 129 | self.philly_tb_logger_med = None 130 | if is_main_process(): 131 | self.tb_logger = SummaryWriter(log_dir) 132 | self.tb_logger_avg = SummaryWriter(os.path.join(log_dir, 'avg')) 133 | self.tb_logger_med = SummaryWriter(os.path.join(log_dir, 'med')) 134 | else: 135 | self.tb_logger = None 136 | self.tb_logger_avg = None 137 | self.tb_logger_med = None 138 | 139 | def get_logs(self, iteration): 140 | if self.tb_logger: 141 | for group_name, values in self.meters.items(): 142 | for name, meter in values.items(): 143 | self.tb_logger.add_scalar( 144 | '{}/{}'.format(group_name, name), 145 | meter.last_value, iteration, 146 | ) 147 | self.tb_logger_avg.add_scalar( 148 | '{}/{}'.format(group_name, name), 149 | meter.avg, iteration, 150 | ) 151 | self.tb_logger_med.add_scalar( 152 | '{}/{}'.format(group_name, name), 153 | meter.median, iteration, 154 | ) 155 | if self.philly_tb_logger: 156 | self.philly_tb_logger.add_scalar( 157 | '{}/{}'.format(group_name, name), 158 | meter.last_value, iteration, 159 | ) 160 | self.philly_tb_logger_avg.add_scalar( 161 | '{}/{}'.format(group_name, name), 162 | meter.avg, iteration, 163 | ) 164 | self.philly_tb_logger_med.add_scalar( 165 | '{}/{}'.format(group_name, name), 166 | meter.median, iteration, 167 | ) 168 | for group_name, values in self.params.items(): 169 | for name, param in values.items(): 170 | self.tb_logger.add_scalar( 171 | '{}/{}'.format(group_name, name), 172 | param, iteration, 173 | ) 174 | if self.philly_tb_logger: 175 | self.philly_tb_logger.add_scalar( 176 | '{}/{}'.format(group_name, name), 177 | param, iteration, 178 | ) 179 | return super(TensorboardLogger, self).get_logs(iteration) 180 | 181 | def close(self): 182 | if is_main_process(): 183 | self.tb_logger.close() 184 | self.tb_logger_avg.close() 185 | self.tb_logger_med.close() 186 | -------------------------------------------------------------------------------- /oscar/utils/misc.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Microsoft Corporation. Licensed under the MIT license. 2 | 3 | import errno 4 | import os 5 | import os.path as op 6 | import yaml 7 | import random 8 | import torch 9 | import numpy as np 10 | import torch.distributed as dist 11 | 12 | 13 | def mkdir(path): 14 | # if it is the current folder, skip. 15 | if path == '': 16 | return 17 | try: 18 | os.makedirs(path) 19 | except OSError as e: 20 | if e.errno != errno.EEXIST: 21 | raise 22 | 23 | 24 | def set_seed(seed, n_gpu): 25 | random.seed(seed) 26 | np.random.seed(seed) 27 | torch.manual_seed(seed) 28 | if n_gpu > 0: 29 | torch.cuda.manual_seed_all(seed) 30 | 31 | 32 | def load_from_yaml_file(yaml_file): 33 | with open(yaml_file, 'r') as fp: 34 | return yaml.load(fp) 35 | 36 | 37 | def find_file_path_in_yaml(fname, root): 38 | if fname is not None: 39 | if op.isfile(fname): 40 | return fname 41 | elif op.isfile(op.join(root, fname)): 42 | return op.join(root, fname) 43 | else: 44 | raise FileNotFoundError( 45 | errno.ENOENT, os.strerror(errno.ENOENT), op.join(root, fname) 46 | ) 47 | 48 | 49 | def get_rank(): 50 | if not dist.is_available(): 51 | return 0 52 | if not dist.is_initialized(): 53 | return 0 54 | return dist.get_rank() 55 | 56 | 57 | def is_main_process(): 58 | return get_rank() == 0 59 | 60 | 61 | def get_world_size(): 62 | if not dist.is_available(): 63 | return 1 64 | if not dist.is_initialized(): 65 | return 1 66 | return dist.get_world_size() 67 | -------------------------------------------------------------------------------- /oscar/utils/task_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Microsoft Corporation. Licensed under the MIT license. 2 | 3 | from __future__ import absolute_import, division, print_function 4 | 5 | import csv, json 6 | import logging 7 | import os 8 | import sys 9 | from io import open 10 | import _pickle as cPickle 11 | import torch 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | class InputInstance(object): 17 | """A single training/test example for simple sequence classification.""" 18 | 19 | def __init__(self, guid, text_a, text_b=None, label=None, score=None, img_key=None, q_id=None): 20 | """Constructs a InputExample. 21 | 22 | Args: 23 | guid: Unique id for the example. 24 | text_a: string. The untokenized text of the first sequence. For single 25 | sequence tasks, only this sequence must be specified. 26 | text_b: (Optional) string. The untokenized text of the second sequence. 27 | Only must be specified for sequence pair tasks. 28 | label: (Optional) string. The label of the example. This should be 29 | specified for train and dev examples, but not for test examples. 30 | """ 31 | 32 | self.guid = guid 33 | self.text_a = text_a 34 | self.text_b = text_b 35 | self.label = label 36 | self.score = score 37 | self.img_key = img_key 38 | self.q_id = q_id 39 | 40 | 41 | class InputFeat(object): 42 | """A single set of features of data.""" 43 | 44 | def __init__(self, input_ids, input_mask, segment_ids, label_id, score, img_feat): 45 | self.input_ids = input_ids 46 | self.input_mask = input_mask 47 | self.segment_ids = segment_ids 48 | self.label_id = label_id 49 | self.score = score 50 | self.img_feat = img_feat 51 | 52 | 53 | class DataProcessor(object): 54 | """Base class for data converters for sequence classification data sets.""" 55 | 56 | def get_train_examples(self, data_dir): 57 | """Gets a collection of `InputExample`s for the train set.""" 58 | raise NotImplementedError() 59 | 60 | def get_dev_examples(self, data_dir): 61 | """Gets a collection of `InputExample`s for the dev set.""" 62 | raise NotImplementedError() 63 | 64 | def get_labels(self): 65 | """Gets the list of labels for this data set.""" 66 | raise NotImplementedError() 67 | 68 | @classmethod 69 | def _read_tsv(cls, input_file, quotechar=None): 70 | """Reads a tab separated value file.""" 71 | with open(input_file, "r", encoding="utf-8-sig") as f: 72 | reader = csv.reader(f, delimiter="\t", quotechar=quotechar) 73 | lines = [] 74 | for line in reader: 75 | if sys.version_info[0] == 2: 76 | line = list(unicode(cell, 'utf-8') for cell in line) 77 | lines.append(line) 78 | return lines 79 | 80 | 81 | class VQATextProcessor(DataProcessor): 82 | """ Processor for the VQA Text data set. """ 83 | 84 | def get_train_examples(self, data_dir, file_name='train2014_qla.json'): 85 | """ See base class.""" 86 | 87 | lines = json.load(open(os.path.join(data_dir, file_name))) 88 | return self._create_examples(lines, "train") 89 | 90 | #return self._create_examples(self._read_tsv(os.path.join(data_dir, "train2014_qla.tsv")), "train") 91 | 92 | def get_dev_examples(self, data_dir, file_name='val2014_qla.json'): 93 | """ See base class.""" 94 | 95 | lines = json.load(open(os.path.join(data_dir, file_name))) 96 | return self._create_examples(lines, "dev") 97 | 98 | #return self._create_examples(self._read_tsv(os.path.join(data_dir, "val2014_qla.tsv")), "dev") 99 | 100 | def get_test_examples(self, data_dir, file_name='test2015_qla.json'): 101 | """ See base class.""" 102 | 103 | lines = json.load(open(os.path.join(data_dir, file_name))) 104 | return self._create_examples(lines, "test") 105 | 106 | def get_labels(self, label_file): 107 | """ See base class.""" 108 | 109 | ans2label = cPickle.load(open(label_file, 'rb')) 110 | return list(ans2label.values()) 111 | #return ["entailment", "not_entailment"] 112 | 113 | def _create_examples(self, lines, set_type): 114 | """Creates examples for the training and dev sets.""" 115 | 116 | examples = [] 117 | for (i, line) in enumerate(lines): 118 | if set_type!='test' and len(line['an']) == 0: continue 119 | 120 | guid = "%s-%s" % (set_type, str(i)) 121 | text_a = line['q'] 122 | text_b = line['o'].replace(';', ' ').strip() #line['o'] 123 | label = None if set_type.startswith('test') else line['an'] 124 | score = None if set_type.startswith('test') else line['s'] 125 | img_key = line['img_id'] 126 | q_id = int(line['q_id']) if set_type.startswith('test') else 0 127 | examples.append(InputInstance(guid=guid, text_a=text_a, text_b=text_b, label=label, score=score, img_key=img_key, q_id=q_id)) 128 | return examples 129 | 130 | class VQATextAProcessor(DataProcessor): 131 | """ Processor for the VQA Text data set. """ 132 | 133 | def get_train_examples(self, data_dir, file_name='train2014_qla.json'): 134 | """ See base class.""" 135 | 136 | lines = json.load(open(os.path.join(data_dir, file_name))) 137 | return self._create_examples(lines, "train") 138 | 139 | #return self._create_examples(self._read_tsv(os.path.join(data_dir, "train2014_qla.tsv")), "train") 140 | 141 | def get_dev_examples(self, data_dir, file_name='val2014_qla.json'): 142 | """ See base class.""" 143 | 144 | lines = json.load(open(os.path.join(data_dir, file_name))) 145 | return self._create_examples(lines, "dev") 146 | 147 | #return self._create_examples(self._read_tsv(os.path.join(data_dir, "val2014_qla.tsv")), "dev") 148 | 149 | def get_test_examples(self, data_dir, file_name='test2015_qla.json'): 150 | """ See base class.""" 151 | 152 | lines = json.load(open(os.path.join(data_dir, file_name))) 153 | return self._create_examples(lines, "test") 154 | 155 | def get_labels(self, label_file): 156 | """ See base class.""" 157 | 158 | ans2label = cPickle.load(open(label_file, 'rb')) 159 | return list(ans2label.values()) 160 | 161 | def _create_examples(self, lines, set_type): 162 | """Creates examples for the training and dev sets.""" 163 | 164 | examples = [] 165 | for (i, line) in enumerate(lines): 166 | if set_type!='test' and len(line['an']) == 0: continue 167 | 168 | guid = "%s-%s" % (set_type, str(i)) 169 | text_a = line['q'] 170 | text_b = None # line['o'] # or None 171 | label = None if set_type.startswith('test') else line['an'] 172 | score = None if set_type.startswith('test') else line['s'] 173 | img_key = line['img_id'] 174 | q_id = int(line['q_id']) if set_type.startswith('test') else 0 175 | examples.append(InputInstance(guid=guid, text_a=text_a, text_b=text_b, label=label, score=score, img_key=img_key, q_id=q_id)) 176 | return examples 177 | 178 | class GQAProcessor(DataProcessor): 179 | """ Processor for the GQA data set. """ 180 | 181 | def get_train_examples(self, data_dir, file_name='train2014_qla.json'): 182 | """ See base class.""" 183 | 184 | lines = json.load(open(os.path.join(data_dir, file_name))) 185 | return self._create_examples(lines, "train") 186 | 187 | #return self._create_examples(self._read_tsv(os.path.join(data_dir, "train2014_qla.tsv")), "train") 188 | 189 | def get_dev_examples(self, data_dir, file_name='val2014_qla.json'): 190 | """ See base class.""" 191 | 192 | lines = json.load(open(os.path.join(data_dir, file_name))) 193 | return self._create_examples(lines, "dev") 194 | 195 | #return self._create_examples(self._read_tsv(os.path.join(data_dir, "val2014_qla.tsv")), "dev") 196 | 197 | def get_test_examples(self, data_dir, file_name='test2015_qla.json'): 198 | """ See base class.""" 199 | 200 | lines = json.load(open(os.path.join(data_dir, file_name))) 201 | return self._create_examples(lines, "test") 202 | 203 | def get_labels(self, label_file='trainval_testdev_all_ans2label.pkl'): 204 | """ See base class.""" 205 | 206 | ans2label = cPickle.load(open(label_file, 'rb')) 207 | return list(ans2label.values()) 208 | 209 | def _create_examples(self, lines, set_type): 210 | """Creates examples for the training and dev sets.""" 211 | 212 | examples = [] 213 | for (i, line) in enumerate(lines): 214 | if set_type!='test' and len(line['an']) == 0: continue 215 | 216 | guid = "%s-%s" % (set_type, str(i)) 217 | text_a = line['q'] 218 | text_b = line['o'] # or None 219 | label = None if set_type.startswith('test') else line['an'] 220 | score = 0 221 | img_key = line['img_id'] 222 | q_id = int(line['q_id']) if set_type.startswith('test') else 0 223 | examples.append(InputInstance(guid=guid, text_a=text_a, text_b=text_b, label=label, score=score, img_key=img_key, q_id=q_id)) 224 | return examples 225 | 226 | class NLVRProcessor(DataProcessor): 227 | """ Processor for the NLVR data set. """ 228 | 229 | def get_train_examples(self, data_dir, use_label_seq=True, file_name='nlvr2_train.json'): 230 | """ See base class.""" 231 | 232 | lines = json.load(open(os.path.join(data_dir, file_name))) 233 | return self._create_examples(lines, "train", use_label_seq) 234 | 235 | #return self._create_examples(self._read_tsv(os.path.join(data_dir, "train2014_qla.tsv")), "train") 236 | 237 | def get_dev_examples(self, data_dir, use_label_seq=True, file_name='nlvr2_dev.json'): 238 | """ See base class.""" 239 | 240 | lines = json.load(open(os.path.join(data_dir, file_name))) 241 | return self._create_examples(lines, "dev", use_label_seq) 242 | 243 | #return self._create_examples(self._read_tsv(os.path.join(data_dir, "val2014_qla.tsv")), "dev") 244 | 245 | def get_test_examples(self, data_dir, use_label_seq=True, file_name='nlvr2_test1.json'): 246 | """ See base class.""" 247 | 248 | lines = json.load(open(os.path.join(data_dir, file_name))) 249 | return self._create_examples(lines, "test", use_label_seq) 250 | 251 | def get_labels(self, label_file=None): 252 | """ See base class.""" 253 | 254 | #ans2label = cPickle.load(open(label_file, 'rb')) 255 | #return list(ans2label.values()) 256 | return [0, 1] 257 | 258 | def _create_examples(self, lines, set_type, use_label_seq=True): 259 | """ Creates examples for the training and dev sets. """ 260 | 261 | examples = [] 262 | for (i, line) in enumerate(lines): 263 | guid = "%s-%s" % (set_type, str(i)) 264 | text_a = line['q'] 265 | text_b = line['o'] if use_label_seq else None 266 | label = line['label'] #None if set_type.startswith('test') else line['label'] 267 | score = 0 268 | img_key = line['img_id'] #[line['img_left'], line['img_left']] 269 | q_id = 0 #int(line['q_id']) if set_type.startswith('test') else 0 270 | examples.append(InputInstance(guid=guid, text_a=text_a, text_b=text_b, label=label, score=score, img_key=img_key, q_id=q_id)) 271 | return examples 272 | 273 | class VCR_Q_A_Processor(DataProcessor): 274 | """ Processor for the VCR (q -> a) (Det) data set. """ 275 | 276 | def get_train_examples(self, data_dir, file_name='vcr_train.json'): 277 | """ See base class.""" 278 | 279 | lines = json.load(open(os.path.join(data_dir, file_name))) 280 | return self._create_examples(lines, "train") 281 | 282 | def get_dev_examples(self, data_dir, file_name='vcr_val.json'): 283 | """ See base class.""" 284 | 285 | lines = json.load(open(os.path.join(data_dir, file_name))) 286 | return self._create_examples(lines, "dev") 287 | 288 | def get_test_examples(self, data_dir, file_name='vcr_test.json'): 289 | """ See base class.""" 290 | 291 | lines = json.load(open(os.path.join(data_dir, file_name))) 292 | return self._create_examples(lines, "test") 293 | 294 | def get_labels(self, label_file=None): 295 | """ See base class.""" 296 | 297 | #ans2label = cPickle.load(open(label_file, 'rb')) 298 | #return list(ans2label.values()) 299 | return [0, 1] 300 | 301 | def _create_examples(self, lines, set_type): 302 | """ Creates examples for the training and dev sets. """ 303 | 304 | examples = [] 305 | for (i, line) in enumerate(lines): 306 | #if set_type!='test': continue 307 | 308 | guid = "%s-%s" % (set_type, str(i)) 309 | text_a = line['q'] # question 310 | choices = line['choices'] 311 | label = None if set_type.startswith('test') else line['label'] 312 | img_key = line['img_id'] 313 | q_id = int(line['annot_id'].split('-')[-1]) #int(line['q_id']) if set_type.startswith('test') else 0 314 | score = line['objects'] if 'objects' in line else None 315 | examples.append(InputInstance(guid=guid, text_a=text_a, text_b=choices, label=label, score=score, img_key=img_key, q_id=q_id)) 316 | return examples 317 | 318 | class VCR_QA_R_Processor(DataProcessor): 319 | """ Processor for the VCR (qa -> r) QA_R data set. """ 320 | 321 | def get_train_examples(self, data_dir, file_name='vcr_train.json'): 322 | """ See base class.""" 323 | 324 | lines = json.load(open(os.path.join(data_dir, file_name))) 325 | return self._create_examples(lines, "train") 326 | 327 | def get_dev_examples(self, data_dir, file_name='vcr_val.json'): 328 | """ See base class.""" 329 | 330 | lines = json.load(open(os.path.join(data_dir, file_name))) 331 | return self._create_examples(lines, "dev") 332 | 333 | def get_test_examples(self, data_dir, file_name='vcr_test.json'): 334 | """ See base class.""" 335 | 336 | lines = json.load(open(os.path.join(data_dir, file_name))) 337 | return self._create_examples(lines, "test") 338 | 339 | def get_labels(self, label_file=None): 340 | """ See base class.""" 341 | 342 | #ans2label = cPickle.load(open(label_file, 'rb')) 343 | #return list(ans2label.values()) 344 | return [0, 1] 345 | 346 | def _create_examples(self, lines, set_type): 347 | """ Creates examples for the training and dev sets. """ 348 | 349 | examples = [] 350 | for (i, line) in enumerate(lines): 351 | #if set_type!='test': continue 352 | 353 | guid = "%s-%s" % (set_type, str(i)) 354 | text_a = line['q'] + ' ' + line['choices'][line['label']] # question_choice 355 | choices = line['rational_choices'] # rational_choice 356 | label = None if set_type.startswith('test') else line['rational_label'] # rational_label 357 | img_key = line['img_id'] 358 | q_id = int(line['annot_id'].split('-')[-1]) #int(line['q_id']) if set_type.startswith('test') else 0 359 | examples.append(InputInstance(guid=guid, text_a=text_a, text_b=choices, label=label, score=None, img_key=img_key, q_id=q_id)) 360 | return examples 361 | 362 | class VCR_QAR_Processor(DataProcessor): 363 | """ Processor for the VCR (q->a, qa->r) data set. """ 364 | 365 | def get_train_examples(self, data_dir, file_name='vcr_train.json'): 366 | """ See base class.""" 367 | 368 | lines = json.load(open(os.path.join(data_dir, file_name))) 369 | return self._create_examples(lines, "train") 370 | 371 | def get_dev_examples(self, data_dir, file_name='vcr_val.json'): 372 | """ See base class.""" 373 | 374 | lines = json.load(open(os.path.join(data_dir, file_name))) 375 | return self._create_examples(lines, "dev") 376 | 377 | def get_test_examples(self, data_dir, file_name='vcr_test.json'): 378 | """ See base class.""" 379 | 380 | lines = json.load(open(os.path.join(data_dir, file_name))) 381 | return self._create_examples(lines, "test") 382 | 383 | def get_labels(self, label_file=None): 384 | """ See base class.""" 385 | 386 | #ans2label = cPickle.load(open(label_file, 'rb')) 387 | #return list(ans2label.values()) 388 | return [0, 1] 389 | 390 | def _create_examples(self, lines, set_type): 391 | """ Creates examples for the training and dev sets. """ 392 | 393 | examples = [] 394 | for (i, line) in enumerate(lines): 395 | #if set_type!='test': continue 396 | 397 | guid = "%s-%s-q-a" % (set_type, str(i)) 398 | text_a = line['q'] # question 399 | choices = line['choices'] 400 | label = None if set_type.startswith('test') else line['label'] 401 | img_key = line['img_id'] 402 | q_id = int(line['annot_id'].split('-')[-1]) #int(line['q_id']) if set_type.startswith('test') else 0 403 | score = line['objects'] if 'objects' in line else None 404 | examples.append(InputInstance(guid=guid, text_a=text_a, text_b=choices, label=label, score=score, img_key=img_key, q_id=q_id)) 405 | 406 | if set_type == 'train': # qa -> r 407 | guid = "%s-%s-qa-r" % (set_type, str(i)) 408 | text_a = line['q'] + ' ' + line['choices'][line['label']] # question_choice 409 | choices = line['rational_choices'] # rational_choice 410 | label = None if set_type.startswith('test') else line['rational_label'] # rational_label 411 | img_key = line['img_id'] 412 | q_id = int(line['annot_id'].split('-')[-1]) # int(line['q_id']) if set_type.startswith('test') else 0 413 | score = line['objects'] if 'objects' in line else None 414 | examples.append(InputInstance(guid=guid, text_a=text_a, text_b=choices, label=label, score=score, img_key=img_key, q_id=q_id)) 415 | return examples 416 | 417 | 418 | def convert_examples_to_features_vqa(examples, img_feats, label_list, max_img_seq_length, max_seq_length, 419 | tokenizer, output_mode, 420 | cls_token_at_end=False, pad_on_left=False, 421 | cls_token='[CLS]', sep_token='[SEP]', pad_token=0, 422 | sequence_a_segment_id=0, sequence_b_segment_id=1, 423 | cls_token_segment_id=1, pad_token_segment_id=0, 424 | mask_padding_with_zero=True): 425 | """ Loads a data file into a list of `InputBatch`s 426 | `cls_token_at_end` define the location of the CLS token: 427 | - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP] 428 | - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS] 429 | `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet) 430 | """ 431 | 432 | label_map = {label:i for i, label in enumerate(label_list)} 433 | 434 | features = [] 435 | #debug: 436 | debug_size = 500 437 | 438 | for (ex_index, example) in enumerate(examples[0: ]): 439 | if len(example.label) == 0: continue 440 | if ex_index % 10000 == 0: 441 | logger.info("Writing example %d of %d" % (ex_index, len(examples))) 442 | 443 | tokens_a = tokenizer.tokenize(example.text_a) 444 | 445 | tokens_b = None 446 | if example.text_b: 447 | tokens_b = tokenizer.tokenize(example.text_b) 448 | # Modifies `tokens_a` and `tokens_b` in place so that the total 449 | # length is less than the specified length. 450 | # Account for [CLS], [SEP], [SEP] with "- 3" 451 | _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) 452 | else: 453 | # Account for [CLS] and [SEP] with "- 2" 454 | if len(tokens_a) > max_seq_length - 2: 455 | tokens_a = tokens_a[:(max_seq_length - 2)] 456 | 457 | # The convention in BERT is: 458 | # (a) For sequence pairs: 459 | # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] 460 | # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 461 | # (b) For single sequences: 462 | # tokens: [CLS] the dog is hairy . [SEP] 463 | # type_ids: 0 0 0 0 0 0 0 464 | # 465 | # Where "type_ids" are used to indicate whether this is the first 466 | # sequence or the second sequence. The embedding vectors for `type=0` and 467 | # `type=1` were learned during pre-training and are added to the wordpiece 468 | # embedding vector (and position vector). This is not *strictly* necessary 469 | # since the [SEP] token unambiguously separates the sequences, but it makes 470 | # it easier for the model to learn the concept of sequences. 471 | # 472 | # For classification tasks, the first vector (corresponding to [CLS]) is 473 | # used as as the "sentence vector". Note that this only makes sense because 474 | # the entire model is fine-tuned. 475 | tokens = tokens_a + [sep_token] 476 | segment_ids = [sequence_a_segment_id] * len(tokens) 477 | 478 | if tokens_b: 479 | tokens += tokens_b + [sep_token] 480 | segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1) 481 | 482 | if cls_token_at_end: 483 | tokens = tokens + [cls_token] 484 | segment_ids = segment_ids + [cls_token_segment_id] 485 | else: 486 | tokens = [cls_token] + tokens 487 | segment_ids = [cls_token_segment_id] + segment_ids 488 | 489 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 490 | 491 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 492 | # tokens are attended to. 493 | input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids) 494 | 495 | # Zero-pad up to the sequence length. 496 | padding_length = max_seq_length - len(input_ids) 497 | if pad_on_left: 498 | input_ids = ([pad_token] * padding_length) + input_ids 499 | input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask 500 | segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids 501 | else: 502 | input_ids = input_ids + ([pad_token] * padding_length) 503 | input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length) 504 | segment_ids = segment_ids + ([pad_token_segment_id] * padding_length) 505 | 506 | assert len(input_ids) == max_seq_length 507 | assert len(input_mask) == max_seq_length 508 | assert len(segment_ids) == max_seq_length 509 | 510 | # image features 511 | #img_feat = img_feats[example.img_key] # torch 512 | img_feat = img_feats.item().get(example.img_key) # numpy 513 | if img_feat.shape[0] > max_img_seq_length: 514 | img_feat = img_feat[0:max_img_seq_length, ] 515 | if max_img_seq_length > 0: 516 | input_mask = input_mask + [1 if mask_padding_with_zero else 0] * img_feat.shape[0] 517 | #segment_ids += [sequence_b_segment_id] * img_feat.shape[0] 518 | else: 519 | if max_img_seq_length > 0: 520 | input_mask = input_mask + [1 if mask_padding_with_zero else 0] * img_feat.shape[0] 521 | #segment_ids = segment_ids + [sequence_b_segment_id] * img_feat.shape[0] 522 | padding_matrix = torch.zeros((max_img_seq_length - img_feat.shape[0], img_feat.shape[1])) 523 | img_feat = torch.cat((img_feat, padding_matrix), 0) 524 | if max_img_seq_length > 0: 525 | input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_matrix.shape[0]) 526 | #segment_ids = segment_ids + [pad_token_segment_id] * padding_matrix.shape[0] 527 | 528 | if output_mode == "classification": 529 | label_id = [label_map[l] for l in example.label] 530 | score = example.score 531 | elif output_mode == "regression": 532 | label_id = float(example.label) 533 | else: 534 | raise KeyError(output_mode) 535 | 536 | if ex_index < 5: 537 | logger.info("*** Example ***") 538 | logger.info("guid: %s" % (example.guid)) 539 | logger.info("tokens: %s" % " ".join([str(x) for x in tokens])) 540 | logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) 541 | logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) 542 | logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) 543 | logger.info("label: %s (id = %s)" % (example.label, label_id)) 544 | logger.info("score: %s (score = %s)" % (example.score, score)) 545 | 546 | features.append(InputFeat(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, score=score, img_feat=img_feat)) 547 | return features 548 | 549 | 550 | def _truncate_seq_pair(tokens_a, tokens_b, max_length): 551 | """Truncates a sequence pair in place to the maximum length.""" 552 | 553 | # This is a simple heuristic which will always truncate the longer sequence 554 | # one token at a time. This makes more sense than truncating an equal percent 555 | # of tokens from each, since if one sequence is very short then each token 556 | # that's truncated likely contains more information than a longer sequence. 557 | while True: 558 | total_length = len(tokens_a) + len(tokens_b) 559 | if total_length <= max_length: 560 | break 561 | if len(tokens_a) > len(tokens_b): 562 | tokens_a.pop() 563 | else: 564 | tokens_b.pop() 565 | 566 | 567 | processors = { 568 | "vqa_text": VQATextProcessor, 569 | "vqa_text_a": VQATextAProcessor, 570 | "gqa": GQAProcessor, 571 | "nlvr": NLVRProcessor, 572 | "vcr_q_a": VCR_Q_A_Processor, 573 | "vcr_qa_r": VCR_QA_R_Processor, 574 | "vcr_qar": VCR_QAR_Processor, 575 | } 576 | 577 | output_modes = { 578 | "vqa_text": "classification", 579 | "vqa_text_a": "classification", 580 | "gqa": "classification", 581 | "nlvr": "classification", 582 | "vcr_q_a": "classification", 583 | "vcr_qa_r": "classification", 584 | "vcr_qar": "classification", 585 | } 586 | 587 | GLUE_TASKS_NUM_LABELS = { 588 | "vqa_text": 3129, 589 | "vqa_text_a": 3129, 590 | "gqa": 1853, 591 | "nlvr": 2, 592 | "vcr_q_a": 2, 593 | "vcr_qa_r": 2, 594 | "vcr_qar": 2, 595 | } -------------------------------------------------------------------------------- /oscar/utils/tsv_file.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Microsoft Corporation. Licensed under the MIT license. 2 | 3 | import logging 4 | import os 5 | import os.path as op 6 | 7 | 8 | def generate_lineidx_file(filein, idxout): 9 | idxout_tmp = idxout + '.tmp' 10 | with open(filein, 'r') as tsvin, open(idxout_tmp,'w') as tsvout: 11 | fsize = os.fstat(tsvin.fileno()).st_size 12 | fpos = 0 13 | while fpos!=fsize: 14 | tsvout.write(str(fpos)+"\n") 15 | tsvin.readline() 16 | fpos = tsvin.tell() 17 | os.rename(idxout_tmp, idxout) 18 | 19 | 20 | class TSVFile(object): 21 | def __init__(self, tsv_file, generate_lineidx=False): 22 | self.tsv_file = tsv_file 23 | self.lineidx = op.splitext(tsv_file)[0] + '.lineidx' 24 | self._fp = None 25 | self._lineidx = None 26 | # the process always keeps the process which opens the file. 27 | # If the pid is not equal to the currrent pid, we will re-open the file. 28 | self.pid = None 29 | # generate lineidx if not exist 30 | if not op.isfile(self.lineidx) and generate_lineidx: 31 | generate_lineidx_file(self.tsv_file, self.lineidx) 32 | 33 | def __del__(self): 34 | if self._fp: 35 | self._fp.close() 36 | 37 | def __str__(self): 38 | return "TSVFile(tsv_file='{}')".format(self.tsv_file) 39 | 40 | def __repr__(self): 41 | return str(self) 42 | 43 | def num_rows(self): 44 | self._ensure_lineidx_loaded() 45 | return len(self._lineidx) 46 | 47 | def seek(self, idx): 48 | self._ensure_tsv_opened() 49 | self._ensure_lineidx_loaded() 50 | try: 51 | pos = self._lineidx[idx] 52 | except: 53 | logging.info('{}-{}'.format(self.tsv_file, idx)) 54 | raise 55 | self._fp.seek(pos) 56 | return [s.strip() for s in self._fp.readline().split('\t')] 57 | 58 | def seek_first_column(self, idx): 59 | self._ensure_tsv_opened() 60 | self._ensure_lineidx_loaded() 61 | pos = self._lineidx[idx] 62 | self._fp.seek(pos) 63 | return read_to_character(self._fp, '\t') 64 | 65 | def __getitem__(self, index): 66 | return self.seek(index) 67 | 68 | def __len__(self): 69 | return self.num_rows() 70 | 71 | def _ensure_lineidx_loaded(self): 72 | if self._lineidx is None: 73 | logging.info('loading lineidx: {}'.format(self.lineidx)) 74 | with open(self.lineidx, 'r') as fp: 75 | self._lineidx = [int(i.strip()) for i in fp.readlines()] 76 | 77 | def _ensure_tsv_opened(self): 78 | if self._fp is None: 79 | self._fp = open(self.tsv_file, 'r') 80 | self.pid = os.getpid() 81 | 82 | if self.pid != os.getpid(): 83 | logging.info('re-open {} because the process id changed'.format(self.tsv_file)) 84 | self._fp = open(self.tsv_file, 'r') 85 | self.pid = os.getpid() 86 | -------------------------------------------------------------------------------- /oscar/utils/tsv_file_ops.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2020 Microsoft Corporation. Licensed under the MIT license. 2 | 3 | import logging 4 | import numpy as np 5 | import os 6 | import os.path as op 7 | import shutil 8 | from .misc import mkdir 9 | from .tsv_file import TSVFile 10 | 11 | 12 | def tsv_writer(values, tsv_file_name, sep='\t'): 13 | mkdir(os.path.dirname(tsv_file_name)) 14 | tsv_file_name_tmp = tsv_file_name + '.tmp' 15 | with open(tsv_file_name_tmp, 'wb') as fp: 16 | assert values is not None 17 | for value in values: 18 | assert value is not None 19 | v = sep.join(map(lambda v: v.decode() if type(v) == bytes else str(v), value)) + '\n' 20 | v = v.encode() 21 | fp.write(v) 22 | os.rename(tsv_file_name_tmp, tsv_file_name) 23 | 24 | 25 | def concat_files(ins, out): 26 | out_tmp = out + '.tmp' 27 | with open(out_tmp, 'wb') as fp_out: 28 | for i, f in enumerate(ins): 29 | with open(f, 'rb') as fp_in: 30 | shutil.copyfileobj(fp_in, fp_out, 1024*1024*10) 31 | os.rename(out_tmp, out) 32 | 33 | 34 | def concat_tsv_files(tsvs, out_tsv, generate_lineidx=False): 35 | concat_files(tsvs, out_tsv) 36 | if generate_lineidx: 37 | sizes = [os.stat(t).st_size for t in tsvs] 38 | sizes = np.cumsum(sizes) 39 | all_idx = [] 40 | for i, t in enumerate(tsvs): 41 | for idx in load_list_file(op.splitext(t)[0] + '.lineidx'): 42 | if i == 0: 43 | all_idx.append(idx) 44 | else: 45 | all_idx.append(str(int(idx) + sizes[i - 1])) 46 | with open(op.splitext(out_tsv)[0] + '.lineidx', 'w') as f: 47 | f.write('\n'.join(all_idx)) 48 | 49 | 50 | def load_list_file(fname): 51 | with open(fname, 'r') as fp: 52 | lines = fp.readlines() 53 | result = [line.strip() for line in lines] 54 | if len(result) > 0 and result[-1] == '': 55 | result = result[:-1] 56 | return result 57 | 58 | 59 | def reorder_tsv_keys(in_tsv_file, ordered_keys, out_tsv_file): 60 | tsv = TSVFile(in_tsv_file, generate_lineidx=True) 61 | keys = [tsv.seek(i)[0] for i in range(len(tsv))] 62 | key_to_idx = {key: i for i, key in enumerate(keys)} 63 | def gen_rows(): 64 | for key in ordered_keys: 65 | idx = key_to_idx[key] 66 | yield tsv.seek(idx) 67 | tsv_writer(gen_rows(), out_tsv_file) 68 | 69 | 70 | def delete_tsv_files(tsvs): 71 | for t in tsvs: 72 | if op.isfile(t): 73 | try_delete(t) 74 | line = op.splitext(t)[0] + '.lineidx' 75 | if op.isfile(line): 76 | try_delete(line) 77 | 78 | 79 | def try_once(func): 80 | def func_wrapper(*args, **kwargs): 81 | try: 82 | return func(*args, **kwargs) 83 | except Exception as e: 84 | logging.info('ignore error \n{}'.format(str(e))) 85 | return func_wrapper 86 | 87 | 88 | @try_once 89 | def try_delete(f): 90 | os.remove(f) 91 | 92 | 93 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | tqdm 2 | pyyaml 3 | matplotlib 4 | requests 5 | scikit-image 6 | anytree 7 | regex 8 | boto3 9 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/python 2 | 3 | from __future__ import print_function 4 | import os 5 | import sys 6 | import re 7 | import os.path as op 8 | from setuptools import find_packages, setup 9 | 10 | # change directory to this module path 11 | try: 12 | this_file = __file__ 13 | except NameError: 14 | this_file = sys.argv[0] 15 | this_file = os.path.abspath(this_file) 16 | if op.dirname(this_file): 17 | os.chdir(op.dirname(this_file)) 18 | script_dir = os.getcwd() 19 | 20 | def readme(fname): 21 | """Read text out of a file in the same directory as setup.py. 22 | """ 23 | return open(op.join(script_dir, fname)).read() 24 | 25 | 26 | def find_version(fname): 27 | version_file = readme(fname) 28 | version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]", 29 | version_file, re.M) 30 | if version_match: 31 | return version_match.group(1) 32 | raise RuntimeError("Unable to find version string.") 33 | 34 | 35 | setup( 36 | name="oscar", 37 | version=find_version("oscar/__init__.py"), 38 | url='https://github.com/xjli/Oscar', 39 | description="Oscar for vision and language tasks", 40 | long_description=readme('README.md'), 41 | packages=find_packages(), 42 | classifiers=[ 43 | 'Intended Audience :: Developers', 44 | "Programming Language :: Python", 45 | 'Topic :: Software Development', 46 | ] 47 | ) 48 | --------------------------------------------------------------------------------