├── .gitignore
├── .gitmodules
├── CODE_OF_CONDUCT.md
├── DOWNLOAD.md
├── INSTALL.md
├── LICENSE
├── MODEL_ZOO.md
├── README.md
├── SECURITY.md
├── VinVL_DOWNLOAD.md
├── VinVL_MODEL_ZOO.md
├── docs
    ├── oscar.PNG
    ├── oscar_logo.png
    └── pretrain_corpus.PNG
├── oscar
    ├── __init__.py
    ├── datasets
    │   ├── __init__.py
    │   ├── build.py
    │   └── oscar_tsv.py
    ├── modeling
    │   ├── __init__.py
    │   ├── modeling_bert.py
    │   └── modeling_utils.py
    ├── run_captioning.py
    ├── run_gqa.py
    ├── run_nlvr.py
    ├── run_oscarplus_pretrain.py
    ├── run_retrieval.py
    ├── run_vqa.py
    └── utils
    │   ├── __init__.py
    │   ├── caption_evaluate.py
    │   ├── cbs.py
    │   ├── cider
    │       └── pyciderevalcap
    │       │   ├── __init__.py
    │       │   ├── cider
    │       │       ├── __init__.py
    │       │       ├── cider.py
    │       │       └── cider_scorer.py
    │       │   └── ciderD
    │       │       ├── __init__.py
    │       │       ├── ciderD.py
    │       │       └── ciderD_scorer.py
    │   ├── logger.py
    │   ├── metric_logger.py
    │   ├── misc.py
    │   ├── task_utils.py
    │   ├── tsv_file.py
    │   └── tsv_file_ops.py
├── requirements.txt
└── setup.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Initially taken from Github's Python gitignore file
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # Distribution / packaging
 12 | .Python
 13 | build/
 14 | develop-eggs/
 15 | dist/
 16 | downloads/
 17 | eggs/
 18 | .eggs/
 19 | lib/
 20 | lib64/
 21 | parts/
 22 | sdist/
 23 | var/
 24 | wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | 
 53 | # Translations
 54 | *.mo
 55 | *.pot
 56 | 
 57 | # Django stuff:
 58 | *.log
 59 | local_settings.py
 60 | db.sqlite3
 61 | 
 62 | # Flask stuff:
 63 | instance/
 64 | .webassets-cache
 65 | 
 66 | # Scrapy stuff:
 67 | .scrapy
 68 | 
 69 | # Sphinx documentation
 70 | docs/_build/
 71 | 
 72 | # PyBuilder
 73 | target/
 74 | 
 75 | # Jupyter Notebook
 76 | .ipynb_checkpoints
 77 | 
 78 | # IPython
 79 | profile_default/
 80 | ipython_config.py
 81 | 
 82 | # pyenv
 83 | .python-version
 84 | 
 85 | # celery beat schedule file
 86 | celerybeat-schedule
 87 | 
 88 | # SageMath parsed files
 89 | *.sage.py
 90 | 
 91 | # Environments
 92 | .env
 93 | .venv
 94 | env/
 95 | venv/
 96 | ENV/
 97 | env.bak/
 98 | venv.bak/
 99 | 
100 | # Spyder project settings
101 | .spyderproject
102 | .spyproject
103 | 
104 | # Rope project settings
105 | .ropeproject
106 | 
107 | # mkdocs documentation
108 | /site
109 | 
110 | # mypy
111 | .mypy_cache/
112 | .dmypy.json
113 | dmypy.json
114 | 
115 | # Pyre type checker
116 | .pyre/
117 | 
118 | # vscode
119 | .vscode
120 | 
121 | # TF code
122 | tensorflow_code
123 | 
124 | # Models
125 | models
126 | proc_data
127 | 
128 | # examples
129 | runs
130 | examples/runs
131 | 
132 | # pyCharm
133 | .idea/
134 | 
135 | # local folders
136 | data
137 | models
138 | output
139 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
1 | [submodule "transformers"]
2 | 	path = transformers
3 | 	url = git@github.com:huggingface/transformers.git
4 | [submodule "coco_caption"]
5 | 	path = coco_caption
6 | 	url = git@github.com:LuoweiZhou/coco-caption.git
7 | 


--------------------------------------------------------------------------------
/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
 1 | # Microsoft Open Source Code of Conduct
 2 | 
 3 | This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/).
 4 | 
 5 | Resources:
 6 | 
 7 | - [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/)
 8 | - [Microsoft Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/)
 9 | - Contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with questions or concerns
10 | 


--------------------------------------------------------------------------------
/DOWNLOAD.md:
--------------------------------------------------------------------------------
 1 | # Download
 2 | 
 3 | Note:  The data is on Azure Storage Blob, a SAS with Read permission is provided. Please append the following SAS at the end of each link to download: 
 4 | ```bash
 5 | ?sp=r&st=2023-08-28T01:12:41Z&se=3023-08-28T09:12:41Z&sv=2022-11-02&sr=c&sig=6R1YmWluiXmPLsdVn1rDUpeBp2SYBMxDjc6KoKNlY8Q%3D
 6 | ```
 7 | 
 8 | ## Datasets
 9 | We provide the extracted image region features, object tags, and the original text annotations for each downstream tasks.
10 | ```bash
11 | wget https://biglmdiag.blob.core.windows.net/oscar/datasets/$TASK_NAME.zip
12 | unzip $TASK_NAME.zip -d $DATA_DIR
13 | ```
14 | `TASK_NAME` could be `coco_caption`, `coco_ir`, `vqa`, `GQA`, `nlvr2`.
15 | 
16 | ## Pre-trained Models
17 | We provide pre-trained *Oscar* models of Bert-base and Bert-large structures, with the name starting with `base` and `large`, respectively.
18 | ```bash
19 | wget https://biglmdiag.blob.core.windows.net/oscar/pretrained_models/$MODEL_NAME.zip
20 | unzip $MODEL_NAME.zip -d $MODEL_DIR
21 | ```
22 | `MODEL_NAME` could be `base-vg-labels`, `large-vg-labels`, `base-oid-labels`, `base-no-labels`.
23 | 
24 | The models are trained with both image region features and object tags. The image region features are extracted by the Faster R-CNN with
25 | ResNet-101, using object and attribute annotations from [Visual Genome](http://visualgenome.org/).
26 | The object tags are from:
27 |     1) the same VisualGenome model, named as `-vg-labels`. Or,
28 |     2) the model trained on object annotations from [Open Images V5](https://storage.googleapis.com/openimages/web/index.html). named as `-oid-labels`. Or,
29 |     3) no object tags provied, serving as baseline, named as `-no-labels`.
30 | 
31 | 
32 | ### Note
33 | It is recommended to download large files with **AzCopy** for faster speed.
34 | AzCopy executable tools can be downloaded [here](https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-v10#download-azcopy).
35 | Decompress the tar file and put the executable in any path. To download from
36 | any URL above, the command is:
37 | ```bash
38 | path/to/azcopy copy <URL> <local_path>
39 | 
40 | # for example, downloading coco_caption.zip
41 | path/to/azcopy copy https://biglmdiag.blob.core.windows.net/oscar/datasets/coco_caption.zip <local_path>
42 | ```
43 | 
44 | 


--------------------------------------------------------------------------------
/INSTALL.md:
--------------------------------------------------------------------------------
 1 | ## Installation
 2 | ### Requirements
 3 | - Python 3.7
 4 | - Pytorch 1.2
 5 | - torchvision 0.4.0
 6 | - cuda 10.0
 7 | 
 8 | ### Setup with Conda
 9 | ```bash
10 | # create a new environment
11 | conda create --name oscar python=3.7
12 | conda activate oscar
13 | 
14 | # install pytorch1.2
15 | conda install pytorch==1.2.0 torchvision==0.4.0 cudatoolkit=10.0 -c pytorch
16 | 
17 | export INSTALL_DIR=$PWD
18 | 
19 | # install apex
20 | cd $INSTALL_DIR
21 | git clone https://github.com/NVIDIA/apex.git
22 | cd apex
23 | python setup.py install --cuda_ext --cpp_ext
24 | 
25 | # install oscar
26 | cd $INSTALL_DIR
27 | git clone --recursive git@github.com:microsoft/Oscar.git
28 | cd Oscar/coco_caption
29 | ./get_stanford_models.sh
30 | cd ..
31 | python setup.py build develop
32 | 
33 | # install requirements
34 | pip install -r requirements.txt
35 | 
36 | unset INSTALL_DIR
37 | ```
38 | 
39 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 |     MIT License
 2 | 
 3 |     Copyright (c) Microsoft Corporation.
 4 | 
 5 |     Permission is hereby granted, free of charge, to any person obtaining a copy
 6 |     of this software and associated documentation files (the "Software"), to deal
 7 |     in the Software without restriction, including without limitation the rights
 8 |     to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 |     copies of the Software, and to permit persons to whom the Software is
10 |     furnished to do so, subject to the following conditions:
11 | 
12 |     The above copyright notice and this permission notice shall be included in all
13 |     copies or substantial portions of the Software.
14 | 
15 |     THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 |     IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 |     FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 |     AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 |     LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 |     OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 |     SOFTWARE
22 | 


--------------------------------------------------------------------------------
/MODEL_ZOO.md:
--------------------------------------------------------------------------------
  1 | Note:  The data is on Azure Storage Blob, a SAS with Read permission is provided. Please append the following SAS at the end of each link to download: 
  2 | ```bash
  3 | ?sp=r&st=2023-08-28T01:12:41Z&se=3023-08-28T09:12:41Z&sv=2022-11-02&sr=c&sig=6R1YmWluiXmPLsdVn1rDUpeBp2SYBMxDjc6KoKNlY8Q%3D
  4 | ```
  5 | 
  6 | ## Table of Contents
  7 | - <a href='#VQA'>VQA</a>
  8 | - <a href='#GQA'>GQA</a>
  9 | - <a href='#NLVR2'>NLVR2</a>
 10 | - <a href='#Image-Text-Retrieval'>Image/Text Retrieval</a>
 11 | - <a href='#Image-Captioning-on-COCO'>Image Captioning on COCO</a>
 12 | 
 13 | 
 14 | ## Performance
 15 | Task    | t2i | t2i | i2t | i2t | IC  | IC  |  IC  |  IC  | NoCaps | NoCaps |   VQA    |  NLVR2  |
 16 | --------|-----|-----|-----|-----|-----|-----|------|------|--------|--------|----------|---------|
 17 | Metric  | R@1 | R@5 | R@1 | R@5 | B@4 |  M  |  C   |   S  |    C   |    S   | test-std | test-P  |
 18 | SoTA_S  |39.2 | 68.0|56.6 | 84.5|38.9 |29.2 |129.8 | 22.4 |   61.5 |  9.2   |  70.90   | 53.50   |
 19 | SoTA_B  |48.4 | 76.7|63.3 | 87.0|39.5 |29.3 |129.3 | 23.2 |   73.1 | 11.2   |  72.54   | 78.87   |
 20 | SoTA_L  |51.7 | 78.4|66.6 | 89.4|  -  |  -  |   -  |   -  |     -  |   -    |  73.40   | 79.50   |
 21 | -----   |---  |---  |---  |---  |---  |---  |---   |---   |---     |---     |---       |---      |
 22 | Oscar_B |54.0 | 80.8|70.0 | 91.1|40.5 |29.7 |137.6 | 22.8 |   78.8 | 11.7   |  73.44   | 78.44   |
 23 | Oscar_L |57.5 | 82.8|73.5 | 92.2|41.7 |30.6 |140.0 | 24.5 |   80.9 | 11.3   |  73.82   | 80.37   |
 24 | gain    | 5.8 |  4.4| 6.9 |  2.8| 2.2 | 1.3 | 10.7 | 1.3  |    7.8 |  0.5   |   0.42   |  0.87   |
 25 | 
 26 | t2i: text-to-image retrieval; i2t: image-to-text retrieval; IC: image captioning on COCO. 
 27 | 
 28 | For reference, we also release the training logs and output.
 29 | 
 30 | 
 31 | ## VQA
 32 | Script to finetune for Oscar base model.
 33 | Base model is trained on train split and evaluated on the val split. Good for later comparison.
 34 | 
 35 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/base/base_9m_ep107_1192k_eu1/application_1575931286052_40649/results/eval_logs.json), [output.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/base/base_9m_ep107_1192k_eu1/application_1575931286052_40649/results/stdout.txt).<br />
 36 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/base/results.txt).<br />
 37 | Model checkpoint: [.zip](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/base/vqa_base_best.zip).
 38 | ```bash
 39 | python oscar/run_vqa.py -j 4 --img_feature_dim 2054 --max_img_seq_length
 40 |     50 --data_label_type mask --img_feature_type faster_r-cnn --data_dir datasets/vqa/2k
 41 |     --model_type bert --model_name_or_path pretrained_models/base-vg-labels/ep_107_1192087
 42 |     --task_name vqa_text --do_train --do_lower_case --max_seq_length 128 --per_gpu_eval_batch_size
 43 |     256 --per_gpu_train_batch_size 32 --learning_rate 5e-05 --num_train_epochs 25
 44 |     --output_dir results --label_file datasets/vqa/cache/trainval_ans2label.pkl
 45 |     --save_epoch 1 --seed 88 --evaluate_during_training --logging_steps 4000 --drop_out
 46 |     0.3 --weight_decay 0.05 --warmup_steps 0 --loss_type bce --img_feat_format pt 
 47 |     --classifier linear --cls_hidden_scale 3 --txt_data_dir datasets/vqa/2k
 48 | ```
 49 | 
 50 | Script to finetune for Oscar large model.
 51 | Large model is trained on train+val split and evaluated on the val split, for reproduce the paper's best result.
 52 | 
 53 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/large/ab128_img_large_rr1_ep20_590k_tv_done_good/exp_ab128_img_large_rr1_ep20_590k_tv_0.00003_128_50_dp_0.3_wd_0.05_bce_3linear_s88_abcd/results/eval_logs.json), [output.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/large/ab128_img_large_rr1_ep20_590k_tv_done_good/exp_ab128_img_large_rr1_ep20_590k_tv_0.00003_128_50_dp_0.3_wd_0.05_bce_3linear_s88_abcd/stdout.txt).<br />
 54 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/large/results.txt).<br />
 55 | Model checkpoint: [.zip](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/large/vqa_large_best.zip).
 56 | ```bash
 57 | python oscar/run_vqa.py -j 4 --img_feature_dim 2054 --max_img_seq_length
 58 |     50 --data_label_type mask --img_feature_type faster_r-cnn --data_dir datasets/vqa/2k
 59 |     --model_type bert --model_name_or_path pretrained_models/large-vg-labels/ep_20_590000
 60 |     --task_name vqa_text --do_train_val --do_lower_case --max_seq_length 128 --per_gpu_eval_batch_size
 61 |     256 --per_gpu_train_batch_size 24 --learning_rate 3e-05 --num_train_epochs 25
 62 |     --label_file datasets/vqa/cache/trainval_ans2label.pkl --save_epoch 30
 63 |     --seed 88 --evaluate_during_training --logging_steps 4000 --drop_out 0.3 --weight_decay
 64 |     0.05 --warmup_steps 0 --loss_type bce --save_after_epoch 15 --output_dir results --img_feat_format pt --classifier linear --cls_hidden_scale 3 --txt_data_dir datasets/vqa/2k
 65 | ```
 66 | 
 67 | 
 68 | ## GQA
 69 | Script to finetune for Oscar base model.
 70 | 
 71 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/gqa/base/ab175_base_ep107_1192k_0.4true_taeb_done_25eps_good/exp_ab175_base_ep107_1192k_0.4true_taeb_b_48_0.00005_165_45_dp_0.3_abce/results/eval_logs.json), [output.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/gqa/base/ab175_base_ep107_1192k_0.4true_taeb_done_25eps_good/exp_ab175_base_ep107_1192k_0.4true_taeb_b_48_0.00005_165_45_dp_0.3_abce/stdout.txt).<br />
 72 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/gqa/base/ab165_img45_1568928610179_62515_test_done_good/results.txt).<br />
 73 | Model checkpoint: [.zip](https://biglmdiag.blob.core.windows.net/oscar/exp/gqa/base/gqa_base_best.zip).
 74 | ```bash
 75 | python oscar/run_gqa.py -j 4 --img_feature_dim 2054 --max_img_seq_length
 76 |     45 --data_dir datasets/GQA/0.4true --model_type bert --model_name_or_path pretrained_models/base-vg-labels/ep_107_1192087
 77 |     --task_name gqa --do_lower_case --max_seq_length 165 --per_gpu_eval_batch_size
 78 |     256 --per_gpu_train_batch_size 48 --learning_rate 5e-05 --num_train_epochs 5 --output_dir
 79 |     results --label_file datasets/GQA/questions1.2/trainval_testdev_all_ans2label.pkl
 80 |     --img_feature_type faster_r-cnn --data_label_type all --train_data_type all --eval_data_type
 81 |     bal --label2ans_file datasets/GQA/questions1.2/trainval_testdev_all_label2ans.pkl
 82 |     --loss_type xe --save_epoch 2 --seed 88 --evaluate_during_training --logging_steps
 83 |     4000 --drop_out 0.3 --do_train --weight_decay 0.05 --warmup_steps 0
 84 | ```
 85 | 
 86 | ## NLVR2
 87 | Script to finetune for Oscar base model.
 88 | 
 89 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/base/exp_rvln_base_ep107_1192k_wm1w_b72_0.00003_55_40_dp0.3_3mlp_wm10000_abcf_best/results/eval_logs.json), [output.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/base/exp_rvln_base_ep107_1192k_wm1w_b72_0.00003_55_40_dp0.3_3mlp_wm10000_abcf_best/stdout.txt).<br />
 90 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/base/exp_nlvr_base_11123_testall_b24_0.00003_55_43_dp_0.3_mlp_abcj_best/stdout.txt).
 91 | ```bash
 92 | python oscar/run_nlvr.py -j 4 --img_feature_dim 2054 --max_img_seq_length
 93 |     40 --data_dir datasets/nlvr2/ft_corpus --model_type bert --model_name_or_path pretrained_models/base-vg-labels/ep_107_1192087
 94 |     --task_name nlvr --do_lower_case --max_seq_length 55 --per_gpu_eval_batch_size
 95 |     64 --per_gpu_train_batch_size 72 --learning_rate 3e-05 --num_train_epochs 20 --output_dir
 96 |     results --img_feature_type faster_r-cnn --data_label_type all --train_data_type
 97 |     all --eval_data_type all --loss_type xe --save_epoch -1 --seed 88 --evaluate_during_training
 98 |     --logging_steps -1 --drop_out 0.3 --do_train --weight_decay 0.05 --warmup_steps
 99 |     10000 --classifier mlp --cls_hidden_scale 3 --num_choice 2 --use_pair
100 | ```
101 | 
102 | Script to finetune for Oscar large model.
103 | 
104 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/large/large_1583307153868_14140/exp_rvln_large_ep55_1618k_b24_0.00002_seq55_img40_dp0.3_2mlp_wm5000_abcj/results/eval_logs.json), [output.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/large/large_1583307153868_14140/exp_rvln_large_ep55_1618k_b24_0.00002_seq55_img40_dp0.3_2mlp_wm5000_abcj/stdout.txt).<br />
105 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/large/large_1583307153868_14140/exp_nlvr_large_1583307153868_14140_testall_b24_0.00003_55_43_dp_0.3_mlp_abck/stdout.txt).
106 | ```bash
107 | python oscar/run_nlvr.py -j 4 --img_feature_dim 2054 --max_img_seq_length
108 |     40 --data_dir datasets/nlvr2/ft_corpus --model_type bert --model_name_or_path pretrained_models/large-vg-labels/ep_55_1617000
109 |     --task_name nlvr --do_lower_case --max_seq_length 55 --per_gpu_eval_batch_size
110 |     64 --per_gpu_train_batch_size 24 --learning_rate 3e-05 --num_train_epochs 20 --output_dir
111 |     results --img_feature_type faster_r-cnn --data_label_type all --train_data_type
112 |     all --eval_data_type all --loss_type xe --save_epoch -1 --seed 88 --evaluate_during_training
113 |     --logging_steps -1 --drop_out 0.3 --do_train --weight_decay 0.05 --warmup_steps
114 |     5000 --classifier mlp --cls_hidden_scale 2 --num_choice 2 --use_pair
115 | ```
116 | 
117 | <!---
118 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/large/large_1583307153868_14140/exp_rvln_large_ep55_1618k_b24_0.00002_seq55_img40_dp0.3_2mlp_wm5000_abcj/results/eval_logs.json), [output.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/large/large_1583307153868_14140/exp_rvln_large_ep55_1618k_b24_0.00002_seq55_img40_dp0.3_2mlp_wm5000_abcj/stdout.txt).<br />
119 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/large/large_1583307153868_14140/exp_nlvr_large_1583307153868_14140_testall_b24_0.00003_55_43_dp_0.3_mlp_abck/stdout.txt).
120 | ```bash
121 | python oscar/run_nlvr.py -j 4 --img_feature_dim 2054 --max_img_seq_length
122 |     40 --data_dir datasets/nlvr2/ft_corpus --model_type bert --model_name_or_path pretrained_models/base-vg-labels/ep_55_1617000
123 |     --task_name nlvr --do_lower_case --max_seq_length 55 --per_gpu_eval_batch_size
124 |     64 --per_gpu_train_batch_size 24 --learning_rate 3e-05 --num_train_epochs 20 --output_dir
125 |     results --img_feature_type faster_r-cnn --data_label_type all --train_data_type
126 |     all --eval_data_type all --loss_type xe --save_epoch -1 --seed 88 --evaluate_during_training
127 |     --logging_steps -1 --drop_out 0.3 --do_train --weight_decay 0.05 --warmup_steps
128 |     5000 --classifier mlp --cls_hidden_scale 2 --num_choice 2 --use_pair
129 | ```
130 | --->
131 | 
132 | ## Image Text Retrieval
133 | Script to finetune for Oscar base model (4 V100 with 16G mem):
134 | 
135 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/retrieval/base/eval_logs.json), [log.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/retrieval/base/log.txt).
136 | Model checkpoint: [checkpoint.zip](https://biglmdiag.blob.core.windows.net/oscar/exp/retrieval/base/checkpoint.zip).
137 | 
138 | ```bash
139 | python oscar/run_retrieval.py \
140 |     --model_name_or_path pretrained_models/base-vg-labels/ep_67_588997 \
141 |     --do_train \
142 |     --do_lower_case \
143 |     --evaluate_during_training \
144 |     --num_captions_per_img_val 20 \
145 |     --eval_caption_index_file minival_caption_indexs_top20.pt \
146 |     --per_gpu_train_batch_size 32 \
147 |     --learning_rate 0.00002 \
148 |     --num_train_epochs 30 \
149 |     --weight_decay 0.05 \
150 |     --save_steps 5000 \
151 |     --add_od_labels \
152 |     --od_label_type vg \
153 |     --max_seq_length 70 \
154 |     --output_dir output/
155 | ```
156 | 
157 | Script to finetune for Oscar large model (8 V100 with 32G mem):
158 | 
159 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/retrieval/large/eval_logs.json), [log.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/retrieval/large/log.txt).
160 | Model checkpoint: [checkpoint.zip](https://biglmdiag.blob.core.windows.net/oscar/exp/retrieval/large/checkpoint.zip).
161 | 
162 | ```bash
163 | python oscar/run_retrieval.py \
164 |     --model_name_or_path pretrained_models/large-vg-labels/ep_7_816000 \
165 |     --do_train \
166 |     --do_lower_case \
167 |     --evaluate_during_training \
168 |     --num_captions_per_img_val 20 \
169 |     --eval_caption_index_file minival_caption_indexs_top20.pt \
170 |     --per_gpu_train_batch_size 16 \
171 |     --learning_rate 0.00001 \
172 |     --num_train_epochs 30 \
173 |     --save_steps 5000 \
174 |     --add_od_labels \
175 |     --od_label_type vg \
176 |     --max_seq_length 70 \
177 |     --output_dir output/
178 | ```
179 | 
180 | Script to inference on COCO 1K test set:
181 | ```bash
182 | python oscar/run_retrieval.py \
183 |     --do_test \
184 |     --do_eval \
185 |     --test_split test \
186 |     --num_captions_per_img_val 5 \
187 |     --eval_img_keys_file test_img_keys_1k.tsv \
188 |     --cross_image_eval \
189 |     --per_gpu_eval_batch_size 64 \
190 |     --eval_model_dir your_model_for_evaluation # could be base/large models.
191 | ```
192 | 
193 | Script to inference on COCO 5K test set:
194 | ```bash
195 | python oscar/run_retrieval.py \
196 |     --do_test \
197 |     --do_eval \
198 |     --test_split test \
199 |     --num_captions_per_img_val 5 \
200 |     --eval_img_keys_file test_img_keys.tsv \
201 |     --cross_image_eval \
202 |     --per_gpu_eval_batch_size 64 \
203 |     --eval_model_dir your_model_for_evaluation # could be base/large models.
204 | ```
205 | 
206 | 
207 | ## Image Captioning on COCO
208 | Script to finetune for Oscar base model (4 V100 with 16G mem):
209 | 
210 | Training logs: [log.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/coco_caption/base/log.txt).
211 | Model checkpoint: [checkpoint.zip](https://biglmdiag.blob.core.windows.net/oscar/exp/coco_caption/base/checkpoint.zip).
212 | 
213 | 1) First train with cross-entropy loss:
214 | ```bash
215 | python oscar/run_captioning.py \
216 |     --model_name_or_path pretrained_models/base-vg-labels/ep_67_588997 \
217 |     --do_train \
218 |     --do_lower_case \
219 |     --evaluate_during_training \
220 |     --add_od_labels \
221 |     --learning_rate 0.00003 \
222 |     --per_gpu_train_batch_size 64 \
223 |     --num_train_epochs 30 \
224 |     --save_steps 5000 \
225 |     --output_dir output/
226 | ```
227 | 2) Finetune with CIDEr optimization:
228 | ```bash
229 | python oscar/run_captioning.py \
230 |     --model_name_or_path your_checkpoint_from_cross_entropy \
231 |     --do_train \
232 |     --do_lower_case \
233 |     --evaluate_during_training \
234 |     --add_od_labels \
235 |     --learning_rate 0.000005 \
236 |     --per_gpu_train_batch_size 16 \
237 |     --num_train_epochs 5 \
238 |     --scst \
239 |     --save_steps 2000 \
240 |     --output_dir output/
241 | ```
242 | 
243 | Script to finetune for Oscar large model (8 V100 with 32G mem):
244 | 1) First train with cross-entropy loss:
245 | ```bash
246 | python oscar/run_captioning.py \
247 |     --model_name_or_path pretrained_models/large-vg-labels/ep_7_816000 \
248 |     --do_train \
249 |     --do_lower_case \
250 |     --evaluate_during_training \
251 |     --add_od_labels \
252 |     --learning_rate 0.00001 \
253 |     --per_gpu_train_batch_size 32 \
254 |     --num_train_epochs 30 \
255 |     --save_steps 5000 \
256 |     --output_dir output/
257 | ```
258 | 2) Finetune with CIDEr optimization:
259 | ```bash
260 | python oscar/run_captioning.py \
261 |     --model_name_or_path your_checkpoint_from_cross_entropy \
262 |     --do_train \
263 |     --do_lower_case \
264 |     --evaluate_during_training \
265 |     --add_od_labels \
266 |     --learning_rate 0.000005 \
267 |     --per_gpu_train_batch_size 8 \
268 |     --num_train_epochs 5 \
269 |     --scst \
270 |     --save_steps 2000 \
271 |     --output_dir output/
272 | ```
273 | 
274 | Script to inference on COCO test set:
275 | ```bash
276 | python oscar/run_captioning.py \
277 |     --do_test \
278 |     --do_eval \
279 |     --test_yaml test.yaml \
280 |     --per_gpu_eval_batch_size 64 \
281 |     --num_beams 5 \
282 |     --max_gen_length 20 \
283 |     --eval_model_dir your_model_for_evaluation # could be bert base/large.
284 | ```
285 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Oscar: Object-Semantics Aligned Pre-training for Vision-and-Language Tasks    <img src="docs/oscar_logo.png" width="200" align="right"> 
 2 | # VinVL: Revisiting Visual Representations in Vision-Language Models  
 3 | ## Updates
 4 | 
 5 | 04/17/2023: Visual instruction tuning with GPT-4 is released! Please check out the multimodal model LLaVA: [[Project Page](https://llava-vl.github.io/)] [[Paper](https://arxiv.org/abs/2304.08485)] [[Demo](https://llava.hliu.cc/)]  [[Data](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K)] [[Model](https://huggingface.co/liuhaotian/LLaVA-13b-delta-v0)] 
 6 | 
 7 | 05/28/2020: Released finetuned models on downstream tasks, please check [MODEL_ZOO.md](MODEL_ZOO.md). <br/>
 8 | 05/15/2020: Released pretrained models, datasets, and code for downstream tasks finetuning. <br/>
 9 | 01/13/2021: our new work [VinVL](https://arxiv.org/abs/2101.00529) proposed OSCAR+, an improved version of OSCAR, and provided a better object-attribute detection model to extract features for V+L tasks. The VinVL work achieved SOTA performance on all seven V+L tasks here. Please stay tuned for the model and code release. <br/>
10 | 03/08/2021: Oscar+ pretraining code released, please check the last section in [VinVL_MODEL_ZOO.md](VinVL_MODEL_ZOO.md). All image features and model checkpoints in VinVL are also released. Please check [VinVL](https://github.com/pzzhang/VinVL) for details. <br/>
11 | 04/13/2021: Our [Scene Graph Benchmark Repo](https://github.com/microsoft/scene_graph_benchmark) has been released. Welcome to use the code there to extract image features with VinVL pretrained models. <br/>
12 | 
13 | 
14 | ## Introduction
15 | This repository contains source code necessary to reproduce the results presented in the paper [Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks](https://arxiv.org/abs/2004.06165).
16 | We propose a new cross-modal pre-training method **Oscar** (Object-Semantics Aligned Pre-training). It leverages **object tags** detected in images as anchor points to significantly ease the learning of image-text alignments. We pre-train Oscar on the public corpus of 6.5 million text-image pairs, and fine-tune it on downstream tasks, creating new state-of-the-arts on six well-established vision-language understanding and generation tasks. For more on this project, see the [Microsoft Research Blog post](https://www.microsoft.com/en-us/research/blog/objects-are-the-secret-key-to-revealing-the-world-between-vision-and-language/).
17 | 
18 | 
19 | <img src="docs/oscar.PNG" width="650"> 
20 | 
21 | ## Performance
22 | Task    | t2i | t2i | i2t | i2t | IC  | IC  |  IC  |  IC  | NoCaps | NoCaps |   VQA    |  NLVR2  |   GQA   |
23 | --------|-----|-----|-----|-----|-----|-----|------|------|--------|--------|----------|---------|---------|
24 | Metric	| R@1 | R@5 | R@1 | R@5 | B@4 |  M  |  C   |   S  |    C   |    S   | test-std | test-P  | test-std|
25 | SoTA_S  |39.2 | 68.0|56.6 | 84.5|38.9 |29.2 |129.8 | 22.4 |   61.5 |  9.2   |  70.92   | 58.80   | 63.17   |
26 | SoTA_B  |54.0 | 80.8|70.0 | 91.1|40.5 |29.7 |137.6 | 22.8 |   86.58| 12.38  |  73.67   | 79.30   |   -     |
27 | SoTA_L  |57.5 | 82.8|73.5 | 92.2|41.7 |30.6 |140.0 | 24.5 |     -  |   -    |  74.93   | 81.47   |   -     |
28 | -----   |---  |---  |---  |---  |---  |---  |---   |---   |---     |---     |---       |---      |---      |
29 | Oscar_B |54.0 | 80.8|70.0 | 91.1|40.5 |29.7 |137.6 | 22.8 |   78.8 | 11.7   |  73.44   | 78.36   | 61.62   |
30 | Oscar_L |57.5 | 82.8|73.5 | 92.2|41.7 |30.6 |140.0 | 24.5 |   80.9 | 11.3   |  73.82   | 80.05   |   -     |
31 | -----   |---  |---  |---  |---  |---  |---  |---   |---   |---     |---     |---       |---      |---      |
32 | VinVL_B |58.1 | 83.2|74.6 | 92.6|40.9 |30.9 |140.6 | 25.1 |   92.46| 13.07  |  76.12   | 83.08   | 64.65   |
33 | VinVL_L |58.8 | 83.5|75.4 | 92.9|41.0 |31.1 |140.9 | 25.2 |     -  |   -    |  76.62   | 83.98   |   -     |
34 | gain    | 1.3 |  0.7| 1.9 |  0.6| -0.7| 0.5 | 0.9  | 0.7  |    5.9 |  0.7   |   1.69   |  2.51   |  1.48   |
35 | 
36 | t2i: text-to-image retrieval; i2t: image-to-text retrieval; IC: image captioning on COCO. 
37 | 
38 | 
39 | ## Download
40 | We released pre-trained models, datasets, VinVL image features, and Oscar+ pretraining corpus for downstream tasks. 
41 | Please check [VinVL_DOWNLOAD.md](VinVL_DOWNLOAD.md) for details. 
42 | 
43 | To download checkpoints for the Vanilla OSCAR, please check [DOWNLOAD.md](DOWNLOAD.md) for details. 
44 | 
45 | ## Installation
46 | Check [INSTALL.md](INSTALL.md) for installation instructions.
47 | 
48 | ## Model Zoo
49 | Check [MODEL_ZOO.md](MODEL_ZOO.md) for scripts to run oscar downstream finetuning.
50 | 
51 | Check [VinVL_MODEL_ZOO.md](VinVL_MODEL_ZOO.md) for scripts to run oscar+ pretraining and downstream finetuning.
52 | 
53 | ## Citations
54 | Please consider citing this paper if you use the code:
55 | ```
56 | @article{li2020oscar,
57 |   title={Oscar: Object-Semantics Aligned Pre-training for Vision-Language Tasks},
58 |   author={Li, Xiujun and Yin, Xi and Li, Chunyuan and Hu, Xiaowei and Zhang, Pengchuan and Zhang, Lei and Wang, Lijuan and Hu, Houdong and Dong, Li and Wei, Furu and Choi, Yejin and Gao, Jianfeng},
59 |   journal={ECCV 2020},
60 |   year={2020}
61 | }
62 | 
63 | @article{zhang2021vinvl,
64 |   title={VinVL: Making Visual Representations Matter in Vision-Language Models},
65 |   author={Zhang, Pengchuan and Li, Xiujun and Hu, Xiaowei and Yang, Jianwei and Zhang, Lei and Wang, Lijuan and Choi, Yejin and Gao, Jianfeng},
66 |   journal={CVPR 2021},
67 |   year={2021}
68 | }
69 | ```
70 | 
71 | ## License
72 | Oscar is released under the MIT license. See [LICENSE](LICENSE) for details. 
73 | 
74 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
 1 | <!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK -->
 2 | 
 3 | ## Security
 4 | 
 5 | Microsoft takes the security of our software products and services seriously, which includes all source code repositories managed through our GitHub organizations, which include [Microsoft](https://github.com/Microsoft), [Azure](https://github.com/Azure), [DotNet](https://github.com/dotnet), [AspNet](https://github.com/aspnet), [Xamarin](https://github.com/xamarin), and [our GitHub organizations](https://opensource.microsoft.com/).
 6 | 
 7 | If you believe you have found a security vulnerability in any Microsoft-owned repository that meets [Microsoft's definition of a security vulnerability](https://docs.microsoft.com/en-us/previous-versions/tn-archive/cc751383(v=technet.10)), please report it to us as described below.
 8 | 
 9 | ## Reporting Security Issues
10 | 
11 | **Please do not report security vulnerabilities through public GitHub issues.**
12 | 
13 | Instead, please report them to the Microsoft Security Response Center (MSRC) at [https://msrc.microsoft.com/create-report](https://msrc.microsoft.com/create-report).
14 | 
15 | If you prefer to submit without logging in, send email to [secure@microsoft.com](mailto:secure@microsoft.com).  If possible, encrypt your message with our PGP key; please download it from the [Microsoft Security Response Center PGP Key page](https://www.microsoft.com/en-us/msrc/pgp-key-msrc).
16 | 
17 | You should receive a response within 24 hours. If for some reason you do not, please follow up via email to ensure we received your original message. Additional information can be found at [microsoft.com/msrc](https://www.microsoft.com/msrc). 
18 | 
19 | Please include the requested information listed below (as much as you can provide) to help us better understand the nature and scope of the possible issue:
20 | 
21 |   * Type of issue (e.g. buffer overflow, SQL injection, cross-site scripting, etc.)
22 |   * Full paths of source file(s) related to the manifestation of the issue
23 |   * The location of the affected source code (tag/branch/commit or direct URL)
24 |   * Any special configuration required to reproduce the issue
25 |   * Step-by-step instructions to reproduce the issue
26 |   * Proof-of-concept or exploit code (if possible)
27 |   * Impact of the issue, including how an attacker might exploit the issue
28 | 
29 | This information will help us triage your report more quickly.
30 | 
31 | If you are reporting for a bug bounty, more complete reports can contribute to a higher bounty award. Please visit our [Microsoft Bug Bounty Program](https://microsoft.com/msrc/bounty) page for more details about our active programs.
32 | 
33 | ## Preferred Languages
34 | 
35 | We prefer all communications to be in English.
36 | 
37 | ## Policy
38 | 
39 | Microsoft follows the principle of [Coordinated Vulnerability Disclosure](https://www.microsoft.com/en-us/msrc/cvd).
40 | 
41 | <!-- END MICROSOFT SECURITY.MD BLOCK -->


--------------------------------------------------------------------------------
/VinVL_DOWNLOAD.md:
--------------------------------------------------------------------------------
 1 | # Download
 2 | 
 3 | ## Datasets
 4 | We provide the extracted image region features, object tags, and the original text annotations for each downstream tasks.
 5 | ```bash
 6 | path/to/azcopy copy 'https://biglmdiag.blob.core.windows.net/vinvl/datasets/TASK_NAME' <target folder> --recursive
 7 | ```
 8 | `TASK_NAME` could be `coco_caption`, `nocaps`, `coco_ir`, `vqa`, `gqa`, `nlvr2`.
 9 | 
10 | ## Pre-trained Models
11 | We provide pre-trained *Oscar+* models of Bert-base and Bert-large structures, with the name starting with `base` and `large`, respectively.
12 | ```bash
13 | path/to/azcopy copy 'https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/TASK_NAME' <target folder> --recursive
14 | ```
15 | `TASK_NAME` could be `image_captioning` (including `nocaps`), `coco_ir`, `vqa`, `gqa`, `nlvr2`, `od_models`.
16 | 
17 | The models are trained with both image region features and object tags. The image region features are extracted by the Faster R-CNN with
18 | ResNet-101, using object and attribute annotations from [Visual Genome](http://visualgenome.org/).
19 | The object tags are from:
20 |     1) the same VisualGenome model, named as `-vg-labels`. Or,
21 |     2) the model trained on object annotations from [Open Images V5](https://storage.googleapis.com/openimages/web/index.html). named as `-oid-labels`. Or,
22 |     3) no object tags provied, serving as baseline, named as `-no-labels`.
23 | 
24 | ## Pre-exacted Image Features
25 | For ease-of-use, we make pretrained features available for all pretraining datasets and downstream tasks. 
26 | Features are stored in tsv (tab-separated-values) format that can be used in [pretraining](oscar/datasets/oscar_tsv.py) and dowstream tasks like [COCO Image-Text Retrieval](oscar/run_retrieval.py).
27 | 
28 | Notice that all the links below are links to a folder. We recommend using the following AzCopy command to download.
29 | ```
30 | path/to/azcopy copy <folder-link> <target-address> --recursive
31 | ```
32 | 
33 | [COCO 2014 Train/Val Image Features (~50G)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/coco_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/)
34 | 
35 | [COCO 2014 Test Image Features (~16G)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/coco_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/coco2014test/)
36 | 
37 | [COCO 2015 Test Image Features (~32G)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/coco_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/coco2015test/)
38 | 
39 | [GQA All Image Features (~62G)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/gqa_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/)
40 | 
41 | [NVLR2 Train/Del/Test Image Features (~28G)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/nlvr2_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/)
42 | 
43 | [Flickr30k All Image Features (~14G)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/flickr30k_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/)
44 | 
45 | [Google Conceptual Captions Image Features (Huge, ~960G, splitted into 12 chunks)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/googlecc_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/)
46 | 
47 | [SBU Image Features (Huge, ~280G, splitted into 4 chunks)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/sbu_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/)
48 | 
49 | [Open Images Detection Image Features (Huge, ~530G, splitted into 8 chunks)](https://biglmdiag.blob.core.windows.net/vinvl/image_features/oi_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/)
50 | 
51 | 
52 | ## Oscar+ pretraining corpus
53 | <img src="docs/pretrain_corpus.PNG" width="650"> 
54 | 
55 | [Small corpus](https://biglmdiag.blob.core.windows.net/vinvl/pretrain_corpus/coco_flickr30k_gqa.tsv)
56 | 
57 | [Medium corpus](https://biglmdiag.blob.core.windows.net/vinvl/pretrain_corpus/coco_flickr30k_gqa_oi.tsv)
58 | 
59 | [Large corpus](https://biglmdiag.blob.core.windows.net/vinvl/pretrain_corpus/coco_flickr30k_googlecc_gqa_sbu_oi.tsv)
60 | 
61 | We have tried our best to make sure that there is no data contamination between pretraining corpus and test sets for downstream tasks. 
62 | More specifically, we use two methods to achieve this. 
63 | (1) We use the COCO Image ID of Visual Genome and Flickr30k images.
64 | (2) For COCO, Visual Genome and Flickr30k, we calucate the pair-wise l2 norm between two images after resizing them into the same size.
65 | 
66 | 
67 | ### Note
68 | It is recommended to download large files with **AzCopy** for faster speed.
69 | AzCopy executable tools can be downloaded [here](https://docs.microsoft.com/en-us/azure/storage/common/storage-use-azcopy-v10#download-azcopy).
70 | Decompress the tar file and put the executable in any path. To download from
71 | any URL above, the command is:
72 | ```bash
73 | path/to/azcopy copy <URL> <local_path>
74 | 
75 | # for example, downloading coco_caption.zip
76 | path/to/azcopy copy https://biglmdiag.blob.core.windows.net/oscar/datasets/coco_caption.zip <local_path>
77 | ```
78 | 
79 | 


--------------------------------------------------------------------------------
/VinVL_MODEL_ZOO.md:
--------------------------------------------------------------------------------
  1 | ## Table of Contents
  2 | - <a href='#VQA'>VQA</a>
  3 | - <a href='#GQA'>GQA</a>
  4 | - <a href='#NLVR2'>NLVR2</a>
  5 | - <a href='#Image-Text-Retrieval'>Image/Text Retrieval</a>
  6 | - <a href='#Image-Captioning-on-COCO'>Image Captioning on COCO</a>
  7 | - <a href='#Oscarplus-pretraining'>Oscarplus pretraining</a>
  8 | 
  9 | 
 10 | ## Performance
 11 | Task    | t2i | t2i | i2t | i2t | IC  | IC  |  IC  |  IC  | NoCaps | NoCaps |   VQA    |  NLVR2  |   GQA   |
 12 | --------|-----|-----|-----|-----|-----|-----|------|------|--------|--------|----------|---------|---------|
 13 | Metric	| R@1 | R@5 | R@1 | R@5 | B@4 |  M  |  C   |   S  |    C   |    S   | test-std | test-P  | test-std|
 14 | SoTA_S  |39.2 | 68.0|56.6 | 84.5|38.9 |29.2 |129.8 | 22.4 |   61.5 |  9.2   |  70.92   | 58.80   | 63.17   |
 15 | SoTA_B  |54.0 | 80.8|70.0 | 91.1|40.5 |29.7 |137.6 | 22.8 |   86.58| 12.38  |  73.67   | 79.30   | 61.62   |
 16 | SoTA_L  |57.5 | 82.8|73.5 | 92.2|41.7 |30.6 |140.0 | 24.5 |     -  |   -    |  74.93   | 81.47   |   -     |
 17 | -----   |---  |---  |---  |---  |---  |---  |---   |---   |---     |---     |---       |---      |---      |
 18 | VinVL_B |58.1 | 83.2|74.6 | 92.6|40.9 |30.9 |140.4 | 25.1 |   92.46 (with [VIVO](https://arxiv.org/abs/2009.13682))| 13.07 (with [VIVO](https://arxiv.org/abs/2009.13682))  |  76.12   | 83.08   | 64.65   |
 19 | VinVL_L |58.8 | 83.5|75.4 | 92.9|41.0 |31.1 |140.9 | 25.2 |     -  |   -    |  76.62   | 83.98   |   -     |
 20 | gain    | 1.3 |  0.7| 1.9 |  0.6| -0.7| 0.5 | 0.9  | 0.7  |    5.9 |  0.7   |   1.69   |  2.51   |  1.48   |
 21 | 
 22 | t2i: text-to-image retrieval; i2t: image-to-text retrieval; IC: image captioning on COCO. 
 23 | 
 24 | For reference, we also release the training logs and output.
 25 | 
 26 | 
 27 | ## VQA
 28 | Script to finetune for Oscar base model.
 29 | Base model is trained on train split and evaluated on the val split. Good for later comparison.
 30 | <!---
 31 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/base/base_9m_ep107_1192k_eu1/application_1575931286052_40649/results/eval_logs.json), [output.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/base/base_9m_ep107_1192k_eu1/application_1575931286052_40649/results/stdout.txt).<br />
 32 | --->
 33 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/vqa/base/test/results.txt).<br />
 34 | Model checkpoint: [.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/vqa/base/best.zip).
 35 | ```bash
 36 | python oscar/run_vqa.py -j 4 --img_feature_dim 2054 --max_img_seq_length
 37 |     50 --data_label_type mask --img_feature_type faster_r-cnn --data_dir vinvl/datasets/vqa
 38 |     --model_type bert --model_name_or_path vinvl/model_ckpts/vqa/base/checkpoint-2000000
 39 |     --task_name vqa_text --do_train --do_lower_case --max_seq_length 128 --per_gpu_eval_batch_size
 40 |     256 --per_gpu_train_batch_size 32 --learning_rate 5e-05 --num_train_epochs 25
 41 |     --output_dir results --label_file datasets/vqa/cache/trainval_ans2label.pkl
 42 |     --save_epoch 1 --seed 88 --evaluate_during_training --logging_steps 4000 --drop_out
 43 |     0.3 --weight_decay 0.05 --warmup_steps 0 --loss_type bce --img_feat_format pt 
 44 |     --classifier linear --cls_hidden_scale 3 --txt_data_dir vinvl/datasets/vqa
 45 | ```
 46 | 
 47 | Script to finetune for Oscar large model.
 48 | Large model is trained on train+val split and evaluated on the val split, for reproduce the paper's best result.
 49 | 
 50 | <!---
 51 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/large/ab128_img_large_rr1_ep20_590k_tv_done_good/exp_ab128_img_large_rr1_ep20_590k_tv_0.00003_128_50_dp_0.3_wd_0.05_bce_3linear_s88_abcd/results/eval_logs.json), [output.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/vqa/large/ab128_img_large_rr1_ep20_590k_tv_done_good/exp_ab128_img_large_rr1_ep20_590k_tv_0.00003_128_50_dp_0.3_wd_0.05_bce_3linear_s88_abcd/stdout.txt).<br />
 52 | --->
 53 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/vqa/large/test/results.txt).<br />
 54 | Model checkpoint: [.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/vqa/large/best.zip).
 55 | ```bash
 56 | python oscar/run_vqa.py -j 4 --img_feature_dim 2054 --max_img_seq_length
 57 |     50 --data_label_type mask --img_feature_type faster_r-cnn --data_dir vinvl/datasets/vqa
 58 |     --model_type bert --model_name_or_path vinvl/model_ckpts/vqa/large/checkpoint-2000000
 59 |     --task_name vqa_text --do_train_val --do_lower_case --max_seq_length 128 --per_gpu_eval_batch_size
 60 |     256 --per_gpu_train_batch_size 24 --learning_rate 3e-05 --num_train_epochs 25
 61 |     --label_file datasets/vqa/cache/trainval_ans2label.pkl --save_epoch 30
 62 |     --seed 88 --evaluate_during_training --logging_steps 4000 --drop_out 0.3 --weight_decay
 63 |     0.05 --warmup_steps 0 --loss_type bce --save_after_epoch 15 --output_dir results --img_feat_format pt --classifier linear --cls_hidden_scale 3 --txt_data_dir vinvl/datasets/vqa
 64 | ```
 65 | 
 66 | 
 67 | ## GQA
 68 | Script to finetune for Oscar base model.
 69 | 
 70 | <!---
 71 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/gqa/base/ab175_base_ep107_1192k_0.4true_taeb_done_25eps_good/exp_ab175_base_ep107_1192k_0.4true_taeb_b_48_0.00005_165_45_dp_0.3_abce/results/eval_logs.json), [output.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/gqa/base/ab175_base_ep107_1192k_0.4true_taeb_done_25eps_good/exp_ab175_base_ep107_1192k_0.4true_taeb_b_48_0.00005_165_45_dp_0.3_abce/stdout.txt).<br />
 72 | --->
 73 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/gqa/base/results.txt).<br />
 74 | Model checkpoint: [.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/gqa/base/best.zip).
 75 | ```bash
 76 | python oscar/run_gqa.py -j 4 --img_feature_dim 2054 --max_img_seq_length
 77 |     45 --data_dir vinvl/datasets/gqa --model_type bert --model_name_or_path vinvl/model_ckpts/vqa/base/checkpoint-2000000
 78 |     --task_name gqa --do_lower_case --max_seq_length 165 --per_gpu_eval_batch_size
 79 |     256 --per_gpu_train_batch_size 48 --learning_rate 5e-05 --num_train_epochs 5 --output_dir
 80 |     results --label_file vinvl/datasets/gqa/trainval_testdev_all_ans2label.pkl
 81 |     --img_feature_type faster_r-cnn --data_label_type all --train_data_type all --eval_data_type
 82 |     bal --label2ans_file vinvl/datasets/gqa/trainval_testdev_all_label2ans.pkl
 83 |     --loss_type xe --save_epoch 2 --seed 88 --evaluate_during_training --logging_steps
 84 |     4000 --drop_out 0.3 --do_train --weight_decay 0.05 --warmup_steps 0
 85 | ```
 86 | 
 87 | ## NLVR2
 88 | Script to finetune for Oscar base model.
 89 | 
 90 | <!---
 91 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/base/exp_rvln_base_ep107_1192k_wm1w_b72_0.00003_55_40_dp0.3_3mlp_wm10000_abcf_best/results/eval_logs.json), [output.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/base/exp_rvln_base_ep107_1192k_wm1w_b72_0.00003_55_40_dp0.3_3mlp_wm10000_abcf_best/stdout.txt).<br />
 92 | --->
 93 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/nlvr2/base/rvln_base_oscar_v2_71.5_86236_test_done_best/exp_rvln_base_oscar_v2_71.5_86236_test_b24_0.00003_55_41_dp_0.3_mlp_abch/stdout.txt).<br />
 94 | Model checkpoint: [.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/nlvr2/base/best.zip).
 95 | ```bash
 96 | python oscar/run_nlvr.py -j 4 --img_feature_dim 2054 --max_img_seq_length
 97 |     40 --data_dir vinvl/datasets/nlvr2 --model_type bert --model_name_or_path vinvl/model_ckpts/vqa/base/checkpoint-2000000
 98 |     --task_name nlvr --do_lower_case --max_seq_length 55 --per_gpu_eval_batch_size
 99 |     64 --per_gpu_train_batch_size 72 --learning_rate 3e-05 --num_train_epochs 20 --output_dir
100 |     results --img_feature_type faster_r-cnn --data_label_type all --train_data_type
101 |     all --eval_data_type all --loss_type xe --save_epoch -1 --seed 88 --evaluate_during_training
102 |     --logging_steps -1 --drop_out 0.3 --do_train --weight_decay 0.05 --warmup_steps
103 |     10000 --classifier mlp --cls_hidden_scale 3 --num_choice 2 --use_pair
104 | ```
105 | 
106 | Script to finetune for Oscar large model.
107 | 
108 | <!---
109 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/large/large_1583307153868_14140/exp_rvln_large_ep55_1618k_b24_0.00002_seq55_img40_dp0.3_2mlp_wm5000_abcj/results/eval_logs.json), [output.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/large/large_1583307153868_14140/exp_rvln_large_ep55_1618k_b24_0.00002_seq55_img40_dp0.3_2mlp_wm5000_abcj/stdout.txt).<br />
110 | --->
111 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/nlvr2/large/rvln_oscar_v2_large_99617_test_done_best/exp_rvln_oscar_v2_large_99617_test_b24_0.00003_55_50_dp_0.3_mlp_abce/stdout.txt).<br />
112 | Model checkpoint: [.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/nlvr2/large/best.zip).
113 | ```bash
114 | python oscar/run_nlvr.py -j 4 --img_feature_dim 2054 --max_img_seq_length
115 |     40 --data_dir vinvl/datasets/nlvr2 --model_type bert --model_name_or_path vinvl/model_ckpts/vqa/large/checkpoint-2000000
116 |     --task_name nlvr --do_lower_case --max_seq_length 55 --per_gpu_eval_batch_size
117 |     64 --per_gpu_train_batch_size 24 --learning_rate 3e-05 --num_train_epochs 20 --output_dir
118 |     results --img_feature_type faster_r-cnn --data_label_type all --train_data_type
119 |     all --eval_data_type all --loss_type xe --save_epoch -1 --seed 88 --evaluate_during_training
120 |     --logging_steps -1 --drop_out 0.3 --do_train --weight_decay 0.05 --warmup_steps
121 |     5000 --classifier mlp --cls_hidden_scale 2 --num_choice 2 --use_pair
122 | ```
123 | 
124 | <!---
125 | Training logs: [eval_logs.json](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/large/large_1583307153868_14140/exp_rvln_large_ep55_1618k_b24_0.00002_seq55_img40_dp0.3_2mlp_wm5000_abcj/results/eval_logs.json), [output.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/large/large_1583307153868_14140/exp_rvln_large_ep55_1618k_b24_0.00002_seq55_img40_dp0.3_2mlp_wm5000_abcj/stdout.txt).<br />
126 | Final server results: [results.txt](https://biglmdiag.blob.core.windows.net/oscar/exp/nlvr2/large/large_1583307153868_14140/exp_nlvr_large_1583307153868_14140_testall_b24_0.00003_55_43_dp_0.3_mlp_abck/stdout.txt).
127 | ```bash
128 | python oscar/run_nlvr.py -j 4 --img_feature_dim 2054 --max_img_seq_length
129 |     40 --data_dir datasets/nlvr2/ft_corpus --model_type bert --model_name_or_path pretrained_models/base-vg-labels/ep_55_1617000
130 |     --task_name nlvr --do_lower_case --max_seq_length 55 --per_gpu_eval_batch_size
131 |     64 --per_gpu_train_batch_size 24 --learning_rate 3e-05 --num_train_epochs 20 --output_dir
132 |     results --img_feature_type faster_r-cnn --data_label_type all --train_data_type
133 |     all --eval_data_type all --loss_type xe --save_epoch -1 --seed 88 --evaluate_during_training
134 |     --logging_steps -1 --drop_out 0.3 --do_train --weight_decay 0.05 --warmup_steps
135 |     5000 --classifier mlp --cls_hidden_scale 2 --num_choice 2 --use_pair
136 | ```
137 | --->
138 | 
139 | ## Image Text Retrieval
140 | Script to finetune for Oscarplus base model (8 V100 with 16G mem):
141 | 
142 | Training logs: [train_logs](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/coco_ir/base/train_logs/), 
143 | 
144 | Training logs: [test_logs](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/coco_ir/base/test_logs/), 
145 | 
146 | Command [command](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/coco_ir/base/philly.yaml).
147 | 
148 | Model checkpoint: [ckeckpoint](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/coco_ir/base/checkpoint-0132780/).
149 | 
150 | ```bash
151 | python oscar/run_retrieval.py \
152 |     --model_name_or_path vinvl/coco_ir/base/checkpoint-1340000 \
153 |     --do_train \
154 |     --do_lower_case \
155 |     --evaluate_during_training \
156 |     --num_captions_per_img_val 20 \
157 |     --eval_caption_index_file minival_caption_indexs_top20.pt \
158 |     --per_gpu_train_batch_size 16 \
159 |     --learning_rate 0.00002 \
160 |     --num_train_epochs 30 \
161 |     --weight_decay 0.05 \
162 |     --save_steps 5000 \
163 |     --add_od_labels \
164 |     --od_label_type vg \
165 |     --max_seq_length 70 \
166 |     --max_img_seq_length 70 \
167 |     --output_dir output/
168 | ```
169 | 
170 | Script to finetune for Oscarplus large model (8 V100 with 32G mem):
171 | 
172 | Training logs: [train_logs](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/coco_ir/large/train_logs/), 
173 | 
174 | Training logs: [test_logs](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/coco_ir/large/test_logs/), 
175 | 
176 | Command [command](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/coco_ir/large/philly.yaml).
177 | 
178 | Model checkpoint: [ckeckpoint](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/coco_ir/large/checkpoint-0132780/).
179 | 
180 | ```bash
181 | python oscar/run_retrieval.py \
182 |     --model_name_or_path vinvl/coco_ir/base/checkpoint-0660000 \
183 |     --do_train \
184 |     --do_lower_case \
185 |     --evaluate_during_training \
186 |     --num_captions_per_img_val 20 \
187 |     --eval_caption_index_file minival_caption_indexs_top20.pt \
188 |     --per_gpu_train_batch_size 16 \
189 |     --learning_rate 7.5e-06 \
190 |     --num_train_epochs 30 \
191 |     --save_steps 5000 \
192 |     --add_od_labels \
193 |     --od_label_type vg \
194 |     --max_seq_length 70 \
195 |     --max_img_seq_length 70 \
196 |     --output_dir output \
197 |     --img_feat_file vinvl/image_features/coco_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/features.tsv 
198 | ```
199 | 
200 | Script to inference on COCO 1K test set:
201 | ```bash
202 | python oscar/run_retrieval.py \
203 |     --do_test \
204 |     --do_eval \
205 |     --test_split test \
206 |     --num_captions_per_img_val 5 \
207 |     --eval_img_keys_file test_img_keys_1k.tsv \
208 |     --cross_image_eval \
209 |     --per_gpu_eval_batch_size 64 \
210 |     --img_feat_file vinvl/image_features/coco_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/features.tsv \
211 |     --eval_model_dir your_model_for_evaluation # could be base/large models.
212 | ```
213 | 
214 | Script to inference on COCO 5K test set:
215 | ```bash
216 | python oscar/run_retrieval.py \
217 |     --do_test \
218 |     --do_eval \
219 |     --test_split test \
220 |     --num_captions_per_img_val 5 \
221 |     --eval_img_keys_file test_img_keys.tsv \
222 |     --cross_image_eval \
223 |     --per_gpu_eval_batch_size 64 \
224 |     --img_feat_file vinvl/image_features/coco_X152C4_frcnnbig2_exp168model_0060000model.roi_heads.nm_filter_2_model.roi_heads.score_thresh_0.2/model_0060000/features.tsv \
225 |     --eval_model_dir your_model_for_evaluation # could be base/large models.
226 | ```
227 | 
228 | 
229 | ## Image Captioning on COCO
230 | Script to finetune for base model:
231 | 
232 | Pretrained model checkpoint: [pretrained_base.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/image_captioning/pretrained_base.zip).
233 | Finetuned model checkpoint (w/ cross entropy): [coco_captioning_base_xe.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/image_captioning/coco_captioning_base_xe.zip).
234 | Finetuned model checkpoint (w/ CIDEr optimization): [coco_captioning_base_scst.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/image_captioning/coco_captioning_base_scst.zip).
235 | 
236 | 1) First train with cross-entropy loss (8 V100 with 16G mem):
237 | ```bash
238 | python oscar/run_captioning.py \
239 |     --model_name_or_path pretrained_models/image_captioning/pretrained_base \
240 |     --do_train \
241 |     --do_lower_case \
242 |     --add_od_labels \
243 |     --learning_rate 3e-5 \
244 |     --per_gpu_train_batch_size 64 \
245 |     --num_train_epochs 60 \
246 |     --tie_weights \
247 |     --freeze_embedding \
248 |     --label_smoothing 0.1 \
249 |     --drop_worst_ratio 0.2 \
250 |     --drop_worst_after 20000 \
251 |     --output_dir output/
252 | ```
253 | 2) Finetune with CIDEr optimization (8 V100 with 32G mem):
254 | ```bash
255 | python oscar/run_captioning.py \
256 |     --model_name_or_path your_checkpoint_from_cross_entropy \
257 |     --do_train \
258 |     --do_lower_case \
259 |     --add_od_labels \
260 |     --learning_rate 3e-6 \
261 |     --per_gpu_train_batch_size 16 \
262 |     --num_train_epochs 75 \
263 |     --tie_weights \
264 |     --freeze_embedding \
265 |     --scst \
266 |     --output_dir output/
267 | ```
268 | 
269 | Script to finetune for large model:
270 | 
271 | Pretrained model checkpoint: [pretrained_large.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/image_captioning/pretrained_large.zip).
272 | Finetuned model checkpoint (w/ cross entropy): [coco_captioning_large_xe.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/image_captioning/coco_captioning_large_xe.zip).
273 | Finetuned model checkpoint (w/ CIDEr optimization): [coco_captioning_large_scst.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/image_captioning/coco_captioning_large_scst.zip).
274 | 
275 | 1) First train with cross-entropy loss (8 V100 with 32G mem):
276 | ```bash
277 | python oscar/run_captioning.py \
278 |     --model_name_or_path pretrained_models/image_captioning/pretrained_large \
279 |     --do_train \
280 |     --do_lower_case \
281 |     --add_od_labels \
282 |     --learning_rate 1e-5 \
283 |     --per_gpu_train_batch_size 64 \
284 |     --num_train_epochs 60 \
285 |     --tie_weights \
286 |     --freeze_embedding \
287 |     --label_smoothing 0.1 \
288 |     --drop_worst_ratio 0.2 \
289 |     --drop_worst_after 20000 \
290 |     --output_dir output/
291 | ```
292 | 2) Finetune with CIDEr optimization (8 V100 with 32G mem):
293 | ```bash
294 | python oscar/run_captioning.py \
295 |     --model_name_or_path your_checkpoint_from_cross_entropy \
296 |     --do_train \
297 |     --do_lower_case \
298 |     --add_od_labels \
299 |     --learning_rate 8e-7 \
300 |     --per_gpu_train_batch_size 6 \
301 |     --num_train_epochs 25 \
302 |     --tie_weights \
303 |     --freeze_embedding \
304 |     --scst \
305 |     --output_dir output/
306 | ```
307 | 
308 | Script to inference on COCO test set:
309 | ```bash
310 | python oscar/run_captioning.py \
311 |     --do_test \
312 |     --do_eval \
313 |     --test_yaml test.yaml \
314 |     --per_gpu_eval_batch_size 64 \
315 |     --num_beams 5 \ 
316 |     --max_gen_length 20 \
317 |     --eval_model_dir your_model_for_evaluation # could be base or large models
318 | ```
319 | 
320 | ## Image Captioning on NoCaps
321 | Note that [NoCaps] (https://nocaps.org/) does not allow to use extra
322 | image-caption pairs for training except COCO. So the model is directly initialized
323 | from bert-base, and trained on COCO data.
324 | 
325 | Script to train base model:
326 | 
327 | Finetuned model checkpoint (w/ cross entropy): [nocaps_base_xe.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/image_captioning/nocaps_base_xe.zip).
328 | Finetuned model checkpoint (w/ CIDEr optimization): [nocaps_base_scst.zip](https://biglmdiag.blob.core.windows.net/vinvl/model_ckpts/image_captioning/nocaps_base_scst.zip).
329 | 
330 | 1) First train with cross-entropy loss (4 V100 with 16G mem):
331 | ```bash
332 | python oscar/run_captioning.py \
333 |     --model_name_or_path bert-base-uncased \
334 |     --do_train \
335 |     --do_lower_case \
336 |     --add_od_labels \
337 |     --learning_rate 0.0001 \
338 |     --per_gpu_train_batch_size 64 \
339 |     --num_train_epochs 30 \
340 |     --tie_weights \
341 |     --freeze_embedding \
342 |     --output_dir output/
343 | ```
344 | 2) Train with CIDEr optimization (8 V100 with 32G mem):
345 | ```bash
346 | python oscar/run_captioning.py \
347 |     --model_name_or_path your_checkpoint_from_cross_entropy \
348 |     --do_train \
349 |     --do_lower_case \
350 |     --add_od_labels \
351 |     --scheduler constant \
352 |     --learning_rate 5e-6 \
353 |     --per_gpu_train_batch_size 14 \
354 |     --num_train_epochs 50 \
355 |     --tie_weights \
356 |     --freeze_embedding \
357 |     --scst \
358 |     --output_dir output/
359 | ```
360 | 
361 | Script to inference on NoCaps val set with Constrained Beam Search:
362 | ```bash
363 | python oscar/run_captioning.py \
364 |     --do_test \
365 |     --do_eval \
366 |     --data_dir datasets/nocaps \
367 |     --test_yaml val.yaml \
368 |     --per_gpu_eval_batch_size 2 \
369 |     --num_beams 5 \
370 |     --use_cbs \
371 |     --max_gen_length 20 \
372 |     --eval_model_dir your_model_for_evaluation
373 | ```
374 | 
375 | <!---
376 | ## VCR
377 | Script to finetune for Oscar base model.
378 | 
379 | ```bash
380 | python oscar/run_vcr.py -j 4 --img_feature_dim 2054 --max_img_seq_length 50 --data_label_type mask --img_feature_type faster_r-cnn --data_dir datasets/vcr/X152C4_71.5/gt_box_feats --txt_data_dir datasets/vcr/new_corpus_v3 --model_type bert --model_name_or_path pretrained_models/base-vg-labels/ep_107_1192087 --task_name vcr_qar --do_lower_case --max_seq_length 202 --per_gpu_eval_batch_size 64 --per_gpu_train_batch_size 12 --learning_rate 3e-5 --num_train_epochs 2 --output_dir results/vcr --save_epoch 8 --seed 88 --evaluate_during_training --logging_steps 10 --drop_out 0.3 --weight_decay 0.05 --warmup_steps 0 --loss_type bce --save_steps 2000 --img_feat_dir datasets/vcr/X152C4_71.5/gt_box_feats --img_feat_format pt --num_choice 2 --classifier linear --cls_hidden_scale 2 --do_train --save_task_best
381 | ```
382 | --->
383 | 
384 | ## Oscarplus pretraining
385 | Table 16 below shows the statistics of image and text of the pre-training corpora. 
386 | In our ablation study, we have corpora of three different sizes: [Small](https://biglmdiag.blob.core.windows.net/vinvl/pretrain_corpus/coco_flickr30k_gqa_x152c4big2exp168.yaml), [Medium](https://biglmdiag.blob.core.windows.net/vinvl/pretrain_corpus/coco_flickr30k_gqa_oi_x152c4big2exp168.yaml), [Large](https://biglmdiag.blob.core.windows.net/vinvl/pretrain_corpus/coco_flickr30k_googlecc_gqa_sbu_oi_x152c4big2exp168.yaml). 
387 | Notice that we make use of image tagging datasets OpenImages, by generating captions using OSCAR's image captioning model to form triplets of ``(generated caption, image tags, image features)'' for the OSCAR+ pre-training. 
388 | <img src="docs/pretrain_corpus.PNG" width="650"> 
389 | 
390 | Script to perform oscar+ pretraining with the [large corpus](https://biglmdiag.blob.core.windows.net/vinvl/pretrain_corpus/coco_flickr30k_googlecc_gqa_sbu_oi_x152c4big2exp168.yaml). 
391 | ```bash
392 | python -m torch.distributed.launch --nproc_per_node=8 oscar/run_oscarplus_pretrain.py \
393 |     --use_b 1 \
394 |     --max_grad_norm 10.0 --gradient_accumulation_steps 1 \
395 |     --use_img_layernorm 1 \
396 |     --output_dir <your output folder> \
397 |     --bert_model bert --model_name_or_path bert-base-uncased \
398 |     --do_lower_case --learning_rate 5e-05 
399 |     --warmup_steps 0 --do_train --max_seq_length 35 --on_memory \
400 |     --max_img_seq_length 50 --img_feature_dim 2054 \
401 |     --drop_out 0.1 --train_batch_size 8 \
402 |     --ckpt_period 10000 --max_iters 2000000 --log_period 100 \
403 |     --data_dir <The input data dir that contain the .yaml files> --dataset_file coco_flickr30k_googlecc_gqa_sbu_oi_x152c4big2exp168.yaml
404 |     --textb_sample_mode 1 --texta_false_prob 0.25 
405 | ```
406 | 
407 | 
408 | One can perform the vanilla OSCAR pretraining by setting 
409 | ```bash
410 |     --textb_sample_mode 0 --texta_false_prob 0.0
411 | ```
412 | 
413 | One can also split the large pretraining corpus into two parts, i.e., coco_flickr30k_gqa + googlecc_sbu_oi, and use different textb_sample_modes for them. 
414 | To set textb_sample_mode=2 for coco_flickr30k_gqa has the potential to emphasize the QA-pairs in the small corpus. 
415 | ```bash
416 |     --data_dir <The input data dir that contain the .yaml files> --dataset_file coco_flickr30k_gqa_x152c4big2exp168.yaml
417 |     --textb_sample_mode 2 --texta_false_prob 0.25 \
418 |     --extra_dataset_file googlecc_sbu_oi_x152c4big2exp168.yaml \
419 |     --extra_textb_sample_mode 1 --extra_loss_weight 0.5
420 | ```


--------------------------------------------------------------------------------
/docs/oscar.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Oscar/266075fef2486846bb7110fbb6232074e09e076d/docs/oscar.PNG


--------------------------------------------------------------------------------
/docs/oscar_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Oscar/266075fef2486846bb7110fbb6232074e09e076d/docs/oscar_logo.png


--------------------------------------------------------------------------------
/docs/pretrain_corpus.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/microsoft/Oscar/266075fef2486846bb7110fbb6232074e09e076d/docs/pretrain_corpus.PNG


--------------------------------------------------------------------------------
/oscar/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.0"
2 | 


--------------------------------------------------------------------------------
/oscar/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.0"
2 | 


--------------------------------------------------------------------------------
/oscar/datasets/build.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import torch
  4 | from oscar.utils.misc import get_world_size
  5 | from .oscar_tsv import OscarTSVDataset
  6 | from transformers.pytorch_transformers import BertTokenizer
  7 | 
  8 | 
  9 | class BatchCollator(object):
 10 |     """
 11 |     From a list of samples from the dataset,
 12 |     returns the images and targets.
 13 |     """
 14 |     def __call__(self, batch):
 15 |         return list(zip(*batch))
 16 | 
 17 | 
 18 | def build_dataset(args):
 19 |     """
 20 |     Arguments:
 21 |         args: configuration.
 22 |     """
 23 |     full_yaml_file = os.path.join(args.data_dir, args.dataset_file)
 24 |     assert os.path.isfile(full_yaml_file)
 25 | 
 26 |     tokenizer = BertTokenizer.from_pretrained(
 27 |         args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
 28 |         do_lower_case=args.do_lower_case)
 29 | 
 30 |     cfg = dict(
 31 |         yaml_file=full_yaml_file,
 32 |         args=args,
 33 |         seq_len=args.max_seq_length,
 34 |         on_memory=args.on_memory,
 35 |         tokenizer=tokenizer,
 36 |     )
 37 |     # make dataset from factory
 38 |     datasets = [OscarTSVDataset(**cfg)]
 39 |     if args.extra_dataset_file:
 40 |         full_yaml_file = os.path.join(args.data_dir, args.extra_dataset_file)
 41 |         assert os.path.isfile(full_yaml_file)
 42 |         cfg['yaml_file'] = full_yaml_file
 43 |         cfg['textb_sample_mode'] = args.extra_textb_sample_mode
 44 |         datasets.append(OscarTSVDataset(**cfg))
 45 | 
 46 |     return datasets
 47 | 
 48 | 
 49 | def make_data_sampler(dataset, shuffle, distributed):
 50 |     if distributed:
 51 |         return torch.utils.data.distributed.DistributedSampler(
 52 |             dataset, shuffle=shuffle
 53 |         )
 54 |     if shuffle:
 55 |         sampler = torch.utils.data.sampler.RandomSampler(dataset)
 56 |     else:
 57 |         sampler = torch.utils.data.sampler.SequentialSampler(dataset)
 58 |     return sampler
 59 | 
 60 | 
 61 | class IterationBasedBatchSampler(torch.utils.data.sampler.BatchSampler):
 62 |     """
 63 |     Wraps a BatchSampler, resampling from it until
 64 |     a specified number of iterations have been sampled
 65 |     """
 66 | 
 67 |     def __init__(self, batch_sampler, num_iterations, start_iter=0):
 68 |         self.batch_sampler = batch_sampler
 69 |         self.num_iterations = num_iterations
 70 |         self.start_iter = start_iter
 71 | 
 72 |     def __iter__(self):
 73 |         iteration = self.start_iter
 74 |         while iteration <= self.num_iterations:
 75 |             # if the underlying sampler has a set_epoch method, like
 76 |             # DistributedSampler, used for making each process see
 77 |             # a different split of the dataset, then set it
 78 |             if hasattr(self.batch_sampler.sampler, "set_epoch"):
 79 |                 self.batch_sampler.sampler.set_epoch(iteration)
 80 |             for batch in self.batch_sampler:
 81 |                 iteration += 1
 82 |                 if iteration > self.num_iterations:
 83 |                     break
 84 |                 yield batch
 85 | 
 86 |     def __len__(self):
 87 |         return self.num_iterations
 88 | 
 89 | 
 90 | def make_batch_data_sampler(
 91 |         sampler, images_per_batch, num_iters=None,
 92 |         start_iter=0
 93 | ):
 94 |     batch_sampler = torch.utils.data.sampler.BatchSampler(
 95 |         sampler, images_per_batch, drop_last=False
 96 |     )
 97 |     if num_iters is not None and num_iters >= 0:
 98 |         batch_sampler = IterationBasedBatchSampler(
 99 |             batch_sampler, num_iters, start_iter
100 |         )
101 |     return batch_sampler
102 | 
103 | 
104 | def make_data_loader(args, is_distributed=False, arguments=None):
105 |     num_gpus = get_world_size()
106 |     # figure out start iteration
107 |     if arguments is None:
108 |         start_iter = 0
109 |     else:
110 |         start_iter = arguments['iteration']
111 |     # figure out the batchsize
112 |     grad_accumulate_steps = 1
113 |     if hasattr(args, 'gradient_accumulation_steps'):
114 |         grad_accumulate_steps = args.gradient_accumulation_steps
115 |     assert (
116 |             args.train_batch_size % grad_accumulate_steps == 0
117 |     ), "train_batch_size ({}) must be divisible by the number "
118 |     "of Gradient accumulation ({}) used."\
119 |         .format(args.train_batch_size, grad_accumulate_steps)
120 |     images_per_batch = args.train_batch_size//grad_accumulate_steps
121 |     assert (
122 |         images_per_batch % num_gpus == 0
123 |     ), "SOLVER.IMS_PER_BATCH ({}) must be divisible by the number "
124 |     "of GPUs ({}) used.".format(images_per_batch, num_gpus)
125 |     images_per_gpu = images_per_batch // num_gpus
126 |     logger = logging.getLogger(__name__)
127 |     logger.info("Train with {} images per GPU".format(images_per_gpu))
128 |     shuffle = True
129 |     num_iters = args.max_iters * grad_accumulate_steps
130 | 
131 |     # build dataset
132 |     datasets = build_dataset(args)
133 | 
134 |     data_loaders = []
135 |     for i, dataset in enumerate(datasets):
136 |         sampler = make_data_sampler(dataset, shuffle, is_distributed)
137 | 
138 |         batch_sampler = make_batch_data_sampler(
139 |            sampler, images_per_gpu, num_iters, start_iter
140 |         )
141 |         num_workers = args.num_workers
142 |         data_loader = torch.utils.data.DataLoader(
143 |             dataset,
144 |             num_workers=num_workers,
145 |             batch_sampler=batch_sampler,
146 |             collate_fn=BatchCollator(),
147 |             pin_memory=True,
148 |         )
149 |         data_loaders.append(data_loader)
150 |     return data_loaders
151 | 


--------------------------------------------------------------------------------
/oscar/modeling/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.0"
2 | 


--------------------------------------------------------------------------------
/oscar/run_oscarplus_pretrain.py:
--------------------------------------------------------------------------------
  1 | from __future__ import absolute_import, division, print_function
  2 | 
  3 | import argparse
  4 | import datetime
  5 | import json
  6 | import logging
  7 | import os
  8 | import random
  9 | import sys
 10 | import time
 11 | import math
 12 | import shutil
 13 | 
 14 | sys.path.insert(0, '.')
 15 | 
 16 | import numpy as np
 17 | import torch
 18 | 
 19 | from oscar.modeling.modeling_bert import BertImgForPreTraining
 20 | from transformers.pytorch_transformers import (WEIGHTS_NAME, BertConfig,
 21 |                                   BertTokenizer)
 22 | 
 23 | from oscar.datasets.build import make_data_loader
 24 | 
 25 | from transformers.pytorch_transformers import AdamW, WarmupLinearSchedule
 26 | from oscar.utils.misc import mkdir, get_rank
 27 | from oscar.utils.metric_logger import TensorboardLogger
 28 | 
 29 | logger = logging.getLogger(__name__)
 30 | 
 31 | ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig,)), ())
 32 | 
 33 | MODEL_CLASSES = {
 34 |     'bert': (BertConfig, BertImgForPreTraining, BertTokenizer),
 35 | }
 36 | 
 37 | 
 38 | """ ****** Pretraining ****** """
 39 | 
 40 | 
 41 | def main():
 42 |     parser = argparse.ArgumentParser()
 43 | 
 44 |     ## Required parameters
 45 |     parser.add_argument("--data_dir", default=None, type=str, required=False,
 46 |                         help="The input data dir. "
 47 |                              "Should contain the .yaml files for the task.")
 48 |     parser.add_argument("--dataset_file", default=None, type=str, required=True,
 49 |                         help="The training dataset yaml file.")
 50 |     parser.add_argument("--extra_dataset_file", default=None, type=str, required=False,
 51 |                         help="The extra training dataset yaml file.")
 52 |     parser.add_argument("--bert_model", default=None, type=str, required=True,
 53 |                         help="Bert pre-trained model selected in the list: bert-base-uncased, "
 54 |                              "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
 55 |     parser.add_argument("--output_dir", default=None, type=str, required=True,
 56 |                         help="The output directory where the model checkpoints will be written.")
 57 | 
 58 |     # image chunks
 59 |     parser.add_argument("--chunk_start_id", default=-1, type=int,
 60 |                         help="Image Chunk Start ID")
 61 |     parser.add_argument("--chunk_end_id", default=-1, type=int,
 62 |                         help="Image Chunk End ID")
 63 | 
 64 |     ## Image parameters
 65 |     parser.add_argument("--max_img_seq_length", default=50, type=int,
 66 |                         help="The maximum total input image sequence length.")
 67 |     parser.add_argument("--img_feature_dim", default=2054, type=int,
 68 |                         help="The Image Feature Dimension.")
 69 |     parser.add_argument("--img_feature_type", default='faster_r-cnn', type=str,
 70 |                         help="faster_r-cnn or mask_r-cnn")
 71 |     parser.add_argument("--use_layernorm", action='store_true',
 72 |                         help="use_layernorm")
 73 | 
 74 |     parser.add_argument("--drop_out", default=0.1, type=float,
 75 |                         help="Drop out for BERT.")
 76 | 
 77 |     parser.add_argument("--use_b", type=int, default=1, help="use_b")
 78 |     parser.add_argument("--textb_sample_mode", type=int, default=0,
 79 |                         help="0: sample from both texta&textb, "
 80 |                              "1: sample from textb, "
 81 |                              "2: sample from QA answers")
 82 |     parser.add_argument("--extra_textb_sample_mode", type=int, default=1)
 83 |     parser.add_argument("--texta_false_prob", type=float, default=0.0,
 84 |                         help="the probality that we sample wrong texta, should in [0.0, 0.5]")
 85 | 
 86 |     parser.add_argument("--model_name_or_path", default=None, type=str,
 87 |                         required=True,
 88 |                         help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
 89 |                             ALL_MODELS))
 90 |     parser.add_argument("--config_name", default="", type=str,
 91 |                         help="Pretrained config name or path if not the same as model_name")
 92 |     parser.add_argument("--tokenizer_name", default="", type=str,
 93 |                         help="Pretrained tokenizer name or path if not the same as model_name")
 94 |     parser.add_argument("--cache_dir", default="", type=str,
 95 |                         help="Where do you want to store the pre-trained models downloaded from s3")
 96 | 
 97 |     parser.add_argument("--max_seq_length", default=35, type=int,
 98 |                         help="The maximum total input sequence length after WordPiece tokenization. \n"
 99 |                              "Sequences longer than this will be truncated, and sequences shorter than this will be padded.")
100 |     parser.add_argument("--do_train", action='store_true',
101 |                         help="Whether to run training.")
102 |     parser.add_argument("--learning_rate", default=5e-5, type=float,
103 |                         help="The initial learning rate for Adam.")
104 |     parser.add_argument("--max_iters", default=2000000, type=int,
105 |                         help="Maximal number of training iterations.")
106 |     parser.add_argument("--train_batch_size", default=1024, type=int,
107 |                         help="Batch size for training.")
108 |     parser.add_argument("--num_workers", default=6, type=int,
109 |                         help="Number of workers for dataset.")
110 |     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
111 |                         help="Epsilon for Adam optimizer.")
112 |     parser.add_argument("--optim", default='adamw', type=str,
113 |                         help="The optimizer used for Bert, [adamw, lamb], default: adamw")
114 |     parser.add_argument("--max_grad_norm", default=-1.0, type=float, help="Max gradient norm.")
115 |     parser.add_argument("--warmup_steps", default=0, type=int,
116 |                         help="Linear warmup over warmup_steps.")
117 |     parser.add_argument("--no_cuda", action='store_true',
118 |                         help="Whether not to use CUDA when available")
119 |     parser.add_argument("--on_memory", action='store_true',
120 |                         help="Whether to load train samples into memory or use disk")
121 |     parser.add_argument("--do_lower_case", action='store_true',
122 |                         help="Whether to lower case the input text. True for uncased models, False for cased models.")
123 |     parser.add_argument("--local_rank", type=int, default=-1,
124 |                         help="local_rank for distributed training on gpus")
125 |     parser.add_argument('--seed', type=int, default=42,
126 |                         help="random seed for initialization")
127 |     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
128 |                         help="Number of updates steps to accumualte before performing a backward/update pass.")
129 | 
130 |     parser.add_argument("--from_scratch", action='store_true',
131 |                         help="train from scratch")
132 |     parser.add_argument("--use_img_layernorm", type=int, default=0,
133 |                         help="Normalize image features with bertlayernorm")
134 |     parser.add_argument("--img_layer_norm_eps", default=1e-12, type=float,
135 |                         help="The eps in image feature laynorm layer")
136 |     # distributed
137 |     parser.add_argument('--gpu_ids', type=str, default='-1')
138 |     parser.add_argument("--mask_loss_for_unmatched", type=int, default=1,
139 |                         help="masked language model loss for unmatched triplets")
140 |     parser.add_argument("--extra_loss_weight", type=float, default=0.0,
141 |                         help="the loss weight for the extra train data batch (should be in [0,1])")
142 |     parser.add_argument(
143 |         "--use_gtlabels",
144 |         type=int, default=1,
145 |         help="use groundtruth labels for text b or not"
146 |     )
147 |     # logging
148 |     parser.add_argument('--ckpt_period', type=int, default=10000,
149 |                         help="Period for saving checkpoint")
150 |     parser.add_argument('--log_period', type=int, default=100,
151 |                         help="Period for saving logging info")
152 |     args = parser.parse_args()
153 | 
154 |     if args.gpu_ids != '-1':
155 |         os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids
156 | 
157 |     args.num_gpus = int(
158 |         os.environ["WORLD_SIZE"]) if "WORLD_SIZE" in os.environ else 1
159 |     args.distributed = args.num_gpus > 1
160 | 
161 |     if args.gpu_ids != '-1':
162 |         os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_ids
163 | 
164 |     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train:
165 |         logger.info("Output Directory Exists.")
166 | 
167 |     # Setup CUDA, GPU & distributed training
168 |     if args.local_rank == -1 or args.no_cuda:
169 |         device = torch.device(
170 |             "cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
171 |         args.n_gpu = torch.cuda.device_count()
172 |     else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
173 |         torch.cuda.set_device(args.local_rank)
174 |         device = torch.device("cuda", args.local_rank)
175 |         torch.distributed.init_process_group(
176 |             backend='nccl', init_method="env://"
177 |         )
178 |         args.n_gpu = 1
179 |     args.device = device
180 | 
181 |     # Setup logging
182 |     logging.basicConfig(
183 |         format='%(asctime)s - %(levelname)s - %(name)s - %(message)s',
184 |         datefmt='%m/%d/%Y %H:%M:%S',
185 |         level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
186 |     logger.warning(
187 |         "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s",
188 |         args.local_rank, device, args.n_gpu, bool(args.local_rank != -1)
189 |     )
190 | 
191 |     if args.gradient_accumulation_steps < 1:
192 |         raise ValueError(
193 |             "Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
194 |                 args.gradient_accumulation_steps))
195 | 
196 |     random.seed(args.seed)
197 |     np.random.seed(args.seed)
198 |     torch.manual_seed(args.seed)
199 |     if args.n_gpu > 0:
200 |         torch.cuda.manual_seed_all(args.seed)
201 | 
202 |     if not args.do_train:
203 |         raise ValueError(
204 |             "Training is currently the only implemented execution option. Please set `do_train`.")
205 | 
206 |     if not os.path.exists(args.output_dir):
207 |         mkdir(args.output_dir)
208 | 
209 |     last_checkpoint_dir = None
210 |     arguments = {"iteration": 0}
211 |     if os.path.exists(args.output_dir):
212 |         save_file = os.path.join(args.output_dir, "last_checkpoint")
213 |         try:
214 |             with open(save_file, "r") as f:
215 |                 last_saved = f.read()
216 |                 last_saved = last_saved.strip()
217 |         except IOError:
218 |             # if file doesn't exist, maybe because it has just been
219 |             # deleted by a separate process
220 |             last_saved = ""
221 |         if last_saved:
222 |             folder_name = os.path.splitext(last_saved.split('/')[0])[0] # in the form of checkpoint-00001 or checkpoint-00001/pytorch_model.bin
223 |             last_checkpoint_dir = os.path.join(args.output_dir, folder_name)
224 |             arguments["iteration"] = int(folder_name.split('-')[-1])
225 |             assert os.path.isfile(os.path.join(last_checkpoint_dir, WEIGHTS_NAME)), "Last_checkpoint detected, but file not found!"
226 | 
227 |     # model first
228 |     if get_rank() != 0:
229 |         torch.distributed.barrier()
230 |     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.bert_model]
231 |     if last_checkpoint_dir is not None:  # recovery
232 |         args.model_name_or_path = last_checkpoint_dir
233 |         logger.info(" -> Recovering model from {}".format(last_checkpoint_dir))
234 | 
235 |     config = config_class.from_pretrained(
236 |         args.config_name if args.config_name else args.model_name_or_path,
237 |     )
238 |     config.img_layer_norm_eps = args.img_layer_norm_eps
239 |     config.use_img_layernorm = args.use_img_layernorm
240 | 
241 |     # discrete code
242 |     config.img_feature_dim = args.img_feature_dim
243 |     config.img_feature_type = args.img_feature_type
244 |     config.hidden_dropout_prob = args.drop_out
245 |     if args.texta_false_prob < 0.5 and (args.texta_false_prob > 0 or not args.use_b):
246 |         args.num_contrast_classes = 3
247 |     else:
248 |         args.num_contrast_classes = 2
249 |     config.num_contrast_classes = args.num_contrast_classes
250 | 
251 |     # Prepare model
252 |     # model = BertForPreTraining.from_pretrained(args.bert_model)
253 |     load_num = 0
254 |     while load_num < 10:
255 |         try:
256 |             model = BertImgForPreTraining.from_pretrained(
257 |                 args.model_name_or_path,
258 |                 from_tf=bool('.ckpt' in args.model_name_or_path),
259 |                 config=config)
260 |             break
261 |         except:
262 |             load_num += 1
263 | 
264 |     # train from scratch
265 |     if args.from_scratch:
266 |         if last_checkpoint_dir is None:
267 |             logger.info("Training from scratch ... ")
268 |             model.apply(model.init_weights)
269 |     total_params = sum(p.numel() for p in model.parameters())
270 |     logger.info(
271 |         'Total Parameters: {}'.format(total_params))
272 | 
273 |     for key, val in vars(config).items():
274 |         setattr(args, key, val)
275 | 
276 |     if get_rank() == 0 and args.local_rank != -1:
277 |         torch.distributed.barrier()
278 | 
279 |     model.to(args.device)
280 | 
281 |     logger.info("Training/evaluation parameters %s", args)
282 | 
283 |     tb_log_dir = os.path.join(args.output_dir, 'train_logs')
284 |     meters = TensorboardLogger(
285 |         log_dir=tb_log_dir,
286 |         delimiter="  ",
287 |     )
288 | 
289 |     # Prepare optimizer
290 |     param_optimizer = list(model.named_parameters())
291 |     no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
292 |     optimizer_grouped_parameters = [
293 |         {'params': [p for n, p in param_optimizer if
294 |                     not any(nd in n for nd in no_decay)],
295 |          'weight_decay': 0.01},
296 |         {'params': [p for n, p in param_optimizer if
297 |                     any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
298 |     ]
299 | 
300 |     optimizer = AdamW(optimizer_grouped_parameters,
301 |                               lr=args.learning_rate, eps=args.adam_epsilon)
302 |     scheduler = WarmupLinearSchedule(optimizer,
303 |                                      warmup_steps=args.warmup_steps,
304 |                                      t_total=args.max_iters)
305 | 
306 |     if arguments['iteration'] > 0 and os.path.isfile(os.path.join(last_checkpoint_dir, 'optimizer.pth')):  # recovery
307 |         logger.info(
308 |             "Load BERT optimizer from {}".format(last_checkpoint_dir))
309 |         optimizer_to_load = torch.load(
310 |             os.path.join(last_checkpoint_dir, 'optimizer.pth'),
311 |             map_location=torch.device("cpu"))
312 |         optimizer.load_state_dict(optimizer_to_load.pop("optimizer"))
313 |         scheduler.load_state_dict(optimizer_to_load.pop("scheduler"))
314 | 
315 |     if args.distributed:
316 |         model = torch.nn.parallel.DistributedDataParallel(
317 |             model, device_ids=[args.local_rank], output_device=args.local_rank,
318 |             find_unused_parameters=True)
319 |     elif args.n_gpu > 1:
320 |         model = torch.nn.DataParallel(model)
321 | 
322 |     # train_examples = None
323 |     train_dataloaders = make_data_loader(
324 |         args, is_distributed=args.distributed, arguments=arguments
325 |     )
326 | 
327 |     if isinstance(train_dataloaders, list):
328 |         train_dataloader = train_dataloaders[0]
329 |     else:
330 |         train_dataloader = train_dataloaders
331 |     train_dataloader_extra = [None] * len(train_dataloader)
332 |     if isinstance(train_dataloaders, list) and len(train_dataloaders) > 1:
333 |         logger.info("Having two train dataloaders!")
334 |         train_dataloader_extra = train_dataloaders[1]
335 |     tokenizer = train_dataloader.dataset.tokenizer
336 | 
337 |     # torch.backends.cudnn.benchmark = True
338 | 
339 |     max_iter = len(train_dataloader)
340 |     start_iter = arguments["iteration"]
341 |     logger.info("***** Running training *****")
342 |     logger.info(" Num examples = {}".format(len(train_dataloader.dataset)))
343 |     logger.info("  Instantaneous batch size = %d",
344 |                 args.train_batch_size // args.gradient_accumulation_steps)
345 |     logger.info(
346 |         "  Total train batch size (w. parallel, distributed & accumulation) = %d",
347 |         args.train_batch_size)
348 |     logger.info("  Gradient Accumulation steps = %d",
349 |                 args.gradient_accumulation_steps)
350 |     logger.info("  Total optimization steps = %d",
351 |                 max_iter // args.gradient_accumulation_steps)
352 | 
353 |     log_json = {}
354 | 
355 |     model.train()
356 |     model.zero_grad()
357 | 
358 |     clock_started = False
359 |     # Every args.ckpt_period, report train_score and save model
360 |     tr_loss = 0
361 |     nb_tr_examples, nb_tr_steps = 0, 0
362 |     for step, (batch, batch_extra) in enumerate(zip(train_dataloader, train_dataloader_extra), start_iter):
363 |         if not clock_started:
364 |             start_training_time = time.time()
365 |             end = time.time()
366 |             clock_started = True
367 | 
368 |         def data_process(mini_batch):
369 |             images, targets, qa_inds = \
370 |                 mini_batch[0], mini_batch[1], mini_batch[2]
371 |             targets_transposed = list(zip(*targets))
372 |             input_ids = torch.stack(targets_transposed[0]).to(args.device, non_blocking=True)
373 |             input_mask = torch.stack(targets_transposed[1]).to(args.device, non_blocking=True)
374 |             segment_ids = torch.stack(targets_transposed[2]).to(args.device, non_blocking=True)
375 |             lm_label_ids = torch.stack(targets_transposed[3]).to(args.device, non_blocking=True)
376 |             is_next = torch.stack(targets_transposed[4]).to(args.device, non_blocking=True)
377 |             is_img_match = torch.stack(targets_transposed[5]).to(args.device, non_blocking=True)
378 | 
379 |             return images, input_ids, input_mask, segment_ids, lm_label_ids, is_next
380 | 
381 |         images1, input_ids1, input_mask1, segment_ids1, lm_label_ids1, is_next1 \
382 |             = data_process(batch)
383 |         if batch_extra is not None:
384 |             images2, input_ids2, input_mask2, segment_ids2, lm_label_ids2, is_next2 \
385 |                 = data_process(batch_extra)
386 | 
387 |         data_time = time.time() - end
388 | 
389 |         def forward_backward(images, input_ids, input_mask, segment_ids,
390 |                              lm_label_ids, is_next, loss_weight=1.0):
391 |             # feature as input
392 |             image_features = torch.stack(images).to(args.device, non_blocking=True)
393 | 
394 |             outputs = model(input_ids, segment_ids, input_mask,
395 |                             lm_label_ids, is_next, img_feats=image_features)
396 | 
397 |             loss = loss_weight * outputs[0]
398 | 
399 |             if args.n_gpu > 1:
400 |                 loss = loss.mean()  # mean() to average on multi-gpu.
401 | 
402 |             if args.gradient_accumulation_steps > 1:
403 |                 loss = loss / args.gradient_accumulation_steps
404 |             loss.backward()
405 | 
406 |             return loss.item(), input_ids.size(0)
407 | 
408 |         start1 = time.time()
409 |         loss1, nb_tr_example1 = forward_backward(
410 |             images1, input_ids1, input_mask1,
411 |             segment_ids1, lm_label_ids1, is_next1,
412 |             loss_weight=1.0-args.extra_loss_weight
413 |         )
414 |         tr_loss += loss1
415 |         nb_tr_examples += nb_tr_example1
416 |         compute_time1 = time.time() - start1
417 | 
418 |         loss2, nb_tr_example2 = 0.0, 0
419 |         compute_time2 = 0.0
420 |         if batch_extra is not None:
421 |             start2 = time.time()
422 |             loss2, nb_tr_example2 = forward_backward(
423 |                 images2, input_ids2, input_mask2,
424 |                 segment_ids2, lm_label_ids2, is_next2,
425 |                 loss_weight=args.extra_loss_weight
426 |             )
427 |             tr_loss += loss2
428 |             nb_tr_examples += nb_tr_example2
429 |             compute_time2 = time.time() - start2
430 | 
431 |         nb_tr_steps += 1
432 |         arguments["iteration"] = step + 1
433 | 
434 |         if (step + 1) % args.gradient_accumulation_steps == 0:
435 |             # do gradient clipping
436 |             if args.max_grad_norm > 0:
437 |                 torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
438 |             # do the optimization steps
439 |             optimizer.step()
440 |             scheduler.step()  # Update learning rate schedule
441 |             optimizer.zero_grad()
442 | 
443 |             # measure elapsed time
444 |             batch_time = time.time() - end
445 |             end = time.time()
446 |             metrics_to_log = {
447 |                 'time_info': {'compute': batch_time, 'data': data_time,
448 |                               'compute1': compute_time1,
449 |                               'compute2': compute_time2},
450 |                 'batch_metrics': {'loss': loss1+loss2}
451 |             }
452 |             params_to_log = {'params': {'bert_lr': optimizer.param_groups[0]["lr"]}}
453 |             meters.update_metrics(metrics_to_log)
454 |             meters.update_params(params_to_log)
455 | 
456 |             if args.log_period > 0 and (step + 1) % args.log_period == 0:
457 |                 avg_time = meters.meters['time_info']['compute'].global_avg
458 |                 eta_seconds = avg_time * (max_iter - step - 1)
459 |                 eta_string = str(
460 |                     datetime.timedelta(seconds=int(eta_seconds)))
461 |                 logger.info(
462 |                     meters.delimiter.join(
463 |                         [
464 |                             "eta: {eta}",
465 |                             "iter: {iter}",
466 |                             "max mem: {memory:.0f}",
467 |                         ]
468 |                     ).format(
469 |                         eta=eta_string,
470 |                         iter=step + 1,
471 |                         memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0,
472 |                     ) + "\n    " + meters.get_logs(step + 1)
473 |                 )
474 | 
475 |         if (step + 1) == max_iter or (step + 1) % args.ckpt_period == 0:  # Save a trained model
476 |             log_json[step+1] = tr_loss
477 |             train_metrics_total = torch.Tensor([tr_loss, nb_tr_examples, nb_tr_steps]).to(args.device)
478 |             torch.distributed.all_reduce(train_metrics_total)
479 |             # reset metrics
480 |             tr_loss = 0
481 |             nb_tr_examples, nb_tr_steps = 0, 0
482 | 
483 |             if get_rank() == 0:
484 |                 # report metrics
485 |                 train_score_gathered = train_metrics_total[0] / \
486 |                                        train_metrics_total[2]
487 |                 logger.info("PROGRESS: {}%".format(
488 |                     round(100 * (step + 1) / max_iter, 4)))
489 |                 logger.info(
490 |                     "EVALERR: {}%".format(train_score_gathered))
491 |                 meters.update_metrics(
492 |                     {
493 |                         'epoch_metrics': {'ex_cnt': train_metrics_total[1],
494 |                                           'loss': train_score_gathered}
495 |                     }
496 |                 )
497 |                 with open(os.path.join(args.output_dir, 'loss_logs.json'),
498 |                           'w') as fp:
499 |                     json.dump(log_json, fp)
500 | 
501 |                 # save checkpoint
502 |                 output_dir = os.path.join(args.output_dir,
503 |                                           'checkpoint-{:07d}'.format(
504 |                                               step + 1))
505 |                 if not os.path.exists(output_dir):
506 |                     os.makedirs(output_dir)
507 |                 model_to_save = model.module if hasattr(
508 |                     model,
509 |                     'module') else model  # Take care of distributed/parallel training
510 |                 optimizer_to_save = {
511 |                     "optimizer": optimizer.state_dict(),
512 |                     "scheduler": scheduler.state_dict()}
513 | 
514 |                 save_num = 0
515 |                 while save_num < 10:
516 |                     try:
517 |                         model_to_save.save_pretrained(output_dir)
518 |                         torch.save(args, os.path.join(output_dir,
519 |                                                       'training_args.bin'))
520 |                         tokenizer.save_pretrained(output_dir)
521 |                         torch.save(optimizer_to_save,
522 |                                    os.path.join(output_dir,
523 |                                                 'optimizer.pth'))
524 |                         save_file = os.path.join(args.output_dir, "last_checkpoint")
525 |                         with open(save_file, "w") as f:
526 |                             f.write('checkpoint-{:07d}/pytorch_model.bin'.format(step + 1))
527 |                         break
528 |                     except:
529 |                         save_num += 1
530 |                 logger.info(
531 |                     "Saving model checkpoint {0} to {1}".format(
532 |                         step + 1, output_dir))
533 | 
534 |     if clock_started:
535 |         total_training_time = time.time() - start_training_time
536 |     else:
537 |         total_training_time = 0.0
538 |     total_time_str = str(datetime.timedelta(seconds=total_training_time))
539 |     logger.info(
540 |         "Total training time: {} ({:.4f} s / it)".format(
541 |             total_time_str, total_training_time / max_iter
542 |         )
543 |     )
544 |     # close the tb logger
545 |     meters.close()
546 | 
547 | 
548 | if __name__ == "__main__":
549 |     main()
550 | 


--------------------------------------------------------------------------------
/oscar/run_retrieval.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Microsoft Corporation. Licensed under the MIT license. 
  2 | 
  3 | from __future__ import absolute_import, division, print_function
  4 | import argparse
  5 | import os
  6 | import base64
  7 | import os.path as op
  8 | import random, json
  9 | import numpy as np
 10 | import torch
 11 | import torch.nn as nn
 12 | from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
 13 | from tqdm import tqdm
 14 | 
 15 | from oscar.utils.tsv_file import TSVFile
 16 | from oscar.utils.logger import setup_logger
 17 | from oscar.utils.misc import mkdir, set_seed
 18 | from oscar.modeling.modeling_bert import ImageBertForSequenceClassification
 19 | from transformers.pytorch_transformers import BertTokenizer, BertConfig 
 20 | from transformers.pytorch_transformers import AdamW, WarmupLinearSchedule, WarmupConstantSchedule
 21 | 
 22 | 
 23 | class RetrievalDataset(Dataset):
 24 |     """ Image/Text Retrieval Dataset"""
 25 |     def __init__(self, tokenizer, args, split='train', is_train=True):
 26 |         """
 27 |         tokenizer: tokenizer to process caption text.
 28 |         args: configureation parameters including max_seq_length, etc.
 29 |         split: used to infer the data used for training or testing. 
 30 |              All files are in .pt format of a dictionary with image keys and 
 31 |              image features (pytorch tensors), captions (list of str, support multiple
 32 |              captions per image), labels (list of dictionary or str of all labels),
 33 | 
 34 |         """
 35 |         super(RetrievalDataset, self).__init__()
 36 |         self.img_file = args.img_feat_file
 37 |         caption_file = op.join(args.data_dir, '{}_captions.pt'.format(split))
 38 |         self.img_tsv = TSVFile(self.img_file)
 39 |         self.captions = torch.load(caption_file)
 40 |         self.img_keys = list(self.captions.keys())  # img_id as int
 41 |         if not type(self.captions[self.img_keys[0]]) == list:
 42 |             self.captions = {k: json.loads(self.captions[k]) for k in self.img_keys}
 43 | 
 44 |         # get the image image_id to index map
 45 |         imgid2idx_file = op.join(op.dirname(self.img_file), 'imageid2idx.json')
 46 |         self.image_id2idx = json.load(open(imgid2idx_file))  # img_id as string
 47 |         
 48 |         if args.add_od_labels:
 49 |             label_data_dir = op.dirname(self.img_file)
 50 |             label_file = os.path.join(label_data_dir, "predictions.tsv")
 51 |             self.label_tsv = TSVFile(label_file)
 52 |             self.labels = {}
 53 |             for line_no in range(self.label_tsv.num_rows()):
 54 |                 row = self.label_tsv.seek(line_no)
 55 |                 image_id = row[0]
 56 |                 if int(image_id) in self.img_keys:
 57 |                     results = json.loads(row[1])
 58 |                     objects = results['objects'] if type(
 59 |                         results) == dict else results
 60 |                     self.labels[int(image_id)] = {
 61 |                         "image_h": results["image_h"] if type(
 62 |                             results) == dict else 600,
 63 |                         "image_w": results["image_w"] if type(
 64 |                             results) == dict else 800,
 65 |                         "class": [cur_d['class'] for cur_d in objects],
 66 |                         "boxes": np.array([cur_d['rect'] for cur_d in objects],
 67 |                                           dtype=np.float32)
 68 |                     }
 69 |             self.label_tsv._fp.close()
 70 |             self.label_tsv._fp = None
 71 | 
 72 |         if is_train:
 73 |             self.num_captions_per_img = args.num_captions_per_img_train
 74 |         else:
 75 |             self.num_captions_per_img = args.num_captions_per_img_val
 76 |             if args.eval_img_keys_file:
 77 |                 # select a subset of image keys for evaluation. eg. COCO 1k and 5k
 78 |                 # eval_img_keys_file is a list of image keys saved in tsv file
 79 |                 with open(op.join(args.data_dir, args.eval_img_keys_file), 'r') as f:
 80 |                     img_keys = f.readlines()
 81 |                 self.img_keys = [int(k.strip()) for k in img_keys]
 82 |                 self.captions = {k: self.captions[k] for k in self.img_keys}
 83 |                 if args.add_od_labels:
 84 |                     self.labels = {k: self.labels[k] for k in self.img_keys}
 85 | 
 86 |             if args.eval_caption_index_file:
 87 |                 # hard negative image/caption indexs for retrieval re-rank setting.
 88 |                 # useful for mini val set to monitor the performance during training.
 89 |                 # However, it cannot be used together with cross image evaluation.
 90 |                 self.has_caption_indexs = True
 91 |                 assert not args.cross_image_eval 
 92 |                 caption_index_file = op.join(args.data_dir, args.eval_caption_index_file)
 93 |                 self.caption_indexs = torch.load(caption_index_file)
 94 |                 if not type(self.caption_indexs[self.img_keys[0]]) == list:
 95 |                     self.caption_indexs = {k: json.loads(self.caption_indexs[k]) for k in self.img_keys}
 96 |             else:
 97 |                 self.has_caption_indexs = False
 98 |         self.is_train = is_train
 99 |         self.output_mode = args.output_mode
100 |         self.tokenizer = tokenizer
101 |         self.max_seq_len = args.max_seq_length
102 |         self.max_img_seq_len = args.max_img_seq_length
103 |         self.args = args
104 | 
105 |     def get_image_caption_index(self, index):
106 |         # return img_idx to access features and [img_key, cap_idx] to access caption
107 |         if not self.is_train and self.args.cross_image_eval:
108 |             img_idx = index // (self.num_captions_per_img * len(self.img_keys))
109 |             cap_idx = index % (self.num_captions_per_img * len(self.img_keys))
110 |             img_idx1 = cap_idx // self.num_captions_per_img
111 |             cap_idx1 = cap_idx % self.num_captions_per_img
112 |             return img_idx, [self.img_keys[img_idx1], cap_idx1]
113 |         if not self.is_train and self.has_caption_indexs:
114 |             img_idx = index // self.num_captions_per_img
115 |             cap_idx = index % self.num_captions_per_img
116 |             img_key1, cap_idx1 = self.caption_indexs[self.img_keys[img_idx]][cap_idx]
117 |             return img_idx, [img_key1, cap_idx1]
118 |         img_idx = index // self.num_captions_per_img
119 |         cap_idx = index % self.num_captions_per_img
120 |         return img_idx, [self.img_keys[img_idx], cap_idx]
121 | 
122 |     def get_label(self, index):
123 |         img_idx, cap_idx = self.get_image_caption_index(index)
124 |         return 1 if self.img_keys[img_idx] == cap_idx[0] else 0
125 | 
126 |     def get_od_labels(self, img_key):
127 |         if self.args.add_od_labels:
128 |             if type(self.labels[img_key]) == str:
129 |                 od_labels = self.labels[img_key]
130 |             else:
131 |                 od_labels = ' '.join(self.labels[img_key]['class'])
132 |             return od_labels
133 | 
134 |     def tensorize_example(self, text_a, img_feat, text_b=None, 
135 |             cls_token_segment_id=0, pad_token_segment_id=0,
136 |             sequence_a_segment_id=0, sequence_b_segment_id=1):
137 |         tokens_a = self.tokenizer.tokenize(text_a)
138 |         if len(tokens_a) > self.args.max_seq_length - 2:
139 |             tokens_a = tokens_a[:(self.args.max_seq_length - 2)]
140 | 
141 |         tokens = [self.tokenizer.cls_token] + tokens_a + [self.tokenizer.sep_token]
142 |         segment_ids = [cls_token_segment_id] + [sequence_a_segment_id] * (len(tokens_a) + 1)
143 |         seq_a_len = len(tokens)
144 |         if text_b:
145 |             tokens_b = self.tokenizer.tokenize(text_b)
146 |             if len(tokens_b) > self.max_seq_len - len(tokens) - 1:
147 |                 tokens_b = tokens_b[: (self.max_seq_len - len(tokens) - 1)]
148 |             tokens += tokens_b + [self.tokenizer.sep_token]
149 |             segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
150 | 
151 |         seq_len = len(tokens)
152 |         seq_padding_len = self.max_seq_len - seq_len
153 |         tokens += [self.tokenizer.pad_token] * seq_padding_len
154 |         segment_ids += [pad_token_segment_id] * seq_padding_len
155 |         input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
156 | 
157 |         # image features
158 |         img_len = img_feat.shape[0]
159 |         if img_len > self.max_img_seq_len:
160 |             img_feat = img_feat[0 : self.max_img_seq_len, :]
161 |             img_len = img_feat.shape[0]
162 |             img_padding_len = 0
163 |         else:
164 |             img_padding_len = self.max_img_seq_len - img_len
165 |             padding_matrix = torch.zeros((img_padding_len, img_feat.shape[1]))
166 |             img_feat = torch.cat((img_feat, padding_matrix), 0)
167 | 
168 |         # generate attention_mask
169 |         att_mask_type = self.args.att_mask_type
170 |         if att_mask_type == "CLR":
171 |             attention_mask = [1] * seq_len + [0] * seq_padding_len + \
172 |                              [1] * img_len + [0] * img_padding_len
173 |         else:
174 |             # use 2D mask to represent the attention
175 |             max_len = self.max_seq_len + self.max_img_seq_len
176 |             attention_mask = torch.zeros((max_len, max_len), dtype=torch.long)
177 |             # full attention of C-C, L-L, R-R
178 |             c_start, c_end = 0, seq_a_len
179 |             l_start, l_end = seq_a_len, seq_len
180 |             r_start, r_end = self.max_seq_len, self.max_seq_len + img_len
181 |             attention_mask[c_start : c_end, c_start : c_end] = 1
182 |             attention_mask[l_start : l_end, l_start : l_end] = 1
183 |             attention_mask[r_start : r_end, r_start : r_end] = 1
184 |             if att_mask_type == 'CL':
185 |                 attention_mask[c_start : c_end, l_start : l_end] = 1
186 |                 attention_mask[l_start : l_end, c_start : c_end] = 1
187 |             elif att_mask_type == 'CR':
188 |                 attention_mask[c_start : c_end, r_start : r_end] = 1
189 |                 attention_mask[r_start : r_end, c_start : c_end] = 1
190 |             elif att_mask_type == 'LR':
191 |                 attention_mask[l_start : l_end, r_start : r_end] = 1
192 |                 attention_mask[r_start : r_end, l_start : l_end] = 1
193 |             else:
194 |                 raise ValueError("Unsupported attention mask type {}".format(att_mask_type))
195 |         
196 |         input_ids = torch.tensor(input_ids, dtype=torch.long)
197 |         attention_mask = torch.tensor(attention_mask, dtype=torch.long)
198 |         segment_ids = torch.tensor(segment_ids, dtype=torch.long)
199 |         return (input_ids, attention_mask, segment_ids, img_feat)
200 | 
201 |     def __getitem__(self, index):
202 |         if self.is_train:
203 |             img_idx, cap_idxs = self.get_image_caption_index(index)
204 |             img_key = self.img_keys[img_idx]
205 |             feature = self.get_image(img_key)
206 |             caption = self.captions[cap_idxs[0]][cap_idxs[1]]
207 |             od_labels = self.get_od_labels(img_key)
208 |             example = self.tensorize_example(caption, feature, text_b=od_labels)
209 | 
210 |             # select a negative pair
211 |             neg_img_indexs = list(range(0, img_idx)) + list(range(img_idx + 1, len(self.img_keys)))
212 |             img_idx_neg = random.choice(neg_img_indexs)
213 |             if random.random() <= 0.5:
214 |                 # randomly select a negative caption from a different image.
215 |                 cap_idx_neg = random.randint(0, self.num_captions_per_img - 1)
216 |                 caption_neg = self.captions[self.img_keys[img_idx_neg]][cap_idx_neg]
217 |                 example_neg = self.tensorize_example(caption_neg, feature, text_b=od_labels)
218 |             else:
219 |                 # randomly select a negative image 
220 |                 feature_neg = self.get_image(self.img_keys[img_idx_neg])
221 |                 od_labels_neg = self.get_od_labels(self.img_keys[img_idx_neg])
222 |                 example_neg = self.tensorize_example(caption, feature_neg, text_b=od_labels_neg)
223 | 
224 |             example_pair = tuple(list(example) + [1] + list(example_neg) + [0])
225 |             return index, example_pair
226 |         else:
227 |             img_idx, cap_idxs = self.get_image_caption_index(index)
228 |             img_key = self.img_keys[img_idx]
229 |             feature = self.get_image(img_key)
230 |             caption = self.captions[cap_idxs[0]][cap_idxs[1]]
231 |             od_labels = self.get_od_labels(img_key)
232 |             example = self.tensorize_example(caption, feature, text_b=od_labels)
233 |             label = 1 if img_key == cap_idxs[0] else 0
234 |             return index, tuple(list(example) + [label])
235 | 
236 |     def get_image(self, image_id):
237 |         image_idx = self.image_id2idx[str(image_id)]
238 |         row = self.img_tsv.seek(image_idx)
239 |         num_boxes = int(row[1])
240 |         features = np.frombuffer(base64.b64decode(row[-1]),
241 |                                  dtype=np.float32).reshape((num_boxes, -1))
242 |         t_features = torch.from_numpy(features)
243 |         return t_features
244 | 
245 |     def __len__(self):
246 |         if not self.is_train and self.args.cross_image_eval:
247 |             return len(self.img_keys) ** 2 * self.num_captions_per_img
248 |         return len(self.img_keys) * self.num_captions_per_img
249 | 
250 | 
251 | def compute_score_with_logits(logits, labels):
252 |     if logits.shape[1] > 1:
253 |         logits = torch.max(logits, 1)[1].data # argmax
254 |         scores = logits == labels 
255 |     else:
256 |         scores = torch.zeros_like(labels).cuda()
257 |         for i, (logit, label) in enumerate(zip(logits, labels)):
258 |             logit_ = torch.sigmoid(logit)
259 |             if (logit_ >= 0.5 and label == 1) or (logit_ < 0.5 and label == 0):
260 |                 scores[i] = 1
261 |     return scores
262 | 
263 | 
264 | def compute_ranks(dataset, results):
265 |     labels = np.array([dataset.get_label(i) for i in range(len(dataset))])
266 |     similarities = np.array([results[i] for i in range(len(dataset))])
267 |     if dataset.has_caption_indexs:
268 |         num_captions_per_img = dataset.num_captions_per_img
269 |     else:
270 |         num_captions_per_img = len(dataset.img_keys) * dataset.num_captions_per_img
271 |     labels = np.reshape(labels, [-1, num_captions_per_img])
272 |     similarities = np.reshape(similarities, [-1, num_captions_per_img])
273 |     i2t_ranks, t2i_ranks = [], []
274 |     for lab, sim in zip(labels, similarities):
275 |         inds = np.argsort(sim)[::-1]
276 |         rank = num_captions_per_img
277 |         for r, ind in enumerate(inds):
278 |             if lab[ind] == 1:
279 |                 rank = r
280 |                 break
281 |         i2t_ranks.append(rank)
282 |     if not dataset.has_caption_indexs:
283 |         labels = np.swapaxes(labels, 0, 1)
284 |         similarities = np.swapaxes(similarities, 0, 1)
285 |         for lab, sim in zip(labels, similarities):
286 |             inds = np.argsort(sim)[::-1]
287 |             rank = num_captions_per_img
288 |             for r, ind in enumerate(inds):
289 |                 if lab[ind] == 1:
290 |                     rank = r
291 |                     break
292 |             t2i_ranks.append(rank)
293 |     return i2t_ranks, t2i_ranks
294 | 
295 | 
296 | def save_checkpoint(model, tokenizer, args, epoch, global_step):
297 |     checkpoint_dir = op.join(args.output_dir, 'checkpoint-{}-{}'.format(
298 |         epoch, global_step))
299 |     mkdir(checkpoint_dir)
300 |     model_to_save = model.module if hasattr(model, 'module') else model
301 |     save_num = 0
302 |     while (save_num < 10):
303 |         try:
304 |             model_to_save.save_pretrained(checkpoint_dir)
305 |             torch.save(args, op.join(checkpoint_dir, 'training_args.bin'))
306 |             tokenizer.save_pretrained(checkpoint_dir)
307 |             logger.info("Save checkpoint to {}".format(checkpoint_dir))
308 |             break
309 |         except:
310 |             save_num += 1
311 |     if save_num == 10:
312 |         logger.info("Failed to save checkpoint after 10 trails.")
313 |     return
314 | 
315 | 
316 | def train(args, train_dataset, val_dataset, model, tokenizer):
317 |     args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
318 |     train_sampler = RandomSampler(train_dataset) 
319 |     train_dataloader = DataLoader(train_dataset, sampler=train_sampler, 
320 |             batch_size=args.train_batch_size, num_workers=args.num_workers)
321 | 
322 |     if args.max_steps > 0:
323 |         t_total = args.max_steps
324 |         args.num_train_epochs = args.max_steps // (len(train_dataloader) // \
325 |                 args.gradient_accumulation_steps) + 1
326 |     else:
327 |         t_total = len(train_dataloader) // args.gradient_accumulation_steps \
328 |                 * args.num_train_epochs
329 | 
330 |     # Prepare optimizer and scheduler
331 |     no_decay = ['bias', 'LayerNorm.weight']
332 |     grouped_parameters = [
333 |         {'params': [p for n, p in model.named_parameters() if not \
334 |             any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
335 |         {'params': [p for n, p in model.named_parameters() if \
336 |             any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
337 |     ]
338 |     optimizer = AdamW(grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
339 |     if args.scheduler == "constant":
340 |         scheduler = WarmupConstantSchedule(
341 |                 optimizer, warmup_steps=args.warmup_steps)
342 |     elif args.scheduler == "linear":
343 |         scheduler = WarmupLinearSchedule(
344 |                 optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
345 |     else:
346 |         raise ValueError("Unknown scheduler type: {}".format(args.scheduler))
347 | 
348 |     if args.n_gpu > 1:
349 |         model = torch.nn.DataParallel(model)
350 | 
351 |     logger.info("***** Running training *****")
352 |     logger.info("  Num examples = %d", len(train_dataset))
353 |     logger.info("  Num Epochs = %d", args.num_train_epochs)
354 |     logger.info("  Batch size per GPU = %d", args.per_gpu_train_batch_size)
355 |     logger.info("  Total train batch size (w. parallel, & accumulation) = %d",
356 |                    args.train_batch_size * args.gradient_accumulation_steps)
357 |     logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
358 |     logger.info("  Total optimization steps = %d", t_total)
359 | 
360 |     global_step, global_loss, global_acc =0,  0.0, 0.0
361 |     model.zero_grad()
362 |     log_json = []
363 |     best_score = 0
364 |     for epoch in range(int(args.num_train_epochs)):
365 |         for step, (_, batch) in enumerate(train_dataloader):
366 |             model.train()
367 |             batch = tuple(t.to(args.device) for t in batch)
368 |             inputs = {
369 |                 'input_ids':      torch.cat((batch[0], batch[5]), dim=0),
370 |                 'attention_mask': torch.cat((batch[1], batch[6]), dim=0),
371 |                 'token_type_ids': torch.cat((batch[2], batch[7]), dim=0),
372 |                 'img_feats':      torch.cat((batch[3], batch[8]), dim=0),
373 |                 'labels':         torch.cat((batch[4], batch[9]), dim=0)
374 |             }
375 |             outputs = model(**inputs)
376 |             loss, logits = outputs[:2]
377 |             if args.n_gpu > 1: 
378 |                 loss = loss.mean() # mean() to average on multi-gpu parallel training
379 |             if args.gradient_accumulation_steps > 1:
380 |                 loss = loss / args.gradient_accumulation_steps
381 |             loss.backward()
382 |             torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
383 |             batch_score = compute_score_with_logits(logits, inputs['labels']).sum()
384 |             batch_acc = batch_score.item() / (args.train_batch_size * 2)
385 |             global_loss += loss.item()
386 |             global_acc += batch_acc
387 |             if (step + 1) % args.gradient_accumulation_steps == 0:
388 |                 global_step += 1
389 |                 scheduler.step()
390 |                 optimizer.step()
391 |                 model.zero_grad()
392 |                 if global_step % args.logging_steps == 0:
393 |                     logger.info("Epoch: {}, global_step: {}, lr: {:.6f}, loss: {:.4f} ({:.4f}), " \
394 |                         "score: {:.4f} ({:.4f})".format(epoch, global_step, 
395 |                         optimizer.param_groups[0]["lr"], loss, global_loss / global_step, 
396 |                         batch_acc, global_acc / global_step)
397 |                     )
398 | 
399 |                 if (args.save_steps > 0 and global_step % args.save_steps == 0) or \
400 |                         global_step == t_total:
401 |                     save_checkpoint(model, tokenizer, args, epoch, global_step) 
402 |                     # evaluation
403 |                     if args.evaluate_during_training: 
404 |                         logger.info("Perform evaluation at step: %d" % (global_step))
405 |                         test_result = test(args, model, val_dataset)
406 |                         eval_result = evaluate(val_dataset, test_result)
407 |                         rank_accs = eval_result['i2t_retrieval']
408 |                         if rank_accs['R@1'] > best_score:
409 |                             best_score = rank_accs['R@1']
410 |                         epoch_log = {'epoch': epoch, 'global_step': global_step, 
411 |                                      'R1': rank_accs['R@1'], 'R5': rank_accs['R@5'], 
412 |                                      'R10': rank_accs['R@10'], 'best_R1':best_score}
413 |                         log_json.append(epoch_log)
414 |                         with open(args.output_dir + '/eval_logs.json', 'w') as fp:
415 |                             json.dump(log_json, fp) 
416 |     return global_step, global_loss / global_step
417 | 
418 | 
419 | def test(args, model, eval_dataset):
420 |     args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
421 |     eval_sampler = SequentialSampler(eval_dataset)
422 |     eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler,
423 |             batch_size=args.eval_batch_size, num_workers=args.num_workers)
424 |     
425 |     logger.info("Num examples = {}".format(len(eval_dataset)))
426 |     logger.info("Evaluation batch size = {}".format(args.eval_batch_size))
427 |     model.eval()
428 |     results = {}
429 |     softmax = nn.Softmax(dim=1)
430 |     for indexs, batch in tqdm(eval_dataloader):
431 |         batch = tuple(t.to(args.device) for t in batch)
432 |         with torch.no_grad():
433 |             inputs = {
434 |                 'input_ids':      batch[0],
435 |                 'attention_mask': batch[1],
436 |                 'token_type_ids': batch[2],
437 |                 'img_feats':      batch[3],
438 |                 'labels':         batch[4]
439 |             }
440 |             _, logits = model(**inputs)[:2]
441 |             if args.num_labels == 2:
442 |                 probs = softmax(logits)
443 |                 result = probs[:, 1] # the confidence to be a matched pair
444 |             else:
445 |                 result = logits
446 |             result = [_.to(torch.device("cpu")) for _ in result]
447 |             results.update({idx.item(): res.item() for idx, res in zip(indexs, result)})
448 |     return results
449 | 
450 | 
451 | def evaluate(eval_dataset, test_results):
452 |     i2t_ranks, t2i_ranks = compute_ranks(eval_dataset, test_results)
453 |     rank = [1, 5, 10]
454 |     i2t_accs = [sum([_ < r for _ in i2t_ranks]) / len(i2t_ranks) for r in rank]
455 |     logger.info("I2T Retrieval: {:.4f} @ R1, {:.4f} @ R5, {:.4f} @ R10".format(
456 |                 i2t_accs[0], i2t_accs[1], i2t_accs[2]))
457 |     eval_result = {"i2t_retrieval": {"R@1": i2t_accs[0], "R@5": i2t_accs[1], "R@10": i2t_accs[2]}}
458 |     if t2i_ranks:
459 |         t2i_accs = [sum([_ < r for _ in t2i_ranks]) / len(t2i_ranks) for r in rank]
460 |         logger.info("T2I Retrieval: {:.4f} @ R1, {:.4f} @ R5, {:.4f} @ R10".format(
461 |                     t2i_accs[0], t2i_accs[1], t2i_accs[2]))
462 |         eval_result["t2i_retrieval"] = {"R@1": t2i_accs[0], "R@5": t2i_accs[1], "R@10": t2i_accs[2]}
463 |     return eval_result
464 | 
465 | 
466 | def get_predict_file(args):
467 |     cc = []
468 |     data = op.basename(op.join(args.data_dir, '')[:-1])
469 |     if data != 'coco_ir':
470 |         cc.append(data)
471 |     cc.append(args.test_split)
472 |     if args.add_od_labels:
473 |         cc.append('wlabels{}'.format(args.od_label_type))
474 |     return op.join(args.eval_model_dir, '{}.results.pt'.format('.'.join(cc))) 
475 | 
476 | 
477 | def restore_training_settings(args):
478 |     assert not args.do_train and (args.do_test or args.do_eval)
479 |     train_args = torch.load(op.join(args.eval_model_dir, 'training_args.bin'))
480 |     override_params = ['do_lower_case', 'img_feature_type', 'max_seq_length', 
481 |             'max_img_seq_length', 'add_od_labels', 'od_label_type',
482 |             'use_img_layernorm', 'img_layer_norm_eps']
483 |     for param in override_params:
484 |         if hasattr(train_args, param):
485 |             train_v = getattr(train_args, param)
486 |             test_v = getattr(args, param)
487 |             if train_v != test_v:
488 |                 logger.warning('Override {} with train args: {} -> {}'.format(param,
489 |                     test_v, train_v))
490 |                 setattr(args, param, train_v)
491 |     return args
492 | 
493 | 
494 | def main():
495 |     parser = argparse.ArgumentParser()
496 |     parser.add_argument("--data_dir", default='datasets/coco_ir', type=str, required=False,
497 |                         help="The input data dir with all required files.")
498 |     parser.add_argument("--img_feat_file", default='datasets/coco_ir/features.tsv', type=str, required=False,
499 |                         help="The absolute address of the image feature file.")
500 |     parser.add_argument("--model_name_or_path", default=None, type=str, required=False,
501 |                         help="Path to pre-trained model or model type. required for training.")
502 |     parser.add_argument("--output_dir", default='output/', type=str, required=False,
503 |                         help="The output directory to save checkpoint and test results.")
504 |     parser.add_argument("--loss_type", default='sfmx', type=str, 
505 |                         help="Loss function types: support kl, sfmx")
506 |     parser.add_argument("--config_name", default="", type=str, 
507 |                         help="Pretrained config name or path if not the same as model_name.")
508 |     parser.add_argument("--tokenizer_name", default="", type=str, 
509 |                         help="Pretrained tokenizer name or path if not the same as model_name.")
510 |     parser.add_argument("--max_seq_length", default=70, type=int,
511 |                         help="The maximum total input sequence length after tokenization. "
512 |                              "Sequences longer than this will be truncated, "
513 |                              "sequences shorter will be padded."
514 |                              "This number is calculated on COCO dataset" 
515 |                              "If add object detection labels, the suggested length should be 70.")
516 |     parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
517 |     parser.add_argument("--do_test", action='store_true', help="Whether to run inference.")
518 |     parser.add_argument("--do_eval", action='store_true', help="Whether to run performance valuation."
519 |                        "do not activate if we want to inference on dataset without gt labels.")
520 |     parser.add_argument("--test_split", default='test', type=str, help='data split name.')
521 |     parser.add_argument("--eval_img_keys_file", default='', type=str, 
522 |                         help="image key tsv to select a subset of images for evaluation. "
523 |                         "This is useful in 5-folds evaluation. The topn index file is not " 
524 |                         "needed in this case.")
525 |     parser.add_argument("--eval_caption_index_file", default='', type=str, 
526 |                         help="index of a list of (img_key, cap_idx) for each image."
527 |                         "this is used to perform re-rank using hard negative samples."
528 |                         "useful for validation set to monitor the performance during training.")
529 |     parser.add_argument("--cross_image_eval", action='store_true', 
530 |                         help="perform cross image inference, ie. each image with all texts from other images.")
531 |     parser.add_argument("--add_od_labels", default=False, action='store_true', 
532 |                         help="Whether to add object detection labels or not.")
533 |     parser.add_argument("--od_label_type", default='vg', type=str, 
534 |                         help="label type, support vg, gt, oid")
535 |     parser.add_argument("--att_mask_type", default='CLR', type=str, 
536 |                         help="attention mask type, support ['CL', 'CR', 'LR', 'CLR']"
537 |                         "C: caption, L: labels, R: image regions; CLR is full attention by default."
538 |                         "CL means attention between caption and labels."
539 |                         "please pay attention to the order CLR, which is the default concat order.")
540 |     parser.add_argument("--do_lower_case", action='store_true', 
541 |                         help="Set this flag if you are using an uncased model.")
542 |     parser.add_argument("--drop_out", default=0.1, type=float, help="Drop out in BERT.")
543 |     parser.add_argument("--max_img_seq_length", default=50, type=int, 
544 |                         help="The maximum total input image sequence length.")
545 |     parser.add_argument("--img_feature_dim", default=2054, type=int, 
546 |                         help="The Image Feature Dimension.")
547 |     parser.add_argument("--img_feature_type", default='frcnn', type=str,
548 |                         help="Image feature type.")
549 |     parser.add_argument("--use_img_layernorm", type=int, default=1,
550 |                         help="Normalize image features with bertlayernorm")
551 |     parser.add_argument("--img_layer_norm_eps", default=1e-12, type=float,
552 |                         help="The eps in image feature laynorm layer")
553 |     parser.add_argument("--per_gpu_train_batch_size", default=32, type=int, 
554 |                         help="Batch size per GPU/CPU for training.")
555 |     parser.add_argument("--per_gpu_eval_batch_size", default=64, type=int, 
556 |                         help="Batch size per GPU/CPU for evaluation.")
557 |     parser.add_argument("--output_mode", default='classification', type=str,
558 |                         help="output mode, support classification or regression.")
559 |     parser.add_argument("--num_labels", default=2, type=int, 
560 |                         help="num_labels is 2 for classification and 1 for regression.")
561 |     parser.add_argument("--num_captions_per_img_train", default=5, type=int,
562 |                         help="number of positive matched captions for each training image.")
563 |     parser.add_argument("--num_captions_per_img_val", default=5, type=int,
564 |                         help="number of captions for each testing image.")
565 |     parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
566 |                         help="Number of updates steps to accumulate before backward.")
567 |     parser.add_argument("--learning_rate", default=2e-5, type=float, help="The initial lr.")
568 |     parser.add_argument("--weight_decay", default=0.05, type=float, help="Weight deay.")
569 |     parser.add_argument("--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam.")
570 |     parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
571 |     parser.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup.")
572 |     parser.add_argument("--scheduler", default='linear', type=str, help="constant or linear.")
573 |     parser.add_argument("--num_workers", default=4, type=int, help="Workers in dataloader.")
574 |     parser.add_argument("--num_train_epochs", default=20, type=int, 
575 |                         help="Total number of training epochs to perform.")
576 |     parser.add_argument("--max_steps", default=-1, type=int, 
577 |                         help="Total number of training steps. Override num_train_epochs.")
578 |     parser.add_argument('--logging_steps', type=int, default=20, help="Log every X steps.")
579 |     parser.add_argument('--save_steps', type=int, default=-1, 
580 |                         help="Save checkpoint every X steps. Will also perform evaluatin.")
581 |     parser.add_argument("--evaluate_during_training", action='store_true', 
582 |                         help="Run evaluation during training at each save_steps.")
583 |     parser.add_argument("--eval_model_dir", type=str, default='', 
584 |                         help="Model directory for evaluation.")
585 |     parser.add_argument("--no_cuda", action='store_true', help="Avoid using CUDA.")
586 |     parser.add_argument('--seed', type=int, default=88, help="random seed for initialization.")
587 |     args = parser.parse_args()
588 | 
589 |     global logger
590 |     mkdir(args.output_dir)
591 |     logger = setup_logger("vlpretrain", args.output_dir, 0)
592 | 
593 |     args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
594 |     args.n_gpu = torch.cuda.device_count()
595 |     set_seed(args.seed, args.n_gpu)
596 |     logger.warning("Device: %s, n_gpu: %s", args.device, args.n_gpu)
597 |     logger.info('output_mode: {}, #Labels: {}'.format(args.output_mode, args.num_labels))
598 |  
599 |     config_class, tokenizer_class = BertConfig, BertTokenizer
600 |     model_class = ImageBertForSequenceClassification
601 |     if args.do_train:
602 |         config = config_class.from_pretrained(args.config_name if args.config_name else \
603 |             args.model_name_or_path, num_labels=args.num_labels, finetuning_task='ir')
604 |         tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name \
605 |             else args.model_name_or_path, do_lower_case=args.do_lower_case)
606 |         config.img_feature_dim = args.img_feature_dim
607 |         config.img_feature_type = args.img_feature_type
608 |         config.hidden_dropout_prob = args.drop_out
609 |         config.loss_type = args.loss_type
610 |         config.img_layer_norm_eps = args.img_layer_norm_eps
611 |         config.use_img_layernorm = args.use_img_layernorm
612 |         model = model_class.from_pretrained(args.model_name_or_path, 
613 |             from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
614 |     else:
615 |         checkpoint = args.eval_model_dir
616 |         assert op.isdir(checkpoint)
617 |         config = config_class.from_pretrained(checkpoint)
618 |         tokenizer = tokenizer_class.from_pretrained(checkpoint)
619 |         logger.info("Evaluate the following checkpoint: %s", checkpoint)
620 |         model = model_class.from_pretrained(checkpoint, config=config)
621 | 
622 |     model.to(args.device)
623 |     logger.info("Training/evaluation parameters %s", args)
624 |     if args.do_train:
625 |         train_dataset = RetrievalDataset(tokenizer, args, 'train', is_train=True)
626 |         if args.evaluate_during_training:
627 |             val_dataset = RetrievalDataset(tokenizer, args, 'minival', is_train=False)
628 |         else:
629 |             val_dataset = None
630 |         global_step, avg_loss = train(args, train_dataset, val_dataset, model, tokenizer)
631 |         logger.info("Training done: total_step = %s, avg loss = %s", global_step, avg_loss)
632 | 
633 |     # inference and evaluation
634 |     if args.do_test or args.do_eval:
635 |         args = restore_training_settings(args)
636 |         test_dataset = RetrievalDataset(tokenizer, args, args.test_split, is_train=False)
637 |         checkpoint = args.eval_model_dir
638 |         assert op.isdir(checkpoint)
639 |         logger.info("Evaluate the following checkpoint: %s", checkpoint)
640 |         model = model_class.from_pretrained(checkpoint, config=config)
641 |         model.to(args.device)
642 |         if args.n_gpu > 1:
643 |             model = torch.nn.DataParallel(model)
644 | 
645 |         pred_file = get_predict_file(args)
646 |         if op.isfile(pred_file):
647 |             logger.info("Prediction file exist, skip inference.")
648 |             if args.do_eval:
649 |                 test_result = torch.load(pred_file)
650 |         else:
651 |             test_result = test(args, model, test_dataset)
652 |             torch.save(test_result, pred_file)
653 |             logger.info("Prediction results saved to {}.".format(pred_file))
654 | 
655 |         if args.do_eval:
656 |             eval_result = evaluate(test_dataset, test_result)
657 |             result_file = op.splitext(pred_file)[0] + '.eval.json'
658 |             with open(result_file, 'w') as f:
659 |                 json.dump(eval_result, f)
660 |             logger.info("Evaluation results saved to {}.".format(result_file))
661 | 
662 | 
663 | if __name__ == "__main__":
664 |     main()
665 | 


--------------------------------------------------------------------------------
/oscar/utils/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.1.0"
2 | 


--------------------------------------------------------------------------------
/oscar/utils/caption_evaluate.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Microsoft Corporation. Licensed under the MIT license. 
  2 | 
  3 | from collections import OrderedDict, defaultdict
  4 | import json
  5 | import numpy as np
  6 | import os.path as op
  7 | from pprint import pprint
  8 | import torch
  9 | import re
 10 | import subprocess
 11 | import tempfile
 12 | import time
 13 | from typing import Dict, Optional
 14 | 
 15 | from coco_caption.pycocotools.coco import COCO
 16 | from coco_caption.pycocoevalcap.eval import COCOEvalCap
 17 | from .cider.pyciderevalcap.ciderD.ciderD import CiderD
 18 | 
 19 | 
 20 | def evaluate_on_nocaps(split, predict_file, data_dir='data/nocaps/', evaluate_file=None):
 21 |     '''
 22 |     NOTE: Put the auth file in folder ~/.evalai/
 23 |     '''
 24 |     if not evaluate_file:
 25 |         evaluate_file = op.splitext(predict_file)[0] + '.eval.json'
 26 |     if op.isfile(evaluate_file):
 27 |         print('{} already exists'.format(evaluate_file))
 28 |         with open(evaluate_file, 'r') as fp:
 29 |             metrics = json.load(fp)
 30 |         return metrics
 31 | 
 32 |     image_info_file = op.join(data_dir,
 33 |             'nocaps_{}_image_info.json'.format(split))
 34 |     image_info = json.load(open(image_info_file))
 35 |     open_image_id2id = {}
 36 |     for it in image_info['images']:
 37 |         open_image_id2id[it['open_images_id']] = it['id']
 38 |     predictions = []
 39 |     cap_id = 0
 40 |     with open(predict_file, 'r') as fp:
 41 |         for line in fp:
 42 |             p = line.strip().split('\t')
 43 |             predictions.append(
 44 |                     {'image_id': open_image_id2id[p[0]],
 45 |                     'caption': json.loads(p[1])[0]['caption'],
 46 |                     'id': cap_id})
 47 |             cap_id += 1
 48 |     if split == 'test':
 49 |         print('Are you sure to submit test split result at: {}'.format(predict_file))
 50 |         import ipdb;ipdb.set_trace()
 51 |     nocapseval = NocapsEvaluator(phase=split)
 52 |     metrics = nocapseval.evaluate(predictions)
 53 |     pprint(metrics)
 54 |     with open(evaluate_file, 'w') as fp:
 55 |         json.dump(metrics, fp)
 56 |     return metrics
 57 | 
 58 | 
 59 | def evaluate_on_coco_caption(res_file, label_file, outfile=None):
 60 |     """
 61 |     res_tsv: TSV file, each row is [image_key, json format list of captions].
 62 |              Each caption is a dict, with fields "caption", "conf".
 63 |     label_file: JSON file of ground truth captions in COCO format.
 64 |     """
 65 |     assert label_file.endswith('.json')
 66 |     if res_file.endswith('.tsv'):
 67 |         res_file_coco = op.splitext(res_file)[0] + '_coco_format.json'
 68 |         convert_tsv_to_coco_format(res_file, res_file_coco)
 69 |     else:
 70 |         raise ValueError('unknown prediction result file format: {}'.format(res_file))
 71 | 
 72 |     coco = COCO(label_file)
 73 |     cocoRes = coco.loadRes(res_file_coco)
 74 |     cocoEval = COCOEvalCap(coco, cocoRes, 'corpus')
 75 | 
 76 |     # evaluate on a subset of images by setting
 77 |     # cocoEval.params['image_id'] = cocoRes.getImgIds()
 78 |     # please remove this line when evaluating the full validation set
 79 |     cocoEval.params['image_id'] = cocoRes.getImgIds()
 80 | 
 81 |     # evaluate results
 82 |     # SPICE will take a few minutes the first time, but speeds up due to caching
 83 |     cocoEval.evaluate()
 84 |     result = cocoEval.eval
 85 |     if not outfile:
 86 |         print(result)
 87 |     else:
 88 |         with open(outfile, 'w') as fp:
 89 |             json.dump(result, fp, indent=4)
 90 |     return result
 91 | 
 92 | 
 93 | def convert_tsv_to_coco_format(res_tsv, outfile,
 94 |         sep='\t', key_col=0, cap_col=1):
 95 |     results = []
 96 |     with open(res_tsv) as fp:
 97 |         for line in fp:
 98 |             parts = line.strip().split(sep)
 99 |             key = parts[key_col]
100 |             if cap_col < len(parts):
101 |                 caps = json.loads(parts[cap_col])
102 |                 assert len(caps) == 1, 'cannot evaluate multiple captions per image'
103 |                 cap = caps[0].get('caption', '')
104 |             else:
105 |                 # empty caption generated
106 |                 cap = ""
107 |             results.append(
108 |                     {'image_id': key,
109 |                     'caption': cap}
110 |                     )
111 |     with open(outfile, 'w') as fp:
112 |         json.dump(results, fp)
113 | 
114 | 
115 | class ScstRewardCriterion(torch.nn.Module):
116 |     CIDER_REWARD_WEIGHT = 1
117 | 
118 |     def __init__(self, cider_cached_tokens='corpus', baseline_type='greedy'):
119 |         self.CiderD_scorer = CiderD(df=cider_cached_tokens)
120 |         assert baseline_type in ['greedy', 'sample']
121 |         self.baseline_type = baseline_type
122 |         self._cur_score = None
123 |         super().__init__()
124 | 
125 |     def forward(self, gt_res, greedy_res, sample_res, sample_logprobs):
126 |         batch_size = len(gt_res)
127 |         sample_res_size = len(sample_res)
128 |         seq_per_img = sample_res_size // batch_size
129 | 
130 |         gen_res = []
131 |         gen_res.extend(sample_res)
132 |         gt_idx = [i // seq_per_img for i in range(sample_res_size)]
133 |         if self.baseline_type == 'greedy':
134 |             assert len(greedy_res) == batch_size
135 |             gen_res.extend(greedy_res)
136 |             gt_idx.extend([i for i in range(batch_size)])
137 | 
138 |         scores = self._calculate_eval_scores(gen_res, gt_idx, gt_res)
139 | 
140 |         if self.baseline_type == 'greedy':
141 |             baseline = scores[-batch_size:][:, np.newaxis]
142 |         else:
143 |             sc_ = scores.reshape(batch_size, seq_per_img)
144 |             baseline = (sc_.sum(1, keepdims=True) - sc_) / (sc_.shape[1] - 1)
145 | 
146 |         # sample - baseline
147 |         reward = scores[:sample_res_size].reshape(batch_size, seq_per_img)
148 |         self._cur_score = reward.mean()
149 |         reward = reward - baseline
150 |         reward = reward.reshape(sample_res_size)
151 | 
152 |         reward = torch.as_tensor(reward, device=sample_logprobs.device, dtype=torch.float)
153 |         loss = - sample_logprobs * reward
154 |         loss = loss.mean()
155 |         return loss
156 | 
157 |     def get_score(self):
158 |         return self._cur_score
159 | 
160 |     def _calculate_eval_scores(self, gen_res, gt_idx, gt_res):
161 |         '''
162 |         gen_res: generated captions, list of str
163 |         gt_idx: list of int, of the same length as gen_res
164 |         gt_res: ground truth captions, list of list of str.
165 |             gen_res[i] corresponds to gt_res[gt_idx[i]]
166 |             Each image can have multiple ground truth captions
167 |         '''
168 |         gen_res_size = len(gen_res)
169 | 
170 |         res = OrderedDict()
171 |         for i in range(gen_res_size):
172 |             res[i] = [self._wrap_sentence(gen_res[i])]
173 | 
174 |         gts = OrderedDict()
175 |         gt_res_ = [
176 |             [self._wrap_sentence(gt_res[i][j]) for j in range(len(gt_res[i]))]
177 |                 for i in range(len(gt_res))
178 |         ]
179 |         for i in range(gen_res_size):
180 |             gts[i] = gt_res_[gt_idx[i]]
181 | 
182 |         res_ = [{'image_id':i, 'caption': res[i]} for i in range(len(res))]
183 |         _, batch_cider_scores = self.CiderD_scorer.compute_score(gts, res_)
184 |         scores = self.CIDER_REWARD_WEIGHT * batch_cider_scores
185 |         return scores
186 | 
187 |     @classmethod
188 |     def _wrap_sentence(self, s):
189 |         # ensure the sentence ends with <eos> token
190 |         # in order to keep consisitent with cider_cached_tokens
191 |         r = s.strip()
192 |         if r.endswith('.'):
193 |             r = r[:-1]
194 |         r += ' <eos>'
195 |         return r
196 | 
197 | 
198 | class NocapsEvaluator(object):
199 |     r"""
200 |     Code from https://github.com/nocaps-org/updown-baseline/blob/master/updown/utils/evalai.py
201 | 
202 |     A utility class to submit model predictions on nocaps splits to EvalAI, and retrieve model
203 |     performance based on captioning metrics (such as CIDEr, SPICE).
204 | 
205 |     Extended Summary
206 |     ----------------
207 |     This class and the training script together serve as a working example for "EvalAI in the
208 |     loop", showing how evaluation can be done remotely on privately held splits. Annotations
209 |     (captions) and evaluation-specific tools (e.g. `coco-caption <https://www.github.com/tylin/coco-caption>`_)
210 |     are not required locally. This enables users to select best checkpoint, perform early
211 |     stopping, learning rate scheduling based on a metric, etc. without actually doing evaluation.
212 | 
213 |     Parameters
214 |     ----------
215 |     phase: str, optional (default = "val")
216 |         Which phase to evaluate on. One of "val" or "test".
217 | 
218 |     Notes
219 |     -----
220 |     This class can be used for retrieving metrics on both, val and test splits. However, we
221 |     recommend to avoid using it for test split (at least during training). Number of allowed
222 |     submissions to test split on EvalAI are very less, and can exhaust in a few iterations! However,
223 |     the number of submissions to val split are practically infinite.
224 |     """
225 | 
226 |     def __init__(self, phase: str = "val"):
227 | 
228 |         # Constants specific to EvalAI.
229 |         self._challenge_id = 355
230 |         self._phase_id = 742 if phase == "val" else 743
231 | 
232 |     def evaluate(
233 |         self, predictions, iteration: Optional[int] = None
234 |     ) -> Dict[str, Dict[str, float]]:
235 |         r"""
236 |         Take the model predictions (in COCO format), submit them to EvalAI, and retrieve model
237 |         performance based on captioning metrics.
238 | 
239 |         Parameters
240 |         ----------
241 |         predictions: List[Prediction]
242 |             Model predictions in COCO format. They are a list of dicts with keys
243 |             ``{"image_id": int, "caption": str}``.
244 |         iteration: int, optional (default = None)
245 |             Training iteration where the checkpoint was evaluated.
246 | 
247 |         Returns
248 |         -------
249 |         Dict[str, Dict[str, float]]
250 |             Model performance based on all captioning metrics. Nested dict structure::
251 | 
252 |                 {
253 |                     "B1": {"in-domain", "near-domain", "out-domain", "entire"},  # BLEU-1
254 |                     "B2": {"in-domain", "near-domain", "out-domain", "entire"},  # BLEU-2
255 |                     "B3": {"in-domain", "near-domain", "out-domain", "entire"},  # BLEU-3
256 |                     "B4": {"in-domain", "near-domain", "out-domain", "entire"},  # BLEU-4
257 |                     "METEOR": {"in-domain", "near-domain", "out-domain", "entire"},
258 |                     "ROUGE-L": {"in-domain", "near-domain", "out-domain", "entire"},
259 |                     "CIDEr": {"in-domain", "near-domain", "out-domain", "entire"},
260 |                     "SPICE": {"in-domain", "near-domain", "out-domain", "entire"},
261 |                 }
262 | 
263 |         """
264 |         # Save predictions as a json file first.
265 |         _, predictions_filename = tempfile.mkstemp(suffix=".json", text=True)
266 |         with open(predictions_filename, "w") as f:
267 |             json.dump(predictions, f)
268 | 
269 |         submission_command = (
270 |             f"evalai challenge {self._challenge_id} phase {self._phase_id} "
271 |             f"submit --file {predictions_filename}"
272 |         )
273 | 
274 |         submission_command_subprocess = subprocess.Popen(
275 |             submission_command.split(),
276 |             stdout=subprocess.PIPE,
277 |             stdin=subprocess.PIPE,
278 |             stderr=subprocess.STDOUT,
279 |         )
280 | 
281 |         # This terminal output will have submission ID we need to check.
282 |         submission_command_stdout = submission_command_subprocess.communicate(input=b"N\n")[
283 |             0
284 |         ].decode("utf-8")
285 | 
286 |         submission_id_regex = re.search("evalai submission ([0-9]+)", submission_command_stdout)
287 |         try:
288 |             # Get an integer submission ID (as a string).
289 |             submission_id = submission_id_regex.group(0).split()[-1]  # type: ignore
290 |         except:
291 |             # Very unlikely, but submission may fail because of some glitch. Retry for that.
292 |             return self.evaluate(predictions)
293 | 
294 |         if iteration is not None:
295 |             print(f"Submitted predictions for iteration {iteration}, submission id: {submission_id}.")
296 |         else:
297 |             print(f"Submitted predictions, submission_id: {submission_id}")
298 | 
299 |         # Placeholder stdout for a pending submission.
300 |         result_stdout: str = "The Submission is yet to be evaluated."
301 |         num_tries: int = 0
302 | 
303 |         # Query every 10 seconds for result until it appears.
304 |         while "CIDEr" not in result_stdout:
305 | 
306 |             time.sleep(10)
307 |             result_stdout = subprocess.check_output(
308 |                 ["evalai", "submission", submission_id, "result"]
309 |             ).decode("utf-8")
310 |             num_tries += 1
311 | 
312 |             # Raise error if it takes more than 5 minutes.
313 |             if num_tries == 30:
314 |                 raise ConnectionError("Unable to get results from EvalAI within 5 minutes!")
315 | 
316 |         # Convert result to json.
317 |         metrics = json.loads(result_stdout, encoding="utf-8")
318 | 
319 |         # keys: {"in-domain", "near-domain", "out-domain", "entire"}
320 |         # In each of these, keys: {"B1", "B2", "B3", "B4", "METEOR", "ROUGE-L", "CIDEr", "SPICE"}
321 |         metrics = {
322 |             "in-domain": metrics[0]["in-domain"],
323 |             "near-domain": metrics[1]["near-domain"],
324 |             "out-domain": metrics[2]["out-domain"],
325 |             "entire": metrics[3]["entire"],
326 |         }
327 | 
328 |         # Restructure the metrics dict for better tensorboard logging.
329 |         # keys: {"B1", "B2", "B3", "B4", "METEOR", "ROUGE-L", "CIDEr", "SPICE"}
330 |         # In each of these, keys: keys: {"in-domain", "near-domain", "out-domain", "entire"}
331 |         flipped_metrics: Dict[str, Dict[str, float]] = defaultdict(dict)
332 |         for key, val in metrics.items():
333 |             for subkey, subval in val.items():
334 |                 flipped_metrics[subkey][key] = subval
335 | 
336 |         return flipped_metrics
337 | 
338 | 


--------------------------------------------------------------------------------
/oscar/utils/cider/pyciderevalcap/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/oscar/utils/cider/pyciderevalcap/cider/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/oscar/utils/cider/pyciderevalcap/cider/cider.py:
--------------------------------------------------------------------------------
 1 | # Filename: cider.py
 2 | #
 3 | #
 4 | # Description: Describes the class to compute the CIDEr
 5 | # (Consensus-Based Image Description Evaluation) Metric
 6 | #          by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
 7 | #
 8 | # Creation Date: Sun Feb  8 14:16:54 2015
 9 | #
10 | # Authors: Ramakrishna Vedantam <vrama91@vt.edu> and
11 | # Tsung-Yi Lin <tl483@cornell.edu>
12 | from __future__ import absolute_import
13 | from __future__ import division
14 | from __future__ import print_function
15 | 
16 | from .cider_scorer import CiderScorer
17 | 
18 | 
19 | class Cider:
20 |     """
21 |     Main Class to compute the CIDEr metric
22 | 
23 |     """
24 |     def __init__(self, n=4, df="corpus"):
25 |         """
26 |         Initialize the CIDEr scoring function
27 |         : param n (int): n-gram size
28 |         : param df (string): specifies where to get the IDF values from
29 |                     takes values 'corpus', 'coco-train'
30 |         : return: None
31 |         """
32 |         # set cider to sum over 1 to 4-grams
33 |         self._n = n
34 |         self._df = df
35 |         self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df)
36 | 
37 |     def compute_score(self, gts, res):
38 |         """
39 |         Main function to compute CIDEr score
40 |         : param  gts (dict) : {image:tokenized reference sentence}
41 |         : param res (dict)  : {image:tokenized candidate sentence}
42 |         : return: cider (float) : computed CIDEr score for the corpus
43 |         """
44 | 
45 |         # clear all the previous hypos and refs
46 |         self.cider_scorer.clear()
47 | 
48 |         for res_id in res:
49 | 
50 |             hypo = res_id['caption']
51 |             ref = gts[res_id['image_id']]
52 | 
53 |             # Sanity check.
54 |             assert(type(hypo) is list)
55 |             assert(len(hypo) == 1)
56 |             assert(type(ref) is list)
57 |             assert(len(ref) > 0)
58 |             self.cider_scorer += (hypo[0], ref)
59 | 
60 |         (score, scores) = self.cider_scorer.compute_score()
61 | 
62 |         return score, scores
63 | 
64 |     def method(self):
65 |         return "CIDEr"
66 | 


--------------------------------------------------------------------------------
/oscar/utils/cider/pyciderevalcap/cider/cider_scorer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Tsung-Yi Lin <tl483@cornell.edu>
  3 | # Ramakrishna Vedantam <vrama91@vt.edu>
  4 | from __future__ import absolute_import
  5 | from __future__ import division
  6 | from __future__ import print_function
  7 | 
  8 | import copy
  9 | import six
 10 | from six.moves import cPickle
 11 | from collections import defaultdict
 12 | import numpy as np
 13 | import math
 14 | import os
 15 | 
 16 | def precook(s, n=4, out=False):
 17 |     """
 18 |     Takes a string as input and returns an object that can be given to
 19 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 20 |     can take string arguments as well.
 21 |     :param s: string : sentence to be converted into ngrams
 22 |     :param n: int    : number of ngrams for which representation is calculated
 23 |     :return: term frequency vector for occuring ngrams
 24 |     """
 25 |     words = s.split()
 26 |     counts = defaultdict(int)
 27 |     for k in range(1,n+1):
 28 |         for i in range(len(words)-k+1):
 29 |             ngram = tuple(words[i:i+k])
 30 |             counts[ngram] += 1
 31 |     return counts
 32 | 
 33 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
 34 |     '''Takes a list of reference sentences for a single segment
 35 |     and returns an object that encapsulates everything that BLEU
 36 |     needs to know about them.
 37 |     :param refs: list of string : reference sentences for some image
 38 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 39 |     :return: result (list of dict)
 40 |     '''
 41 |     return [precook(ref, n) for ref in refs]
 42 | 
 43 | def cook_test(test, n=4):
 44 |     '''Takes a test sentence and returns an object that
 45 |     encapsulates everything that BLEU needs to know about it.
 46 |     :param test: list of string : hypothesis sentence for some image
 47 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 48 |     :return: result (dict)
 49 |     '''
 50 |     return precook(test, n, True)
 51 | 
 52 | class CiderScorer(object):
 53 |     """CIDEr scorer.
 54 |     """
 55 | 
 56 |     def copy(self):
 57 |         ''' copy the refs.'''
 58 |         new = CiderScorer(n=self.n)
 59 |         new.ctest = copy.copy(self.ctest)
 60 |         new.crefs = copy.copy(self.crefs)
 61 |         return new
 62 | 
 63 |     def __init__(self, df_mode="corpus", test=None, refs=None, n=4, sigma=6.0):
 64 |         ''' singular instance '''
 65 |         self.n = n
 66 |         self.sigma = sigma
 67 |         self.crefs = []
 68 |         self.ctest = []
 69 |         self.df_mode = df_mode
 70 |         self.ref_len = None
 71 |         if self.df_mode != "corpus":
 72 |             pkl_file = cPickle.load(open(os.path.join('data', df_mode + '.p'),'rb'), **(dict(encoding='latin1') if six.PY3 else {}))
 73 |             self.ref_len = np.log(float(pkl_file['ref_len']))
 74 |             self.document_frequency = pkl_file['document_frequency']
 75 |         self.cook_append(test, refs)
 76 |     
 77 |     def clear(self):
 78 |         self.crefs = []
 79 |         self.ctest = []
 80 | 
 81 |     def cook_append(self, test, refs):
 82 |         '''called by constructor and __iadd__ to avoid creating new instances.'''
 83 | 
 84 |         if refs is not None:
 85 |             self.crefs.append(cook_refs(refs))
 86 |             if test is not None:
 87 |                 self.ctest.append(cook_test(test)) ## N.B.: -1
 88 |             else:
 89 |                 self.ctest.append(None) # lens of crefs and ctest have to match
 90 | 
 91 |     def size(self):
 92 |         assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
 93 |         return len(self.crefs)
 94 | 
 95 |     def __iadd__(self, other):
 96 |         '''add an instance (e.g., from another sentence).'''
 97 | 
 98 |         if type(other) is tuple:
 99 |             ## avoid creating new CiderScorer instances
100 |             self.cook_append(other[0], other[1])
101 |         else:
102 |             self.ctest.extend(other.ctest)
103 |             self.crefs.extend(other.crefs)
104 | 
105 |         return self
106 |     def compute_doc_freq(self):
107 |         '''
108 |         Compute term frequency for reference data.
109 |         This will be used to compute idf (inverse document frequency later)
110 |         The term frequency is stored in the object
111 |         :return: None
112 |         '''
113 |         for refs in self.crefs:
114 |             # refs, k ref captions of one image
115 |             for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]):
116 |                 self.document_frequency[ngram] += 1
117 |             # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
118 | 
119 |     def compute_cider(self):
120 |         def counts2vec(cnts):
121 |             """
122 |             Function maps counts of ngram to vector of tfidf weights.
123 |             The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
124 |             The n-th entry of array denotes length of n-grams.
125 |             :param cnts:
126 |             :return: vec (array of dict), norm (array of float), length (int)
127 |             """
128 |             vec = [defaultdict(float) for _ in range(self.n)]
129 |             length = 0
130 |             norm = [0.0 for _ in range(self.n)]
131 |             for (ngram,term_freq) in cnts.items():
132 |                 # give word count 1 if it doesn't appear in reference corpus
133 |                 df = np.log(max(1.0, self.document_frequency[ngram]))
134 |                 # ngram index
135 |                 n = len(ngram)-1
136 |                 # tf (term_freq) * idf (precomputed idf) for n-grams
137 |                 vec[n][ngram] = float(term_freq)*(self.ref_len - df)
138 |                 # compute norm for the vector.  the norm will be used for
139 |                 # computing similarity
140 |                 norm[n] += pow(vec[n][ngram], 2)
141 | 
142 |                 if n == 1:
143 |                     length += term_freq
144 |             norm = [np.sqrt(n) for n in norm]
145 |             return vec, norm, length
146 | 
147 |         def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
148 |             '''
149 |             Compute the cosine similarity of two vectors.
150 |             :param vec_hyp: array of dictionary for vector corresponding to hypothesis
151 |             :param vec_ref: array of dictionary for vector corresponding to reference
152 |             :param norm_hyp: array of float for vector corresponding to hypothesis
153 |             :param norm_ref: array of float for vector corresponding to reference
154 |             :param length_hyp: int containing length of hypothesis
155 |             :param length_ref: int containing length of reference
156 |             :return: array of score for each n-grams cosine similarity
157 |             '''
158 |             delta = float(length_hyp - length_ref)
159 |             # measure consine similarity
160 |             val = np.array([0.0 for _ in range(self.n)])
161 |             for n in range(self.n):
162 |                 # ngram
163 |                 for (ngram,count) in vec_hyp[n].items():
164 |                     val[n] += vec_hyp[n][ngram] * vec_ref[n][ngram]
165 | 
166 |                 if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
167 |                     val[n] /= (norm_hyp[n]*norm_ref[n])
168 | 
169 |                 assert(not math.isnan(val[n]))
170 |             return val
171 | 
172 |         # compute log reference length
173 |         if self.df_mode == "corpus":
174 |             self.ref_len = np.log(float(len(self.crefs)))
175 | 
176 |         scores = []
177 |         for test, refs in zip(self.ctest, self.crefs):
178 |             # compute vector for test captions
179 |             vec, norm, length = counts2vec(test)
180 |             # compute vector for ref captions
181 |             score = np.array([0.0 for _ in range(self.n)])
182 |             for ref in refs:
183 |                 vec_ref, norm_ref, length_ref = counts2vec(ref)
184 |                 score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
185 |             # change by vrama91 - mean of ngram scores, instead of sum
186 |             score_avg = np.mean(score)
187 |             # divide by number of references
188 |             score_avg /= len(refs)
189 |             # multiply score by 10
190 |             score_avg *= 10.0
191 |             # append score of an image to the score list
192 |             scores.append(score_avg)
193 |         return scores
194 | 
195 |     def compute_score(self, option=None, verbose=0):
196 |         # compute idf
197 |         if self.df_mode == "corpus":
198 |             self.document_frequency = defaultdict(float)
199 |             self.compute_doc_freq()
200 |             # assert to check document frequency
201 |             assert(len(self.ctest) >= max(self.document_frequency.values()))
202 |             # import json for now and write the corresponding files
203 |         # compute cider score
204 |         score = self.compute_cider()
205 |         # debug
206 |         # print score
207 |         return np.mean(np.array(score)), np.array(score)
208 | 


--------------------------------------------------------------------------------
/oscar/utils/cider/pyciderevalcap/ciderD/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/oscar/utils/cider/pyciderevalcap/ciderD/ciderD.py:
--------------------------------------------------------------------------------
 1 | # Filename: ciderD.py
 2 | #
 3 | # Description: Describes the class to compute the CIDEr-D (Consensus-Based Image Description Evaluation) Metric
 4 | #               by Vedantam, Zitnick, and Parikh (http://arxiv.org/abs/1411.5726)
 5 | #
 6 | # Creation Date: Sun Feb  8 14:16:54 2015
 7 | #
 8 | # Authors: Ramakrishna Vedantam <vrama91@vt.edu> and Tsung-Yi Lin <tl483@cornell.edu>
 9 | from __future__ import absolute_import
10 | from __future__ import division
11 | from __future__ import print_function
12 | 
13 | from .ciderD_scorer import CiderScorer
14 | import pdb
15 | 
16 | class CiderD:
17 |     """
18 |     Main Class to compute the CIDEr metric
19 | 
20 |     """
21 |     def __init__(self, n=4, sigma=6.0, df="corpus"):
22 |         # set cider to sum over 1 to 4-grams
23 |         self._n = n
24 |         # set the standard deviation parameter for gaussian penalty
25 |         self._sigma = sigma
26 |         # set which where to compute document frequencies from
27 |         self._df = df
28 |         self.cider_scorer = CiderScorer(n=self._n, df_mode=self._df)
29 | 
30 |     def compute_score(self, gts, res):
31 |         """
32 |         Main function to compute CIDEr score
33 |         :param  hypo_for_image (dict) : dictionary with key <image> and value <tokenized hypothesis / candidate sentence>
34 |                 ref_for_image (dict)  : dictionary with key <image> and value <tokenized reference sentence>
35 |         :return: cider (float) : computed CIDEr score for the corpus
36 |         """
37 | 
38 |         # clear all the previous hypos and refs
39 |         tmp_cider_scorer = self.cider_scorer.copy_empty()
40 |         tmp_cider_scorer.clear()
41 |         for res_id in res:
42 | 
43 |             hypo = res_id['caption']
44 |             ref = gts[res_id['image_id']]
45 | 
46 |             # Sanity check.
47 |             assert(type(hypo) is list)
48 |             assert(len(hypo) == 1)
49 |             assert(type(ref) is list)
50 |             assert(len(ref) > 0)
51 |             tmp_cider_scorer += (hypo[0], ref)
52 | 
53 |         (score, scores) = tmp_cider_scorer.compute_score()
54 | 
55 |         return score, scores
56 | 
57 |     def method(self):
58 |         return "CIDEr-D"
59 | 


--------------------------------------------------------------------------------
/oscar/utils/cider/pyciderevalcap/ciderD/ciderD_scorer.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Tsung-Yi Lin <tl483@cornell.edu>
  3 | # Ramakrishna Vedantam <vrama91@vt.edu>
  4 | from __future__ import absolute_import
  5 | from __future__ import division
  6 | from __future__ import print_function
  7 | 
  8 | import copy
  9 | from collections import defaultdict
 10 | import numpy as np
 11 | import pdb
 12 | import math
 13 | import six
 14 | from six.moves import cPickle
 15 | import os
 16 | 
 17 | def precook(s, n=4, out=False):
 18 |     """
 19 |     Takes a string as input and returns an object that can be given to
 20 |     either cook_refs or cook_test. This is optional: cook_refs and cook_test
 21 |     can take string arguments as well.
 22 |     :param s: string : sentence to be converted into ngrams
 23 |     :param n: int    : number of ngrams for which representation is calculated
 24 |     :return: term frequency vector for occuring ngrams
 25 |     """
 26 |     words = s.split()
 27 |     counts = defaultdict(int)
 28 |     for k in range(1,n+1):
 29 |         for i in range(len(words)-k+1):
 30 |             ngram = tuple(words[i:i+k])
 31 |             counts[ngram] += 1
 32 |     return counts
 33 | 
 34 | def cook_refs(refs, n=4): ## lhuang: oracle will call with "average"
 35 |     '''Takes a list of reference sentences for a single segment
 36 |     and returns an object that encapsulates everything that BLEU
 37 |     needs to know about them.
 38 |     :param refs: list of string : reference sentences for some image
 39 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 40 |     :return: result (list of dict)
 41 |     '''
 42 |     return [precook(ref, n) for ref in refs]
 43 | 
 44 | def cook_test(test, n=4):
 45 |     '''Takes a test sentence and returns an object that
 46 |     encapsulates everything that BLEU needs to know about it.
 47 |     :param test: list of string : hypothesis sentence for some image
 48 |     :param n: int : number of ngrams for which (ngram) representation is calculated
 49 |     :return: result (dict)
 50 |     '''
 51 |     return precook(test, n, True)
 52 | 
 53 | class CiderScorer(object):
 54 |     """CIDEr scorer.
 55 |     """
 56 | 
 57 |     def copy(self):
 58 |         ''' copy the refs.'''
 59 |         new = CiderScorer(n=self.n)
 60 |         new.ctest = copy.copy(self.ctest)
 61 |         new.crefs = copy.copy(self.crefs)
 62 |         return new
 63 | 
 64 |     def copy_empty(self):
 65 |         new = CiderScorer(df_mode="corpus", n=self.n, sigma=self.sigma)
 66 |         new.df_mode = self.df_mode
 67 |         new.ref_len = self.ref_len
 68 |         new.document_frequency = self.document_frequency
 69 |         return new
 70 | 
 71 |     def __init__(self, df_mode="corpus", test=None, refs=None, n=4, sigma=6.0):
 72 |         ''' singular instance '''
 73 |         self.n = n
 74 |         self.sigma = sigma
 75 |         self.crefs = []
 76 |         self.ctest = []
 77 |         self.df_mode = df_mode
 78 |         self.ref_len = None
 79 |         if self.df_mode != "corpus":
 80 |             pkl_file = cPickle.load(open(df_mode,'rb'), **(dict(encoding='latin1') if six.PY3 else {}))
 81 |             self.ref_len = np.log(float(pkl_file['ref_len']))
 82 |             self.document_frequency = pkl_file['document_frequency']
 83 |         else:
 84 |             self.document_frequency = None
 85 |         self.cook_append(test, refs)
 86 |     
 87 |     def clear(self):
 88 |         self.crefs = []
 89 |         self.ctest = []
 90 | 
 91 |     def cook_append(self, test, refs):
 92 |         '''called by constructor and __iadd__ to avoid creating new instances.'''
 93 | 
 94 |         if refs is not None:
 95 |             self.crefs.append(cook_refs(refs))
 96 |             if test is not None:
 97 |                 self.ctest.append(cook_test(test)) ## N.B.: -1
 98 |             else:
 99 |                 self.ctest.append(None) # lens of crefs and ctest have to match
100 | 
101 |     def size(self):
102 |         assert len(self.crefs) == len(self.ctest), "refs/test mismatch! %d<>%d" % (len(self.crefs), len(self.ctest))
103 |         return len(self.crefs)
104 | 
105 |     def __iadd__(self, other):
106 |         '''add an instance (e.g., from another sentence).'''
107 | 
108 |         if type(other) is tuple:
109 |             ## avoid creating new CiderScorer instances
110 |             self.cook_append(other[0], other[1])
111 |         else:
112 |             self.ctest.extend(other.ctest)
113 |             self.crefs.extend(other.crefs)
114 | 
115 |         return self
116 |     def compute_doc_freq(self):
117 |         '''
118 |         Compute term frequency for reference data.
119 |         This will be used to compute idf (inverse document frequency later)
120 |         The term frequency is stored in the object
121 |         :return: None
122 |         '''
123 |         for refs in self.crefs:
124 |             # refs, k ref captions of one image
125 |             for ngram in set([ngram for ref in refs for (ngram,count) in ref.items()]):
126 |                 self.document_frequency[ngram] += 1
127 |             # maxcounts[ngram] = max(maxcounts.get(ngram,0), count)
128 | 
129 |     def compute_cider(self):
130 |         def counts2vec(cnts):
131 |             """
132 |             Function maps counts of ngram to vector of tfidf weights.
133 |             The function returns vec, an array of dictionary that store mapping of n-gram and tf-idf weights.
134 |             The n-th entry of array denotes length of n-grams.
135 |             :param cnts:
136 |             :return: vec (array of dict), norm (array of float), length (int)
137 |             """
138 |             vec = [defaultdict(float) for _ in range(self.n)]
139 |             length = 0
140 |             norm = [0.0 for _ in range(self.n)]
141 |             for (ngram,term_freq) in cnts.items():
142 |                 # give word count 1 if it doesn't appear in reference corpus
143 |                 df = np.log(max(1.0, self.document_frequency[ngram]))
144 |                 # ngram index
145 |                 n = len(ngram)-1
146 |                 # tf (term_freq) * idf (precomputed idf) for n-grams
147 |                 vec[n][ngram] = float(term_freq)*(self.ref_len - df)
148 |                 # compute norm for the vector.  the norm will be used for computing similarity
149 |                 norm[n] += pow(vec[n][ngram], 2)
150 | 
151 |                 if n == 1:
152 |                     length += term_freq
153 |             norm = [np.sqrt(n) for n in norm]
154 |             return vec, norm, length
155 | 
156 |         def sim(vec_hyp, vec_ref, norm_hyp, norm_ref, length_hyp, length_ref):
157 |             '''
158 |             Compute the cosine similarity of two vectors.
159 |             :param vec_hyp: array of dictionary for vector corresponding to hypothesis
160 |             :param vec_ref: array of dictionary for vector corresponding to reference
161 |             :param norm_hyp: array of float for vector corresponding to hypothesis
162 |             :param norm_ref: array of float for vector corresponding to reference
163 |             :param length_hyp: int containing length of hypothesis
164 |             :param length_ref: int containing length of reference
165 |             :return: array of score for each n-grams cosine similarity
166 |             '''
167 |             delta = float(length_hyp - length_ref)
168 |             # measure consine similarity
169 |             val = np.array([0.0 for _ in range(self.n)])
170 |             for n in range(self.n):
171 |                 # ngram
172 |                 for (ngram,count) in vec_hyp[n].items():
173 |                     # vrama91 : added clipping
174 |                     val[n] += min(vec_hyp[n][ngram], vec_ref[n][ngram]) * vec_ref[n][ngram]
175 | 
176 |                 if (norm_hyp[n] != 0) and (norm_ref[n] != 0):
177 |                     val[n] /= (norm_hyp[n]*norm_ref[n])
178 | 
179 |                 assert(not math.isnan(val[n]))
180 |                 # vrama91: added a length based gaussian penalty
181 |                 val[n] *= np.e**(-(delta**2)/(2*self.sigma**2))
182 |             return val
183 | 
184 |         # compute log reference length
185 |         if self.df_mode == "corpus":
186 |             self.ref_len = np.log(float(len(self.crefs)))
187 |         #elif self.df_mode == "coco-val-df":
188 |             # if coco option selected, use length of coco-val set
189 |         #    self.ref_len = np.log(float(40504))
190 | 
191 |         scores = []
192 |         for test, refs in zip(self.ctest, self.crefs):
193 |             # compute vector for test captions
194 |             vec, norm, length = counts2vec(test)
195 |             # compute vector for ref captions
196 |             score = np.array([0.0 for _ in range(self.n)])
197 |             for ref in refs:
198 |                 vec_ref, norm_ref, length_ref = counts2vec(ref)
199 |                 score += sim(vec, vec_ref, norm, norm_ref, length, length_ref)
200 |             # change by vrama91 - mean of ngram scores, instead of sum
201 |             score_avg = np.mean(score)
202 |             # divide by number of references
203 |             score_avg /= len(refs)
204 |             # multiply score by 10
205 |             score_avg *= 10.0
206 |             # append score of an image to the score list
207 |             scores.append(score_avg)
208 |         return scores
209 | 
210 |     def compute_score(self, option=None, verbose=0):
211 |         # compute idf
212 |         if self.df_mode == "corpus":
213 |             self.document_frequency = defaultdict(float)
214 |             self.compute_doc_freq()
215 |             # assert to check document frequency
216 |             assert(len(self.ctest) >= max(self.document_frequency.values()))
217 |             # import json for now and write the corresponding files
218 |         # compute cider score
219 |         score = self.compute_cider()
220 |         # debug
221 |         # print score
222 |         return np.mean(np.array(score)), np.array(score)
223 | 


--------------------------------------------------------------------------------
/oscar/utils/logger.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Microsoft Corporation. Licensed under the MIT license. 
  2 | 
  3 | import logging
  4 | from logging import StreamHandler, Handler, getLevelName
  5 | import os
  6 | import sys
  7 | 
  8 | 
  9 | # this class is a copy of logging.FileHandler except we end self.close()
 10 | # at the end of each emit. While closing file and reopening file after each
 11 | # write is not efficient, it allows us to see partial logs when writing to
 12 | # fused Azure blobs, which is very convenient
 13 | class FileHandler(StreamHandler):
 14 |     """
 15 |     A handler class which writes formatted logging records to disk files.
 16 |     """
 17 |     def __init__(self, filename, mode='a', encoding=None, delay=False):
 18 |         """
 19 |         Open the specified file and use it as the stream for logging.
 20 |         """
 21 |         # Issue #27493: add support for Path objects to be passed in
 22 |         filename = os.fspath(filename)
 23 |         #keep the absolute path, otherwise derived classes which use this
 24 |         #may come a cropper when the current directory changes
 25 |         self.baseFilename = os.path.abspath(filename)
 26 |         self.mode = mode
 27 |         self.encoding = encoding
 28 |         self.delay = delay
 29 |         if delay:
 30 |             #We don't open the stream, but we still need to call the
 31 |             #Handler constructor to set level, formatter, lock etc.
 32 |             Handler.__init__(self)
 33 |             self.stream = None
 34 |         else:
 35 |             StreamHandler.__init__(self, self._open())
 36 | 
 37 |     def close(self):
 38 |         """
 39 |         Closes the stream.
 40 |         """
 41 |         self.acquire()
 42 |         try:
 43 |             try:
 44 |                 if self.stream:
 45 |                     try:
 46 |                         self.flush()
 47 |                     finally:
 48 |                         stream = self.stream
 49 |                         self.stream = None
 50 |                         if hasattr(stream, "close"):
 51 |                             stream.close()
 52 |             finally:
 53 |                 # Issue #19523: call unconditionally to
 54 |                 # prevent a handler leak when delay is set
 55 |                 StreamHandler.close(self)
 56 |         finally:
 57 |             self.release()
 58 | 
 59 |     def _open(self):
 60 |         """
 61 |         Open the current base file with the (original) mode and encoding.
 62 |         Return the resulting stream.
 63 |         """
 64 |         return open(self.baseFilename, self.mode, encoding=self.encoding)
 65 | 
 66 |     def emit(self, record):
 67 |         """
 68 |         Emit a record.
 69 | 
 70 |         If the stream was not opened because 'delay' was specified in the
 71 |         constructor, open it before calling the superclass's emit.
 72 |         """
 73 |         if self.stream is None:
 74 |             self.stream = self._open()
 75 |         StreamHandler.emit(self, record)
 76 |         self.close()
 77 | 
 78 |     def __repr__(self):
 79 |         level = getLevelName(self.level)
 80 |         return '<%s %s (%s)>' % (self.__class__.__name__, self.baseFilename, level)
 81 | 
 82 | 
 83 | def setup_logger(name, save_dir, distributed_rank, filename="log.txt"):
 84 |     logger = logging.getLogger(name)
 85 |     logger.setLevel(logging.DEBUG)
 86 |     # don't log results for the non-master process
 87 |     if distributed_rank > 0:
 88 |         return logger
 89 |     ch = logging.StreamHandler(stream=sys.stdout)
 90 |     ch.setLevel(logging.DEBUG)
 91 |     formatter = logging.Formatter("%(asctime)s %(name)s %(levelname)s: %(message)s")
 92 |     ch.setFormatter(formatter)
 93 |     logger.addHandler(ch)
 94 | 
 95 |     if save_dir:
 96 |         fh = FileHandler(os.path.join(save_dir, filename))
 97 |         fh.setLevel(logging.DEBUG)
 98 |         fh.setFormatter(formatter)
 99 |         logger.addHandler(fh)
100 | 
101 |     return logger
102 | 
103 | 


--------------------------------------------------------------------------------
/oscar/utils/metric_logger.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
  2 | from collections import defaultdict
  3 | from collections import deque
  4 | import os
  5 | 
  6 | import torch
  7 | 
  8 | from .misc import is_main_process
  9 | 
 10 | 
 11 | class SmoothedValue(object):
 12 |     """Track a series of values and provide access to smoothed values over a
 13 |     window or the global series average.
 14 |     """
 15 | 
 16 |     def __init__(self, window_size=20):
 17 |         self.deque = deque(maxlen=window_size)
 18 |         # self.series = []
 19 |         self.total = 0.0
 20 |         self.count = 0
 21 | 
 22 |     def update(self, value):
 23 |         self.deque.append(value)
 24 |         # self.series.append(value)
 25 |         self.count += 1
 26 |         self.total += value
 27 | 
 28 |     @property
 29 |     def median(self):
 30 |         d = torch.tensor(list(self.deque))
 31 |         return d.median().item()
 32 | 
 33 |     @property
 34 |     def avg(self):
 35 |         d = torch.tensor(list(self.deque))
 36 |         return d.mean().item()
 37 | 
 38 |     @property
 39 |     def global_avg(self):
 40 |         return self.total / self.count
 41 | 
 42 |     @property
 43 |     def last_value(self):
 44 |         return self.deque[-1]
 45 | 
 46 | 
 47 | class MetricLogger(object):
 48 |     def __init__(self, delimiter="\t"):
 49 |         self.meters = {}
 50 |         self.params = {}
 51 |         self.delimiter = delimiter
 52 | 
 53 |     def update_params(self, update_dict):
 54 |         for param_group, group_dict in update_dict.items():
 55 |             if param_group not in self.params:
 56 |                 self.params[param_group] = {}
 57 |             for param_name, param_value in group_dict.items():
 58 |                 # skipping parameters if they start with '_'
 59 |                 if param_name.startswith('_'):
 60 |                     continue
 61 |                 if isinstance(param_value, torch.Tensor):
 62 |                     param_value = param_value.item()
 63 |                 assert isinstance(param_value, (float, int))
 64 |                 self.params[param_group][param_name] = param_value
 65 | 
 66 |     def update_metrics(self, update_dict):
 67 |         for metric_group, group_dict in update_dict.items():
 68 |             if metric_group not in self.meters:
 69 |                 self.meters[metric_group] = defaultdict(SmoothedValue)
 70 |             for metric_name, metric_value in group_dict.items():
 71 |                 # skipping metrics if they start with '_'
 72 |                 if metric_name.startswith('_'):
 73 |                     continue
 74 |                 if isinstance(metric_value, torch.Tensor):
 75 |                     metric_value = metric_value.item()
 76 |                 assert isinstance(metric_value, (float, int))
 77 |                 self.meters[metric_group][metric_name].update(metric_value)
 78 | 
 79 |     def get_logs(self, iteration):
 80 |         return_str = []
 81 |         if len(self.meters) > 0:
 82 |             offset_m = max([len(group_name) for group_name in self.meters.keys()])
 83 |         else:
 84 |             offset_m = 0
 85 |         if len(self.params) > 0:
 86 |             offset_p = max([len(group_name) for group_name in self.params.keys()])
 87 |         else:
 88 |             offset_p = 0
 89 |         offset = max(offset_m, offset_p)
 90 | 
 91 |         for group_name, values in sorted(self.meters.items(),
 92 |                                          key=lambda x: x[0]):
 93 |             loss_str = []
 94 |             for name, meter in values.items():
 95 |                 loss_str.append("{}: {:.4f} ({:.4f})".format(
 96 |                     name, meter.median, meter.global_avg,
 97 |                 ))
 98 |             return_str.append(
 99 |                 "{:{offset}s} - {}".format(
100 |                     group_name, self.delimiter.join(loss_str), offset=offset,
101 |                 ),
102 |             )
103 |         for group_name, values in self.params.items():
104 |             loss_str = []
105 |             for name, param in values.items():
106 |                 loss_str.append("{}: {:.6f}".format(name, param))
107 |             return_str.append(
108 |                 "{:{offset}s} - {}".format(
109 |                     group_name, self.delimiter.join(loss_str), offset=offset,
110 |                 ),
111 |             )
112 |         return "\n    ".join(return_str)
113 | 
114 | 
115 | class TensorboardLogger(MetricLogger):
116 |     def __init__(self,
117 |                  log_dir,
118 |                  delimiter='\t'):
119 |         super(TensorboardLogger, self).__init__(delimiter)
120 |         try:
121 |             from tensorboardX import SummaryWriter
122 |         except ImportError:
123 |             raise ImportError(
124 |                 'To use tensorboard please install tensorboardX '
125 |                 '[ pip install tensorboardx ].'
126 |             )
127 |         self.philly_tb_logger = None
128 |         self.philly_tb_logger_avg = None
129 |         self.philly_tb_logger_med = None
130 |         if is_main_process():
131 |             self.tb_logger = SummaryWriter(log_dir)
132 |             self.tb_logger_avg = SummaryWriter(os.path.join(log_dir, 'avg'))
133 |             self.tb_logger_med = SummaryWriter(os.path.join(log_dir, 'med'))
134 |         else:
135 |             self.tb_logger = None
136 |             self.tb_logger_avg = None
137 |             self.tb_logger_med = None
138 | 
139 |     def get_logs(self, iteration):
140 |         if self.tb_logger:
141 |             for group_name, values in self.meters.items():
142 |                 for name, meter in values.items():
143 |                     self.tb_logger.add_scalar(
144 |                         '{}/{}'.format(group_name, name),
145 |                         meter.last_value, iteration,
146 |                     )
147 |                     self.tb_logger_avg.add_scalar(
148 |                         '{}/{}'.format(group_name, name),
149 |                         meter.avg, iteration,
150 |                     )
151 |                     self.tb_logger_med.add_scalar(
152 |                         '{}/{}'.format(group_name, name),
153 |                         meter.median, iteration,
154 |                     )
155 |                     if self.philly_tb_logger:
156 |                         self.philly_tb_logger.add_scalar(
157 |                             '{}/{}'.format(group_name, name),
158 |                             meter.last_value, iteration,
159 |                         )
160 |                         self.philly_tb_logger_avg.add_scalar(
161 |                             '{}/{}'.format(group_name, name),
162 |                             meter.avg, iteration,
163 |                         )
164 |                         self.philly_tb_logger_med.add_scalar(
165 |                             '{}/{}'.format(group_name, name),
166 |                             meter.median, iteration,
167 |                         )
168 |             for group_name, values in self.params.items():
169 |                 for name, param in values.items():
170 |                     self.tb_logger.add_scalar(
171 |                         '{}/{}'.format(group_name, name),
172 |                         param, iteration,
173 |                     )
174 |                     if self.philly_tb_logger:
175 |                         self.philly_tb_logger.add_scalar(
176 |                             '{}/{}'.format(group_name, name),
177 |                             param, iteration,
178 |                         )
179 |         return super(TensorboardLogger, self).get_logs(iteration)
180 | 
181 |     def close(self):
182 |         if is_main_process():
183 |             self.tb_logger.close()
184 |             self.tb_logger_avg.close()
185 |             self.tb_logger_med.close()
186 | 


--------------------------------------------------------------------------------
/oscar/utils/misc.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Microsoft Corporation. Licensed under the MIT license. 
 2 | 
 3 | import errno
 4 | import os
 5 | import os.path as op
 6 | import yaml
 7 | import random
 8 | import torch
 9 | import numpy as np
10 | import torch.distributed as dist
11 | 
12 | 
13 | def mkdir(path):
14 |     # if it is the current folder, skip.
15 |     if path == '':
16 |         return
17 |     try:
18 |         os.makedirs(path)
19 |     except OSError as e:
20 |         if e.errno != errno.EEXIST:
21 |             raise
22 | 
23 | 
24 | def set_seed(seed, n_gpu):
25 |     random.seed(seed)
26 |     np.random.seed(seed)
27 |     torch.manual_seed(seed)
28 |     if n_gpu > 0:
29 |         torch.cuda.manual_seed_all(seed)
30 | 
31 | 
32 | def load_from_yaml_file(yaml_file):
33 |     with open(yaml_file, 'r') as fp:
34 |         return yaml.load(fp)
35 | 
36 | 
37 | def find_file_path_in_yaml(fname, root):
38 |     if fname is not None:
39 |         if op.isfile(fname):
40 |             return fname
41 |         elif op.isfile(op.join(root, fname)):
42 |             return op.join(root, fname)
43 |         else:
44 |             raise FileNotFoundError(
45 |                 errno.ENOENT, os.strerror(errno.ENOENT), op.join(root, fname)
46 |             )
47 | 
48 | 
49 | def get_rank():
50 |     if not dist.is_available():
51 |         return 0
52 |     if not dist.is_initialized():
53 |         return 0
54 |     return dist.get_rank()
55 | 
56 | 
57 | def is_main_process():
58 |     return get_rank() == 0
59 | 
60 | 
61 | def get_world_size():
62 |     if not dist.is_available():
63 |         return 1
64 |     if not dist.is_initialized():
65 |         return 1
66 |     return dist.get_world_size()
67 | 


--------------------------------------------------------------------------------
/oscar/utils/task_utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020 Microsoft Corporation. Licensed under the MIT license. 
  2 | 
  3 | from __future__ import absolute_import, division, print_function
  4 | 
  5 | import csv, json
  6 | import logging
  7 | import os
  8 | import sys
  9 | from io import open
 10 | import _pickle as cPickle
 11 | import torch
 12 | 
 13 | logger = logging.getLogger(__name__)
 14 | 
 15 | 
 16 | class InputInstance(object):
 17 |     """A single training/test example for simple sequence classification."""
 18 | 
 19 |     def __init__(self, guid, text_a, text_b=None, label=None, score=None, img_key=None, q_id=None):
 20 |         """Constructs a InputExample.
 21 | 
 22 |         Args:
 23 |             guid: Unique id for the example.
 24 |             text_a: string. The untokenized text of the first sequence. For single
 25 |             sequence tasks, only this sequence must be specified.
 26 |             text_b: (Optional) string. The untokenized text of the second sequence.
 27 |             Only must be specified for sequence pair tasks.
 28 |             label: (Optional) string. The label of the example. This should be
 29 |             specified for train and dev examples, but not for test examples.
 30 |         """
 31 | 
 32 |         self.guid = guid
 33 |         self.text_a = text_a
 34 |         self.text_b = text_b
 35 |         self.label = label
 36 |         self.score = score
 37 |         self.img_key = img_key
 38 |         self.q_id = q_id
 39 | 
 40 | 
 41 | class InputFeat(object):
 42 |     """A single set of features of data."""
 43 | 
 44 |     def __init__(self, input_ids, input_mask, segment_ids, label_id, score, img_feat):
 45 |         self.input_ids = input_ids
 46 |         self.input_mask = input_mask
 47 |         self.segment_ids = segment_ids
 48 |         self.label_id = label_id
 49 |         self.score = score
 50 |         self.img_feat = img_feat
 51 | 
 52 | 
 53 | class DataProcessor(object):
 54 |     """Base class for data converters for sequence classification data sets."""
 55 | 
 56 |     def get_train_examples(self, data_dir):
 57 |         """Gets a collection of `InputExample`s for the train set."""
 58 |         raise NotImplementedError()
 59 | 
 60 |     def get_dev_examples(self, data_dir):
 61 |         """Gets a collection of `InputExample`s for the dev set."""
 62 |         raise NotImplementedError()
 63 | 
 64 |     def get_labels(self):
 65 |         """Gets the list of labels for this data set."""
 66 |         raise NotImplementedError()
 67 | 
 68 |     @classmethod
 69 |     def _read_tsv(cls, input_file, quotechar=None):
 70 |         """Reads a tab separated value file."""
 71 |         with open(input_file, "r", encoding="utf-8-sig") as f:
 72 |             reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
 73 |             lines = []
 74 |             for line in reader:
 75 |                 if sys.version_info[0] == 2:
 76 |                     line = list(unicode(cell, 'utf-8') for cell in line)
 77 |                 lines.append(line)
 78 |             return lines
 79 | 
 80 | 
 81 | class VQATextProcessor(DataProcessor):
 82 |     """ Processor for the VQA Text data set. """
 83 | 
 84 |     def get_train_examples(self, data_dir, file_name='train2014_qla.json'):
 85 |         """ See base class."""
 86 | 
 87 |         lines = json.load(open(os.path.join(data_dir, file_name)))
 88 |         return self._create_examples(lines, "train")
 89 | 
 90 |         #return self._create_examples(self._read_tsv(os.path.join(data_dir, "train2014_qla.tsv")), "train")
 91 | 
 92 |     def get_dev_examples(self, data_dir, file_name='val2014_qla.json'):
 93 |         """ See base class."""
 94 | 
 95 |         lines = json.load(open(os.path.join(data_dir, file_name)))
 96 |         return self._create_examples(lines, "dev")
 97 | 
 98 |         #return self._create_examples(self._read_tsv(os.path.join(data_dir, "val2014_qla.tsv")), "dev")
 99 | 
100 |     def get_test_examples(self, data_dir, file_name='test2015_qla.json'):
101 |         """ See base class."""
102 | 
103 |         lines = json.load(open(os.path.join(data_dir, file_name)))
104 |         return self._create_examples(lines, "test")
105 | 
106 |     def get_labels(self, label_file):
107 |         """ See base class."""
108 | 
109 |         ans2label = cPickle.load(open(label_file, 'rb'))
110 |         return list(ans2label.values())
111 |         #return ["entailment", "not_entailment"]
112 | 
113 |     def _create_examples(self, lines, set_type):
114 |         """Creates examples for the training and dev sets."""
115 | 
116 |         examples = []
117 |         for (i, line) in enumerate(lines):
118 |             if set_type!='test' and len(line['an']) == 0: continue
119 | 
120 |             guid = "%s-%s" % (set_type, str(i))
121 |             text_a = line['q']
122 |             text_b = line['o'].replace(';', ' ').strip() #line['o']
123 |             label = None if set_type.startswith('test') else line['an']
124 |             score = None if set_type.startswith('test') else line['s']
125 |             img_key = line['img_id']
126 |             q_id = int(line['q_id']) if set_type.startswith('test') else 0
127 |             examples.append(InputInstance(guid=guid, text_a=text_a, text_b=text_b, label=label, score=score, img_key=img_key, q_id=q_id))
128 |         return examples
129 | 
130 | class VQATextAProcessor(DataProcessor):
131 |     """ Processor for the VQA Text data set. """
132 | 
133 |     def get_train_examples(self, data_dir, file_name='train2014_qla.json'):
134 |         """ See base class."""
135 | 
136 |         lines = json.load(open(os.path.join(data_dir, file_name)))
137 |         return self._create_examples(lines, "train")
138 | 
139 |         #return self._create_examples(self._read_tsv(os.path.join(data_dir, "train2014_qla.tsv")), "train")
140 | 
141 |     def get_dev_examples(self, data_dir, file_name='val2014_qla.json'):
142 |         """ See base class."""
143 | 
144 |         lines = json.load(open(os.path.join(data_dir, file_name)))
145 |         return self._create_examples(lines, "dev")
146 | 
147 |         #return self._create_examples(self._read_tsv(os.path.join(data_dir, "val2014_qla.tsv")), "dev")
148 | 
149 |     def get_test_examples(self, data_dir, file_name='test2015_qla.json'):
150 |         """ See base class."""
151 | 
152 |         lines = json.load(open(os.path.join(data_dir, file_name)))
153 |         return self._create_examples(lines, "test")
154 | 
155 |     def get_labels(self, label_file):
156 |         """ See base class."""
157 | 
158 |         ans2label = cPickle.load(open(label_file, 'rb'))
159 |         return list(ans2label.values())
160 | 
161 |     def _create_examples(self, lines, set_type):
162 |         """Creates examples for the training and dev sets."""
163 | 
164 |         examples = []
165 |         for (i, line) in enumerate(lines):
166 |             if set_type!='test' and len(line['an']) == 0: continue
167 | 
168 |             guid = "%s-%s" % (set_type, str(i))
169 |             text_a = line['q']
170 |             text_b = None # line['o'] # or None
171 |             label = None if set_type.startswith('test') else line['an']
172 |             score = None if set_type.startswith('test') else line['s']
173 |             img_key = line['img_id']
174 |             q_id = int(line['q_id']) if set_type.startswith('test') else 0
175 |             examples.append(InputInstance(guid=guid, text_a=text_a, text_b=text_b, label=label, score=score, img_key=img_key, q_id=q_id))
176 |         return examples
177 | 
178 | class GQAProcessor(DataProcessor):
179 |     """ Processor for the GQA data set. """
180 | 
181 |     def get_train_examples(self, data_dir, file_name='train2014_qla.json'):
182 |         """ See base class."""
183 | 
184 |         lines = json.load(open(os.path.join(data_dir, file_name)))
185 |         return self._create_examples(lines, "train")
186 | 
187 |         #return self._create_examples(self._read_tsv(os.path.join(data_dir, "train2014_qla.tsv")), "train")
188 | 
189 |     def get_dev_examples(self, data_dir, file_name='val2014_qla.json'):
190 |         """ See base class."""
191 | 
192 |         lines = json.load(open(os.path.join(data_dir, file_name)))
193 |         return self._create_examples(lines, "dev")
194 | 
195 |         #return self._create_examples(self._read_tsv(os.path.join(data_dir, "val2014_qla.tsv")), "dev")
196 | 
197 |     def get_test_examples(self, data_dir, file_name='test2015_qla.json'):
198 |         """ See base class."""
199 | 
200 |         lines = json.load(open(os.path.join(data_dir, file_name)))
201 |         return self._create_examples(lines, "test")
202 | 
203 |     def get_labels(self, label_file='trainval_testdev_all_ans2label.pkl'):
204 |         """ See base class."""
205 | 
206 |         ans2label = cPickle.load(open(label_file, 'rb'))
207 |         return list(ans2label.values())
208 | 
209 |     def _create_examples(self, lines, set_type):
210 |         """Creates examples for the training and dev sets."""
211 | 
212 |         examples = []
213 |         for (i, line) in enumerate(lines):
214 |             if set_type!='test' and len(line['an']) == 0: continue
215 | 
216 |             guid = "%s-%s" % (set_type, str(i))
217 |             text_a = line['q']
218 |             text_b = line['o'] # or None
219 |             label = None if set_type.startswith('test') else line['an']
220 |             score = 0
221 |             img_key = line['img_id']
222 |             q_id = int(line['q_id']) if set_type.startswith('test') else 0
223 |             examples.append(InputInstance(guid=guid, text_a=text_a, text_b=text_b, label=label, score=score, img_key=img_key, q_id=q_id))
224 |         return examples
225 | 
226 | class NLVRProcessor(DataProcessor):
227 |     """ Processor for the NLVR data set. """
228 | 
229 |     def get_train_examples(self, data_dir, use_label_seq=True, file_name='nlvr2_train.json'):
230 |         """ See base class."""
231 | 
232 |         lines = json.load(open(os.path.join(data_dir, file_name)))
233 |         return self._create_examples(lines, "train", use_label_seq)
234 | 
235 |         #return self._create_examples(self._read_tsv(os.path.join(data_dir, "train2014_qla.tsv")), "train")
236 | 
237 |     def get_dev_examples(self, data_dir, use_label_seq=True, file_name='nlvr2_dev.json'):
238 |         """ See base class."""
239 | 
240 |         lines = json.load(open(os.path.join(data_dir, file_name)))
241 |         return self._create_examples(lines, "dev", use_label_seq)
242 | 
243 |         #return self._create_examples(self._read_tsv(os.path.join(data_dir, "val2014_qla.tsv")), "dev")
244 | 
245 |     def get_test_examples(self, data_dir, use_label_seq=True, file_name='nlvr2_test1.json'):
246 |         """ See base class."""
247 | 
248 |         lines = json.load(open(os.path.join(data_dir, file_name)))
249 |         return self._create_examples(lines, "test", use_label_seq)
250 | 
251 |     def get_labels(self, label_file=None):
252 |         """ See base class."""
253 | 
254 |         #ans2label = cPickle.load(open(label_file, 'rb'))
255 |         #return list(ans2label.values())
256 |         return [0, 1]
257 | 
258 |     def _create_examples(self, lines, set_type, use_label_seq=True):
259 |         """ Creates examples for the training and dev sets. """
260 | 
261 |         examples = []
262 |         for (i, line) in enumerate(lines):
263 |             guid = "%s-%s" % (set_type, str(i))
264 |             text_a = line['q']
265 |             text_b = line['o'] if use_label_seq else None
266 |             label = line['label'] #None if set_type.startswith('test') else line['label']
267 |             score = 0
268 |             img_key = line['img_id'] #[line['img_left'], line['img_left']]
269 |             q_id = 0 #int(line['q_id']) if set_type.startswith('test') else 0
270 |             examples.append(InputInstance(guid=guid, text_a=text_a, text_b=text_b, label=label, score=score, img_key=img_key, q_id=q_id))
271 |         return examples
272 | 
273 | class VCR_Q_A_Processor(DataProcessor):
274 |     """ Processor for the VCR (q -> a) (Det) data set. """
275 | 
276 |     def get_train_examples(self, data_dir, file_name='vcr_train.json'):
277 |         """ See base class."""
278 | 
279 |         lines = json.load(open(os.path.join(data_dir, file_name)))
280 |         return self._create_examples(lines, "train")
281 | 
282 |     def get_dev_examples(self, data_dir, file_name='vcr_val.json'):
283 |         """ See base class."""
284 | 
285 |         lines = json.load(open(os.path.join(data_dir, file_name)))
286 |         return self._create_examples(lines, "dev")
287 | 
288 |     def get_test_examples(self, data_dir, file_name='vcr_test.json'):
289 |         """ See base class."""
290 | 
291 |         lines = json.load(open(os.path.join(data_dir, file_name)))
292 |         return self._create_examples(lines, "test")
293 | 
294 |     def get_labels(self, label_file=None):
295 |         """ See base class."""
296 | 
297 |         #ans2label = cPickle.load(open(label_file, 'rb'))
298 |         #return list(ans2label.values())
299 |         return [0, 1]
300 | 
301 |     def _create_examples(self, lines, set_type):
302 |         """ Creates examples for the training and dev sets. """
303 | 
304 |         examples = []
305 |         for (i, line) in enumerate(lines):
306 |             #if set_type!='test': continue
307 | 
308 |             guid = "%s-%s" % (set_type, str(i))
309 |             text_a = line['q'] # question
310 |             choices = line['choices']
311 |             label = None if set_type.startswith('test') else line['label']
312 |             img_key = line['img_id']
313 |             q_id = int(line['annot_id'].split('-')[-1]) #int(line['q_id']) if set_type.startswith('test') else 0
314 |             score = line['objects'] if 'objects' in line else None
315 |             examples.append(InputInstance(guid=guid, text_a=text_a, text_b=choices, label=label, score=score, img_key=img_key, q_id=q_id))
316 |         return examples
317 | 
318 | class VCR_QA_R_Processor(DataProcessor):
319 |     """ Processor for the VCR (qa -> r) QA_R data set. """
320 | 
321 |     def get_train_examples(self, data_dir, file_name='vcr_train.json'):
322 |         """ See base class."""
323 | 
324 |         lines = json.load(open(os.path.join(data_dir, file_name)))
325 |         return self._create_examples(lines, "train")
326 | 
327 |     def get_dev_examples(self, data_dir, file_name='vcr_val.json'):
328 |         """ See base class."""
329 | 
330 |         lines = json.load(open(os.path.join(data_dir, file_name)))
331 |         return self._create_examples(lines, "dev")
332 | 
333 |     def get_test_examples(self, data_dir, file_name='vcr_test.json'):
334 |         """ See base class."""
335 | 
336 |         lines = json.load(open(os.path.join(data_dir, file_name)))
337 |         return self._create_examples(lines, "test")
338 | 
339 |     def get_labels(self, label_file=None):
340 |         """ See base class."""
341 | 
342 |         #ans2label = cPickle.load(open(label_file, 'rb'))
343 |         #return list(ans2label.values())
344 |         return [0, 1]
345 | 
346 |     def _create_examples(self, lines, set_type):
347 |         """ Creates examples for the training and dev sets. """
348 | 
349 |         examples = []
350 |         for (i, line) in enumerate(lines):
351 |             #if set_type!='test': continue
352 | 
353 |             guid = "%s-%s" % (set_type, str(i))
354 |             text_a = line['q'] + ' ' + line['choices'][line['label']] # question_choice
355 |             choices = line['rational_choices'] # rational_choice
356 |             label = None if set_type.startswith('test') else line['rational_label'] # rational_label
357 |             img_key = line['img_id']
358 |             q_id = int(line['annot_id'].split('-')[-1]) #int(line['q_id']) if set_type.startswith('test') else 0
359 |             examples.append(InputInstance(guid=guid, text_a=text_a, text_b=choices, label=label, score=None, img_key=img_key, q_id=q_id))
360 |         return examples
361 | 
362 | class VCR_QAR_Processor(DataProcessor):
363 |     """ Processor for the VCR (q->a, qa->r) data set. """
364 | 
365 |     def get_train_examples(self, data_dir, file_name='vcr_train.json'):
366 |         """ See base class."""
367 | 
368 |         lines = json.load(open(os.path.join(data_dir, file_name)))
369 |         return self._create_examples(lines, "train")
370 | 
371 |     def get_dev_examples(self, data_dir, file_name='vcr_val.json'):
372 |         """ See base class."""
373 | 
374 |         lines = json.load(open(os.path.join(data_dir, file_name)))
375 |         return self._create_examples(lines, "dev")
376 | 
377 |     def get_test_examples(self, data_dir, file_name='vcr_test.json'):
378 |         """ See base class."""
379 | 
380 |         lines = json.load(open(os.path.join(data_dir, file_name)))
381 |         return self._create_examples(lines, "test")
382 | 
383 |     def get_labels(self, label_file=None):
384 |         """ See base class."""
385 | 
386 |         #ans2label = cPickle.load(open(label_file, 'rb'))
387 |         #return list(ans2label.values())
388 |         return [0, 1]
389 | 
390 |     def _create_examples(self, lines, set_type):
391 |         """ Creates examples for the training and dev sets. """
392 | 
393 |         examples = []
394 |         for (i, line) in enumerate(lines):
395 |             #if set_type!='test': continue
396 | 
397 |             guid = "%s-%s-q-a" % (set_type, str(i))
398 |             text_a = line['q'] # question
399 |             choices = line['choices']
400 |             label = None if set_type.startswith('test') else line['label']
401 |             img_key = line['img_id']
402 |             q_id = int(line['annot_id'].split('-')[-1]) #int(line['q_id']) if set_type.startswith('test') else 0
403 |             score = line['objects'] if 'objects' in line else None
404 |             examples.append(InputInstance(guid=guid, text_a=text_a, text_b=choices, label=label, score=score, img_key=img_key, q_id=q_id))
405 | 
406 |             if set_type == 'train': # qa -> r
407 |                 guid = "%s-%s-qa-r" % (set_type, str(i))
408 |                 text_a = line['q'] + ' ' + line['choices'][line['label']] # question_choice
409 |                 choices = line['rational_choices'] # rational_choice
410 |                 label = None if set_type.startswith('test') else line['rational_label'] # rational_label
411 |                 img_key = line['img_id']
412 |                 q_id = int(line['annot_id'].split('-')[-1]) # int(line['q_id']) if set_type.startswith('test') else 0
413 |                 score = line['objects'] if 'objects' in line else None
414 |                 examples.append(InputInstance(guid=guid, text_a=text_a, text_b=choices, label=label, score=score, img_key=img_key, q_id=q_id))
415 |         return examples
416 | 
417 | 
418 | def convert_examples_to_features_vqa(examples, img_feats, label_list, max_img_seq_length, max_seq_length,
419 |                                  tokenizer, output_mode,
420 |                                  cls_token_at_end=False, pad_on_left=False,
421 |                                  cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
422 |                                  sequence_a_segment_id=0, sequence_b_segment_id=1,
423 |                                  cls_token_segment_id=1, pad_token_segment_id=0,
424 |                                  mask_padding_with_zero=True):
425 |     """ Loads a data file into a list of `InputBatch`s
426 |         `cls_token_at_end` define the location of the CLS token:
427 |             - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
428 |             - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
429 |         `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
430 |     """
431 | 
432 |     label_map = {label:i for i, label in enumerate(label_list)}
433 | 
434 |     features = []
435 |     #debug:
436 |     debug_size = 500
437 | 
438 |     for (ex_index, example) in enumerate(examples[0: ]):
439 |         if len(example.label) == 0: continue
440 |         if ex_index % 10000 == 0:
441 |             logger.info("Writing example %d of %d" % (ex_index, len(examples)))
442 | 
443 |         tokens_a = tokenizer.tokenize(example.text_a)
444 | 
445 |         tokens_b = None
446 |         if example.text_b:
447 |             tokens_b = tokenizer.tokenize(example.text_b)
448 |             # Modifies `tokens_a` and `tokens_b` in place so that the total
449 |             # length is less than the specified length.
450 |             # Account for [CLS], [SEP], [SEP] with "- 3"
451 |             _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
452 |         else:
453 |             # Account for [CLS] and [SEP] with "- 2"
454 |             if len(tokens_a) > max_seq_length - 2:
455 |                 tokens_a = tokens_a[:(max_seq_length - 2)]
456 | 
457 |         # The convention in BERT is:
458 |         # (a) For sequence pairs:
459 |         #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
460 |         #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
461 |         # (b) For single sequences:
462 |         #  tokens:   [CLS] the dog is hairy . [SEP]
463 |         #  type_ids:   0   0   0   0  0     0   0
464 |         #
465 |         # Where "type_ids" are used to indicate whether this is the first
466 |         # sequence or the second sequence. The embedding vectors for `type=0` and
467 |         # `type=1` were learned during pre-training and are added to the wordpiece
468 |         # embedding vector (and position vector). This is not *strictly* necessary
469 |         # since the [SEP] token unambiguously separates the sequences, but it makes
470 |         # it easier for the model to learn the concept of sequences.
471 |         #
472 |         # For classification tasks, the first vector (corresponding to [CLS]) is
473 |         # used as as the "sentence vector". Note that this only makes sense because
474 |         # the entire model is fine-tuned.
475 |         tokens = tokens_a + [sep_token]
476 |         segment_ids = [sequence_a_segment_id] * len(tokens)
477 | 
478 |         if tokens_b:
479 |             tokens += tokens_b + [sep_token]
480 |             segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
481 | 
482 |         if cls_token_at_end:
483 |             tokens = tokens + [cls_token]
484 |             segment_ids = segment_ids + [cls_token_segment_id]
485 |         else:
486 |             tokens = [cls_token] + tokens
487 |             segment_ids = [cls_token_segment_id] + segment_ids
488 | 
489 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
490 | 
491 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
492 |         # tokens are attended to.
493 |         input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
494 | 
495 |         # Zero-pad up to the sequence length.
496 |         padding_length = max_seq_length - len(input_ids)
497 |         if pad_on_left:
498 |             input_ids = ([pad_token] * padding_length) + input_ids
499 |             input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
500 |             segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
501 |         else:
502 |             input_ids = input_ids + ([pad_token] * padding_length)
503 |             input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
504 |             segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)
505 | 
506 |         assert len(input_ids) == max_seq_length
507 |         assert len(input_mask) == max_seq_length
508 |         assert len(segment_ids) == max_seq_length
509 | 
510 |         # image features
511 |         #img_feat = img_feats[example.img_key] # torch
512 |         img_feat = img_feats.item().get(example.img_key) # numpy
513 |         if img_feat.shape[0] > max_img_seq_length:
514 |             img_feat = img_feat[0:max_img_seq_length, ]
515 |             if max_img_seq_length > 0:
516 |                 input_mask = input_mask + [1 if mask_padding_with_zero else 0] * img_feat.shape[0]
517 |                 #segment_ids += [sequence_b_segment_id] * img_feat.shape[0]
518 |         else:
519 |             if max_img_seq_length > 0:
520 |                 input_mask = input_mask + [1 if mask_padding_with_zero else 0] * img_feat.shape[0]
521 |                 #segment_ids = segment_ids + [sequence_b_segment_id] * img_feat.shape[0]
522 |             padding_matrix = torch.zeros((max_img_seq_length - img_feat.shape[0], img_feat.shape[1]))
523 |             img_feat = torch.cat((img_feat, padding_matrix), 0)
524 |             if max_img_seq_length > 0:
525 |                 input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_matrix.shape[0])
526 |                 #segment_ids = segment_ids + [pad_token_segment_id] * padding_matrix.shape[0]
527 | 
528 |         if output_mode == "classification":
529 |             label_id = [label_map[l] for l in example.label]
530 |             score = example.score
531 |         elif output_mode == "regression":
532 |             label_id = float(example.label)
533 |         else:
534 |             raise KeyError(output_mode)
535 | 
536 |         if ex_index < 5:
537 |             logger.info("*** Example ***")
538 |             logger.info("guid: %s" % (example.guid))
539 |             logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
540 |             logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
541 |             logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
542 |             logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
543 |             logger.info("label: %s (id = %s)" % (example.label, label_id))
544 |             logger.info("score: %s (score = %s)" % (example.score, score))
545 | 
546 |         features.append(InputFeat(input_ids=input_ids, input_mask=input_mask, segment_ids=segment_ids, label_id=label_id, score=score, img_feat=img_feat))
547 |     return features
548 | 
549 | 
550 | def _truncate_seq_pair(tokens_a, tokens_b, max_length):
551 |     """Truncates a sequence pair in place to the maximum length."""
552 | 
553 |     # This is a simple heuristic which will always truncate the longer sequence
554 |     # one token at a time. This makes more sense than truncating an equal percent
555 |     # of tokens from each, since if one sequence is very short then each token
556 |     # that's truncated likely contains more information than a longer sequence.
557 |     while True:
558 |         total_length = len(tokens_a) + len(tokens_b)
559 |         if total_length <= max_length:
560 |             break
561 |         if len(tokens_a) > len(tokens_b):
562 |             tokens_a.pop()
563 |         else:
564 |             tokens_b.pop()
565 | 
566 | 
567 | processors = {
568 |     "vqa_text": VQATextProcessor,
569 |     "vqa_text_a": VQATextAProcessor,
570 |     "gqa": GQAProcessor,
571 |     "nlvr": NLVRProcessor,
572 |     "vcr_q_a": VCR_Q_A_Processor,
573 |     "vcr_qa_r": VCR_QA_R_Processor,
574 |     "vcr_qar": VCR_QAR_Processor,
575 | }
576 | 
577 | output_modes = {
578 |     "vqa_text": "classification",
579 |     "vqa_text_a": "classification",
580 |     "gqa": "classification",
581 |     "nlvr": "classification",
582 |     "vcr_q_a": "classification",
583 |     "vcr_qa_r": "classification",
584 |     "vcr_qar": "classification",
585 | }
586 | 
587 | GLUE_TASKS_NUM_LABELS = {
588 |     "vqa_text": 3129,
589 |     "vqa_text_a": 3129,
590 |     "gqa": 1853,
591 |     "nlvr": 2,
592 |     "vcr_q_a": 2,
593 |     "vcr_qa_r": 2,
594 |     "vcr_qar": 2,
595 | }


--------------------------------------------------------------------------------
/oscar/utils/tsv_file.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Microsoft Corporation. Licensed under the MIT license. 
 2 | 
 3 | import logging
 4 | import os
 5 | import os.path as op
 6 | 
 7 | 
 8 | def generate_lineidx_file(filein, idxout):
 9 |     idxout_tmp = idxout + '.tmp'
10 |     with open(filein, 'r') as tsvin, open(idxout_tmp,'w') as tsvout:
11 |         fsize = os.fstat(tsvin.fileno()).st_size
12 |         fpos = 0
13 |         while fpos!=fsize:
14 |             tsvout.write(str(fpos)+"\n")
15 |             tsvin.readline()
16 |             fpos = tsvin.tell()
17 |     os.rename(idxout_tmp, idxout)
18 | 
19 | 
20 | class TSVFile(object):
21 |     def __init__(self, tsv_file, generate_lineidx=False):
22 |         self.tsv_file = tsv_file
23 |         self.lineidx = op.splitext(tsv_file)[0] + '.lineidx'
24 |         self._fp = None
25 |         self._lineidx = None
26 |         # the process always keeps the process which opens the file. 
27 |         # If the pid is not equal to the currrent pid, we will re-open the file.
28 |         self.pid = None
29 |         # generate lineidx if not exist
30 |         if not op.isfile(self.lineidx) and generate_lineidx:
31 |             generate_lineidx_file(self.tsv_file, self.lineidx)
32 | 
33 |     def __del__(self):
34 |         if self._fp:
35 |             self._fp.close()
36 | 
37 |     def __str__(self):
38 |         return "TSVFile(tsv_file='{}')".format(self.tsv_file)
39 | 
40 |     def __repr__(self):
41 |         return str(self)
42 | 
43 |     def num_rows(self):
44 |         self._ensure_lineidx_loaded()
45 |         return len(self._lineidx)
46 | 
47 |     def seek(self, idx):
48 |         self._ensure_tsv_opened()
49 |         self._ensure_lineidx_loaded()
50 |         try:
51 |             pos = self._lineidx[idx]
52 |         except:
53 |             logging.info('{}-{}'.format(self.tsv_file, idx))
54 |             raise
55 |         self._fp.seek(pos)
56 |         return [s.strip() for s in self._fp.readline().split('\t')]
57 | 
58 |     def seek_first_column(self, idx):
59 |         self._ensure_tsv_opened()
60 |         self._ensure_lineidx_loaded()
61 |         pos = self._lineidx[idx]
62 |         self._fp.seek(pos)
63 |         return read_to_character(self._fp, '\t')
64 | 
65 |     def __getitem__(self, index):
66 |         return self.seek(index)
67 | 
68 |     def __len__(self):
69 |         return self.num_rows()
70 | 
71 |     def _ensure_lineidx_loaded(self):
72 |         if self._lineidx is None:
73 |             logging.info('loading lineidx: {}'.format(self.lineidx))
74 |             with open(self.lineidx, 'r') as fp:
75 |                 self._lineidx = [int(i.strip()) for i in fp.readlines()]
76 | 
77 |     def _ensure_tsv_opened(self):
78 |         if self._fp is None:
79 |             self._fp = open(self.tsv_file, 'r')
80 |             self.pid = os.getpid()
81 | 
82 |         if self.pid != os.getpid():
83 |             logging.info('re-open {} because the process id changed'.format(self.tsv_file))
84 |             self._fp = open(self.tsv_file, 'r')
85 |             self.pid = os.getpid()
86 | 


--------------------------------------------------------------------------------
/oscar/utils/tsv_file_ops.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) 2020 Microsoft Corporation. Licensed under the MIT license.
 2 | 
 3 | import logging
 4 | import numpy as np
 5 | import os
 6 | import os.path as op
 7 | import shutil
 8 | from .misc import mkdir
 9 | from .tsv_file import TSVFile
10 | 
11 | 
12 | def tsv_writer(values, tsv_file_name, sep='\t'):
13 |     mkdir(os.path.dirname(tsv_file_name))
14 |     tsv_file_name_tmp = tsv_file_name + '.tmp'
15 |     with open(tsv_file_name_tmp, 'wb') as fp:
16 |         assert values is not None
17 |         for value in values:
18 |             assert value is not None
19 |             v = sep.join(map(lambda v: v.decode() if type(v) == bytes else str(v), value)) + '\n'
20 |             v = v.encode()
21 |             fp.write(v)
22 |     os.rename(tsv_file_name_tmp, tsv_file_name)
23 | 
24 | 
25 | def concat_files(ins, out):
26 |     out_tmp = out + '.tmp'
27 |     with open(out_tmp, 'wb') as fp_out:
28 |         for i, f in enumerate(ins):
29 |             with open(f, 'rb') as fp_in:
30 |                 shutil.copyfileobj(fp_in, fp_out, 1024*1024*10)
31 |     os.rename(out_tmp, out)
32 | 
33 | 
34 | def concat_tsv_files(tsvs, out_tsv, generate_lineidx=False):
35 |     concat_files(tsvs, out_tsv)
36 |     if generate_lineidx:
37 |         sizes = [os.stat(t).st_size for t in tsvs]
38 |         sizes = np.cumsum(sizes)
39 |         all_idx = []
40 |         for i, t in enumerate(tsvs):
41 |             for idx in load_list_file(op.splitext(t)[0] + '.lineidx'):
42 |                 if i == 0:
43 |                     all_idx.append(idx)
44 |                 else:
45 |                     all_idx.append(str(int(idx) + sizes[i - 1]))
46 |         with open(op.splitext(out_tsv)[0] + '.lineidx', 'w') as f:
47 |             f.write('\n'.join(all_idx))
48 | 
49 | 
50 | def load_list_file(fname):
51 |     with open(fname, 'r') as fp:
52 |         lines = fp.readlines()
53 |     result = [line.strip() for line in lines]
54 |     if len(result) > 0 and result[-1] == '':
55 |         result = result[:-1]
56 |     return result
57 | 
58 | 
59 | def reorder_tsv_keys(in_tsv_file, ordered_keys, out_tsv_file):
60 |     tsv = TSVFile(in_tsv_file, generate_lineidx=True)
61 |     keys = [tsv.seek(i)[0] for i in range(len(tsv))]
62 |     key_to_idx = {key: i for i, key in enumerate(keys)}
63 |     def gen_rows():
64 |         for key in ordered_keys:
65 |             idx = key_to_idx[key]
66 |             yield tsv.seek(idx)
67 |     tsv_writer(gen_rows(), out_tsv_file)
68 | 
69 | 
70 | def delete_tsv_files(tsvs):
71 |     for t in tsvs:
72 |         if op.isfile(t):
73 |             try_delete(t)
74 |         line = op.splitext(t)[0] + '.lineidx'
75 |         if op.isfile(line):
76 |             try_delete(line)
77 | 
78 | 
79 | def try_once(func):
80 |     def func_wrapper(*args, **kwargs):
81 |         try:
82 |             return func(*args, **kwargs)
83 |         except Exception as e:
84 |             logging.info('ignore error \n{}'.format(str(e)))
85 |     return func_wrapper
86 | 
87 | 
88 | @try_once
89 | def try_delete(f):
90 |     os.remove(f)
91 | 
92 | 
93 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | tqdm
2 | pyyaml
3 | matplotlib
4 | requests
5 | scikit-image
6 | anytree
7 | regex
8 | boto3
9 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/python
 2 | 
 3 | from __future__ import print_function
 4 | import os
 5 | import sys
 6 | import re
 7 | import os.path as op
 8 | from setuptools import find_packages, setup
 9 | 
10 | # change directory to this module path
11 | try:
12 |     this_file = __file__
13 | except NameError:
14 |     this_file = sys.argv[0]
15 | this_file = os.path.abspath(this_file)
16 | if op.dirname(this_file):
17 |     os.chdir(op.dirname(this_file))
18 | script_dir = os.getcwd()
19 | 
20 | def readme(fname):
21 |     """Read text out of a file in the same directory as setup.py.
22 |     """
23 |     return open(op.join(script_dir, fname)).read()
24 | 
25 | 
26 | def find_version(fname):
27 |     version_file = readme(fname)
28 |     version_match = re.search(r"^__version__ = ['\"]([^'\"]*)['\"]",
29 |                               version_file, re.M)
30 |     if version_match:
31 |         return version_match.group(1)
32 |     raise RuntimeError("Unable to find version string.")
33 | 
34 | 
35 | setup(
36 |     name="oscar",
37 |     version=find_version("oscar/__init__.py"),
38 |     url='https://github.com/xjli/Oscar',
39 |     description="Oscar for vision and language tasks",
40 |     long_description=readme('README.md'),
41 |     packages=find_packages(),
42 |     classifiers=[
43 |         'Intended Audience :: Developers',
44 |         "Programming Language :: Python",
45 |         'Topic :: Software Development',
46 |     ]
47 | )
48 | 


--------------------------------------------------------------------------------