├── .gitignore
├── LICENSE
├── README.md
├── baselines
    ├── README.md
    ├── clip
    │   ├── contextual.py
    │   ├── contra.py
    │   ├── dataset.py
    │   ├── evaluate_clip.py
    │   ├── evaluate_contextual.py
    │   ├── extras.py
    │   ├── nocontra.py
    │   ├── vilbert-and-bert-config.json
    │   ├── volta_src
    │   │   ├── config.py
    │   │   ├── embeddings.py
    │   │   ├── encoders.py
    │   │   ├── losses.py
    │   │   └── utils.py
    │   └── zero_shot.py
    └── crossencoders
    │   ├── analyze_results.py
    │   ├── contextual.py
    │   ├── contra.py
    │   ├── ctrl_uniter_base.json
    │   ├── nocontra.py
    │   ├── task_config
    │       ├── contextual.yml
    │       ├── contra.yml
    │       ├── nocontra.yml
    │       └── zero_shot.yml
    │   ├── vilbert_base.json
    │   ├── volta
    │       ├── OLD_encoders.py
    │       ├── __init__.py
    │       ├── config.py
    │       ├── datasets
    │       │   ├── __init__.py
    │       │   ├── _image_features_reader.py
    │       │   ├── concept_cap_dataset.py
    │       │   ├── flickr_grounding_dataset.py
    │       │   ├── gqa_dataset.py
    │       │   ├── guesswhat_dataset.py
    │       │   ├── guesswhat_pointing_dataset.py
    │       │   ├── nlvr2_dataset.py
    │       │   ├── refer_dense_caption.py
    │       │   ├── refer_expression_dataset.py
    │       │   ├── retrieval_dataset.py
    │       │   ├── vcr_dataset.py
    │       │   ├── visdial_dataset.py
    │       │   ├── vismadlibs_dataset.py
    │       │   ├── visual7w_pointing_dataset.py
    │       │   ├── visual_entailment_dataset.py
    │       │   ├── visual_genome_dataset.py
    │       │   └── vqa_dataset.py
    │       ├── embeddings.py
    │       ├── encoders.py
    │       ├── extras.py
    │       ├── losses.py
    │       ├── optimization.py
    │       ├── task_utils.py
    │       ├── train_utils.py
    │       └── utils.py
    │   └── zero_shot.py
├── data
    ├── analysis
    │   ├── annotator_agreement.py
    │   ├── annotator_bias.py
    │   ├── annotator_split_valid.json
    │   ├── calc_accuracies.py
    │   ├── compare_dataset_statistics.py
    │   ├── convert.py
    │   ├── convert_zeroshot.py
    │   ├── counter2key_test.json
    │   ├── counter2key_train.json
    │   ├── counter2key_val.json
    │   ├── img_similarity.py
    │   ├── manual_annotation_valid.yaml
    │   └── shortid2id.json
    ├── test_data_unlabeled.json
    ├── train_data.json
    ├── train_simple.json
    ├── valid_data.json
    ├── valid_simple.json
    └── vilbert_data_format
    │   ├── test.json
    │   ├── test_target.pkl
    │   ├── train.json
    │   ├── train_target.pkl
    │   ├── trainval_ans2label.pkl
    │   ├── trainval_label2ans.pkl
    │   ├── val.json
    │   └── val_target.pkl
├── example.png
└── install.sh


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Emanuele Bugliarello
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # ImageCoDe
  2 | 
  3 | [![arxiv](https://img.shields.io/badge/arXiv-2203.15867-b31b1b.svg)](https://arxiv.org/abs/2203.15867)
  4 | 
  5 | This repository contains code and data for ImageCoDe: [Image Retrieval from Contextual Descriptions](https://arxiv.org/abs/2203.15867). 
  6 | 
  7 | ![Example](https://github.com/mcgill-nlp/ImageCoDe/blob/main/example.png?raw=true)
  8 | 
  9 | ## Updates
 10 | 
 11 | [29/2/2024] **We release a simpler version of ImageCoDe to faciliate progress on the task, called *ImageCoDe-simple***. Instead of 10 images per example, it is now only 2 images, to allow current state-of-the-art models to easily process and reason across them at once. You can find more details under the Data section below. TODO: reformat the secret test set into the simplified format.
 12 | 
 13 | ## Data
 14 | All collected descriptions for the training and validation set are under [`data/train_data.json`](data/train_data.json) and [`data/valid_data.json`](data/valid_data.json), as well as [`data/train_simple.json`](data/train_simple.json) and [`data/valid_simple.json`](data/valid_simple.json) for ImageCoDe-simple.
 15 | 
 16 | Image sets can be downloaded on [Zenodo](https://zenodo.org/record/6518944#.YnLboHWZPUQ) or [GoogleDrive](https://drive.google.com/file/d/1OIKNyU0F9lThbaZZ3Jvm7AlF94n1MzDk/view?usp=sharing) and should be unzipped in `data/`.
 17 | 
 18 | You can download from the commandline via:
 19 | 
 20 | ```
 21 | wget https://zenodo.org/record/6518944/files/image-sets.zip
 22 | ```
 23 | 
 24 | Alternatively, you can use [HuggingFace Datasets](https://huggingface.co/datasets/BennoKrojer/ImageCoDe) for working with ImageCoDe.
 25 | 
 26 | For ViLBERT experiments, you need to download a pretrained ViLBERT checkpoint from volta [here](https://github.com/e-bug/volta/blob/main/MODELS.md), simply by clicking on ViLBERT in the table. Save the downloaded file as `baselines/vilbert/vilbert-pretrained.bin`.
 27 | Since ViLBERT uses image features from Faster R-CNN, you also have to downloaded these for all ImageCoDe images here: [Google Drive link](https://drive.google.com/drive/folders/1Gm22SlCM1V63oZIVS0riqWlySL_g5DJc?usp=sharing). Save the file as `data/rcnn-features36-36.lmdb`.
 28 | The same procedure applies for UNITER.
 29 | 
 30 | The format for [`data/train_simple.json`](data/train_simple.json) looks like this:
 31 | 
 32 | ```json
 33 | [
 34 |     {
 35 |         "directory": "open-images-1815_f91d6f546e63f20d",
 36 |         "pos_idx": 5,
 37 |         "neg_idx": 3,
 38 |         "caption": "A deer head is mounted horizontally next to a painting"
 39 |     },
 40 | {"..."}
 41 | ]
 42 | ```
 43 | And the images under `data/` have the following structure. Each folder contains 10 images. If the images are video frames, the number X in imgX.jpg indicates the frame number:
 44 | ```
 45 |   .
 46 |   ├── MSR-VTT-videoTrainValVideo_video2044-shot1_0
 47 |       │   ├── img0.jpg
 48 |       │   ├── img7.jpg
 49 |       │   ├── ...
 50 |   ├── video-storytelling-videochristmas_56Nm66j-i5Q-shot14_2
 51 |       │   ├── ...
 52 | ```
 53 | 
 54 | Checkpoints for our CLIP models in the paper can be downloaded [here](https://drive.google.com/drive/folders/1PTvUgtyKAqzUPQLRn8792QQx7G-5zs9V?usp=sharing). For example `CONTRA-...` is used to initialize the CLIP vision and text encoder for our strongest baseline model `TEMP-CONTEXTUAL-...`.
 55 | ### Leaderboard
 56 | 
 57 | Based on this you can train your model and test on the unlabeled test set:
 58 | ```json
 59 | {
 60 |   "MSR-VTT-videoTestVideo_video7763-shot2_1": [
 61 |     "The team name on shirt is visible without a number, but all letters can be seen for team name.",
 62 |     "the player can be seen with him on the left close to the logo on the pitch on the right and can be clearly seen"
 63 |   ],
 64 |   "...":
 65 |   ["..."]
 66 | }
 67 | ```
 68 | 
 69 | In order to appear on the leaderboard, please format your results in the following format:
 70 | ```json
 71 | {
 72 |   "MSR-VTT-videoTestVideo_video7763-shot2_1": [
 73 |     1,
 74 |     2
 75 |   ],
 76 |   "...":
 77 |   ["..."]
 78 | }
 79 | ```
 80 | Where the example here with "1" and "2" represent image indices ranging from 0 to 9.
 81 | You can submit to the leaderboard by sending your test set file (or a download link) to benno.krojer@mila.quebec and we will update the leaderboard quickly (max. 1-2 days).
 82 | The leaderboard is maintained on the [project website](https://mcgill-nlp.github.io/imagecode/) and might change its submission procedure at some point.
 83 | 
 84 | ## Installations
 85 | 
 86 | Run [`install.sh`](install.sh) for running CLIP experiments.
 87 | For VilBERT follow the [instructions for volta](https://github.com/e-bug/volta#repository-setup). 
 88 | 
 89 | ## Code
 90 | 
 91 | Code for CLIP is under [baselines/clip](https://github.com/BennoKrojer/ImageCoDe/tree/main/baselines/clip) and and code for ViLBERT/UNITER is under [baselines/crossencoders](https://github.com/BennoKrojer/ImageCoDe/tree/main/baselines/crossencoders).
 92 | 
 93 | For details commands to run each model variant shown in the paper, have a look at the [README in baselines](https://github.com/BennoKrojer/ImageCoDe/tree/main/baselines).
 94 | 
 95 | For example to train the best performing model CLIP+TemporalEmbeddings, run:
 96 | 
 97 | ```
 98 | python3 contextual.py --lr 2e-6 --lr_head 1e-4 -b 36 -m ViT-B/16 --fusion mult -a gelu --logit_scale 1000 --finetuned_checkpoint_path checkpoints/CONTRA_clip_best__36_4e-06_30_1395526.pt --add_input --frozen_clip --positional
 99 | ```
100 | 
101 | ## Data Analysis
102 | 
103 | Our manual annotation of various phenomena (negation, nuances, ...) in our validation set can be found under `data/manual_annotation_valid.yaml`
104 | 
105 | ## License
106 | 
107 | This work is licensed under the MIT license. See [`LICENSE`](LICENSE) for details. 
108 | Third-party software and data sets are subject to their respective licenses. <br>
109 | If you want to cite our paper, please use:
110 | ```
111 | @inproceedings{krojer_contextual_2022,
112 |   address = {Online},
113 |   title = {Image Retrieval from Contextual Descriptions},
114 |   booktitle = {Proceedings of the 60th {Annual} {Meeting} of the {Association} for {Computational} {Linguistics},
115 |   publisher = {Association for Computational Linguistics},
116 |   author = {Krojer, Benno and Adlakha, Vaibhav and Vineet, Vibhav and Goyal, Yash and Ponti, Edoardo and Reddy, Siva},
117 |   month = may,
118 |   year = {2022},
119 | }
120 | ```
121 | 
122 | ## Acknowledgement
123 | Our data (specifically the image sets) are built upon 3 video dataset and Open Images:
124 | - [MSR-VTT](https://www.microsoft.com/en-us/research/publication/msr-vtt-a-large-video-description-dataset-for-bridging-video-and-language/)
125 | - [Video Storytelling](https://zenodo.org/record/2383739#.Yizc2Iz0nUR)
126 | - [YouCook](https://web.eecs.umich.edu/~jjcorso/r/youcook/)
127 | - [Open Images](https://storage.googleapis.com/openimages/web/index.html)
128 | 
129 | We also the [volta repository](https://github.com/e-bug/volta) for ViLBERT and UNITER baseline variants
130 | 
131 | For questions or feedback, don't hesitate to contact the author: benno.krojer@mila.quebec
132 | 
133 | 


--------------------------------------------------------------------------------
/baselines/README.md:
--------------------------------------------------------------------------------
 1 | Below are all commands for baseline experiments shown in the paper.
 2 | 
 3 | ## CLIP
 4 | **CLIP zero-shot:**
 5 | 
 6 | `python3 zero_shot.py`
 7 | 
 8 | **CLIP (non-contrastive):**
 9 | 
10 | `python3 nocontra.py`
11 | 
12 | **CLIP +ContextBatch (contrastive):**
13 | 
14 | `python3 contra.py`
15 | 
16 | For evaluation of nocontra.py and contra.py, run:
17 | 
18 | `evaluate_clip.py --checkpoint checkpoints/<MY_CHECKPOINT_FILE.pt> --test_descr_path ../../data/test_data.json`
19 | 
20 | **CLIP +ContextModule:**
21 | 
22 | `python3 contextual.py --lr 2e-6 --lr_head 1e-4 -b 36 -m ViT-B/16 --fusion mult -a gelu --logit_scale 1000 --finetuned_checkpoint_path checkpoints/CONTRA_clip_best__36_4e-06_30_1395526.pt --add_input --frozen_clip`
23 | 
24 | **CLIP +TemporalEmbedding:**
25 | 
26 | `python3 contextual.py --lr 2e-6 --lr_head 1e-4 -b 36 -m ViT-B/16 --fusion mult -a gelu --logit_scale 1000 --finetuned_checkpoint_path checkpoints/CONTRA_clip_best__36_4e-06_30_1395526.pt --add_input --frozen_clip --positional`
27 | 
28 | For evaluation for contextual.py run:
29 | 
30 | `evaluate_contextual.py --checkpoint checkpoints/<MY_CHECKPOINT_FILE.pt> --test_descr_path ../../data/test_data.json`
31 | `evaluate_contextual.py --checkpoint checkpoints/<MY_CHECKPOINT_FILE.pt> --test_descr_path ../../data/test_data.json --positional`
32 | 
33 | ## ViLBERT
34 | 
35 | **ViLBERT zero-shot**
36 | 
37 | `python3 zero_shot.py --config_file vilbert_base.json --tasks_config_file task_config/vilbert_zero_shot.yml --task 3 --output_dir results/vilbert-zero-shot --from_pretrained vilbert-pretrained.bin`
38 | 
39 | **ViLBERT (non-contrastive):**
40 | 
41 | `python nocontra.py --config_file vilbert_base.json --tasks_config_file task_config/vilbert_nocontra.yml --task 3 --adam_epsilon 1e-6 --adam_betas 0.9 0.999 --adam_correct_bias --weight_decay 0.0001 --warmup_proportion 0.1 --clip_grad_norm 1.0 --output_dir checkpoints/vilbert --logdir logs/vilbert --from_pretrained vilbert-pretrained.bin --num_train_epochs 10 --lr 0.000005 --save_name FINAL-NOCONTRA`
42 | 
43 | **ViLBERT +ContextBatch (contrastive):**
44 | 
45 | `python3 contra.py --config_file vilbert_base.json --tasks_config_file task_config/vilbert_contra.yml --task 3 --adam_epsilon 1e-6 --adam_betas 0.9 0.999 --adam_correct_bias --weight_decay 0.0001 --warmup_proportion 0.1 --clip_grad_norm 1.0 --output_dir checkpoints/contravilbert --logdir logs/contravilbert --from_pretrained vilbert-pretrained.bin --num_train_epochs 25 --lr 0.00004 --save_name FINAL-CONTRA --temperature 0.15 --ce_loss`
46 | 
47 | **ViLBERT +ContextModule:**
48 | 
49 | `python3 contextual.py --config_file vilbert_base.json --tasks_config_file task_config/vilbert_contextual.yml --task 3 --adam_epsilon 1e-6 --adam_betas 0.9 0.999 --adam_correct_bias --weight_decay 0.0001 --warmup_proportion 0.1 --clip_grad_norm 1.0 --output_dir checkpoints/contravilbert --logdir logs/contravilbert --from_pretrained vilbert-pretrained.bin --num_train_epochs 25 --lr 0.00002 --save_name FINAL-CONTEXT --add_inputs --transformer_layers 4 --all_pos`
50 | 
51 | **ViLBERT +TemporalEmbedding:**
52 | 
53 | `python3 contextual.py --config_file vilbert_base.json --tasks_config_file task_config/vilbert_contextual.yml --task 3 --adam_epsilon 1e-6 --adam_betas 0.9 0.999 --adam_correct_bias --weight_decay 0.0001 --warmup_proportion 0.1 --clip_grad_norm 1.0 --output_dir checkpoints/contravilbert --logdir logs/contravilbert --from_pretrained vilbert-pretrained.bin --num_train_epochs 25 --lr 0.00002 --save_name FINAL-TEMP-CONTEXT --add_inputs --transformer_layers 4 --positional --all_pos`
54 | 
55 | ## UNITER
56 | 
57 | **UNITER zero-shot**
58 | 
59 | `python3 zero_shot.py --config_file ctrl_uniter_base.json --tasks_config_file task_config/vilbert_zero_shot.yml --task 3 --output_dir results/vilbert-zero-shot --from_pretrained uniter-pretrained.bin`
60 | 
61 | **UNITER (non-contrastive):**
62 | 
63 | `python nocontra.py --config_file ctrl_uniter_base.json --tasks_config_file task_config/vilbert_nocontra.yml --task 3 --adam_epsilon 1e-6 --adam_betas 0.9 0.999 --adam_correct_bias --weight_decay 0.0001 --warmup_proportion 0.1 --clip_grad_norm 1.0 --output_dir checkpoints/vilbert --logdir logs/vilbert --from_pretrained uniter-pretrained.bin --num_train_epochs 10 --lr 0.000008 --save_name FINAL-NOCONTRA`
64 | 
65 | **UNITER +ContextBatch (contrastive):**
66 | 
67 | `python3 contra.py --config_file ctrl_uniter_base.json --tasks_config_file task_config/vilbert_contra.yml --task 3 --adam_epsilon 1e-6 --adam_betas 0.9 0.999 --adam_correct_bias --weight_decay 0.0001 --warmup_proportion 0.1 --clip_grad_norm 1.0 --output_dir checkpoints/contravilbert --logdir logs/contravilbert --from_pretrained uniter-pretrained.bin --num_train_epochs 25 --lr 0.000007 --save_name FINAL-CONTRA --temperature 0.15 --ce_loss`
68 | 
69 | **UNITER +ContextModule:**
70 | 
71 | `python3 contextual.py --config_file ctrl_uniter_base.json --tasks_config_file task_config/vilbert_contextual.yml --task 3 --adam_epsilon 1e-6 --adam_betas 0.9 0.999 --adam_correct_bias --weight_decay 0.0001 --warmup_proportion 0.1 --clip_grad_norm 1.0 --output_dir checkpoints/contravilbert --logdir logs/contravilbert --from_pretrained uniter-pretrained.bin --num_train_epochs 25 --lr 0.000006 --save_name FINAL-CONTEXT --add_inputs --transformer_layers 5 --all_pos`
72 | 
73 | **UNITER +TemporalEmbedding:**
74 | 
75 | `python3 contextual.py --config_file ctrl_uniter_base.json --tasks_config_file task_config/vilbert_contextual.yml --task 3 --adam_epsilon 1e-6 --adam_betas 0.9 0.999 --adam_correct_bias --weight_decay 0.0001 --warmup_proportion 0.1 --clip_grad_norm 1.0 --output_dir checkpoints/contravilbert --logdir logs/contravilbert --from_pretrained uniter-pretrained.bin --num_train_epochs 25 --lr 0.000006 --save_name FINAL-TEMP-CONTEXT --add_inputs --transformer_layers 5 --positional --all_pos`
76 | 
77 | Evaluation for ViLBERT and UNITER happens already during training. This is not ideal but ensured there was no slight accuracy difference between the model during training and saved model.
78 | 


--------------------------------------------------------------------------------
/baselines/clip/contra.py:
--------------------------------------------------------------------------------
  1 | # inspired from: https://github.com/openai/CLIP/issues/83
  2 | # https://github.com/openai/CLIP/issues/83
  3 | from importlib import import_module
  4 | import json
  5 | import os
  6 | import random
  7 | import wandb
  8 | import clip
  9 | from clip import model
 10 | import torch
 11 | from torch import autograd
 12 | from torch.utils.data import DataLoader
 13 | from dataset import ImageCoDeDataset
 14 | import tqdm
 15 | from torch import nn, optim
 16 | from PIL import Image
 17 | from pathlib import Path
 18 | from collections import defaultdict
 19 | import argparse
 20 | from functools import partial
 21 | random.seed(10)
 22 | torch.manual_seed(10)
 23 | wandb.init(project='clip-ViT-L14-336px-T5', settings=wandb.Settings(start_method='fork'))
 24 | 
 25 | def find_best_matches(text_features, photo_features):
 26 |     similarities = (photo_features @ text_features.T).squeeze(1)
 27 |     best_photo_idx = (-similarities).argsort()
 28 |     similarities = -similarities
 29 |     similarities.sort()
 30 |     return best_photo_idx, similarities
 31 | 
 32 | 
 33 | def convert_models_to_fp32(model):
 34 |     for name, p in model.named_parameters():
 35 |         p.data = p.data.float()
 36 |         p.grad.data = p.grad.data.float()
 37 | 
 38 | config = wandb.config
 39 | parser = argparse.ArgumentParser()
 40 | parser.add_argument('--batchsize', type=int, default=36)
 41 | parser.add_argument('--grad_accumulation', type=int, default=1)
 42 | parser.add_argument('--lr', type=float, default=4e-6)
 43 | parser.add_argument('--max_lr', type=float, default=4e-6)
 44 | parser.add_argument('--vit', type=str)
 45 | parser.add_argument('--decay', default=0.01, type=float)
 46 | parser.add_argument('--epochs', type=int, default=30)
 47 | parser.add_argument('--data_dir', type=str, default='../../data/')
 48 | parser.add_argument('--imgs_path', type=str, default='/network/scratch/b/benno.krojer/dataset/games')
 49 | parser.add_argument('--save_model', action='store_true')
 50 | parser.add_argument('--cycle_scheduler', action='store_true')
 51 | parser.add_argument("--job_id")
 52 | 
 53 | args = parser.parse_args()
 54 | wandb.config.update(args)
 55 | 
 56 | DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 57 | print(f'DEVICE USED: {DEVICE}')
 58 | model, preprocess = clip.load(args.vit, device=DEVICE, jit=False)
 59 | wandb.watch(model)
 60 | if DEVICE == "cpu":
 61 |     model.float()
 62 | else:
 63 |     clip.model.convert_weights(model)  # Actually this line is unnecessary since clip by default already on float16
 64 | 
 65 | 
 66 | dataset_train = ImageCoDeDataset(
 67 |     data_dir=args.data_dir,
 68 |     split='train',
 69 |     image_transform=preprocess,
 70 |     text_transform=partial(clip.tokenize, truncate=True)
 71 | )
 72 | dataloader_train = DataLoader(
 73 |     dataset=dataset_train,
 74 |     batch_size=args.batchsize,
 75 |     shuffle=True,
 76 |     num_workers=8,
 77 |     pin_memory=True
 78 | )
 79 | dataset_valid = ImageCoDeDataset(
 80 |     data_dir=args.data_dir,
 81 |     split='valid',
 82 |     image_transform=preprocess,
 83 |     text_transform=partial(clip.tokenize, truncate=True)
 84 | )
 85 | dataloader_valid = DataLoader(
 86 |     dataset=dataset_valid,
 87 |     batch_size=1,
 88 |     shuffle=False,
 89 |     num_workers=8,
 90 |     pin_memory=True
 91 | )
 92 | 
 93 | loss_img = nn.CrossEntropyLoss()
 94 | loss_txt = nn.CrossEntropyLoss()
 95 | optimizer = optim.AdamW(model.parameters(), lr=config.lr, betas=(0.9, 0.98), eps=1e-6, weight_decay=args.decay)
 96 | if args.cycle_scheduler:
 97 |     scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=args.max_lr, steps_per_epoch=len(dataloader_train)//(args.batchsize*args.grad_accumulation), epochs = args.epochs)
 98 | 
 99 | best_val = 0
100 | 
101 | for i in range(args.epochs):
102 |     save_model = False
103 |     # EVALUATE
104 |     if i > 0:
105 |         correct = 0
106 |         total = 0
107 |         for image, text, target, is_video in tqdm.tqdm(dataloader_valid):
108 |             image = image.to(DEVICE)
109 |             text = text.to(DEVICE)
110 |             target = target.to(DEVICE)
111 |             is_video = is_video.to(DEVICE)
112 | 
113 |             with torch.no_grad():
114 |                 image_features = model.encode_image(image.flatten(0, 1)).view(*image.shape[:2], -1)
115 |                 text_features = model.encode_text(text.squeeze(1))
116 |             
117 |             image_features /= image_features.norm(dim=-1, keepdim=True)
118 |             text_features /= text_features.norm(dim=-1, keepdim=True)
119 | 
120 |             similarity = (image_features @ text_features.unsqueeze(2)).squeeze()
121 |             prediction = similarity.argmax()
122 | 
123 |             total += 1
124 |             correct += (prediction == target)
125 |         acc = correct / total
126 |         acc = acc.item()
127 |         print(f'accuracy: {acc:.4f}')
128 |         wandb.log({'val_acc': acc})
129 |         if acc > best_val:
130 |             best_val = acc
131 |             save_model = True
132 |             string = ''
133 |             for key, val in list(vars(args).items()):
134 |                 if 'path' not in key:
135 |                     string += f'_{val}'
136 |             if args.save_model:
137 |                 torch.save({
138 |                     'epoch': i,
139 |                     'model_state_dict': model.state_dict(),
140 |                     'optimizer_state_dict': optimizer.state_dict(),
141 |                 }, f"checkpoints/CONTRA_clip_best_{string.replace('/', '')}.pt")
142 | 
143 |     print(f'EPOCH: {i}')
144 |     for step, (images, text, target, is_video) in tqdm.tqdm(enumerate(dataloader_train)):
145 |         images = images.to(DEVICE)
146 |         text = text.to(DEVICE)
147 |         target = target.to(DEVICE)
148 |         is_video = is_video.to(DEVICE)
149 | 
150 |         image_features_ = model.encode_image(images.flatten(0, 1)).reshape(*images.shape[:2], -1)
151 |         text_features_ = model.encode_text(text.squeeze(1))
152 |         image_features = image_features_ / image_features_.norm(dim=-1, keepdim=True)
153 |         text_features = text_features_ / text_features_.norm(dim=-1, keepdim=True)
154 | 
155 |         similarity = (image_features @ text_features.unsqueeze(2)).squeeze() * model.logit_scale.exp()
156 |         ground_truth = torch.tensor(target).long()  # the index of the correct one
157 |         loss = loss_txt(similarity, ground_truth)
158 |         loss.backward()
159 |         if step % args.grad_accumulation == 0:
160 |             print(loss.item())
161 |             wandb.log({'loss': loss})
162 |             if DEVICE == "cpu":
163 |                 optimizer.step()
164 |             else:
165 |                 convert_models_to_fp32(model)
166 |                 optimizer.step()
167 |                 clip.model.convert_weights(model)
168 |             optimizer.zero_grad()
169 |             if args.cycle_scheduler:
170 |                 scheduler.step()
171 | 


--------------------------------------------------------------------------------
/baselines/clip/dataset.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from pathlib import Path
 3 | from functools import partial
 4 | 
 5 | import torch
 6 | from torch.utils.data import Dataset
 7 | from torchvision.transforms import Pad, Resize, ToTensor, Compose
 8 | # from transformers import BertTokenizerFast
 9 | from PIL import Image
10 | 
11 | 
12 | def default_image_transform(img, img_size=224):
13 |     img = img.convert('RGB')
14 |     w, h = img.size
15 |     img = Compose([
16 |         Pad([0, (w-h)//2] if w>h else [(h-w)//2, 0]), 
17 |         Resize([img_size, img_size]), 
18 |         ToTensor()
19 |     ])(img)
20 |     return img
21 | 
22 | 
23 | def default_text_transform(text, tokenizer, max_length=77):
24 |     inputs = tokenizer(
25 |         text,
26 |         padding='max_length',
27 |         max_length=max_length,
28 |         truncation=True,
29 |         return_tensors='np'
30 |     )
31 |     return inputs
32 | 
33 | 
34 | class ImageCoDeDataset(Dataset):
35 | 
36 |     def __init__(self, data_dir, split, image_transform=None, text_transform=None, video_only=False):
37 |         super().__init__()
38 |         assert split in ['train', 'valid']
39 | 
40 |         if image_transform is not None:
41 |             self.image_transform = image_transform
42 |         else:
43 |             self.image_transform = default_image_transform
44 |         
45 |         # if text_transform is not None:
46 |         self.text_transform = text_transform
47 |         # else:
48 |         #     self.tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')
49 |         #     self.text_transform = partial(default_text_transform, tokenizer=self.tokenizer)
50 | 
51 |         self.data = self.load_data(Path(data_dir), '/network/scratch/b/benno.krojer/dataset/games', split, video_only)
52 | 
53 |     @staticmethod
54 |     def load_data(data_dir, img_path, split, video_only=False):
55 |         with open(data_dir / f'{split}_data.json') as f:
56 |             json_file = json.load(f)
57 | 
58 |         dataset = []
59 |         for img_dir, data in json_file.items():
60 |             img_files = list((Path(f'{img_path}/{img_dir}')).glob('*.jpg'))
61 |             img_files = sorted(img_files, key=lambda x: int(str(x).split('/')[-1].split('.')[0][3:]))
62 |             for img_idx, text in data.items():
63 |                 static = 'open-images' in img_dir
64 |                 if video_only:
65 |                     if not static:
66 |                         dataset.append((img_dir, img_files, int(img_idx), text))
67 |                 else:
68 |                     dataset.append((img_dir, img_files, int(img_idx), text))
69 |         
70 |         return dataset
71 |     
72 |     def __getitem__(self, idx):
73 |         img_dir, img_files, img_idx, text = self.data[idx]
74 |         
75 |         images = [self.image_transform(Image.open(img_file)) for img_file in img_files]
76 |         img = torch.stack(images, dim=0)
77 |         
78 |         txt = self.text_transform(text)
79 |         is_video = torch.tensor(1 if 'open-images' not in img_dir else 0)
80 |         
81 |         return img, txt, img_idx, is_video
82 |     
83 |     def __len__(self):
84 |         return len(self.data)


--------------------------------------------------------------------------------
/baselines/clip/evaluate_clip.py:
--------------------------------------------------------------------------------
  1 | # based on: https://github.com/haltakov/natural-language-image-search#on-your-machine
  2 | from tqdm import tqdm
  3 | import json
  4 | from collections import defaultdict
  5 | from glob import glob
  6 | import os
  7 | import numpy as np
  8 | import clip
  9 | import torch
 10 | from PIL import Image
 11 | from pathlib import Path
 12 | import statistics
 13 | import argparse
 14 | 
 15 | def encode_images(photos_batch):
 16 |     photos = [Image.open(photo_file) for photo_file in photos_batch]
 17 |     photos_preprocessed = torch.stack([preprocess(photo) for photo in photos]).to(device)
 18 | 
 19 |     with torch.no_grad():
 20 |         photos_features = model.encode_image(photos_preprocessed)
 21 |         photos_features /= photos_features.norm(dim=-1, keepdim=True)
 22 |     return photos_features.cpu().numpy()
 23 | 
 24 | 
 25 | def encode_text(search_query):
 26 |     with torch.no_grad():
 27 |         text_encoded = model.encode_text(clip.tokenize(search_query, truncate=True).to(device))
 28 |         text_encoded /= text_encoded.norm(dim=-1, keepdim=True)
 29 |     return text_encoded.cpu().numpy()
 30 | 
 31 | 
 32 | def find_best_matches(text_features, photo_features):
 33 |     similarities = (photo_features @ text_features.T).squeeze(1)
 34 |     best_photo_idx = (-similarities).argsort()
 35 |     similarities = -similarities
 36 |     similarities.sort()
 37 |     return best_photo_idx, similarities
 38 | 
 39 | parser = argparse.ArgumentParser()
 40 | parser.add_argument('--checkpoint', type=str)
 41 | parser.add_argument('--test_descr_path', type=str, default='../../data/test_data.json')
 42 | parser.add_argument('--imgs_path', type=str, default='/network/scratch/b/benno.krojer/dataset/games')
 43 | parser.add_argument("--job_id")
 44 | 
 45 | args = parser.parse_args()
 46 | 
 47 | 
 48 | device = "cuda" if torch.cuda.is_available() else "cpu"
 49 | print(f'USING DEVICE: {device}')
 50 | model, preprocess = clip.load('ViT-B/16', device=device, jit=False)  # Must set jit=False for training
 51 | 
 52 | checkpoint = torch.load(args.checkpoint)
 53 | model.load_state_dict(checkpoint['model_state_dict'])
 54 | print(checkpoint['epoch'])
 55 | clip.model.convert_weights(model)  # Actually this line is unnecessary since clip by default already on float16
 56 | model.eval()
 57 | 
 58 | correct = 0
 59 | total = 0
 60 | vid_correct = 0
 61 | vid_total = 0
 62 | img_correct= 0
 63 | img_total = 0
 64 | 
 65 | 
 66 | img_dirs = args.imgs_path
 67 | descriptions = json.load(open(args.test_descr_path, 'r'))
 68 | valid = []
 69 | for img_dir, data in descriptions.items():
 70 |     for img_idx, text in data.items():
 71 |         valid.append((img_dir, img_idx, text))
 72 | 
 73 | results = defaultdict(dict)
 74 | for img_dir, img_idx, text in tqdm(valid):
 75 |     text = [text]
 76 |     img_idx = int(img_idx)
 77 |     img_files = list((Path(img_dirs) / img_dir).glob("*.jpg"))
 78 |     img_files = sorted(img_files, key=lambda x: int(str(x).split('/')[-1].split('.')[0][3:]))
 79 | 
 80 | 
 81 |     images = [Image.open(photo_file) for photo_file in img_files]
 82 |     images = torch.stack([preprocess(photo) for photo in images]).to(device)
 83 |     text = clip.tokenize(text, truncate=True).to(device)
 84 |     with torch.no_grad():
 85 |         image_features = model.encode_image(images)
 86 |         text_features = model.encode_text(text)
 87 |         # normalized features
 88 |         image_features = image_features / image_features.norm(dim=-1, keepdim=True)
 89 |         text_features = text_features / text_features.norm(dim=-1, keepdim=True)
 90 | 
 91 |         logits = (image_features @ text_features.T).squeeze(1)
 92 | 
 93 |     pred = torch.argmax(logits).squeeze()
 94 |     if img_idx == pred:
 95 |         correct += 1
 96 |     if 'open-images' in img_dir:
 97 |         img_total += 1
 98 |         if img_idx == pred:
 99 |             img_correct += 1
100 |     else:
101 |         vid_total += 1
102 |         if img_idx == pred:
103 |             vid_correct += 1        
104 | 
105 | 
106 | print('OVERALL ACC: ' + str(round(correct/len(valid),4)))
107 | print('VIDEO ACC: ' + str(round(vid_correct/vid_total,4)))
108 | print('IMG ACC: ' + str(round(img_correct/img_total,4)))
109 | json.dump(results, open(f'results/nocontra-test-data.json', 'w'), indent=2)
110 | 


--------------------------------------------------------------------------------
/baselines/clip/evaluate_contextual.py:
--------------------------------------------------------------------------------
  1 | # inspired from: https://github.com/openai/CLIP/issues/83
  2 | # https://github.com/openai/CLIP/issues/83
  3 | import json
  4 | import os
  5 | import random
  6 | import wandb
  7 | import clip
  8 | from clip import model
  9 | import torch
 10 | from torch import autograd
 11 | import tqdm
 12 | from torch import nn, optim
 13 | from PIL import Image
 14 | from pathlib import Path
 15 | from collections import defaultdict
 16 | import sys
 17 | from volta_src.config import BertConfig
 18 | from volta_src.embeddings import BertLayerNorm
 19 | from volta_src.encoders import GeLU
 20 | from extras import convert_sents_to_features, BertLayer
 21 | import argparse
 22 | 
 23 | random.seed(10)
 24 | torch.manual_seed(10)
 25 | wandb.init(project='contextualclip', notes="fixed pos emb", entity='bennokrojer', settings=wandb.Settings(start_method="fork"))
 26 | 
 27 | 
 28 | def find_best_matches(text_features, photo_features):
 29 |     similarities = (photo_features @ text_features.T).squeeze(1)
 30 |     best_photo_idx = (-similarities).argsort()
 31 |     similarities = -similarities
 32 |     similarities.sort()
 33 |     return best_photo_idx, similarities
 34 | 
 35 | 
 36 | def convert_models_to_fp32(model):
 37 |     for p in model.parameters():
 38 |         if p.grad is not None:
 39 |             p.data = p.data.float()
 40 |             p.grad.data = p.grad.data.float()
 41 | 
 42 | class ContextualCLIP(torch.nn.Module):
 43 |     def __init__(self, bert_config, args):
 44 |         super(ContextualCLIP, self).__init__()
 45 |         self.clip, self.preprocess = clip.load('ViT-B/16', device=device, jit=False)
 46 |         config = BertConfig.from_dict(bert_config)
 47 |         self.fusion = args.fusion
 48 |         if self.fusion == 'concat':
 49 |             hidden_size = 1024
 50 |         else:
 51 |             hidden_size = 512
 52 | 
 53 |         config.hidden_size =  hidden_size
 54 |         config.num_attention_heads = 8
 55 |         self.transformer = nn.ModuleList([BertLayer(config) for _ in range(args.transformer_layers)])
 56 |         self.transformer.cuda()
 57 |         self.prediction_layer = nn.Linear(config.hidden_size, 1).cuda()
 58 |         self.batch_size = 1
 59 |         self.logit_scale = float(args.logit_scale)
 60 |         self.frozen_clip = args.frozen_clip
 61 |         self.add_input = args.add_input
 62 |         self.positional = args.positional
 63 |         if args.positional:
 64 |             self.positional_emb = torch.nn.Embedding(10,hidden_size).cuda()
 65 | 
 66 |     def forward(self, images, text, pos_mask):
 67 |         if self.frozen_clip:
 68 |             with torch.no_grad():
 69 |                 image_features = self.clip.encode_image(images)
 70 |                 text_features = self.clip.encode_text(text)
 71 |         else:
 72 |             image_features = self.clip.encode_image(images)
 73 |             text_features = self.clip.encode_text(text)
 74 |         # normalized features
 75 |         image_features = image_features / image_features.norm(dim=-1, keepdim=True)
 76 |         text_features = text_features / text_features.norm(dim=-1, keepdim=True)
 77 |         text_features = torch.cat(10 * [text_features])
 78 |         if self.fusion == 'concat':
 79 |             x = torch.cat((image_features, text_features), dim=1)
 80 |         else:
 81 |             x = (self.logit_scale * image_features) * text_features
 82 |         x_ = torch.unsqueeze(x,dim=0)
 83 |         if self.positional:
 84 |             embs = self.positional_emb(torch.arange(10).cuda())
 85 |             embs = embs * pos_mask
 86 |             x_pos = x_ + embs
 87 |         else:
 88 |             x_pos = x_
 89 |         attention_mask = torch.ones((self.batch_size,1,1,10)).cuda()
 90 |         x = self.transformer[0](x_pos, attention_mask)
 91 |         for layer_module in self.transformer[1:]:
 92 |             x = layer_module(x, attention_mask) #TODO: remove hard-coding of 10
 93 |         if self.add_input:
 94 |             x = x + x_
 95 |         preds = self.prediction_layer(x.half())
 96 |         return preds
 97 | 
 98 |     def encode_images(self, photos_batch):
 99 |         photos = [Image.open(photo_file) for photo_file in photos_batch]
100 |         photos_preprocessed = torch.stack([self.preprocess(photo) for photo in photos]).to(device)
101 | 
102 |         with torch.no_grad():
103 |             photos_features = self.clip.encode_image(photos_preprocessed)
104 |             photos_features /= photos_features.norm(dim=-1, keepdim=True)
105 |         return photos_features.cpu().numpy()
106 | 
107 |     def encode_text(self, search_query):
108 |         with torch.no_grad():
109 |             text_encoded = self.clip.encode_text(clip.tokenize(search_query, truncate=True).to(device))
110 |             text_encoded /= text_encoded.norm(dim=-1, keepdim=True)
111 |         return text_encoded.cpu().numpy()
112 | 
113 | 
114 | config = wandb.config
115 | parser = argparse.ArgumentParser()
116 | parser.add_argument('--checkpoint', type=str)
117 | parser.add_argument('--test_descr_path', type=str, default='../../data/test_data.json')
118 | parser.add_argument('--imgs_path', type=str, default='/network/scratch/b/benno.krojer/dataset/games')
119 | parser.add_argument("-b", "--batchsize", type=int, default=36)
120 | parser.add_argument("--fusion", type=str, default='mult')
121 | parser.add_argument("-a", "--activation", default='gelu')
122 | parser.add_argument("-s", "--logit_scale", default=1000)
123 | parser.add_argument("--frozen_clip", default=True)
124 | parser.add_argument("--add_input", default=True)
125 | parser.add_argument("--positional", action="store_true")
126 | parser.add_argument("--head_scheduler", default= 1.0, type=float)
127 | parser.add_argument("--base_scheduler", default= 1.0, type=float)
128 | parser.add_argument("--transformer_layers", default=2, type=int)
129 | parser.add_argument("--job_id")
130 | 
131 | args = parser.parse_args()
132 | assert args.fusion in ['concat', 'mult']
133 | assert args.activation in ['leaky-relu', 'relu', 'gelu']
134 | wandb.config.update(args)
135 | 
136 | img_dirs = args.imgs_path
137 | valid_data = json.load(open(args.test_descr_path, 'r'))
138 | valid = []
139 | for img_dir, data in valid_data.items():
140 |     for img_idx, text in data.items():
141 |         valid.append((img_dir, img_idx, text))
142 | device = "cuda" if torch.cuda.is_available() else "cpu"
143 | print(f'DEVICE USED: {device}')
144 | 
145 | bert_config = json.load(open('vilbert-and-bert-config.json', 'r'))
146 | contextual_clip = ContextualCLIP(bert_config, args)
147 | checkpoint = torch.load(args.checkpoint)
148 | contextual_clip.load_state_dict(checkpoint['model_state_dict'])
149 | 
150 | config = wandb.config
151 | wandb.watch(contextual_clip)
152 | if device == "cpu":
153 |     contextual_clip.float()
154 | else:
155 |     clip.model.convert_weights(
156 |         contextual_clip)  # Actually this line is unnecessary since clip by default already on float16
157 | 
158 | 
159 | correct = 0
160 | total = 0
161 | vid_correct = 0
162 | vid_total = 0
163 | img_correct= 0
164 | img_total = 0
165 | 
166 | results = defaultdict(dict)
167 | for img_dir, img_idx, text in tqdm.tqdm(valid):
168 |     text = [text]
169 |     img_idx = int(img_idx)
170 |     img_files = list((Path(img_dirs) / img_dir).glob("*.jpg"))
171 |     img_files = sorted(img_files, key=lambda x: int(str(x).split('/')[-1].split('.')[0][3:]))
172 |     images = [Image.open(photo_file) for photo_file in img_files]
173 |     images = torch.stack([contextual_clip.preprocess(photo) for photo in images]).to(device)
174 |     text = clip.tokenize(text, truncate=True).to(device)
175 |     if "open-images" in str(img_dir):
176 |         pos_mask = torch.zeros((10,1)).cuda()
177 |     else:
178 |         pos_mask = torch.ones((10,1)).cuda()
179 |     with torch.no_grad():
180 |         logits = contextual_clip(images, text, pos_mask).squeeze()
181 |     pred = torch.argmax(logits).squeeze()
182 |     if img_idx == pred:
183 |         correct += 1
184 |     if 'open-images' in img_dir:
185 |         img_total += 1
186 |         if img_idx == pred:
187 |             img_correct += 1
188 |     else:
189 |         vid_total += 1
190 |         if img_idx == pred:
191 |             vid_correct += 1        
192 | 
193 |     total += 1
194 |     results[img_dir].update({f'raw_preds_{img_idx}': logits.squeeze().tolist(), f'clip_pred_{img_idx}': int(pred.item()) ,f'correct_{img_idx}': 1 if img_idx == pred else 0})
195 | 
196 | print('OVERALL ACC: ' + str(round(correct/len(valid),4)))
197 | print('VIDEO ACC: ' + str(round(vid_correct/vid_total,4)))
198 | print('IMG ACC: ' + str(round(img_correct/img_total,4)))
199 | json.dump(results, open(f'results/nocontra-test-data.json', 'w'), indent=2)
200 | json.dump(results, open(f'results/CONTEXTUAL_test_set.json', 'w'), indent=2)


--------------------------------------------------------------------------------
/baselines/clip/extras.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import sys
  3 | 
  4 | import torch
  5 | from torch import nn
  6 | 
  7 | from volta_src.embeddings import BertLayerNorm
  8 | from volta_src.encoders import ACT2FN
  9 | 
 10 | 
 11 | class BertSelfAttention(nn.Module):
 12 |     def __init__(self, config):
 13 |         super(BertSelfAttention, self).__init__()
 14 |         if config.hidden_size % config.num_attention_heads != 0:
 15 |             raise ValueError(
 16 |                 "The hidden size (%d) is not a multiple of the number of attention "
 17 |                 "heads (%d)" % (config.hidden_size, config.num_attention_heads))
 18 |         self.num_attention_heads = config.num_attention_heads
 19 |         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
 20 |         self.all_head_size = self.num_attention_heads * self.attention_head_size
 21 | 
 22 |         self.query = nn.Linear(config.hidden_size, self.all_head_size)
 23 |         self.key = nn.Linear(config.hidden_size, self.all_head_size)
 24 |         self.value = nn.Linear(config.hidden_size, self.all_head_size)
 25 | 
 26 |         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 27 | 
 28 |     def transpose_for_scores(self, x):
 29 |         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
 30 |         x = x.view(*new_x_shape)
 31 |         return x.permute(0, 2, 1, 3)
 32 | 
 33 |     def forward(self, hidden_states, attention_mask=None):
 34 |         mixed_query_layer = self.query(hidden_states.half())
 35 |         mixed_key_layer = self.key(hidden_states.half())
 36 |         mixed_value_layer = self.value(hidden_states.half())
 37 | 
 38 |         query_layer = self.transpose_for_scores(mixed_query_layer).float()
 39 |         key_layer = self.transpose_for_scores(mixed_key_layer).float()
 40 |         value_layer = self.transpose_for_scores(mixed_value_layer).float()
 41 | 
 42 |         # Take the dot product between "query" and "key" to get the raw attention scores.
 43 |         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
 44 |         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
 45 |         # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
 46 |         if attention_mask is not None:
 47 |             attention_scores = attention_scores + attention_mask
 48 | 
 49 |         # Normalize the attention scores to probabilities.
 50 |         attention_probs = nn.Softmax(dim=-1)(attention_scores)
 51 | 
 52 |         # This is actually dropping out entire tokens to attend to, which might
 53 |         # seem a bit unusual, but is taken from the original Transformer paper.
 54 |         attention_probs = self.dropout(attention_probs)
 55 | 
 56 |         context_layer = torch.matmul(attention_probs, value_layer)
 57 |         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
 58 |         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
 59 |         context_layer = context_layer.view(*new_context_layer_shape)
 60 |         return context_layer
 61 | 
 62 | 
 63 | class BertSelfOutput(nn.Module):
 64 |     def __init__(self, config):
 65 |         super(BertSelfOutput, self).__init__()
 66 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 67 |         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
 68 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 69 | 
 70 |     def forward(self, hidden_states, input_tensor):
 71 |         hidden_states = hidden_states.half()
 72 |         hidden_states = self.dense(hidden_states)
 73 |         hidden_states = self.dropout(hidden_states)
 74 |         hidden_states = self.LayerNorm(hidden_states + input_tensor)
 75 |         return hidden_states
 76 | 
 77 | 
 78 | class BertAttention(nn.Module):
 79 |     def __init__(self, config):
 80 |         super(BertAttention, self).__init__()
 81 |         self.self = BertSelfAttention(config)
 82 |         self.output = BertSelfOutput(config).cuda()
 83 | 
 84 |     def forward(self, input_tensor, attention_mask):
 85 |         self_output = self.self(input_tensor, attention_mask)
 86 |         attention_output = self.output(self_output, input_tensor)
 87 |         return attention_output
 88 | 
 89 | 
 90 | class BertIntermediate(nn.Module):
 91 |     def __init__(self, config):
 92 |         super(BertIntermediate, self).__init__()
 93 |         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
 94 |         if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
 95 |             self.intermediate_act_fn = ACT2FN[config.hidden_act]
 96 |         else:
 97 |             self.intermediate_act_fn = config.hidden_act
 98 | 
 99 |     def forward(self, hidden_states):
100 |         hidden_states = self.dense(hidden_states)
101 |         hidden_states = self.intermediate_act_fn(hidden_states)
102 |         return hidden_states
103 | 
104 | 
105 | class BertOutput(nn.Module):
106 |     def __init__(self, config):
107 |         super(BertOutput, self).__init__()
108 |         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
109 |         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
110 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
111 | 
112 |     def forward(self, hidden_states, input_tensor):
113 |         hidden_states = self.dense(hidden_states)
114 |         hidden_states = self.dropout(hidden_states)
115 |         hidden_states = self.LayerNorm(hidden_states + input_tensor)
116 |         return hidden_states
117 | 
118 | 
119 | class BertLayer(nn.Module):
120 |     def __init__(self, config):
121 |         super(BertLayer, self).__init__()
122 |         self.attention = BertAttention(config)
123 |         self.intermediate = BertIntermediate(config)
124 |         self.output = BertOutput(config)
125 | 
126 |     def forward(self, hidden_states, attention_mask):
127 |         attention_output = self.attention(hidden_states, attention_mask)
128 |         intermediate_output = self.intermediate(attention_output.half())
129 |         layer_output = self.output(intermediate_output.half(), attention_output.half())
130 |         return layer_output
131 | 
132 | 
133 | """
134 | The above modules are copied from BERT.
135 | """
136 | 
137 | 
138 | class InputFeatures(object):
139 |     """A single set of features of data."""
140 | 
141 |     def __init__(self, input_ids, input_mask, segment_ids):
142 |         self.input_ids = input_ids
143 |         self.input_mask = input_mask
144 |         self.segment_ids = segment_ids
145 | 
146 | 
147 | def convert_sents_to_features(sents, max_seq_length, tokenizer):
148 |     """Loads a data file into a list of `InputBatch`s."""
149 | 
150 |     features = []
151 |     tokenized_sentences = []
152 | 
153 |     for (i, sent) in enumerate(sents):
154 |         tokens_a = tokenizer.tokenize(sent.strip())
155 | 
156 |         # Account for [CLS] and [SEP] with "- 2"
157 |         if len(tokens_a) > max_seq_length - 2:
158 |             tokens_a = tokens_a[:(max_seq_length - 2)]
159 | 
160 |         # Keep segment id which allows loading BERT-weights.
161 |         tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
162 |         tokenized_sentences.append(tokens)
163 | 
164 |     max_len = max(len(tokens) for tokens in tokenized_sentences)
165 | 
166 |     for (i, tokens) in enumerate(tokenized_sentences):
167 |         segment_ids = [0] * len(tokens)
168 | 
169 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
170 | 
171 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
172 |         # tokens are attended to.
173 |         input_mask = [1] * len(input_ids)
174 | 
175 |         # Zero-pad up to the sequence length.
176 |         padding = [0] * (max_len - len(input_ids))
177 |         input_ids += padding
178 |         input_mask += padding
179 |         segment_ids += padding
180 | 
181 |         assert len(input_ids) == max_len
182 |         assert len(input_mask) == max_len
183 |         assert len(segment_ids) == max_len
184 | 
185 |         features.append(
186 |             InputFeatures(input_ids=input_ids,
187 |                           input_mask=input_mask,
188 |                           segment_ids=segment_ids))
189 |     return features
190 | 
191 | 
192 | """
193 | The above modules are copied from LXMERT.
194 | """
195 | 


--------------------------------------------------------------------------------
/baselines/clip/nocontra.py:
--------------------------------------------------------------------------------
  1 | # inspired from: https://github.com/openai/CLIP/issues/83
  2 | # https://github.com/openai/CLIP/issues/83
  3 | import json
  4 | import os
  5 | import random
  6 | import wandb
  7 | import clip
  8 | from clip import model
  9 | import torch
 10 | from torch import autograd
 11 | import tqdm
 12 | from torch import nn, optim
 13 | from PIL import Image
 14 | from pathlib import Path
 15 | from collections import defaultdict
 16 | import argparse
 17 | random.seed(10)
 18 | torch.manual_seed(10)
 19 | wandb.init(project='noncontra-finetune-clip', settings=wandb.Settings(start_method='fork'))
 20 | 
 21 | def encode_images(photos_batch):
 22 |     photos = [Image.open(photo_file) for photo_file in photos_batch]
 23 |     photos_preprocessed = torch.stack([preprocess(photo) for photo in photos]).to(device)
 24 | 
 25 |     with torch.no_grad():
 26 |         photos_features = model.encode_image(photos_preprocessed)
 27 |         photos_features /= photos_features.norm(dim=-1, keepdim=True)
 28 |     return photos_features.cpu().numpy()
 29 | 
 30 | 
 31 | def encode_text(search_query):
 32 |     with torch.no_grad():
 33 |         text_encoded = model.encode_text(clip.tokenize(search_query, truncate=True).to(device))
 34 |         text_encoded /= text_encoded.norm(dim=-1, keepdim=True)
 35 |     return text_encoded.cpu().numpy()
 36 | 
 37 | 
 38 | def find_best_matches(text_features, photo_features):
 39 |     similarities = (photo_features @ text_features.T).squeeze(1)
 40 |     best_photo_idx = (-similarities).argsort()
 41 |     similarities = -similarities
 42 |     similarities.sort()
 43 |     return best_photo_idx, similarities
 44 | 
 45 | 
 46 | def convert_models_to_fp32(model):
 47 |     for p in model.parameters():
 48 |         p.data = p.data.float()
 49 |         p.grad.data = p.grad.data.float()
 50 | 
 51 | config = wandb.config
 52 | parser = argparse.ArgumentParser()
 53 | parser.add_argument('--batchsize', type=int, default=36)
 54 | parser.add_argument('--lr', type=float, default=4e-6)
 55 | parser.add_argument('--epochs', type=int, default=30)
 56 | parser.add_argument('--valid_descr_path', type=str, default='../../data/valid_data.json')
 57 | parser.add_argument('--train_descr_path', type=str, default='../../data/train_data.json')
 58 | parser.add_argument('--imgs_path', type=str, default='/network/scratch/b/benno.krojer/dataset/games')
 59 | parser.add_argument("--job_id")
 60 | 
 61 | args = parser.parse_args()
 62 | wandb.config.update(args)
 63 | device = "cuda" if torch.cuda.is_available() else "cpu"
 64 | print(f'DEVICE USED: {device}')
 65 | model, preprocess = clip.load('ViT-B/16', device=device, jit=False)
 66 | wandb.watch(model)
 67 | if device == "cpu":
 68 |     model.float()
 69 | else:
 70 |     clip.model.convert_weights(model)  # Actually this line is unnecessary since clip by default already on float16
 71 | 
 72 | img_dirs = args.imgs_path
 73 | valid_data = json.load(open(args.valid_descr_path, 'r'))
 74 | train_data = json.load(open(args.train_descr_path, 'r'))
 75 | valid = []
 76 | for img_dir, data in valid_data.items():
 77 |     for img_idx, text in data.items():
 78 |         valid.append((img_dir, int(img_idx), text))
 79 | train = []
 80 | all_img = []
 81 | for img_dir, data in train_data.items():
 82 |     img_files = list((Path(img_dirs) / img_dir).glob("*.jpg"))
 83 |     all_img += img_files  
 84 | for img_dir, data in train_data.items():
 85 |     img_files = list((Path(img_dirs) / img_dir).glob("*.jpg"))
 86 |     img_files = sorted(img_files, key=lambda x: int(str(x).split('/')[-1].split('.')[0][3:]))
 87 |     for img_idx, text in data.items():
 88 |         true_img = img_files[int(img_idx)]
 89 |         distr_imgs = random.sample(all_img, 9)
 90 |         img_files = [true_img] + distr_imgs
 91 |         train.append((img_files, text))
 92 | 
 93 | criterion = nn.CrossEntropyLoss()
 94 | optimizer = optim.Adam(model.parameters(), lr=config.lr, betas=(0.9, 0.98), eps=1e-6, weight_decay=0.2)
 95 | best_val = 0
 96 | 
 97 | sigm = nn.Sigmoid()
 98 | for i in range(args.epochs):
 99 |     save_model = False
100 |     # EVALUATE
101 |     if i != 0:
102 |         correct = 0
103 |         ranks = defaultdict(int)
104 |         for img_dir, img_idx, text in tqdm.tqdm(valid):
105 |             img_files = list((Path(img_dirs)/img_dir).glob("*.jpg"))
106 |             img_files = sorted(img_files, key=lambda x: int(str(x).split('/')[-1].split('.')[0][3:]))
107 |             img_embs = encode_images(img_files)
108 |             text_emb = encode_text(text.strip())
109 |             ranked_idx, sim = find_best_matches(text_emb, img_embs)
110 |             ranked_files = [str(img_files[rank]).split('/')[-1][:-4] for rank in ranked_idx]
111 |             target = str(img_files[int(img_idx)]).split('/')[-1][:-4]
112 |             if ranked_files[0] == target:
113 |                 correct += 1
114 |             ranks[ranked_files.index(target)+1] += 1
115 |         print(correct)
116 |         print(len(valid))
117 |         print(ranks)
118 |         acc = correct / len(valid)
119 |         wandb.log({'val_acc': acc})
120 |         if acc > best_val:
121 |             best_val = acc
122 |             save_model = True
123 |             string = ''
124 |             for key, val in list(vars(args).items()):
125 |                 if 'path' not in key:
126 |                     string += f'_{val}'
127 |             torch.save({
128 |                 'epoch': i,
129 |                 'model_state_dict': model.state_dict(),
130 |                 'optimizer_state_dict': optimizer.state_dict(),
131 |             }, f"checkpoints/NOCONTRA_clip_best_{string.replace('/', '')}.pt")
132 |         print('------------------------------')
133 | 
134 | 
135 |     print(f'EPOCH: {i}')
136 |     step = 0
137 |     random.shuffle(train)
138 |     for img_files, text in train:
139 |         step += 1
140 |         text = [text]
141 |         images = [Image.open(photo_file) for photo_file in img_files]
142 |         images = torch.stack([preprocess(photo) for photo in images]).to(device)
143 |         text = clip.tokenize(text, truncate=True).to(device)
144 |         logits, _ = model(images, text)
145 |         logits = logits.permute(1,0)
146 |         ground_truth = torch.tensor([0]).long().to(device)  # the index of the correct one
147 |         loss = criterion(logits, ground_truth)
148 |         loss.backward()
149 |         if step % config.batchsize == 0:
150 |             print('STEP: '+ str(step))
151 |             print(f'TOTAL LOSS: {loss}')
152 |             wandb.log({'loss': loss})
153 |             if device == "cpu":
154 |                 optimizer.step()
155 |             else:
156 |                 convert_models_to_fp32(model)
157 |                 optimizer.step()
158 |                 clip.model.convert_weights(model)
159 |             optimizer.zero_grad()
160 | 


--------------------------------------------------------------------------------
/baselines/clip/vilbert-and-bert-config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 768,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 3072,
 8 |   "max_position_embeddings": 512,
 9 |   "num_attention_heads": 12,
10 |   "pooler_size": 1024,
11 |   "type_vocab_size": 2,
12 |   "vocab_size": 30522,
13 |   "bert_model": "bert-base-uncased",
14 |   "do_lower_case": true,
15 |   "num_locs": 5,
16 |   "add_global_imgfeat": "first",
17 |   "image_embeddings": "vilbert",
18 |   "v_attention_probs_dropout_prob": 0.1,
19 |   "v_hidden_act": "gelu",
20 |   "v_hidden_dropout_prob": 0.1,
21 |   "v_feature_size": 2048,
22 |   "visual_target_weights": {"0": 1.0},
23 |   "v_hidden_size": 768,
24 |   "v_initializer_range": 0.02,
25 |   "v_pooler_size": 1024,
26 |   "v_num_attention_heads": 12,
27 |   "v_intermediate_size": 3072,
28 |   "fusion_method": "mul",
29 |   "clf_hidden_size": 1536,
30 |   "tt_attn_sublayers": [0,2,4,6,8,10,14,18,22,26,30,34],
31 |   "tv_attn_sublayers": [12,16,20,24,28,32],
32 |   "vt_attn_sublayers": [12,16,20,24,28,32],
33 |   "vv_attn_sublayers": [14,18,22,26,30,34],
34 |   "t_ff_sublayers": [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35],
35 |   "v_ff_sublayers": [13,15,17,19,21,23,25,27,29,31,33,35],
36 |   "shared_sublayers": [],
37 |   "single_ln_sublayers": [],
38 |   "sublayer2attn_hidden_size": {},
39 |   "sublayer2num_attention_heads": {},
40 |   "sublayer2intermediate_size": {},
41 |   "sublayer2v_attn_hidden_size": {},
42 |   "sublayer2v_num_attention_heads": {},
43 |   "sublayer2v_intermediate_size": {},
44 |   "bert_layer2attn_sublayer": {
45 |     "0": 0, "1": 2, "2": 4, "3": 6, "4": 8, "5": 10,
46 |     "6": 14, "7": 18, "8": 22, "9": 26, "10": 30, "11": 34
47 |   },
48 |   "bert_layer2ff_sublayer": {
49 |     "0": 1, "1": 3, "2": 5, "3": 7, "4": 9, "5": 11,
50 |     "6": 15, "7": 19, "8": 23, "9": 27, "10": 31, "11": 35
51 |   }
52 | }


--------------------------------------------------------------------------------
/baselines/clip/volta_src/config.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020, Emanuele Bugliarello (@e-bug).
  2 | 
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | import json
  7 | import copy
  8 | from io import open
  9 | 
 10 | 
 11 | class BertConfig(object):
 12 |     """Configuration class to store the configuration of a `BertModel`.
 13 |     """
 14 | 
 15 |     def __init__(
 16 |         self,
 17 |         vocab_size_or_config_json_file,
 18 |         hidden_size=768,
 19 |         num_attention_heads=12,
 20 |         intermediate_size=3072,
 21 |         pooler_size=768,
 22 |         hidden_act="gelu",
 23 |         hidden_dropout_prob=0.1,
 24 |         attention_probs_dropout_prob=0.1,
 25 |         max_position_embeddings=512,
 26 |         type_vocab_size=2,
 27 |         do_lower_case=True,
 28 |         num_locs=5,
 29 |         v_coordinate_embeddings_dim=None,
 30 |         add_global_imgfeat=None,
 31 |         image_embeddings="vilbert",
 32 |         initializer_range=0.02,
 33 |         v_feature_size=2048,
 34 |         v_hidden_size=768,
 35 |         v_num_attention_heads=12,
 36 |         v_intermediate_size=3072,
 37 |         v_pooler_size=1024,
 38 |         v_attention_probs_dropout_prob=0.1,
 39 |         v_hidden_act="gelu",
 40 |         v_hidden_dropout_prob=0.1,
 41 |         v_initializer_range=0.2,
 42 |         visual_target_weights={"0": 1},
 43 |         qa_task_weight=0.0,
 44 |         qa_num_answers=0,
 45 |         fixed_layers=[],
 46 |         fusion_method="mul",
 47 |         objective=0,
 48 |         clf_hidden_size=1536,
 49 |         image_head_ln=True,
 50 |         bert_model="bert-base-uncased",
 51 |         visualization=False,
 52 |         tt_attn_sublayers=[],
 53 |         tv_attn_sublayers=[],
 54 |         vt_attn_sublayers=[],
 55 |         vv_attn_sublayers=[],
 56 |         t_ff_sublayers=[],
 57 |         v_ff_sublayers=[],
 58 |         shared_sublayers=[],
 59 |         single_ln_sublayers=[],
 60 |         sublayer2attn_hidden_size={},
 61 |         sublayer2num_attention_heads={},
 62 |         sublayer2intermediate_size={},
 63 |         sublayer2v_attn_hidden_size={},
 64 |         sublayer2v_num_attention_heads={},
 65 |         sublayer2v_intermediate_size={},
 66 |         bert_layer2attn_sublayer={},
 67 |         bert_layer2ff_sublayer={},
 68 |     ):
 69 |         """Constructs BertConfig.
 70 | 
 71 |         Args:
 72 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
 73 |             hidden_size: Size of the encoder layers and the pooler layer.
 74 |             num_hidden_layers: Number of hidden layers in the Transformer encoder.
 75 |             num_attention_heads: Number of attention heads for each attention layer in
 76 |                 the Transformer encoder.
 77 |             intermediate_size: The size of the "intermediate" (i.e., feed-forward)
 78 |                 layer in the Transformer encoder.
 79 |             hidden_act: The non-linear activation function (function or string) in the
 80 |                 encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
 81 |             hidden_dropout_prob: The dropout probabilitiy for all fully connected
 82 |                 layers in the embeddings, encoder, and pooler.
 83 |             attention_probs_dropout_prob: The dropout ratio for the attention
 84 |                 probabilities.
 85 |             max_position_embeddings: The maximum sequence length that this model might
 86 |                 ever be used with. Typically set this to something large just in case
 87 |                 (e.g., 512 or 1024 or 2048).
 88 |             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
 89 |                 `BertModel`.
 90 |             initializer_range: The sttdev of the truncated_normal_initializer for
 91 |                 initializing all weight matrices.
 92 |         """
 93 |         if isinstance(vocab_size_or_config_json_file, str):
 94 |             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
 95 |                 json_config = json.loads(reader.read())
 96 |             for key, value in json_config.items():
 97 |                 self.__dict__[key] = value
 98 |         elif isinstance(vocab_size_or_config_json_file, int):
 99 |             # Text
100 |             self.vocab_size = vocab_size_or_config_json_file
101 |             self.hidden_size = hidden_size
102 |             self.num_attention_heads = num_attention_heads
103 |             self.hidden_act = hidden_act
104 |             self.intermediate_size = intermediate_size
105 |             self.hidden_dropout_prob = hidden_dropout_prob
106 |             self.attention_probs_dropout_prob = attention_probs_dropout_prob
107 |             self.max_position_embeddings = max_position_embeddings
108 |             self.type_vocab_size = type_vocab_size
109 |             self.do_lower_case = do_lower_case
110 |             self.initializer_range = initializer_range
111 |             self.pooler_size = pooler_size
112 |             # Vision
113 |             self.num_locs = num_locs
114 |             self.v_coordinate_embeddings_dim = v_coordinate_embeddings_dim
115 |             self.add_global_imgfeat = add_global_imgfeat
116 |             self.image_embeddings = image_embeddings
117 |             self.v_feature_size = v_feature_size
118 |             self.v_hidden_size = v_hidden_size
119 |             self.v_num_attention_heads = v_num_attention_heads
120 |             self.v_intermediate_size = v_intermediate_size
121 |             self.v_attention_probs_dropout_prob = v_attention_probs_dropout_prob
122 |             self.v_hidden_act = v_hidden_act
123 |             self.v_hidden_dropout_prob = v_hidden_dropout_prob
124 |             self.v_initializer_range = v_initializer_range
125 |             self.v_pooler_size = v_pooler_size
126 |             self.qa_task_weight = qa_task_weight
127 |             self.qa_num_answers = qa_num_answers
128 |             # Text-Vision
129 |             self.tt_attn_sublayers = tt_attn_sublayers
130 |             self.tv_attn_sublayers = tv_attn_sublayers
131 |             self.vt_attn_sublayers = vt_attn_sublayers
132 |             self.vv_attn_sublayers = vv_attn_sublayers
133 |             self.t_ff_sublayers = t_ff_sublayers
134 |             self.v_ff_sublayers = v_ff_sublayers
135 |             self.shared_sublayers = shared_sublayers
136 |             self.single_ln_sublayers = single_ln_sublayers
137 |             self.sublayer2attn_hidden_size = sublayer2attn_hidden_size
138 |             self.sublayer2num_attention_heads = sublayer2num_attention_heads
139 |             self.sublayer2intermediate_size = sublayer2intermediate_size
140 |             self.sublayer2v_attn_hidden_size = sublayer2v_attn_hidden_size
141 |             self.sublayer2v_num_attention_heads = sublayer2v_num_attention_heads
142 |             self.sublayer2v_intermediate_size = sublayer2v_intermediate_size
143 |             self.bert_layer2attn_sublayer = bert_layer2attn_sublayer
144 |             self.bert_layer2ff_sublayer = bert_layer2ff_sublayer
145 |             self.image_head_ln = image_head_ln
146 |             # Else
147 |             self.visual_target_weights = visual_target_weights
148 |             self.fixed_layers = fixed_layers
149 |             self.bert_model = bert_model
150 |             # Pre-training
151 |             self.fusion_method = fusion_method
152 |             self.objective = objective
153 |             # Fine-tuning
154 |             self.clf_hidden_size = clf_hidden_size
155 |             self.visualization = visualization
156 |         else:
157 |             raise ValueError(
158 |                 "First argument must be either a vocabulary size (int)"
159 |                 "or the path to a pretrained model config file (str)"
160 |             )
161 | 
162 |     @classmethod
163 |     def from_dict(cls, json_object):
164 |         """Constructs a `BertConfig` from a Python dictionary of parameters."""
165 |         config = BertConfig(vocab_size_or_config_json_file=-1)
166 |         for key, value in json_object.items():
167 |             config.__dict__[key] = value
168 |         return config
169 | 
170 |     @classmethod
171 |     def from_json_file(cls, json_file):
172 |         """Constructs a `BertConfig` from a json file of parameters."""
173 |         with open(json_file, "r", encoding="utf-8") as reader:
174 |             text = reader.read()
175 |         return cls.from_dict(json.loads(text))
176 | 
177 |     def __repr__(self):
178 |         return str(self.to_json_string())
179 | 
180 |     def to_dict(self):
181 |         """Serializes this instance to a Python dictionary."""
182 |         output = copy.deepcopy(self.__dict__)
183 |         return output
184 | 
185 |     def to_json_string(self):
186 |         """Serializes this instance to a JSON string."""
187 |         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"


--------------------------------------------------------------------------------
/baselines/clip/volta_src/losses.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020, Emanuele Bugliarello (@e-bug).
  2 | 
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | import copy
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.nn.functional as F
 11 | 
 12 | 
 13 | # ==================================================================================================================== #
 14 | #                                                  Vision Pretraining                                                  #
 15 | # ==================================================================================================================== #
 16 | def kl_1601(prediction_scores_v, weight, label, image_cls, image_feat, obj_labels, obj_confs, attr_labels, attr_confs):
 17 |     if (weight > 0) and (image_cls is not None):
 18 |         image_target = image_cls
 19 |         loss = nn.KLDivLoss(reduction="none")(F.log_softmax(prediction_scores_v, dim=2), image_target)
 20 |         return weight * torch.sum(loss * (label == 1).unsqueeze(2).float()) / max(torch.sum((label == 1)), 1)
 21 |     else:
 22 |         return 0
 23 | 
 24 | 
 25 | def mse_2048(prediction_scores_v, weight, label, image_cls, image_feat, obj_labels, obj_confs, attr_labels, attr_confs):
 26 |     # regress the feature
 27 |     if (weight > 0) and (image_feat is not None):
 28 |         image_target = copy.deepcopy(image_feat)
 29 |         loss = nn.MSELoss(reduction="none")(prediction_scores_v, image_target)
 30 |         return weight * torch.sum(loss * (label == 1).unsqueeze(2).float()) / \
 31 |             max(torch.sum((label == 1).unsqueeze(2).expand_as(loss)), 1)
 32 |     else:
 33 |         return 0
 34 | 
 35 | 
 36 | def nce_2048(prediction_scores_v, weight, label, image_cls, image_feat, obj_labels, obj_confs, attr_labels, attr_confs):
 37 |     # NCE loss
 38 |     num_negative = 128
 39 |     if (weight > 0) and (image_feat is not None):
 40 | 
 41 |         image_target = copy.deepcopy(image_feat)
 42 | 
 43 |         # generate negative sampled index.
 44 |         num_across_batch = int(num_negative * 0.7)
 45 |         num_inside_batch = int(num_negative * 0.3)
 46 | 
 47 |         batch_size, num_regions, _ = prediction_scores_v.size()
 48 |         assert batch_size != 0
 49 |         # random negative across batches.
 50 |         row_across_index = image_target.new(batch_size, num_regions, num_across_batch).random_(0, batch_size - 1)
 51 |         col_across_index = image_target.new(batch_size, num_regions, num_across_batch).random_(0, num_regions)
 52 | 
 53 |         for i in range(batch_size - 1):
 54 |             row_across_index[i][row_across_index[i] == i] = batch_size - 1
 55 |         final_across_index = row_across_index * num_regions + col_across_index
 56 | 
 57 |         # random negative inside batches.
 58 |         row_inside_index = image_target.new(batch_size, num_regions, num_inside_batch).zero_()
 59 |         col_inside_index = image_target.new(batch_size, num_regions, num_inside_batch).random_(0, num_regions - 1)
 60 | 
 61 |         for i in range(batch_size):
 62 |             row_inside_index[i] = i
 63 |         for i in range(num_regions - 1):
 64 |             col_inside_index[:, i, :][col_inside_index[:, i, :] == i] = num_regions - 1
 65 |         final_inside_index = row_inside_index * num_regions + col_inside_index
 66 | 
 67 |         final_index = torch.cat((final_across_index, final_inside_index), dim=2)
 68 | 
 69 |         # Let's first sample where we need to compute.
 70 |         predict_v = prediction_scores_v[label == 1]
 71 |         neg_index_v = final_index[label == 1]
 72 | 
 73 |         flat_image_target = image_target.view(batch_size * num_regions, -1)
 74 |         # we also need to append the target feature at the beginning.
 75 |         negative_v = flat_image_target[neg_index_v]
 76 |         positive_v = image_target[label == 1]
 77 |         sample_v = torch.cat((positive_v.unsqueeze(1), negative_v), dim=1)
 78 | 
 79 |         # calculate the loss.
 80 |         score = torch.bmm(sample_v, predict_v.unsqueeze(2)).squeeze(2)
 81 |         return weight * nn.CrossEntropyLoss()(score, image_target.new(score.size(0)).zero_())
 82 |     else:
 83 |         return 0
 84 | 
 85 | 
 86 | def xent_1600(prediction_scores_v, weight, label, image_cls, image_feat, obj_labels, obj_confs, attr_labels, attr_confs):
 87 |     if (weight > 0) and (obj_labels is not None) and (obj_confs is not None):
 88 |         # hard object labels
 89 |         image_target, mask_conf = obj_labels, obj_confs
 90 |         loss = nn.CrossEntropyLoss(reduction='none')(prediction_scores_v.reshape(-1, 1600), image_target.view(-1,))
 91 |         loss = loss * mask_conf.view(-1)
 92 |         return weight * torch.sum(loss * (label.view(-1) == 1)) / max(torch.sum((label == 1)), 1)
 93 |     else:
 94 |         return 0
 95 | 
 96 | 
 97 | def xent_400(prediction_scores_v, weight, label, image_cls, image_feat, obj_labels, obj_confs, attr_labels, attr_confs):
 98 |     if (weight > 0) and (attr_labels is not None) and (attr_confs is not None):
 99 |         # hard attribute labels
100 |         image_target, mask_conf = attr_labels, attr_confs
101 |         loss = nn.CrossEntropyLoss(reduction='none')(prediction_scores_v.reshape(-1, 400), image_target.view(-1,))
102 |         loss = loss * mask_conf.view(-1)
103 |         return weight * torch.sum(loss * (label.view(-1) == 1)) / max(torch.sum((label == 1)), 1)
104 |     else:
105 |         return 0
106 | 
107 | 
108 | def huber_2048(prediction_scores_v, weight, label, image_cls, image_feat, obj_labels, obj_confs, attr_labels, attr_confs):
109 |     if (weight > 0) and (image_feat is not None):
110 |         # regress the feature
111 |         image_target = copy.deepcopy(image_feat)
112 |         loss = nn.SmoothL1Loss(reduction='none')(prediction_scores_v, image_target)
113 |         return weight * torch.sum(loss * (label == 1).unsqueeze(2).float()) / \
114 |             max(torch.sum((label == 1).unsqueeze(2).expand_as(loss)), 1)
115 |     else:
116 |         return 0
117 | 
118 | 
119 | def xent_1601(prediction_scores_v, weight, label, image_cls, image_feat, obj_labels, obj_confs, attr_labels, attr_confs):
120 |     if (weight > 0) and (obj_labels is not None):
121 |         # hard object labels
122 |         image_target = obj_labels
123 |         loss = nn.CrossEntropyLoss(reduction='none')(prediction_scores_v.reshape(-1, 1601), image_target.view(-1,))
124 |         return weight * torch.sum(loss * (label.view(-1) == 1)) / max(torch.sum((label == 1)), 1)
125 |     else:
126 |         return 0
127 | 
128 | 
129 | pre_vis_targets = {
130 |     "0": 1601,
131 |     "1": 2048,
132 |     "2": 2048,
133 |     "3": 1600,
134 |     "4": 400,
135 |     "5": 2048,
136 |     "6": 1601
137 | }
138 | 
139 | pre_vis_criterions = {
140 |     "0": kl_1601,
141 |     "1": mse_2048,
142 |     "2": nce_2048,
143 |     "3": xent_1600,
144 |     "4": xent_400,
145 |     "5": huber_2048,
146 |     "6": xent_1601,
147 | }


--------------------------------------------------------------------------------
/baselines/clip/zero_shot.py:
--------------------------------------------------------------------------------
 1 | # based on: https://github.com/haltakov/natural-language-image-search
 2 | from tqdm import tqdm
 3 | import json
 4 | from collections import defaultdict
 5 | from glob import glob
 6 | import os
 7 | import numpy as np
 8 | import clip
 9 | import torch
10 | from PIL import Image
11 | from pathlib import Path
12 | import argparse
13 | 
14 | def encode_images(photos_batch):
15 |     photos = [Image.open(photo_file) for photo_file in photos_batch]
16 |     photos_preprocessed = torch.stack([preprocess(photo) for photo in photos]).to(device)
17 | 
18 |     with torch.no_grad():
19 |         photos_features = model.encode_image(photos_preprocessed)
20 |         photos_features /= photos_features.norm(dim=-1, keepdim=True)
21 |     return photos_features.cpu().numpy()
22 | 
23 | 
24 | def encode_text(search_query):
25 |     with torch.no_grad():
26 |         text_encoded = model.encode_text(clip.tokenize(search_query, truncate=True).to(device))
27 |         text_encoded /= text_encoded.norm(dim=-1, keepdim=True)
28 |     return text_encoded.cpu().numpy()
29 | 
30 | 
31 | def find_best_matches(text_features, photo_features):
32 |     similarities = (photo_features @ text_features.T).squeeze(1)
33 |     best_photo_idx = (-similarities).argsort()
34 |     similarities = -similarities
35 |     unsorted_sims = np.copy(similarities)
36 |     similarities.sort()
37 |     return best_photo_idx, similarities, unsorted_sims
38 | 
39 | parser = argparse.ArgumentParser()
40 | parser.add_argument('--descr_path', type=str, default='../../data/valid_data.json')
41 | parser.add_argument('--imgs_path', type=str, default='/network/scratch/b/benno.krojer/dataset/games')
42 | args = parser.parse_args()
43 | clip_type = 'ViT-B/16'
44 | device = "cuda" if torch.cuda.is_available() else "cpu"
45 | print(f'USING DEVICE: {device}')
46 | model, preprocess = clip.load(clip_type, device=device, jit=False)  # Must set jit=False for training
47 | 
48 | correct = 0
49 | total = 0
50 | vid_correct = 0
51 | vid_total = 0
52 | img_correct = 0
53 | img_total = 0
54 | 
55 | img_dirs = args.imgs_path
56 | descriptions = json.load(open(args.descr_path, 'r'))
57 | results = defaultdict(dict)
58 | for img_dir, data in tqdm(descriptions.items()):
59 |     for img_idx, text in data.items():
60 |         img_files = list((Path(img_dirs) / img_dir).glob("*.jpg"))
61 |         img_files = sorted(img_files, key=lambda x: int(str(x).split('/')[-1].split('.')[0][3:]))
62 |         img_embs = encode_images(img_files)
63 |         text_emb = encode_text(text.strip())
64 |         ranked_idx, sim, unsorted_sims = find_best_matches(text_emb, img_embs)
65 |         ranked_files = [str(img_files[rank]).split('/')[-1][:-4] for rank in ranked_idx]
66 |         target = str(img_files[int(img_idx)]).split('/')[-1][:-4]
67 |         total += 1
68 |         results[img_dir].update({f'raw_preds_{img_idx}': unsorted_sims.tolist(), f'clip_pred_{img_idx}': int(ranked_idx[0]) ,f'correct_{img_idx}': 1 if ranked_files[0] == target else 0})
69 |         if ranked_files[0] == target:
70 |             correct += 1
71 |         if 'open-images' in img_dir:
72 |             img_total += 1
73 |             if ranked_files[0] == target:
74 |                 img_correct += 1
75 |         else:
76 |             vid_total += 1
77 |             if ranked_files[0] == target:
78 |                 vid_correct += 1        
79 | 
80 | 
81 | print('OVERALL ACC: ' + str(round(correct/total,4)))
82 | print('VIDEO ACC: ' + str(round(vid_correct/vid_total,4)))
83 | print('IMG ACC: ' + str(round(img_correct/img_total,4)))
84 | json.dump(results, open(f'results/zero_clip_test.json', 'w'), indent=2)
85 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/analyze_results.py:
--------------------------------------------------------------------------------
 1 | from torch.nn.functional import softmax
 2 | import torch
 3 | import json
 4 | import sys
 5 | 
 6 | from collections import defaultdict
 7 | counter2key = json.load(open('counter2key_test.json', 'r'))
 8 | results = json.load(open(sys.argv[1], 'r'))
 9 | groups = defaultdict(list)
10 | 
11 | final = dict()
12 | 
13 | for pred in results:
14 |     q_id = str(pred['question_id'])
15 |     alignment = pred['prediction_aligned']
16 |     not_alignment = pred['prediction_notaligned']
17 |     p1, p2 = softmax(torch.Tensor([alignment, not_alignment]))
18 |     p1 = p1.item()
19 |     folder = q_id[:-2]
20 |     groups[folder].append([q_id, p1])
21 | 
22 | total = 0
23 | total_corr = 0
24 | for _, preds in groups.items():
25 |     preds = sorted(preds, key = lambda x: x[1], reverse=True)
26 |     rank = -1
27 |     for i, p in enumerate(preds):
28 |         key = p[0]
29 |         if key[-1] == key [-2]:
30 |             rank = i+1
31 |     if rank == 1:
32 |         total_corr += 1
33 |     total += 1
34 | 
35 | print(total)
36 | print(total_corr)
37 | print(total_corr / total)
38 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/ctrl_uniter_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 768,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 3072,
 8 |   "max_position_embeddings": 512,
 9 |   "num_attention_heads": 12,
10 |   "pooler_size": 1024,
11 |   "type_vocab_size": 2,
12 |   "vocab_size": 30522,
13 |   "bert_model": "bert-base-uncased",
14 |   "do_lower_case": true,
15 |   "num_locs": 5,
16 |   "add_global_imgfeat": "first",
17 |   "image_embeddings": "uniter",
18 |   "v_attention_probs_dropout_prob": 0.1,
19 |   "v_hidden_act": "gelu",
20 |   "v_hidden_dropout_prob": 0.1,
21 |   "v_feature_size": 2048,
22 |   "visual_target_weights": {"0": 1.0},
23 |   "v_hidden_size": 768,
24 |   "v_initializer_range": 0.02,
25 |   "v_pooler_size": 1024,
26 |   "v_num_attention_heads": 12,
27 |   "v_intermediate_size": 3072,
28 |   "fusion_method": "mul",
29 |   "clf_hidden_size": 1536,
30 |   "tt_attn_sublayers": [0,2,4,6,8,10,12,14,16,18,20,22],
31 |   "tv_attn_sublayers": [0,2,4,6,8,10,12,14,16,18,20,22],
32 |   "vt_attn_sublayers": [0,2,4,6,8,10,12,14,16,18,20,22],
33 |   "vv_attn_sublayers": [0,2,4,6,8,10,12,14,16,18,20,22],
34 |   "t_ff_sublayers": [1,3,5,7,9,11,13,15,17,19,21,23],
35 |   "v_ff_sublayers": [1,3,5,7,9,11,13,15,17,19,21,23],
36 |   "shared_sublayers": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23],
37 |   "single_ln_sublayers": [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23],
38 |   "sublayer2attn_hidden_size": {},
39 |   "sublayer2num_attention_heads": {},
40 |   "sublayer2intermediate_size": {},
41 |   "sublayer2v_attn_hidden_size": {},
42 |   "sublayer2v_num_attention_heads": {},
43 |   "sublayer2v_intermediate_size": {},
44 |   "bert_layer2attn_sublayer": {
45 |     "0": 0, "1": 2, "2": 4, "3": 6, "4": 8, "5": 10,
46 |     "6": 12, "7": 14, "8": 16, "9": 18, "10": 20, "11": 22
47 |   },
48 |   "bert_layer2ff_sublayer": {
49 |     "0": 1, "1": 3, "2": 5, "3": 7, "4": 9, "5": 11,
50 |     "6": 13, "7": 15, "8": 17, "9": 19, "10": 21, "11": 23
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/task_config/contextual.yml:
--------------------------------------------------------------------------------
 1 | TASK3:
 2 |   name: VQA
 3 |   type: VL-classifier
 4 |   num_labels: 2
 5 |   loss: BCEWithLogitLoss
 6 |   process: normal
 7 |   task_id: 3
 8 |   dataroot: ../../data/vilbert_data_format
 9 |   features_h5path1: ../../data/rcnn-features36-36.lmdb
10 |   features_h5path2: ''
11 |   train_annotations_jsonpath: ''
12 |   val_annotations_jsonpath: ''
13 |   max_seq_length: 120
14 |   max_region_num: 36
15 |   #  batch_size: 256
16 |   #  eval_batch_size: 256
17 |   train_batch_size: 15
18 |   val_batch_size: 4
19 |   train_split: train
20 |   val_split: val
21 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/task_config/contra.yml:
--------------------------------------------------------------------------------
 1 | TASK3:
 2 |   name: VQA
 3 |   type: VL-classifier
 4 |   num_labels: 2
 5 |   loss: BCEWithLogitLoss
 6 |   process: normal
 7 |   task_id: 3
 8 |   dataroot: ../../data/vilbert_data_format
 9 |   #  features_h5path1: ../../data/rcnn-features36-36.lmdb
10 |   features_h5path1: /network/scratch/b/benno.krojer/dataset/rcnn-features36-36.lmdb
11 |   features_h5path2: ''
12 |   train_annotations_jsonpath: ''
13 |   val_annotations_jsonpath: ''
14 |   max_seq_length: 120
15 |   max_region_num: 36
16 |   #  batch_size: 256
17 |   #  eval_batch_size: 256
18 |   train_batch_size: 15
19 |   val_batch_size: 8
20 |   train_split: train
21 |   val_split: val
22 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/task_config/nocontra.yml:
--------------------------------------------------------------------------------
 1 | TASK3:
 2 |   name: VQA
 3 |   type: VL-classifier
 4 |   num_labels: 2
 5 |   loss: BCEWithLogitLoss
 6 |   process: normal
 7 |   task_id: 3
 8 |   dataroot: ../../data/vilbert_data_format
 9 |   #  features_h5path1: ../../data/rcnn-features36-36.lmdb
10 |   features_h5path1: /network/scratch/b/benno.krojer/dataset/rcnn-features36-36.lmdb
11 |   features_h5path2: ''
12 |   train_annotations_jsonpath: ''
13 |   val_annotations_jsonpath: ''
14 |   max_seq_length: 120
15 |   max_region_num: 36
16 |   #  batch_size: 256
17 |   #  eval_batch_size: 256
18 |   train_batch_size: 128
19 |   val_batch_size: 32
20 |   train_split: train
21 |   val_split: val
22 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/task_config/zero_shot.yml:
--------------------------------------------------------------------------------
 1 | TASK3:
 2 |   name: VQA
 3 |   type: VL-classifier
 4 |   num_labels: 2
 5 |   loss: BCEWithLogitLoss
 6 |   process: normal
 7 |   task_id: 3
 8 |   dataroot: ../../data/vilbert_data_format
 9 |   #  features_h5path1: ../../data/rcnn-features36-36.lmdb
10 |   features_h5path1: /network/scratch/b/benno.krojer/dataset/rcnn-features36-36.lmdb
11 |   features_h5path2: ''
12 |   train_annotations_jsonpath: ''
13 |   val_annotations_jsonpath: ''
14 |   max_seq_length: 120
15 |   max_region_num: 36
16 |   batch_size: 32
17 |   eval_batch_size: 256
18 |   train_split: ''
19 |   val_split: test
20 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/vilbert_base.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "attention_probs_dropout_prob": 0.1,
 3 |   "hidden_act": "gelu",
 4 |   "hidden_dropout_prob": 0.1,
 5 |   "hidden_size": 768,
 6 |   "initializer_range": 0.02,
 7 |   "intermediate_size": 3072,
 8 |   "max_position_embeddings": 512,
 9 |   "num_attention_heads": 12,
10 |   "pooler_size": 1024,
11 |   "type_vocab_size": 2,
12 |   "vocab_size": 30522,
13 |   "bert_model": "bert-base-uncased",
14 |   "do_lower_case": true,
15 |   "num_locs": 5,
16 |   "add_global_imgfeat": "first",
17 |   "image_embeddings": "vilbert",
18 |   "v_attention_probs_dropout_prob": 0.1,
19 |   "v_hidden_act": "gelu",
20 |   "v_hidden_dropout_prob": 0.1,
21 |   "v_feature_size": 2048,
22 |   "visual_target_weights": {"0": 1.0},
23 |   "v_hidden_size": 1024,
24 |   "v_initializer_range": 0.02,
25 |   "v_pooler_size": 1024,
26 |   "v_num_attention_heads": 8,
27 |   "v_intermediate_size": 1024,
28 |   "fusion_method": "mul",
29 |   "clf_hidden_size": 1536,
30 |   "tt_attn_sublayers": [0,2,4,6,8,10,14,18,22,26,30,34],
31 |   "tv_attn_sublayers": [12,16,20,24,28,32],
32 |   "vt_attn_sublayers": [12,16,20,24,28,32],
33 |   "vv_attn_sublayers": [14,18,22,26,30,34],
34 |   "t_ff_sublayers": [1,3,5,7,9,11,13,15,17,19,21,23,25,27,29,31,33,35],
35 |   "v_ff_sublayers": [13,15,17,19,21,23,25,27,29,31,33,35],
36 |   "shared_sublayers": [],
37 |   "single_ln_sublayers": [],
38 |   "sublayer2attn_hidden_size": {"12":1024,"16":1024,"20":1024,"24":1024,"28":1024,"32":1024},
39 |   "sublayer2num_attention_heads": {"12":8,"16":8,"20":8,"24":8,"28":8,"32":8},
40 |   "sublayer2intermediate_size": {},
41 |   "sublayer2v_attn_hidden_size": {},
42 |   "sublayer2v_num_attention_heads": {},
43 |   "sublayer2v_intermediate_size": {},
44 |   "bert_layer2attn_sublayer": {
45 |     "0": 0, "1": 2, "2": 4, "3": 6, "4": 8, "5": 10,
46 |     "6": 14, "7": 18, "8": 22, "9": 26, "10": 30, "11": 34
47 |   },
48 |   "bert_layer2ff_sublayer": {
49 |     "0": 1, "1": 3, "2": 5, "3": 7, "4": 9, "5": 11,
50 |     "6": 15, "7": 19, "8": 23, "9": 27, "10": 31, "11": 35
51 |   }
52 | }
53 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/volta/__init__.py:
--------------------------------------------------------------------------------
1 | # This source code is licensed under the MIT license found in the
2 | # LICENSE file in the root directory of this source tree.
3 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/volta/config.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020, Emanuele Bugliarello (@e-bug).
  2 | 
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | import json
  7 | import copy
  8 | from io import open
  9 | 
 10 | 
 11 | class BertConfig(object):
 12 |     """Configuration class to store the configuration of a `BertModel`.
 13 |     """
 14 | 
 15 |     def __init__(
 16 |         self,
 17 |         vocab_size_or_config_json_file,
 18 |         hidden_size=768,
 19 |         num_attention_heads=12,
 20 |         intermediate_size=3072,
 21 |         pooler_size=768,
 22 |         hidden_act="gelu",
 23 |         hidden_dropout_prob=0.1,
 24 |         attention_probs_dropout_prob=0.1,
 25 |         max_position_embeddings=512,
 26 |         type_vocab_size=2,
 27 |         do_lower_case=True,
 28 |         num_locs=5,
 29 |         v_coordinate_embeddings_dim=None,
 30 |         add_global_imgfeat=None,
 31 |         image_embeddings="vilbert",
 32 |         initializer_range=0.02,
 33 |         v_feature_size=2048,
 34 |         v_hidden_size=768,
 35 |         v_num_attention_heads=12,
 36 |         v_intermediate_size=3072,
 37 |         v_pooler_size=1024,
 38 |         v_attention_probs_dropout_prob=0.1,
 39 |         v_hidden_act="gelu",
 40 |         v_hidden_dropout_prob=0.1,
 41 |         v_initializer_range=0.2,
 42 |         visual_target_weights={"0": 1},
 43 |         qa_task_weight=0.0,
 44 |         qa_num_answers=0,
 45 |         fixed_layers=[],
 46 |         fusion_method="mul",
 47 |         objective=0,
 48 |         clf_hidden_size=1536,
 49 |         image_head_ln=True,
 50 |         bert_model="bert-base-uncased",
 51 |         visualization=False,
 52 |         tt_attn_sublayers=[],
 53 |         tv_attn_sublayers=[],
 54 |         vt_attn_sublayers=[],
 55 |         vv_attn_sublayers=[],
 56 |         t_ff_sublayers=[],
 57 |         v_ff_sublayers=[],
 58 |         shared_sublayers=[],
 59 |         single_ln_sublayers=[],
 60 |         sublayer2attn_hidden_size={},
 61 |         sublayer2num_attention_heads={},
 62 |         sublayer2intermediate_size={},
 63 |         sublayer2v_attn_hidden_size={},
 64 |         sublayer2v_num_attention_heads={},
 65 |         sublayer2v_intermediate_size={},
 66 |         bert_layer2attn_sublayer={},
 67 |         bert_layer2ff_sublayer={},
 68 |     ):
 69 |         """Constructs BertConfig.
 70 | 
 71 |         Args:
 72 |             vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
 73 |             hidden_size: Size of the encoder layers and the pooler layer.
 74 |             num_hidden_layers: Number of hidden layers in the Transformer encoder.
 75 |             num_attention_heads: Number of attention heads for each attention layer in
 76 |                 the Transformer encoder.
 77 |             intermediate_size: The size of the "intermediate" (i.e., feed-forward)
 78 |                 layer in the Transformer encoder.
 79 |             hidden_act: The non-linear activation function (function or string) in the
 80 |                 encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
 81 |             hidden_dropout_prob: The dropout probabilitiy for all fully connected
 82 |                 layers in the embeddings, encoder, and pooler.
 83 |             attention_probs_dropout_prob: The dropout ratio for the attention
 84 |                 probabilities.
 85 |             max_position_embeddings: The maximum sequence length that this model might
 86 |                 ever be used with. Typically set this to something large just in case
 87 |                 (e.g., 512 or 1024 or 2048).
 88 |             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
 89 |                 `BertModel`.
 90 |             initializer_range: The sttdev of the truncated_normal_initializer for
 91 |                 initializing all weight matrices.
 92 |         """
 93 |         if isinstance(vocab_size_or_config_json_file, str):
 94 |             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
 95 |                 json_config = json.loads(reader.read())
 96 |             for key, value in json_config.items():
 97 |                 self.__dict__[key] = value
 98 |         elif isinstance(vocab_size_or_config_json_file, int):
 99 |             # Text
100 |             self.vocab_size = vocab_size_or_config_json_file
101 |             self.hidden_size = hidden_size
102 |             self.num_attention_heads = num_attention_heads
103 |             self.hidden_act = hidden_act
104 |             self.intermediate_size = intermediate_size
105 |             self.hidden_dropout_prob = hidden_dropout_prob
106 |             self.attention_probs_dropout_prob = attention_probs_dropout_prob
107 |             self.max_position_embeddings = max_position_embeddings
108 |             self.type_vocab_size = type_vocab_size
109 |             self.do_lower_case = do_lower_case
110 |             self.initializer_range = initializer_range
111 |             self.pooler_size = pooler_size
112 |             # Vision
113 |             self.num_locs = num_locs
114 |             self.v_coordinate_embeddings_dim = v_coordinate_embeddings_dim
115 |             self.add_global_imgfeat = add_global_imgfeat
116 |             self.image_embeddings = image_embeddings
117 |             self.v_feature_size = v_feature_size
118 |             self.v_hidden_size = v_hidden_size
119 |             self.v_num_attention_heads = v_num_attention_heads
120 |             self.v_intermediate_size = v_intermediate_size
121 |             self.v_attention_probs_dropout_prob = v_attention_probs_dropout_prob
122 |             self.v_hidden_act = v_hidden_act
123 |             self.v_hidden_dropout_prob = v_hidden_dropout_prob
124 |             self.v_initializer_range = v_initializer_range
125 |             self.v_pooler_size = v_pooler_size
126 |             self.qa_task_weight = qa_task_weight
127 |             self.qa_num_answers = qa_num_answers
128 |             # Text-Vision
129 |             self.tt_attn_sublayers = tt_attn_sublayers
130 |             self.tv_attn_sublayers = tv_attn_sublayers
131 |             self.vt_attn_sublayers = vt_attn_sublayers
132 |             self.vv_attn_sublayers = vv_attn_sublayers
133 |             self.t_ff_sublayers = t_ff_sublayers
134 |             self.v_ff_sublayers = v_ff_sublayers
135 |             self.shared_sublayers = shared_sublayers
136 |             self.single_ln_sublayers = single_ln_sublayers
137 |             self.sublayer2attn_hidden_size = sublayer2attn_hidden_size
138 |             self.sublayer2num_attention_heads = sublayer2num_attention_heads
139 |             self.sublayer2intermediate_size = sublayer2intermediate_size
140 |             self.sublayer2v_attn_hidden_size = sublayer2v_attn_hidden_size
141 |             self.sublayer2v_num_attention_heads = sublayer2v_num_attention_heads
142 |             self.sublayer2v_intermediate_size = sublayer2v_intermediate_size
143 |             self.bert_layer2attn_sublayer = bert_layer2attn_sublayer
144 |             self.bert_layer2ff_sublayer = bert_layer2ff_sublayer
145 |             self.image_head_ln = image_head_ln
146 |             # Else
147 |             self.visual_target_weights = visual_target_weights
148 |             self.fixed_layers = fixed_layers
149 |             self.bert_model = bert_model
150 |             # Pre-training
151 |             self.fusion_method = fusion_method
152 |             self.objective = objective
153 |             # Fine-tuning
154 |             self.clf_hidden_size = clf_hidden_size
155 |             self.visualization = visualization
156 |         else:
157 |             raise ValueError(
158 |                 "First argument must be either a vocabulary size (int)"
159 |                 "or the path to a pretrained model config file (str)"
160 |             )
161 | 
162 |     @classmethod
163 |     def from_dict(cls, json_object):
164 |         """Constructs a `BertConfig` from a Python dictionary of parameters."""
165 |         config = BertConfig(vocab_size_or_config_json_file=-1)
166 |         for key, value in json_object.items():
167 |             config.__dict__[key] = value
168 |         return config
169 | 
170 |     @classmethod
171 |     def from_json_file(cls, json_file):
172 |         """Constructs a `BertConfig` from a json file of parameters."""
173 |         with open(json_file, "r", encoding="utf-8") as reader:
174 |             text = reader.read()
175 |         return cls.from_dict(json.loads(text))
176 | 
177 |     def __repr__(self):
178 |         return str(self.to_json_string())
179 | 
180 |     def to_dict(self):
181 |         """Serializes this instance to a Python dictionary."""
182 |         output = copy.deepcopy(self.__dict__)
183 |         return output
184 | 
185 |     def to_json_string(self):
186 |         """Serializes this instance to a JSON string."""
187 |         return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
188 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/volta/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Copyright (c) 2020, Emanuele Bugliarello (@e-bug).
 3 | 
 4 | # This source code is licensed under the MIT license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | from .concept_cap_dataset import ConceptCapLoaderTrain, ConceptCapLoaderVal
 8 | from .vqa_dataset import VQAClassificationDataset, ContrastiveVQAClassificationDataset
 9 | from .nlvr2_dataset import NLVR2Dataset
10 | from .refer_expression_dataset import ReferExpressionDataset
11 | from .retrieval_dataset import RetrievalDataset, RetrievalDatasetVal
12 | from .vcr_dataset import VCRDataset
13 | from .visual_entailment_dataset import VisualEntailmentDataset
14 | from .refer_dense_caption import ReferDenseCpationDataset
15 | from .visual_genome_dataset import GenomeQAClassificationDataset
16 | from .gqa_dataset import GQAClassificationDataset
17 | from .guesswhat_dataset import GuessWhatDataset
18 | from .visual7w_pointing_dataset import Visual7wPointingDataset
19 | from .guesswhat_pointing_dataset import GuessWhatPointingDataset
20 | from .flickr_grounding_dataset import FlickrGroundingDataset
21 | 
22 | 
23 | __all__ = [
24 |     "VQAClassificationDataset",
25 |     "ContrastiveVQAClassificationDataset",
26 |     "GenomeQAClassificationDataset",
27 |     "ConceptCapLoaderTrain",
28 |     "ConceptCapLoaderVal",
29 |     "NLVR2Dataset",
30 |     "ReferExpressionDataset",
31 |     "RetrievalDataset",
32 |     "RetrievalDatasetVal",
33 |     "VCRDataset",
34 |     "VisualEntailmentDataset",
35 |     "GQAClassificationDataset",
36 |     "GuessWhatDataset",
37 |     "Visual7wPointingDataset",
38 |     "GuessWhatPointingDataset",
39 |     "FlickrGroundingDataset",
40 |     "",
41 | ]
42 | 
43 | DatasetMapTrain = {
44 |     "VQA": VQAClassificationDataset,
45 |     "VQAcontra": ContrastiveVQAClassificationDataset,
46 |     "GenomeQA": GenomeQAClassificationDataset,
47 |     "VCR_Q-A": VCRDataset,
48 |     "VCR_QA-R": VCRDataset,
49 |     "RetrievalCOCO": RetrievalDataset,
50 |     "RetrievalFlickr30k": RetrievalDataset,
51 |     "refcoco": ReferExpressionDataset,
52 |     "refcoco+": ReferExpressionDataset,
53 |     "refcocog": ReferExpressionDataset,
54 |     "NLVR2": NLVR2Dataset,
55 |     "VisualEntailment": VisualEntailmentDataset,
56 |     "GQA": GQAClassificationDataset,
57 |     "GuessWhat": GuessWhatDataset,
58 |     "Visual7w": Visual7wPointingDataset,
59 |     "GuessWhatPointing": GuessWhatPointingDataset,
60 |     "FlickrGrounding": FlickrGroundingDataset,
61 | }
62 | 
63 | DatasetMapEval = {
64 |     "VQA": VQAClassificationDataset,
65 |     "GenomeQA": GenomeQAClassificationDataset,
66 |     "VCR_Q-A": VCRDataset,
67 |     "VCR_QA-R": VCRDataset,
68 |     "RetrievalCOCO": RetrievalDatasetVal,
69 |     "RetrievalFlickr30k": RetrievalDatasetVal,
70 |     "refcoco": ReferExpressionDataset,
71 |     "refcoco+": ReferExpressionDataset,
72 |     "refcocog": ReferExpressionDataset,
73 |     "NLVR2": NLVR2Dataset,
74 |     "VisualEntailment": VisualEntailmentDataset,
75 |     "GQA": GQAClassificationDataset,
76 |     "GuessWhat": GuessWhatDataset,
77 |     "Visual7w": Visual7wPointingDataset,
78 |     "GuessWhatPointing": GuessWhatPointingDataset,
79 |     "FlickrGrounding": FlickrGroundingDataset,
80 | }
81 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/volta/datasets/_image_features_reader.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Copyright (c) 2020, Emanuele Bugliarello (@e-bug).
  3 | 
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import copy
  8 | import lmdb  # install lmdb by "pip install lmdb"
  9 | import base64
 10 | import pickle
 11 | from typing import List
 12 | 
 13 | import numpy as np
 14 | 
 15 | 
 16 | class ImageFeaturesH5Reader(object):
 17 |     """
 18 |     A reader for H5 files containing pre-extracted image features. A typical
 19 |     H5 file is expected to have a column named "image_id", and another column
 20 |     named "features".
 21 | 
 22 |     Example of an H5 file:
 23 |     ```
 24 |     faster_rcnn_bottomup_features.h5
 25 |        |--- "image_id" [shape: (num_images, )]
 26 |        |--- "features" [shape: (num_images, num_proposals, feature_size)]
 27 |        +--- .attrs ("split", "train")
 28 |     ```
 29 |     # TODO (kd): Add support to read boxes, classes and scores.
 30 | 
 31 |     Parameters
 32 |     ----------
 33 |     features_h5path : str
 34 |         Path to an H5 file containing COCO train / val image features.
 35 |     in_memory : bool
 36 |         Whether to load the whole H5 file in memory. Beware, these files are
 37 |         sometimes tens of GBs in size. Set this to true if you have sufficient
 38 |         RAM - trade-off between speed and memory.
 39 |     """
 40 | 
 41 |     def __init__(self, features_path: str, config, in_memory: bool = False):
 42 |         self.features_path = features_path
 43 |         self._in_memory = in_memory
 44 | 
 45 |         # If not loaded in memory, then list of None.
 46 |         self.env = lmdb.open(
 47 |             self.features_path,
 48 |             max_readers=1,
 49 |             readonly=True,
 50 |             lock=False,
 51 |             readahead=False,
 52 |             meminit=False,
 53 |         )
 54 | 
 55 |         with self.env.begin(write=False) as txn:
 56 |             self._image_ids = pickle.loads(txn.get("keys".encode()))
 57 |         self._image_ids = [x for x in self._image_ids]
 58 | 
 59 |         self.features = [None] * len(self._image_ids)
 60 |         self.num_boxes = [None] * len(self._image_ids)
 61 |         self.boxes = [None] * len(self._image_ids)
 62 |         self.boxes_ori = [None] * len(self._image_ids)
 63 |         self.feature_size = config.v_feature_size
 64 |         self.num_locs = config.num_locs
 65 |         self.add_global_imgfeat = config.add_global_imgfeat
 66 | 
 67 |     def __len__(self):
 68 |         return len(self._image_ids)
 69 | 
 70 |     def __getitem__(self, image_id):
 71 |         image_id = str(image_id).encode()
 72 |         index = self._image_ids.index(image_id)
 73 |         if self._in_memory:
 74 |             # Load features during first epoch, all not loaded together as it has a slow start.
 75 |             if self.features[index] is not None:
 76 |                 features = self.features[index]
 77 |                 num_boxes = self.num_boxes[index]
 78 |                 image_location = self.boxes[index]
 79 |                 image_location_ori = self.boxes_ori[index]
 80 |             else:
 81 |                 with self.env.begin(write=False) as txn:
 82 |                     item = pickle.loads(txn.get(image_id))
 83 |                     image_h = int(item["img_h"])
 84 |                     image_w = int(item["img_w"])
 85 | 
 86 |                     features = np.frombuffer(base64.b64decode(item["features"]), dtype=np.float32).reshape(-1, self.feature_size)
 87 |                     boxes = np.frombuffer(base64.b64decode(item['boxes']), dtype=np.float32).reshape(-1, 4)
 88 | 
 89 |                     image_location = np.zeros((boxes.shape[0], self.num_locs), dtype=np.float32)
 90 |                     image_location[:, :4] = boxes
 91 |                     if self.num_locs == 5:
 92 |                         image_location[:, 4] = (
 93 |                                 (image_location[:, 3] - image_location[:, 1])
 94 |                                 * (image_location[:, 2] - image_location[:, 0])
 95 |                                 / (float(image_w) * float(image_h))
 96 |                         )
 97 | 
 98 |                     image_location_ori = copy.deepcopy(image_location)
 99 |                     image_location[:, 0] = image_location[:, 0] / float(image_w)
100 |                     image_location[:, 1] = image_location[:, 1] / float(image_h)
101 |                     image_location[:, 2] = image_location[:, 2] / float(image_w)
102 |                     image_location[:, 3] = image_location[:, 3] / float(image_h)
103 | 
104 |                     num_boxes = features.shape[0]
105 |                     if self.add_global_imgfeat == "first":
106 |                         g_feat = np.sum(features, axis=0) / num_boxes
107 |                         num_boxes = num_boxes + 1
108 |                         features = np.concatenate([np.expand_dims(g_feat, axis=0), features], axis=0)
109 | 
110 |                         g_location = [0, 0, 1, 1] + [1] * (self.num_locs - 4)
111 |                         image_location = np.concatenate([np.expand_dims(g_location, axis=0), image_location], axis=0)
112 | 
113 |                         g_location_ori = np.array([0, 0, image_w, image_h] + [image_w * image_h] * (self.num_locs - 4))
114 |                         image_location_ori = np.concatenate(
115 |                             [np.expand_dims(g_location_ori, axis=0), image_location_ori], axis=0
116 |                         )
117 | 
118 |                     elif self.add_global_imgfeat == "last":
119 |                         g_feat = np.sum(features, axis=0) / num_boxes
120 |                         num_boxes = num_boxes + 1
121 |                         features = np.concatenate([features, np.expand_dims(g_feat, axis=0)], axis=0)
122 | 
123 |                         g_location = [0, 0, 1, 1] + [1] * (self.num_locs - 4)
124 |                         image_location = np.concatenate([image_location, np.expand_dims(g_location, axis=0)], axis=0)
125 | 
126 |                         g_location_ori = np.array([0, 0, image_w, image_h] + [image_w * image_h] * (self.num_locs - 4))
127 |                         image_location_ori = np.concatenate(
128 |                             [image_location_ori, np.expand_dims(g_location_ori, axis=0)], axis=0
129 |                         )
130 | 
131 |                     self.features[index] = features
132 |                     self.boxes[index] = image_location
133 |                     self.boxes_ori[index] = image_location_ori
134 |                     self.num_boxes[index] = num_boxes
135 |         else:
136 |             # Read chunk from file everytime if not loaded in memory.
137 |             with self.env.begin(write=False) as txn:
138 |                 item = pickle.loads(txn.get(image_id))
139 |                 image_h = int(item["img_h"])
140 |                 image_w = int(item["img_w"])
141 | 
142 |                 features = np.frombuffer(base64.b64decode(item["features"]), dtype=np.float32).reshape(-1, self.feature_size)
143 |                 boxes = np.frombuffer(base64.b64decode(item['boxes']), dtype=np.float32).reshape(-1, 4)
144 | 
145 |                 image_location = np.zeros((boxes.shape[0], self.num_locs), dtype=np.float32)
146 |                 image_location[:, :4] = boxes
147 |                 if self.num_locs == 5:
148 |                     image_location[:, 4] = (
149 |                             (image_location[:, 3] - image_location[:, 1])
150 |                             * (image_location[:, 2] - image_location[:, 0])
151 |                             / (float(image_w) * float(image_h))
152 |                     )
153 | 
154 |                 image_location_ori = copy.deepcopy(image_location)
155 |                 image_location[:, 0] = image_location[:, 0] / float(image_w)
156 |                 image_location[:, 1] = image_location[:, 1] / float(image_h)
157 |                 image_location[:, 2] = image_location[:, 2] / float(image_w)
158 |                 image_location[:, 3] = image_location[:, 3] / float(image_h)
159 | 
160 |                 num_boxes = features.shape[0]
161 |                 if self.add_global_imgfeat == "first":
162 |                     g_feat = np.sum(features, axis=0) / num_boxes
163 |                     num_boxes = num_boxes + 1
164 |                     features = np.concatenate([np.expand_dims(g_feat, axis=0), features], axis=0)
165 | 
166 |                     g_location = [0, 0, 1, 1] + [1] * (self.num_locs - 4)
167 |                     image_location = np.concatenate([np.expand_dims(g_location, axis=0), image_location], axis=0)
168 | 
169 |                     g_location_ori = np.array([0, 0, image_w, image_h] + [image_w * image_h] * (self.num_locs - 4))
170 |                     image_location_ori = np.concatenate(
171 |                         [np.expand_dims(g_location_ori, axis=0), image_location_ori], axis=0
172 |                     )
173 | 
174 |                 elif self.add_global_imgfeat == "last":
175 |                     g_feat = np.sum(features, axis=0) / num_boxes
176 |                     num_boxes = num_boxes + 1
177 |                     features = np.concatenate([features, np.expand_dims(g_feat, axis=0)], axis=0)
178 | 
179 |                     g_location = [0, 0, 1, 1] + [1] * (self.num_locs - 4)
180 |                     image_location = np.concatenate([image_location, np.expand_dims(g_location, axis=0)], axis=0)
181 | 
182 |                     g_location_ori = np.array([0, 0, image_w, image_h] + [image_w * image_h] * (self.num_locs - 4))
183 |                     image_location_ori = np.concatenate(
184 |                         [image_location_ori, np.expand_dims(g_location_ori, axis=0)], axis=0
185 |                     )
186 | 
187 |         return features, num_boxes, image_location, image_location_ori
188 | 
189 |     def keys(self) -> List[int]:
190 |         return self._image_ids
191 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/volta/datasets/gqa_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Copyright (c) 2020, Emanuele Bugliarello (@e-bug).
  3 | 
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import os
  8 | import json
  9 | import logging
 10 | import _pickle as cPickle
 11 | 
 12 | import numpy as np
 13 | 
 14 | import torch
 15 | from torch.utils.data import Dataset
 16 | 
 17 | from transformers import AutoTokenizer
 18 | from ._image_features_reader import ImageFeaturesH5Reader
 19 | 
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
 23 | 
 24 | 
 25 | def assert_eq(real, expected):
 26 |     assert real == expected, "%s (true) vs %s (expected)" % (real, expected)
 27 | 
 28 | 
 29 | def _create_entry(item):
 30 |     entry = {
 31 |         "question_id": int(item["question_id"]),
 32 |         "image_id": item["image_id"],
 33 |         "question": item["question"],
 34 |         "answer": item,
 35 |     }
 36 |     return entry
 37 | 
 38 | 
 39 | def _load_dataset(dataroot, name):
 40 |     """Load entries
 41 | 
 42 |     dataroot: root path of dataset
 43 |     name: 'train', 'val', 'trainval', 'test'
 44 |     """
 45 |     if name == "train" or name == "val":
 46 |         items_path = os.path.join(dataroot, "%s_target.pkl" % name)
 47 |         items = cPickle.load(open(items_path, "rb"))
 48 |         items = sorted(items, key=lambda x: x["question_id"])
 49 |     elif name == "trainval":
 50 |         items_path = os.path.join(dataroot, "%s_target.pkl" % name)
 51 |         items = cPickle.load(open(items_path, "rb"))
 52 |         items = sorted(items, key=lambda x: x["question_id"])
 53 |         items = items[:-3000]
 54 |     elif name == "minval":
 55 |         items_path = os.path.join(dataroot, "trainval_target.pkl")
 56 |         items = cPickle.load(open(items_path, "rb"))
 57 |         items = sorted(items, key=lambda x: x["question_id"])
 58 |         items = items[-3000:]
 59 |     elif name == "test":
 60 |         items_path = os.path.join(dataroot, "testdev_balanced_questions.json")
 61 |         items = json.load(open(items_path, "rb"))
 62 |     else:
 63 |         assert False, "data split is not recognized."
 64 | 
 65 |     if "test" in name:
 66 |         entries = []
 67 |         for item in items:
 68 |             it = items[item]
 69 |             entry = {
 70 |                 "question_id": int(item),
 71 |                 "image_id": it["imageId"],
 72 |                 "question": it["question"],
 73 |             }
 74 |             entries.append(entry)
 75 |     else:
 76 |         entries = []
 77 |         for item in items:
 78 |             entries.append(_create_entry(item))
 79 |     return entries
 80 | 
 81 | 
 82 | class GQAClassificationDataset(Dataset):
 83 |     def __init__(
 84 |         self,
 85 |         task: str,
 86 |         dataroot: str,
 87 |         annotations_jsonpath: str,
 88 |         split: str,
 89 |         image_features_reader: ImageFeaturesH5Reader,
 90 |         gt_image_features_reader: ImageFeaturesH5Reader,
 91 |         tokenizer: AutoTokenizer,
 92 |         bert_model,
 93 |         padding_index: int = 0,
 94 |         max_seq_length: int = 16,
 95 |         max_region_num: int = 37,
 96 |         num_locs=5,
 97 |         add_global_imgfeat=None,
 98 |         append_mask_sep=False,
 99 |     ):
100 |         super().__init__()
101 |         self.split = split
102 |         ans2label_path = os.path.join(dataroot, "trainval_ans2label.pkl")
103 |         label2ans_path = os.path.join(dataroot, "trainval_label2ans.pkl")
104 |         self.ans2label = cPickle.load(open(ans2label_path, "rb"))
105 |         self.label2ans = cPickle.load(open(label2ans_path, "rb"))
106 |         self.num_labels = len(self.ans2label)
107 |         self._max_region_num = max_region_num + int(add_global_imgfeat is not None)
108 |         self._max_seq_length = max_seq_length
109 |         self._image_features_reader = image_features_reader
110 |         self._tokenizer = tokenizer
111 |         self._padding_index = padding_index
112 |         self._num_locs = num_locs
113 |         self._add_global_imgfeat = add_global_imgfeat
114 | 
115 |         if "roberta" in bert_model:
116 |             cache_path = os.path.join(
117 |                 dataroot,
118 |                 "cache",
119 |                 task
120 |                 + "_"
121 |                 + split
122 |                 + "_"
123 |                 + "roberta"
124 |                 + "_"
125 |                 + str(max_seq_length)
126 |                 + ".pkl",
127 |             )
128 |         else:
129 |             cache_path = os.path.join(
130 |                 dataroot,
131 |                 "cache",
132 |                 task
133 |                 + "_"
134 |                 + split
135 |                 + "_"
136 |                 + str(max_seq_length)
137 |                 + ".pkl",
138 |             )
139 | 
140 |         if not os.path.exists(cache_path):
141 |             self.entries = _load_dataset(dataroot, split)
142 |             self.tokenize(max_seq_length)
143 |             self.tensorize()
144 |             cPickle.dump(self.entries, open(cache_path, "wb"))
145 |         else:
146 |             logger.info("Loading from %s" % cache_path)
147 |             self.entries = cPickle.load(open(cache_path, "rb"))
148 | 
149 |         self.qid2imgid = {e["question_id"]: e["image_id"] for e in self.entries}
150 | 
151 |     def tokenize(self, max_length=16):
152 |         """Tokenizes the questions.
153 | 
154 |         This will add q_token in each entry of the dataset.
155 |         -1 represent nil, and should be treated as padding_index in embedding
156 |         """
157 |         for entry in self.entries:
158 |             tokens = self._tokenizer.encode(entry["question"])
159 |             tokens = [tokens[0]] + tokens[1:-1][: self._max_seq_length - 2] + [tokens[-1]]
160 | 
161 |             segment_ids = [0] * len(tokens)
162 |             input_mask = [1] * len(tokens)
163 | 
164 |             if len(tokens) < max_length:
165 |                 # Note here we pad in front of the sentence
166 |                 padding = [self._padding_index] * (max_length - len(tokens))
167 |                 tokens = tokens + padding
168 |                 input_mask += padding
169 |                 segment_ids += padding
170 | 
171 |             assert_eq(len(tokens), max_length)
172 |             entry["q_token"] = tokens
173 |             entry["q_input_mask"] = input_mask
174 |             entry["q_segment_ids"] = segment_ids
175 | 
176 |     def tensorize(self):
177 |         for entry in self.entries:
178 |             question = torch.from_numpy(np.array(entry["q_token"]))
179 |             entry["q_token"] = question
180 | 
181 |             q_input_mask = torch.from_numpy(np.array(entry["q_input_mask"]))
182 |             entry["q_input_mask"] = q_input_mask
183 | 
184 |             q_segment_ids = torch.from_numpy(np.array(entry["q_segment_ids"]))
185 |             entry["q_segment_ids"] = q_segment_ids
186 | 
187 |             if "test" not in self.split:
188 |                 answer = entry["answer"]
189 |                 labels = np.array(answer["labels"])
190 |                 scores = np.array(answer["scores"], dtype=np.float32)
191 |                 if len(labels):
192 |                     labels = torch.from_numpy(labels)
193 |                     scores = torch.from_numpy(scores)
194 |                     entry["answer"]["labels"] = labels
195 |                     entry["answer"]["scores"] = scores
196 |                 else:
197 |                     entry["answer"]["labels"] = None
198 |                     entry["answer"]["scores"] = None
199 | 
200 |     def __getitem__(self, index):
201 |         entry = self.entries[index]
202 |         image_id = entry["image_id"]
203 |         question_id = entry["question_id"]
204 |         features, num_boxes, boxes, _ = self._image_features_reader[image_id]
205 | 
206 |         mix_num_boxes = min(int(num_boxes), self._max_region_num)
207 |         mix_boxes_pad = np.zeros((self._max_region_num, self._num_locs))
208 |         mix_features_pad = np.zeros((self._max_region_num, 2048))
209 | 
210 |         image_mask = [1] * (int(mix_num_boxes))
211 |         while len(image_mask) < self._max_region_num:
212 |             image_mask.append(0)
213 | 
214 |         mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes]
215 |         mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes]
216 | 
217 |         features = torch.tensor(mix_features_pad).float()
218 |         image_mask = torch.tensor(image_mask).long()
219 |         spatials = torch.tensor(mix_boxes_pad).float()
220 | 
221 |         question = entry["q_token"]
222 |         input_mask = entry["q_input_mask"]
223 |         segment_ids = entry["q_segment_ids"]
224 | 
225 |         target = torch.zeros(self.num_labels)
226 | 
227 |         if "test" not in self.split:
228 |             answer = entry["answer"]
229 |             labels = answer["labels"]
230 |             scores = answer["scores"]
231 |             if labels is not None:
232 |                 target.scatter_(0, labels, scores)
233 | 
234 |         return features, spatials, image_mask, question, target, input_mask, segment_ids, question_id
235 | 
236 |     def __len__(self):
237 |         return len(self.entries)
238 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/volta/datasets/guesswhat_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | import os
  7 | import jsonlines
  8 | import _pickle as cPickle
  9 | import logging
 10 | 
 11 | import numpy as np
 12 | import torch
 13 | from torch.utils.data import Dataset
 14 | from transformers import AutoTokenizer
 15 | 
 16 | from ._image_features_reader import ImageFeaturesH5Reader
 17 | 
 18 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 19 | os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
 20 | 
 21 | LABEL_MAP = {"Yes": 0, "No": 1, "N/A": 2}
 22 | 
 23 | 
 24 | def assert_eq(real, expected):
 25 |     assert real == expected, "%s (true) vs %s (expected)" % (real, expected)
 26 | 
 27 | 
 28 | def _create_entry(item):
 29 |     entry = {
 30 |         "question_id": item["question_id"],
 31 |         "image_id": item["image_id"],
 32 |         "question": item["question"],
 33 |         "answer": item,
 34 |     }
 35 |     return entry
 36 | 
 37 | 
 38 | def _load_dataset(dataroot, name):
 39 |     """Load entries
 40 | 
 41 |     dataroot: root path of dataset
 42 |     name: 'train', 'valid', 'test'
 43 |     """
 44 |     if name == "train" or name == "valid" or name == "test":
 45 |         annotations_path = os.path.join(dataroot, "guesswhat.%s.jsonl" % name)
 46 |         with jsonlines.open(annotations_path) as reader:
 47 |             # Build an index which maps image id with a list of qa annotations.
 48 |             items = []
 49 |             for annotation in reader:
 50 |                 for q in annotation["qas"]:
 51 |                     dictionary = {}
 52 |                     dictionary["image_id"] = annotation["image"]["id"]
 53 |                     dictionary["question_id"] = q["id"]
 54 |                     dictionary["question"] = q["question"]
 55 |                     dictionary["labels"] = [int(LABEL_MAP[str(q["answer"])])]
 56 |                     dictionary["scores"] = [1.0]
 57 |                     items.append(dictionary)
 58 |     else:
 59 |         assert False, "data split is not recognized."
 60 | 
 61 |     if "test" in name:
 62 |         entries = []
 63 |         for item in items:
 64 |             entries.append(item)
 65 |     else:
 66 |         entries = []
 67 |         for item in items:
 68 |             entries.append(_create_entry(item))
 69 |     return entries
 70 | 
 71 | 
 72 | class GuessWhatDataset(Dataset):
 73 |     def __init__(
 74 |         self,
 75 |         task: str,
 76 |         dataroot: str,
 77 |         annotations_jsonpath: str,
 78 |         split: str,
 79 |         image_features_reader: ImageFeaturesH5Reader,
 80 |         gt_image_features_reader: ImageFeaturesH5Reader,
 81 |         tokenizer: AutoTokenizer,
 82 |         padding_index: int = 0,
 83 |         max_seq_length: int = 16,
 84 |         max_region_num: int = 37,
 85 |     ):
 86 |         super().__init__()
 87 |         self.split = split
 88 |         self.num_labels = 3
 89 |         self._max_region_num = max_region_num
 90 |         self._max_seq_length = max_seq_length
 91 |         self._image_features_reader = image_features_reader
 92 |         self._tokenizer = tokenizer
 93 |         self._padding_index = padding_index
 94 |         cache_path = os.path.join(
 95 |             dataroot, "cache", task + "_" + split + "_" + str(max_seq_length) + ".pkl"
 96 |         )
 97 |         if not os.path.exists(cache_path):
 98 |             self.entries = _load_dataset(dataroot, split)
 99 |             self.tokenize(max_seq_length)
100 |             self.tensorize()
101 |             cPickle.dump(self.entries, open(cache_path, "wb"))
102 |         else:
103 |             logger.info("Loading from %s" % cache_path)
104 |             self.entries = cPickle.load(open(cache_path, "rb"))
105 | 
106 |     def tokenize(self, max_length=16):
107 |         """Tokenizes the questions.
108 | 
109 |         This will add q_token in each entry of the dataset.
110 |         -1 represent nil, and should be treated as padding_index in embedding
111 |         """
112 |         for entry in self.entries:
113 |             # tokens = self._tokenizer.tokenize(entry["question"])
114 |             # tokens = ["[CLS]"] + tokens + ["[SEP]"]
115 | 
116 |             # tokens = [
117 |             #     self._tokenizer.vocab.get(w, self._tokenizer.vocab["[UNK]"])
118 |             #     for w in tokens
119 |             # ]
120 |             tokens = self._tokenizer.encode(entry["question"])
121 |             tokens = [tokens[0]] + tokens[1:-1][: self._max_seq_length - 2] + [tokens[-1]]
122 | 
123 |             # tokens = tokens[:max_length]
124 |             segment_ids = [0] * len(tokens)
125 |             input_mask = [1] * len(tokens)
126 | 
127 |             if len(tokens) < max_length:
128 |                 # Note here we pad in front of the sentence
129 |                 padding = [self._padding_index] * (max_length - len(tokens))
130 |                 tokens = tokens + padding
131 |                 input_mask += padding
132 |                 segment_ids += padding
133 | 
134 |             assert_eq(len(tokens), max_length)
135 |             entry["q_token"] = tokens
136 |             entry["q_input_mask"] = input_mask
137 |             entry["q_segment_ids"] = segment_ids
138 | 
139 |     def tensorize(self):
140 | 
141 |         for entry in self.entries:
142 |             question = torch.from_numpy(np.array(entry["q_token"]))
143 |             entry["q_token"] = question
144 | 
145 |             q_input_mask = torch.from_numpy(np.array(entry["q_input_mask"]))
146 |             entry["q_input_mask"] = q_input_mask
147 | 
148 |             q_segment_ids = torch.from_numpy(np.array(entry["q_segment_ids"]))
149 |             entry["q_segment_ids"] = q_segment_ids
150 | 
151 |             if "test" not in self.split:
152 |                 answer = entry["answer"]
153 |                 labels = np.array(answer["labels"])
154 |                 scores = np.array(answer["scores"], dtype=np.float32)
155 |                 if len(labels):
156 |                     labels = torch.from_numpy(labels)
157 |                     scores = torch.from_numpy(scores)
158 |                     entry["answer"]["labels"] = labels
159 |                     entry["answer"]["scores"] = scores
160 |                 else:
161 |                     entry["answer"]["labels"] = None
162 |                     entry["answer"]["scores"] = None
163 | 
164 |     def __getitem__(self, index):
165 |         entry = self.entries[index]
166 |         image_id = entry["image_id"]
167 |         question_id = entry["question_id"]
168 |         features, num_boxes, boxes, _ = self._image_features_reader[image_id]
169 | 
170 |         mix_num_boxes = min(int(num_boxes), self._max_region_num)
171 |         mix_boxes_pad = np.zeros((self._max_region_num, 5))
172 |         mix_features_pad = np.zeros((self._max_region_num, 2048))
173 | 
174 |         image_mask = [1] * (int(mix_num_boxes))
175 |         while len(image_mask) < self._max_region_num:
176 |             image_mask.append(0)
177 | 
178 |         mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes]
179 |         mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes]
180 | 
181 |         features = torch.tensor(mix_features_pad).float()
182 |         image_mask = torch.tensor(image_mask).long()
183 |         spatials = torch.tensor(mix_boxes_pad).float()
184 | 
185 |         question = entry["q_token"]
186 |         input_mask = entry["q_input_mask"]
187 |         segment_ids = entry["q_segment_ids"]
188 | 
189 |         co_attention_mask = torch.zeros((self._max_region_num, self._max_seq_length))
190 |         target = torch.zeros(self.num_labels)
191 | 
192 |         if "test" not in self.split:
193 |             answer = entry["answer"]
194 |             labels = answer["labels"]
195 |             scores = answer["scores"]
196 |             if labels is not None:
197 |                 target.scatter_(0, labels, scores)
198 | 
199 |         return (
200 |             features,
201 |             spatials,
202 |             image_mask,
203 |             question,
204 |             target,
205 |             input_mask,
206 |             segment_ids,
207 |             co_attention_mask,
208 |             question_id,
209 |         )
210 | 
211 |     def __len__(self):
212 |         return len(self.entries)
213 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/volta/datasets/nlvr2_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Copyright (c) 2020, Emanuele Bugliarello (@e-bug).
  3 | 
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import os
  8 | import logging
  9 | import jsonlines
 10 | import _pickle as cPickle
 11 | 
 12 | import numpy as np
 13 | 
 14 | import torch
 15 | from torch.utils.data import Dataset
 16 | 
 17 | 
 18 | logger = logging.getLogger(__name__)
 19 | os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
 20 | 
 21 | 
 22 | def assert_eq(real, expected):
 23 |     assert real == expected, "%s (true) vs %s (expected)" % (real, expected)
 24 | 
 25 | 
 26 | def _create_entry(item):
 27 |     entry = {
 28 |         "question_id": item["question_id"],
 29 |         "image_id_0": item["image_id_0"],
 30 |         "image_id_1": item["image_id_1"],
 31 |         "sentence": item["sentence"],
 32 |         "answer": item,
 33 |     }
 34 |     return entry
 35 | 
 36 | 
 37 | def _load_dataset(dataroot, name):
 38 |     """Load entries
 39 | 
 40 |     dataroot: root path of dataset
 41 |     name: 'train', 'dev', 'test'
 42 |     """
 43 |     if name == "train" or name == "dev" or name == "test":
 44 |         annotations_path = os.path.join(dataroot, "%s.json" % name)
 45 |         with jsonlines.open(annotations_path) as reader:
 46 |             # Build an index which maps image id with a list of hypothesis annotations.
 47 |             items = []
 48 |             count = 0
 49 |             for annotation in reader:
 50 |                 dictionary = {}
 51 |                 dictionary["id"] = annotation["identifier"]
 52 |                 dictionary["image_id_0"] = (
 53 |                     "-".join(annotation["identifier"].split("-")[:-1]) + "-img0"
 54 |                 )
 55 |                 dictionary["image_id_1"] = (
 56 |                     "-".join(annotation["identifier"].split("-")[:-1]) + "-img1"
 57 |                 )
 58 |                 dictionary["question_id"] = count
 59 |                 dictionary["sentence"] = str(annotation["sentence"])
 60 |                 dictionary["labels"] = [0 if str(annotation["label"]) == "False" else 1]
 61 |                 dictionary["scores"] = [1.0]
 62 |                 items.append(dictionary)
 63 |                 count += 1
 64 |     else:
 65 |         assert False, "data split is not recognized."
 66 | 
 67 |     entries = []
 68 |     for item in items:
 69 |         entries.append(_create_entry(item))
 70 |     return entries
 71 | 
 72 | 
 73 | class NLVR2Dataset(Dataset):
 74 |     def __init__(
 75 |         self,
 76 |         task,
 77 |         dataroot,
 78 |         annotations_jsonpath,
 79 |         split,
 80 |         image_features_reader,
 81 |         gt_image_features_reader,
 82 |         tokenizer,
 83 |         bert_model,
 84 |         padding_index=0,
 85 |         max_seq_length=16,
 86 |         max_region_num=37,
 87 |         num_locs=5,
 88 |         add_global_imgfeat=None,
 89 |         append_mask_sep=False,
 90 |     ):
 91 |         super().__init__()
 92 |         self.split = split
 93 |         self.num_labels = 2
 94 |         self._max_region_num = max_region_num + int(add_global_imgfeat is not None)
 95 |         self._max_seq_length = max_seq_length
 96 |         self._image_features_reader = image_features_reader
 97 |         self._tokenizer = tokenizer
 98 |         self._padding_index = padding_index
 99 |         self._num_locs = num_locs
100 |         self._add_global_imgfeat = add_global_imgfeat
101 | 
102 |         if "roberta" in bert_model:
103 |             cache_path = os.path.join(
104 |                 dataroot,
105 |                 "cache",
106 |                 task
107 |                 + "_"
108 |                 + split
109 |                 + "_"
110 |                 + "roberta"
111 |                 + "_"
112 |                 + str(max_seq_length)
113 |                 + ".pkl",
114 |             )
115 |         else:
116 |             cache_path = os.path.join(
117 |                 dataroot,
118 |                 "cache",
119 |                 task
120 |                 + "_"
121 |                 + split
122 |                 + "_"
123 |                 + str(max_seq_length)
124 |                 + ".pkl",
125 |             )
126 |         if not os.path.exists(cache_path):
127 |             self.entries = _load_dataset(dataroot, split)
128 |             self.tokenize(max_seq_length)
129 |             self.tensorize()
130 |             cPickle.dump(self.entries, open(cache_path, "wb"))
131 |         else:
132 |             logger.info("Loading from %s" % cache_path)
133 |             self.entries = cPickle.load(open(cache_path, "rb"))
134 | 
135 |     def tokenize(self, max_length=16):
136 |         """Tokenizes the questions.
137 | 
138 |         This will add q_token in each entry of the dataset.
139 |         -1 represent nil, and should be treated as padding_index in embedding
140 |         """
141 |         for entry in self.entries:
142 |             tokens = self._tokenizer.encode(entry["sentence"])
143 |             tokens = [tokens[0]] + tokens[1:-1][: self._max_seq_length - 2] + [tokens[-1]]
144 | 
145 |             segment_ids = [0] * len(tokens)
146 |             input_mask = [1] * len(tokens)
147 | 
148 |             if len(tokens) < max_length:
149 |                 # Note here we pad in front of the sentence
150 |                 padding = [self._padding_index] * (max_length - len(tokens))
151 |                 tokens = tokens + padding
152 |                 input_mask += padding
153 |                 segment_ids += padding
154 | 
155 |             assert_eq(len(tokens), max_length)
156 |             entry["q_token"] = tokens
157 |             entry["q_input_mask"] = input_mask
158 |             entry["q_segment_ids"] = segment_ids
159 | 
160 |     def tensorize(self):
161 |         for entry in self.entries:
162 |             question = torch.from_numpy(np.array(entry["q_token"]))
163 |             entry["q_token"] = question
164 | 
165 |             q_input_mask = torch.from_numpy(np.array(entry["q_input_mask"]))
166 |             entry["q_input_mask"] = q_input_mask
167 | 
168 |             q_segment_ids = torch.from_numpy(np.array(entry["q_segment_ids"]))
169 |             entry["q_segment_ids"] = q_segment_ids
170 | 
171 |             answer = entry["answer"]
172 |             labels = np.array(answer["labels"])
173 |             scores = np.array(answer["scores"], dtype=np.float32)
174 |             if len(labels):
175 |                 labels = torch.from_numpy(labels)
176 |                 scores = torch.from_numpy(scores)
177 |                 entry["answer"]["labels"] = labels
178 |                 entry["answer"]["scores"] = scores
179 |             else:
180 |                 entry["answer"]["labels"] = None
181 |                 entry["answer"]["scores"] = None
182 | 
183 |     def __getitem__(self, index):
184 |         entry = self.entries[index]
185 |         image_id_0 = entry["image_id_0"]
186 |         image_id_1 = entry["image_id_1"]
187 |         question_id = entry["question_id"]
188 |         features_0, num_boxes_0, boxes_0, _ = self._image_features_reader[image_id_0]
189 |         features_1, num_boxes_1, boxes_1, _ = self._image_features_reader[image_id_1]
190 | 
191 |         mix_num_boxes = min(int(num_boxes_0) + int(num_boxes_1), self._max_region_num * 2)
192 |         mix_boxes_pad = np.zeros((self._max_region_num * 2, self._num_locs))
193 |         mix_features_pad = np.zeros((self._max_region_num * 2, 2048))
194 | 
195 |         image_mask = [1] * (int(mix_num_boxes))
196 |         while len(image_mask) < self._max_region_num * 2:
197 |             image_mask.append(0)
198 | 
199 |         mix_boxes_pad[:mix_num_boxes] = np.concatenate((boxes_0, boxes_1), axis=0)[:mix_num_boxes]
200 |         mix_features_pad[:mix_num_boxes] = np.concatenate((features_0, features_1), axis=0)[:mix_num_boxes]
201 | 
202 |         img_segment_ids = np.zeros((mix_features_pad.shape[0]))
203 |         img_segment_ids[:boxes_0.shape[0]] = 0
204 |         img_segment_ids[boxes_0.shape[0]:] = 1
205 | 
206 |         features = torch.tensor(mix_features_pad).float()
207 |         image_mask = torch.tensor(image_mask).long()
208 |         spatials = torch.tensor(mix_boxes_pad).float()
209 | 
210 |         question = entry["q_token"]
211 |         input_mask = entry["q_input_mask"]
212 |         segment_ids = entry["q_segment_ids"]
213 | 
214 |         target = torch.zeros(self.num_labels)
215 | 
216 |         answer = entry["answer"]
217 |         labels = answer["labels"]
218 |         scores = answer["scores"]
219 |         if labels is not None:
220 |             target.scatter_(0, labels, scores)
221 | 
222 |         return features, spatials, image_mask, question, target, input_mask, segment_ids, question_id
223 | 
224 |     def __len__(self):
225 |         return len(self.entries)
226 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/volta/datasets/refer_dense_caption.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | import json
  7 | from typing import Any, Dict, List
  8 | import random
  9 | import os
 10 | 
 11 | import torch
 12 | from torch.utils.data import Dataset
 13 | import numpy as np
 14 | 
 15 | from transformers import AutoTokenizer
 16 | from ._image_features_reader import ImageFeaturesH5Reader
 17 | import _pickle as cPickle
 18 | import sys
 19 | import pdb
 20 | 
 21 | 
 22 | def iou(anchors, gt_boxes):
 23 |     """
 24 |     anchors: (N, 4) ndarray of float
 25 |     gt_boxes: (K, 4) ndarray of float
 26 |     overlaps: (N, K) ndarray of overlap between boxes and query_boxes
 27 |     """
 28 |     N = anchors.size(0)
 29 |     K = gt_boxes.size(0)
 30 | 
 31 |     gt_boxes_area = (
 32 |         (gt_boxes[:, 2] - gt_boxes[:, 0] + 1) * (gt_boxes[:, 3] - gt_boxes[:, 1] + 1)
 33 |     ).view(1, K)
 34 | 
 35 |     anchors_area = (
 36 |         (anchors[:, 2] - anchors[:, 0] + 1) * (anchors[:, 3] - anchors[:, 1] + 1)
 37 |     ).view(N, 1)
 38 | 
 39 |     boxes = anchors.view(N, 1, 4).expand(N, K, 4)
 40 |     query_boxes = gt_boxes.view(1, K, 4).expand(N, K, 4)
 41 | 
 42 |     iw = (
 43 |         torch.min(boxes[:, :, 2], query_boxes[:, :, 2])
 44 |         - torch.max(boxes[:, :, 0], query_boxes[:, :, 0])
 45 |         + 1
 46 |     )
 47 |     iw[iw < 0] = 0
 48 | 
 49 |     ih = (
 50 |         torch.min(boxes[:, :, 3], query_boxes[:, :, 3])
 51 |         - torch.max(boxes[:, :, 1], query_boxes[:, :, 1])
 52 |         + 1
 53 |     )
 54 |     ih[ih < 0] = 0
 55 | 
 56 |     ua = anchors_area + gt_boxes_area - (iw * ih)
 57 |     overlaps = iw * ih / ua
 58 | 
 59 |     return overlaps
 60 | 
 61 | 
 62 | def assert_eq(real, expected):
 63 |     assert real == expected, "%s (true) vs %s (expected)" % (real, expected)
 64 | 
 65 | 
 66 | class ReferDenseCpationDataset(Dataset):
 67 |     def __init__(
 68 |         self,
 69 |         task: str,
 70 |         dataroot: str,
 71 |         annotations_jsonpath: str,
 72 |         split: str,
 73 |         image_features_reader: ImageFeaturesH5Reader,
 74 |         gt_image_features_reader: ImageFeaturesH5Reader,
 75 |         tokenizer: AutoTokenizer,
 76 |         bert_model,
 77 |         padding_index: int = 0,
 78 |         max_seq_length: int = 20,
 79 |         max_region_num: int = 60,
 80 |     ):
 81 |         self.split = split
 82 |         self.num_labels = 1
 83 |         self._image_features_reader = image_features_reader
 84 |         self._gt_image_features_reader = gt_image_features_reader
 85 |         self._tokenizer = tokenizer
 86 | 
 87 |         self._padding_index = padding_index
 88 |         self._max_seq_length = max_seq_length
 89 | 
 90 |         self.entries = self._load_annotations(annotations_jsonpath)
 91 |         self.max_region_num = max_region_num
 92 | 
 93 |         if "roberta" in bert_model:
 94 |             cache_path = os.path.join(
 95 |                 dataroot,
 96 |                 "cache",
 97 |                 task
 98 |                 + "_"
 99 |                 + split
100 |                 + "_"
101 |                 + "roberta"
102 |                 + "_"
103 |                 + str(max_seq_length)
104 |                 + "_"
105 |                 + str(max_region_num)
106 |                 + ".pkl",
107 |             )
108 |         else:
109 |             cache_path = os.path.join(
110 |                 dataroot,
111 |                 "cache",
112 |                 task
113 |                 + "_"
114 |                 + split
115 |                 + "_"
116 |                 + str(max_seq_length)
117 |                 + "_"
118 |                 + str(max_region_num)
119 |                 + ".pkl",
120 |             )
121 | 
122 |         if not os.path.exists(cache_path):
123 |             self.tokenize()
124 |             self.tensorize()
125 |             cPickle.dump(self.entries, open(cache_path, "wb"))
126 |         else:
127 |             print("loading entries from %s" % (cache_path))
128 |             self.entries = cPickle.load(open(cache_path, "rb"))
129 | 
130 |     def _load_annotations(self, annotations_jsonpath):
131 | 
132 |         # annotations_json: Dict[str, Any] = json.load(open(annotations_jsonpath))
133 |         annotations = json.load(open(annotations_jsonpath, "r"))
134 | 
135 |         # we simply use the last 5000 for val and test
136 |         if self.split == "train":
137 |             annotations = annotations[:-10000]
138 |         elif self.split == "val":
139 |             annotations = annotations[-10000:-5000]
140 |         elif self.split == "test":
141 |             annotations = annotations[-5000:]
142 | 
143 |         entries = []
144 |         for img in annotations:
145 |             image_id = img["id"]
146 |             for region in img["regions"]:
147 |                 phrase = region["phrase"]
148 |                 region_id = region["region_id"]
149 |                 bbox = [region["x"], region["y"], region["width"], region["height"]]
150 |                 entries.append(
151 |                     {
152 |                         "phrase": phrase,
153 |                         "region_id": region_id,
154 |                         "image_id": image_id,
155 |                         "bbox": bbox,
156 |                     }
157 |                 )
158 | 
159 |         return entries
160 | 
161 |     def tokenize(self):
162 |         """Tokenizes the captions.
163 | 
164 |         This will add caption_tokens in each entry of the dataset.
165 |         -1 represents nil, and should be treated as padding_idx in embedding.
166 |         """
167 |         count = 0
168 |         for entry in self.entries:
169 | 
170 |             # sentence_tokens = self._tokenizer.tokenize(entry["phrase"])
171 |             # sentence_tokens = ["[CLS]"] + sentence_tokens + ["[SEP]"]
172 | 
173 |             # tokens = [
174 |             #     self._tokenizer.vocab.get(w, self._tokenizer.vocab["[UNK]"])
175 |             #     for w in sentence_tokens
176 |             # ]
177 | 
178 |             tokens = self._tokenizer.encode(entry["phrase"])
179 |             tokens = [tokens[0]] + tokens[1:-1][: self._max_seq_length - 2] + [tokens[-1]]
180 | 
181 |             tokens = tokens[: self._max_seq_length]
182 |             segment_ids = [0] * len(tokens)
183 |             input_mask = [1] * len(tokens)
184 | 
185 |             if len(tokens) < self._max_seq_length:
186 |                 # Note here we pad in front of the sentence
187 |                 padding = [self._padding_index] * (self._max_seq_length - len(tokens))
188 |                 tokens = tokens + padding
189 |                 input_mask += padding
190 |                 segment_ids += padding
191 | 
192 |             assert_eq(len(tokens), self._max_seq_length)
193 |             entry["token"] = tokens
194 |             entry["input_mask"] = input_mask
195 |             entry["segment_ids"] = segment_ids
196 | 
197 |             sys.stdout.write("%d/%d\r" % (count, len(self.entries)))
198 |             sys.stdout.flush()
199 |             count += 1
200 | 
201 |     def tensorize(self):
202 |         for entry in self.entries:
203 |             token = torch.from_numpy(np.array(entry["token"]))
204 |             entry["token"] = token
205 | 
206 |             input_mask = torch.from_numpy(np.array(entry["input_mask"]))
207 |             entry["input_mask"] = input_mask
208 | 
209 |             segment_ids = torch.from_numpy(np.array(entry["segment_ids"]))
210 |             entry["segment_ids"] = segment_ids
211 | 
212 |     def __getitem__(self, index):
213 |         entry = self.entries[index]
214 | 
215 |         image_id = entry["image_id"]
216 |         ref_box = entry["bbox"]
217 | 
218 |         ref_box = [
219 |             ref_box[0],
220 |             ref_box[1],
221 |             ref_box[0] + ref_box[2],
222 |             ref_box[1] + ref_box[3],
223 |         ]
224 |         features, num_boxes, boxes, boxes_ori = self._image_features_reader[image_id]
225 | 
226 |         boxes_ori = boxes_ori[:num_boxes]
227 |         boxes = boxes[:num_boxes]
228 |         features = features[:num_boxes]
229 | 
230 |         mix_boxes_ori = boxes_ori
231 |         mix_boxes = boxes
232 |         mix_features = features
233 |         mix_num_boxes = min(int(num_boxes), self.max_region_num)
234 |         mix_target = iou(
235 |             torch.tensor(mix_boxes_ori[:, :4]).float(), torch.tensor([ref_box]).float()
236 |         )
237 | 
238 |         image_mask = [1] * (mix_num_boxes)
239 |         while len(image_mask) < self.max_region_num:
240 |             image_mask.append(0)
241 | 
242 |         mix_boxes_pad = np.zeros((self.max_region_num, 5))
243 |         mix_features_pad = np.zeros((self.max_region_num, 2048))
244 | 
245 |         # random sample index for the target.
246 |         pdb.set_trace()
247 | 
248 |         mix_boxes_pad[:mix_num_boxes] = mix_boxes
249 |         mix_features_pad[:mix_num_boxes] = mix_features
250 | 
251 |         # appending the target feature.
252 |         features = torch.tensor(mix_features_pad).float()
253 |         image_mask = torch.tensor(image_mask).long()
254 |         spatials = torch.tensor(mix_boxes_pad).float()
255 | 
256 |         target = torch.zeros((self.max_region_num, 1)).float()
257 |         target[:mix_num_boxes] = mix_target
258 | 
259 |         spatials_ori = torch.tensor(mix_boxes_ori).float()
260 |         co_attention_mask = torch.zeros((self.max_region_num, self._max_seq_length))
261 | 
262 |         caption = entry["token"]
263 |         input_mask = entry["input_mask"]
264 |         segment_ids = entry["segment_ids"]
265 | 
266 |         return (
267 |             features,
268 |             spatials,
269 |             image_mask,
270 |             caption,
271 |             target,
272 |             input_mask,
273 |             segment_ids,
274 |             co_attention_mask,
275 |             image_id,
276 |         )
277 | 
278 |     def __len__(self):
279 |         return len(self.entries)
280 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/volta/datasets/refer_expression_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Copyright (c) 2020, Emanuele Bugliarello (@e-bug).
  3 | 
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import os
  8 | import _pickle as cPickle
  9 | 
 10 | import numpy as np
 11 | 
 12 | import torch
 13 | from torch.utils.data import Dataset
 14 | 
 15 | from transformers import AutoTokenizer
 16 | from ._image_features_reader import ImageFeaturesH5Reader
 17 | 
 18 | from tools.refer.refer import REFER
 19 | 
 20 | 
 21 | def iou(anchors, gt_boxes):
 22 |     """
 23 |     anchors: (N, 4) ndarray of float
 24 |     gt_boxes: (K, 4) ndarray of float
 25 |     overlaps: (N, K) ndarray of overlap between boxes and query_boxes
 26 |     """
 27 |     N = anchors.size(0)
 28 |     K = gt_boxes.size(0)
 29 | 
 30 |     gt_boxes_area = (
 31 |         (gt_boxes[:, 2] - gt_boxes[:, 0] + 1) * (gt_boxes[:, 3] - gt_boxes[:, 1] + 1)
 32 |     ).view(1, K)
 33 | 
 34 |     anchors_area = (
 35 |         (anchors[:, 2] - anchors[:, 0] + 1) * (anchors[:, 3] - anchors[:, 1] + 1)
 36 |     ).view(N, 1)
 37 | 
 38 |     boxes = anchors.view(N, 1, 4).expand(N, K, 4)
 39 |     query_boxes = gt_boxes.view(1, K, 4).expand(N, K, 4)
 40 | 
 41 |     iw = (
 42 |         torch.min(boxes[:, :, 2], query_boxes[:, :, 2])
 43 |         - torch.max(boxes[:, :, 0], query_boxes[:, :, 0])
 44 |         + 1
 45 |     )
 46 |     iw[iw < 0] = 0
 47 | 
 48 |     ih = (
 49 |         torch.min(boxes[:, :, 3], query_boxes[:, :, 3])
 50 |         - torch.max(boxes[:, :, 1], query_boxes[:, :, 1])
 51 |         + 1
 52 |     )
 53 |     ih[ih < 0] = 0
 54 | 
 55 |     ua = anchors_area + gt_boxes_area - (iw * ih)
 56 |     overlaps = iw * ih / ua
 57 | 
 58 |     return overlaps
 59 | 
 60 | 
 61 | def assert_eq(real, expected):
 62 |     assert real == expected, "%s (true) vs %s (expected)" % (real, expected)
 63 | 
 64 | 
 65 | class ReferExpressionDataset(Dataset):
 66 |     def __init__(
 67 |         self,
 68 |         task: str,
 69 |         dataroot: str,
 70 |         annotations_jsonpath: str,
 71 |         split: str,
 72 |         image_features_reader: ImageFeaturesH5Reader,
 73 |         gt_image_features_reader: ImageFeaturesH5Reader,
 74 |         tokenizer: AutoTokenizer,
 75 |         bert_model,
 76 |         padding_index: int = 0,
 77 |         max_seq_length: int = 20,
 78 |         max_region_num: int = 60,
 79 |         num_locs=5,
 80 |         add_global_imgfeat=None,
 81 |         append_mask_sep=False,
 82 |     ):
 83 |         self.split = split
 84 | 
 85 |         if task == "refcocog":
 86 |             self.refer = REFER(dataroot, dataset=task, splitBy="umd")
 87 |         else:
 88 |             self.refer = REFER(dataroot, dataset=task, splitBy="unc")
 89 | 
 90 |         if self.split == "mteval":
 91 |             self.ref_ids = self.refer.getRefIds(split="train")
 92 |         else:
 93 |             self.ref_ids = self.refer.getRefIds(split=split)
 94 | 
 95 |         print("%s refs are in split [%s]." % (len(self.ref_ids), split))
 96 | 
 97 |         self.num_labels = 1
 98 |         self._image_features_reader = image_features_reader
 99 |         self._gt_image_features_reader = gt_image_features_reader
100 |         self._tokenizer = tokenizer
101 | 
102 |         self._padding_index = padding_index
103 |         self._max_seq_length = max_seq_length
104 |         self.dataroot = dataroot
105 |         self.entries = self._load_annotations()
106 | 
107 |         self._max_region_num = max_region_num + int(add_global_imgfeat is not None)
108 |         self._num_locs = num_locs
109 |         self._add_global_imgfeat = add_global_imgfeat
110 | 
111 |         if "roberta" in bert_model:
112 |             cache_path = os.path.join(
113 |                 dataroot,
114 |                 "cache",
115 |                 task
116 |                 + "_"
117 |                 + split
118 |                 + "_"
119 |                 + "roberta"
120 |                 + "_"
121 |                 + str(max_seq_length)
122 |                 + "_"
123 |                 + str(max_region_num)
124 |                 + ".pkl",
125 |             )
126 |         else:
127 |             cache_path = os.path.join(
128 |                 dataroot,
129 |                 "cache",
130 |                 task
131 |                 + "_"
132 |                 + split
133 |                 + "_"
134 |                 + str(max_seq_length)
135 |                 + "_"
136 |                 + str(max_region_num)
137 |                 + ".pkl",
138 |             )
139 | 
140 |         if not os.path.exists(cache_path):
141 |             self.tokenize()
142 |             self.tensorize()
143 |             cPickle.dump(self.entries, open(cache_path, "wb"))
144 |         else:
145 |             print("loading entries from %s" % (cache_path))
146 |             self.entries = cPickle.load(open(cache_path, "rb"))
147 | 
148 |     def _load_annotations(self):
149 |         # Build an index which maps image id with a list of caption annotations.
150 |         entries = []
151 |         remove_ids = []
152 |         if self.split == "mteval":
153 |             remove_ids = np.load(
154 |                 os.path.join(self.dataroot, "cache", "coco_test_ids.npy")
155 |             )
156 |             remove_ids = [int(x) for x in remove_ids]
157 | 
158 |         for ref_id in self.ref_ids:
159 |             ref = self.refer.Refs[ref_id]
160 |             image_id = ref["image_id"]
161 |             if self.split == "train" and int(image_id) in remove_ids:
162 |                 continue
163 |             elif self.split == "mteval" and int(image_id) not in remove_ids:
164 |                 continue
165 |             ref_id = ref["ref_id"]
166 |             refBox = self.refer.getRefBox(ref_id)
167 |             for sent, sent_id in zip(ref["sentences"], ref["sent_ids"]):
168 |                 caption = sent["raw"]
169 |                 entries.append(
170 |                     {
171 |                         "caption": caption,
172 |                         "sent_id": sent_id,
173 |                         "image_id": image_id,
174 |                         "refBox": refBox,
175 |                         "ref_id": ref_id,
176 |                     }
177 |                 )
178 | 
179 |         return entries
180 | 
181 |     def tokenize(self):
182 |         """Tokenizes the captions.
183 | 
184 |         This will add caption_tokens in each entry of the dataset.
185 |         -1 represents nil, and should be treated as padding_idx in embedding.
186 |         """
187 |         for entry in self.entries:
188 |             tokens = self._tokenizer.encode(entry["caption"])
189 |             tokens = [tokens[0]] + tokens[1:-1][: self._max_seq_length - 2] + [tokens[-1]]
190 | 
191 |             segment_ids = [0] * len(tokens)
192 |             input_mask = [1] * len(tokens)
193 | 
194 |             if len(tokens) < self._max_seq_length:
195 |                 # Note here we pad in front of the sentence
196 |                 padding = [self._padding_index] * (self._max_seq_length - len(tokens))
197 |                 tokens = tokens + padding
198 |                 input_mask += padding
199 |                 segment_ids += padding
200 | 
201 |             assert_eq(len(tokens), self._max_seq_length)
202 |             entry["token"] = tokens
203 |             entry["input_mask"] = input_mask
204 |             entry["segment_ids"] = segment_ids
205 | 
206 |     def tensorize(self):
207 |         for entry in self.entries:
208 |             token = torch.from_numpy(np.array(entry["token"]))
209 |             entry["token"] = token
210 | 
211 |             input_mask = torch.from_numpy(np.array(entry["input_mask"]))
212 |             entry["input_mask"] = input_mask
213 | 
214 |             segment_ids = torch.from_numpy(np.array(entry["segment_ids"]))
215 |             entry["segment_ids"] = segment_ids
216 | 
217 |     def __getitem__(self, index):
218 |         entry = self.entries[index]
219 | 
220 |         image_id = entry["image_id"]
221 |         ref_box = entry["refBox"]
222 | 
223 |         ref_box = [
224 |             ref_box[0],
225 |             ref_box[1],
226 |             ref_box[0] + ref_box[2],
227 |             ref_box[1] + ref_box[3],
228 |         ]
229 |         features, num_boxes, boxes, boxes_ori = self._image_features_reader[image_id]
230 | 
231 |         boxes_ori = boxes_ori[:num_boxes]
232 |         boxes = boxes[:num_boxes]
233 |         features = features[:num_boxes]
234 | 
235 |         mix_boxes_ori = boxes_ori
236 |         mix_boxes = boxes
237 |         mix_features = features
238 |         mix_num_boxes = min(int(num_boxes), self._max_region_num)
239 |         mix_target = iou(
240 |             torch.tensor(mix_boxes_ori[:, :4]).float(),
241 |             torch.tensor([ref_box]).float(),
242 |         )
243 | 
244 |         image_mask = [1] * (mix_num_boxes)
245 |         while len(image_mask) < self._max_region_num:
246 |             image_mask.append(0)
247 | 
248 |         mix_boxes_pad = np.zeros((self._max_region_num, self._num_locs))
249 |         mix_features_pad = np.zeros((self._max_region_num, 2048))
250 | 
251 |         mix_boxes_pad[:mix_num_boxes] = mix_boxes[:mix_num_boxes]
252 |         mix_features_pad[:mix_num_boxes] = mix_features[:mix_num_boxes]
253 | 
254 |         # appending the target feature.
255 |         features = torch.tensor(mix_features_pad).float()
256 |         image_mask = torch.tensor(image_mask).long()
257 |         spatials = torch.tensor(mix_boxes_pad).float()
258 | 
259 |         target = torch.zeros((self._max_region_num, 1)).float()
260 |         target[:mix_num_boxes] = mix_target[:mix_num_boxes]
261 | 
262 |         spatials_ori = torch.tensor(mix_boxes_ori).float()
263 | 
264 |         caption = entry["token"]
265 |         input_mask = entry["input_mask"]
266 |         segment_ids = entry["segment_ids"]
267 | 
268 |         return features, spatials, image_mask, caption, target, input_mask, segment_ids, image_id
269 | 
270 |     def __len__(self):
271 |         return len(self.entries)
272 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/volta/datasets/visdial_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | import json
  7 | import random
  8 | import os
  9 | import logging
 10 | 
 11 | import torch
 12 | from torch.utils.data import Dataset
 13 | import numpy as np
 14 | import _pickle as cPickle
 15 | 
 16 | from transformers import AutoTokenizer
 17 | from ._image_features_reader import ImageFeaturesH5Reader
 18 | import pdb
 19 | import csv
 20 | import sys
 21 | import copy
 22 | 
 23 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 24 | os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
 25 | 
 26 | 
 27 | def assert_eq(real, expected):
 28 |     assert real == expected, "%s (true) vs %s (expected)" % (real, expected)
 29 | 
 30 | 
 31 | def _load_dataset(annotations_jsonpath, clean_datasets):
 32 |     """Build an index out of FOIL annotations, mapping each image ID with its corresponding captions."""
 33 |     entries = []
 34 |     captions = []
 35 |     remove_ids = []
 36 |     if clean_datasets:
 37 |         remove_ids = np.load(os.path.join(dataroot, "cache", "genome_test_ids.npy"))
 38 |         remove_ids = [int(x) for x in remove_ids]
 39 |     print("Loading dataset from %s" % annotations_jsonpath)
 40 |     annotations = json.load(open(annotations_jsonpath, "r"))["data"]
 41 |     print("Finish loading ...")
 42 |     for i, dialog in enumerate(annotations["dialogs"]):
 43 |         image_id = dialog["image_id"]
 44 |         if int(image_id) in remove_ids:
 45 |             continue
 46 |         captions.append(dialog["caption"])
 47 |         entries.append({"image_id": image_id, "dialog": dialog["dialog"], "caption": i})
 48 | 
 49 |     return entries, annotations["questions"], annotations["answers"], captions
 50 | 
 51 | 
 52 | class VisDialDataset(Dataset):
 53 |     def __init__(
 54 |         self,
 55 |         task,
 56 |         dataroot,
 57 |         annotations_jsonpath,
 58 |         split,
 59 |         image_features_reader,
 60 |         gt_image_features_reader,
 61 |         tokenizer,
 62 |         bert_model,
 63 |         clean_datasets,
 64 |         padding_index=0,
 65 |         max_seq_length=16,
 66 |         max_region_num=101,
 67 |     ):
 68 | 
 69 |         self._image_features_reader = image_features_reader
 70 |         self._tokenizer = tokenizer
 71 | 
 72 |         self._padding_index = padding_index
 73 |         self._max_seq_length = max_seq_length
 74 |         self._max_region_num = max_region_num
 75 |         self._total_seq_length = 50
 76 |         self.num_labels = 1
 77 | 
 78 |         self.max_round_num = 3
 79 |         self.max_num_option = 4
 80 |         self.ans_option = 100
 81 |         self.CLS = self._tokenizer.convert_tokens_to_ids(["[CLS]"])[0]
 82 |         self.SEP = self._tokenizer.convert_tokens_to_ids(["[SEP]"])[0]
 83 | 
 84 |         clean_train = "_cleaned" if clean_datasets else ""
 85 | 
 86 |         if "roberta" in bert_model:
 87 |             cache_path = os.path.join(
 88 |                 dataroot,
 89 |                 "cache",
 90 |                 task
 91 |                 + "_"
 92 |                 + split
 93 |                 + "_"
 94 |                 + "roberta"
 95 |                 + "_"
 96 |                 + str(max_seq_length)
 97 |                 + clean_train
 98 |                 + ".pkl",
 99 |             )
100 |         else:
101 |             cache_path = os.path.join(
102 |                 dataroot,
103 |                 "cache",
104 |                 task + "_" + split + "_" + str(max_seq_length) + clean_train + ".pkl",
105 |             )
106 | 
107 |         if not os.path.exists(cache_path):
108 |             self._entries, questions, answers, captions = _load_dataset(
109 |                 annotations_jsonpath, clean_datasets
110 |             )
111 |             self._questions, self._answers, self._captions = self.tokenizeQA(
112 |                 questions, answers, captions
113 |             )
114 |             file_save = {}
115 |             file_save["entries"] = self._entries
116 |             file_save["questions"] = self._questions
117 |             file_save["answers"] = self._answers
118 |             file_save["captions"] = self._captions
119 |             cPickle.dump(file_save, open(cache_path, "wb"))
120 |         else:
121 |             logger.info("Loading from %s" % cache_path)
122 |             file_save = cPickle.load(open(cache_path, "rb"))
123 |             self._entries = file_save["entries"]
124 |             self._questions = file_save["questions"]
125 |             self._answers = file_save["answers"]
126 |             self._captions = file_save["captions"]
127 | 
128 |     def tokenizeQA(self, questions, answers, captions):
129 |         """Tokenizes the captions.
130 | 
131 |         This will add caption_tokens in each entry of the dataset.
132 |         -1 represents nil, and should be treated as padding_idx in embedding.
133 |         """
134 |         question_token = []
135 |         answer_token = []
136 |         caption_token = []
137 | 
138 |         for question in questions:
139 |             # replace with name
140 |             question_token.append(
141 |                 self._tokenizer.convert_tokens_to_ids(
142 |                     self._tokenizer.tokenize(question)
143 |                 )
144 |             )
145 | 
146 |         for answer in answers:
147 |             # replace with name
148 |             answer_token.append(
149 |                 self._tokenizer.convert_tokens_to_ids(self._tokenizer.tokenize(answer))
150 |             )
151 | 
152 |         for caption in captions:
153 |             # replace with name
154 |             caption_token.append(
155 |                 self._tokenizer.convert_tokens_to_ids(self._tokenizer.tokenize(caption))
156 |             )
157 | 
158 |         return question_token, answer_token, caption_token
159 | 
160 |     def _truncate_seq(self, tokens_a, max_length):
161 |         """Truncates a sequence pair in place to the maximum length."""
162 | 
163 |         # This is a simple heuristic which will always truncate the longer sequence
164 |         # one token at a time. This makes more sense than truncating an equal percent
165 |         # of tokens from each, since if one sequence is very short then each token
166 |         # that's truncated likely contains more information than a longer sequence.
167 |         while True:
168 |             total_length = len(tokens_a)
169 |             if total_length <= max_length:
170 |                 break
171 | 
172 |             tokens_a.pop(0)
173 | 
174 |         return tokens_a
175 | 
176 |     def __getitem__(self, index):
177 | 
178 |         entry = self._entries[index]
179 |         image_id = entry["image_id"]
180 |         features, num_boxes, boxes, _ = self._image_features_reader[image_id]
181 |         image_mask = [1] * (int(num_boxes))
182 |         while len(image_mask) < self._max_region_num:
183 |             image_mask.append(0)
184 | 
185 |         features = torch.tensor(features).float()
186 |         image_mask = torch.tensor(image_mask).long()
187 |         spatials = torch.tensor(boxes).float()
188 | 
189 |         # Let's sample one dialog at a time.
190 |         caption = self._captions[entry["caption"]]
191 | 
192 |         input_ids_all = []
193 |         input_mask_all = []
194 |         segment_ids_all = []
195 | 
196 |         for rnd in range(10):
197 |             ques = self._questions[entry["dialog"][rnd]["question"]]
198 |             # fact is all previous question+answer
199 |             tokens_fact = []
200 |             for j in range(rnd):
201 |                 if rnd - self.max_round_num <= j:
202 |                     fact_q = self._questions[entry["dialog"][j]["question"]]
203 |                     fact_a = self._answers[entry["dialog"][j]["answer"]]
204 |                     if len(tokens_fact) == 0:
205 |                         tokens_fact = tokens_fact + fact_q + [self.SEP] + fact_a
206 |                     else:
207 |                         tokens_fact = (
208 |                             tokens_fact + [self.SEP] + fact_q + [self.SEP] + fact_a
209 |                         )
210 | 
211 |             token_q = ques
212 | 
213 |             if len(tokens_fact) == 0:
214 |                 tokens_f = caption
215 |             else:
216 |                 tokens_f = tokens_fact + [self.SEP] + caption
217 |             answer_candidate = []
218 |             answer_candidate.append(entry["dialog"][rnd]["gt_index"])
219 |             rand_idx = np.random.permutation(self.ans_option)
220 |             count = 0
221 |             while len(answer_candidate) < self.max_num_option:
222 |                 if rand_idx[count] != entry["dialog"][rnd]["gt_index"]:
223 |                     answer_candidate.append(rand_idx[count])
224 |                 count += 1
225 | 
226 |             input_ids_rnd = []
227 |             input_mask_rnd = []
228 |             segment_ids_rnd = []
229 | 
230 |             for i, ans_idx in enumerate(answer_candidate):
231 |                 tokens_a = self._answers[
232 |                     entry["dialog"][rnd]["answer_options"][ans_idx]
233 |                 ]
234 |                 tokens_f_new = self._truncate_seq(
235 |                     copy.deepcopy(tokens_f),
236 |                     self._total_seq_length - len(token_q) - len(tokens_a) - 4,
237 |                 )
238 | 
239 |                 tokens = []
240 |                 segment_ids = []
241 | 
242 |                 tokens.append(self.CLS)
243 |                 segment_ids.append(0)
244 |                 for token in token_q:
245 |                     tokens.append(token)
246 |                     segment_ids.append(0)
247 | 
248 |                 tokens.append(self.SEP)
249 |                 segment_ids.append(0)
250 | 
251 |                 for token in tokens_a:
252 |                     tokens.append(token)
253 |                     segment_ids.append(1)
254 | 
255 |                 tokens.append(self.SEP)
256 |                 segment_ids.append(1)
257 | 
258 |                 for token in tokens_f_new:
259 |                     tokens.append(token)
260 |                     segment_ids.append(0)
261 | 
262 |                 tokens.append(self.SEP)
263 |                 segment_ids.append(0)
264 | 
265 |                 input_mask = [1] * (len(tokens))
266 |                 # Zero-pad up to the sequence length.
267 |                 while len(tokens) < self._total_seq_length:
268 |                     tokens.append(0)
269 |                     input_mask.append(0)
270 |                     segment_ids.append(0)
271 | 
272 |                 input_ids_rnd.append(tokens)
273 |                 input_mask_rnd.append(input_mask)
274 |                 segment_ids_rnd.append(segment_ids)
275 | 
276 |             input_ids_all.append(input_ids_rnd)
277 |             input_mask_all.append(input_mask_rnd)
278 |             segment_ids_all.append(segment_ids_rnd)
279 | 
280 |         input_ids = torch.from_numpy(np.array(input_ids_all))
281 |         input_mask = torch.from_numpy(np.array(input_mask_all))
282 |         segment_ids = torch.from_numpy(np.array(segment_ids_all))
283 |         co_attention_mask = torch.zeros(
284 |             (10, self.max_num_option, self._max_region_num, self._total_seq_length)
285 |         )
286 |         target = torch.zeros(10).long()
287 |         return (
288 |             features,
289 |             spatials,
290 |             image_mask,
291 |             input_ids,
292 |             target,
293 |             input_mask,
294 |             segment_ids,
295 |             co_attention_mask,
296 |             image_id,
297 |         )
298 | 
299 |     def __len__(self):
300 |         return len(self._entries)
301 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/volta/datasets/vismadlibs_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | import os
  7 | import json
  8 | import _pickle as cPickle
  9 | 
 10 | # import cPickle
 11 | import numpy as np
 12 | import torch
 13 | from torch.utils.data import Dataset
 14 | 
 15 | from ._image_features_reader import ImageFeaturesH5Reader
 16 | 
 17 | 
 18 | os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
 19 | 
 20 | 
 21 | def assert_eq(real, expected):
 22 |     assert real == expected, "%s (true) vs %s (expected)" % (real, expected)
 23 | 
 24 | 
 25 | def _create_entry(question, answer):
 26 |     answer.pop("image_id")
 27 |     answer.pop("question_id")
 28 |     entry = {
 29 |         "question_id": question["question_id"],
 30 |         "image_id": question["image_id"],
 31 |         "question": question["question"],
 32 |         "answer": answer,
 33 |     }
 34 |     return entry
 35 | 
 36 | 
 37 | def _load_dataset(dataroot, name):
 38 |     """Load entries
 39 | 
 40 |     dataroot: root path of dataset
 41 |     name: 'train', 'val'
 42 |     """
 43 |     question_path = os.path.join(
 44 |         dataroot, "v2_OpenEnded_mscoco_%s2014_questions.json" % name
 45 |     )
 46 |     questions = sorted(
 47 |         json.load(open(question_path))["questions"], key=lambda x: x["question_id"]
 48 |     )
 49 |     answer_path = os.path.join(dataroot, "cache", "%s_target.pkl" % name)
 50 |     answers = cPickle.load(open(answer_path, "rb"))
 51 |     answers = sorted(answers, key=lambda x: x["question_id"])
 52 | 
 53 |     assert_eq(len(questions), len(answers))
 54 |     entries = []
 55 |     for question, answer in zip(questions, answers):
 56 |         assert_eq(question["question_id"], answer["question_id"])
 57 |         assert_eq(question["image_id"], answer["image_id"])
 58 |         entries.append(_create_entry(question, answer))
 59 | 
 60 |     return entries
 61 | 
 62 | 
 63 | class VMMultipleChoiceDataset(Dataset):
 64 |     def __init__(
 65 |         self, name, image_features_reader, tokenizer, dataroot="data", padding_index=0
 66 |     ):
 67 |         super().__init__()
 68 |         assert name in ["train", "val"]
 69 | 
 70 |         # ans2label_path = os.path.join(dataroot, "cache", "trainval_ans2label.pkl")
 71 |         # label2ans_path = os.path.join(dataroot, "cache", "trainval_label2ans.pkl")
 72 |         # self.ans2label = cPickle.load(open(ans2label_path, "rb"))
 73 |         # self.label2ans = cPickle.load(open(label2ans_path, "rb"))
 74 |         # self.num_ans_candidates = len(self.ans2label)
 75 | 
 76 |         self._image_features_reader = image_features_reader
 77 |         self._tokenizer = tokenizer
 78 |         self._padding_index = padding_index
 79 | 
 80 |         self.entries = _load_dataset(dataroot, name)
 81 | 
 82 |         # cache file path data/cache/train_ques
 83 |         madlibs = "data/VisualMadlibs/madlibs_train_v1/"
 84 |         if not os.path.exists(ques_cache_path):
 85 |             self.tokenize()
 86 |             self.tensorize()
 87 |             # cPickle.dump(self.entries, open(ques_cache_path, 'wb'))
 88 |         else:
 89 |             self.entries = cPickle.load(open(ques_cache_path, "rb"))
 90 | 
 91 |     def tokenize(self, max_length=16):
 92 |         """Tokenizes the questions.
 93 | 
 94 |         This will add q_token in each entry of the dataset.
 95 |         -1 represent nil, and should be treated as padding_index in embedding
 96 |         """
 97 |         for entry in self.entries:
 98 |             # sentence_tokens = self._tokenizer.tokenize(entry["question"])
 99 |             # sentence_tokens = ["[CLS]"] + sentence_tokens + ["[SEP]"]
100 | 
101 |             # tokens = [
102 |             #     self._tokenizer.vocab.get(w, self._tokenizer.vocab["[UNK]"])
103 |             #     for w in sentence_tokens
104 |             # ]
105 |             tokens = self._tokenizer.encode(entry["question"])
106 | 
107 |             tokens = tokens[:max_length]
108 |             segment_ids = [0] * len(tokens)
109 |             input_mask = [1] * len(tokens)
110 | 
111 |             if len(tokens) < max_length:
112 |                 # Note here we pad in front of the sentence
113 |                 padding = [self._padding_index] * (max_length - len(tokens))
114 |                 tokens = tokens + padding
115 |                 input_mask += padding
116 |                 segment_ids += padding
117 | 
118 |             assert_eq(len(tokens), max_length)
119 |             entry["q_token"] = tokens
120 |             entry["q_input_mask"] = input_mask
121 |             entry["q_segment_ids"] = segment_ids
122 | 
123 |     def tensorize(self):
124 | 
125 |         for entry in self.entries:
126 |             question = torch.from_numpy(np.array(entry["q_token"]))
127 |             entry["q_token"] = question
128 | 
129 |             q_input_mask = torch.from_numpy(np.array(entry["q_input_mask"]))
130 |             entry["q_input_mask"] = q_input_mask
131 | 
132 |             q_segment_ids = torch.from_numpy(np.array(entry["q_segment_ids"]))
133 |             entry["q_segment_ids"] = q_segment_ids
134 | 
135 |             answer = entry["answer"]
136 |             labels = np.array(answer["labels"])
137 |             scores = np.array(answer["scores"], dtype=np.float32)
138 |             if len(labels):
139 |                 labels = torch.from_numpy(labels)
140 |                 scores = torch.from_numpy(scores)
141 |                 entry["answer"]["labels"] = labels
142 |                 entry["answer"]["scores"] = scores
143 |             else:
144 |                 entry["answer"]["labels"] = None
145 |                 entry["answer"]["scores"] = None
146 | 
147 |     def __getitem__(self, index):
148 |         entry = self.entries[index]
149 |         image_id = entry["image_id"]
150 |         features = torch.tensor(self._image_features_reader[image_id])
151 |         spatials = -1
152 | 
153 |         question = entry["q_token"]
154 |         answer = entry["answer"]
155 |         labels = answer["labels"]
156 |         scores = answer["scores"]
157 |         input_mask = entry["q_input_mask"]
158 |         segment_ids = entry["q_segment_ids"]
159 | 
160 |         target = torch.zeros(self.num_ans_candidates)
161 |         if labels is not None:
162 |             target.scatter_(0, labels, scores)
163 | 
164 |         return features, spatials, question, target, input_mask, segment_ids
165 | 
166 |     def __len__(self):
167 |         return len(self.entries)
168 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/volta/datasets/visual_entailment_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | # Copyright (c) 2020, Emanuele Bugliarello (@e-bug).
  3 | 
  4 | # This source code is licensed under the MIT license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import os
  8 | import logging
  9 | import jsonlines
 10 | import _pickle as cPickle
 11 | 
 12 | import numpy as np
 13 | 
 14 | import torch
 15 | from torch.utils.data import Dataset
 16 | 
 17 | from transformers import AutoTokenizer
 18 | from ._image_features_reader import ImageFeaturesH5Reader
 19 | 
 20 | 
 21 | logger = logging.getLogger(__name__)
 22 | os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
 23 | 
 24 | LABEL_MAP = {"contradiction": 0, "neutral": 1, "entailment": 2}
 25 | 
 26 | 
 27 | def assert_eq(real, expected):
 28 |     assert real == expected, "%s (true) vs %s (expected)" % (real, expected)
 29 | 
 30 | 
 31 | def _create_entry(item):
 32 |     entry = {
 33 |         "question_id": item["question_id"],
 34 |         "image_id": item["image_id"],
 35 |         "hypothesis": item["hypothesis"],
 36 |         "answer": item,
 37 |     }
 38 |     return entry
 39 | 
 40 | 
 41 | def _load_dataset(dataroot, name):
 42 |     """Load entries
 43 | 
 44 |     dataroot: root path of dataset
 45 |     name: 'train', 'dev', 'test'
 46 |     """
 47 |     if name == "train" or name == "dev" or name == "test":
 48 |         annotations_path = os.path.join(dataroot, "snli_ve_%s.jsonl" % name)
 49 |         with jsonlines.open(annotations_path) as reader:
 50 |             # Build an index which maps image id with a list of hypothesis annotations.
 51 |             items = []
 52 |             count = 0
 53 |             for annotation in reader:
 54 |                 dictionary = {}
 55 |                 dictionary["image_id"] = int(annotation["Flikr30kID"].split(".")[0])
 56 |                 dictionary["question_id"] = count
 57 |                 dictionary["hypothesis"] = str(annotation["sentence2"])
 58 |                 if str(annotation["gold_label"]) == "-":
 59 |                     dictionary["labels"] = []
 60 |                     dictionary["scores"] = []
 61 |                 else:
 62 |                     dictionary["labels"] = [
 63 |                         int(LABEL_MAP[str(annotation["gold_label"])])
 64 |                     ]
 65 |                     dictionary["scores"] = [1.0]
 66 |                 items.append(dictionary)
 67 |                 count += 1
 68 |     else:
 69 |         assert False, "data split is not recognized."
 70 |     entries = []
 71 |     for item in items:
 72 |         entries.append(_create_entry(item))
 73 |     return entries
 74 | 
 75 | 
 76 | class VisualEntailmentDataset(Dataset):
 77 |     def __init__(
 78 |         self,
 79 |         task: str,
 80 |         dataroot: str,
 81 |         annotations_jsonpath: str,
 82 |         split: str,
 83 |         image_features_reader: ImageFeaturesH5Reader,
 84 |         gt_image_features_reader: ImageFeaturesH5Reader,
 85 |         tokenizer: AutoTokenizer,
 86 |         bert_model,
 87 |         padding_index: int = 0,
 88 |         max_seq_length: int = 16,
 89 |         max_region_num: int = 37,
 90 |         num_locs=5,
 91 |         add_global_imgfeat=None,
 92 |         append_mask_sep=False,
 93 |     ):
 94 |         super().__init__()
 95 |         self.split = split
 96 |         self.num_labels = 3
 97 |         self._max_region_num = max_region_num
 98 |         self._max_seq_length = max_seq_length
 99 |         self._image_features_reader = image_features_reader
100 |         self._tokenizer = tokenizer
101 |         self._padding_index = padding_index
102 |         self._num_locs = num_locs
103 |         self._add_global_imgfeat = add_global_imgfeat
104 | 
105 |         if "roberta" in bert_model:
106 |             cache_path = os.path.join(
107 |                 dataroot,
108 |                 "cache",
109 |                 task
110 |                 + "_"
111 |                 + split
112 |                 + "_"
113 |                 + "roberta"
114 |                 + "_"
115 |                 + str(max_seq_length)
116 |                 + ".pkl",
117 |             )
118 |         else:
119 |             cache_path = os.path.join(
120 |                 dataroot,
121 |                 "cache",
122 |                 task
123 |                 + "_"
124 |                 + split
125 |                 + "_"
126 |                 + str(max_seq_length)
127 |                 + ".pkl",
128 |             )
129 |         if not os.path.exists(cache_path):
130 |             self.entries = _load_dataset(dataroot, split)
131 |             self.tokenize(max_seq_length)
132 |             self.tensorize()
133 |             cPickle.dump(self.entries, open(cache_path, "wb"))
134 |         else:
135 |             logger.info("Loading from %s" % cache_path)
136 |             self.entries = cPickle.load(open(cache_path, "rb"))
137 | 
138 |     def tokenize(self, max_length=16):
139 |         """Tokenizes the questions.
140 | 
141 |         This will add q_token in each entry of the dataset.
142 |         -1 represent nil, and should be treated as padding_index in embedding
143 |         """
144 |         for entry in self.entries:
145 |             tokens = self._tokenizer.encode(entry["hypothesis"])
146 |             tokens = [tokens[0]] + tokens[1:-1][: self._max_seq_length - 2] + [tokens[-1]]
147 | 
148 |             segment_ids = [0] * len(tokens)
149 |             input_mask = [1] * len(tokens)
150 | 
151 |             if len(tokens) < max_length:
152 |                 # Note here we pad in front of the sentence
153 |                 padding = [self._padding_index] * (max_length - len(tokens))
154 |                 tokens = tokens + padding
155 |                 input_mask += padding
156 |                 segment_ids += padding
157 | 
158 |             assert_eq(len(tokens), max_length)
159 |             entry["q_token"] = tokens
160 |             entry["q_input_mask"] = input_mask
161 |             entry["q_segment_ids"] = segment_ids
162 | 
163 |     def tensorize(self):
164 |         for entry in self.entries:
165 |             question = torch.from_numpy(np.array(entry["q_token"]))
166 |             entry["q_token"] = question
167 | 
168 |             q_input_mask = torch.from_numpy(np.array(entry["q_input_mask"]))
169 |             entry["q_input_mask"] = q_input_mask
170 | 
171 |             q_segment_ids = torch.from_numpy(np.array(entry["q_segment_ids"]))
172 |             entry["q_segment_ids"] = q_segment_ids
173 | 
174 |             answer = entry["answer"]
175 |             labels = np.array(answer["labels"])
176 |             scores = np.array(answer["scores"], dtype=np.float32)
177 |             if len(labels):
178 |                 labels = torch.from_numpy(labels)
179 |                 scores = torch.from_numpy(scores)
180 |                 entry["answer"]["labels"] = labels
181 |                 entry["answer"]["scores"] = scores
182 |             else:
183 |                 entry["answer"]["labels"] = None
184 |                 entry["answer"]["scores"] = None
185 | 
186 |     def __getitem__(self, index):
187 |         entry = self.entries[index]
188 |         image_id = entry["image_id"]
189 |         question_id = entry["question_id"]
190 |         features, num_boxes, boxes, _ = self._image_features_reader[image_id]
191 | 
192 |         mix_num_boxes = min(int(num_boxes), self._max_region_num)
193 |         mix_boxes_pad = np.zeros((self._max_region_num, self._num_locs))
194 |         mix_features_pad = np.zeros((self._max_region_num, 2048))
195 | 
196 |         image_mask = [1] * (int(mix_num_boxes))
197 |         while len(image_mask) < self._max_region_num:
198 |             image_mask.append(0)
199 | 
200 |         mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes]
201 |         mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes]
202 | 
203 |         features = torch.tensor(mix_features_pad).float()
204 |         image_mask = torch.tensor(image_mask).long()
205 |         spatials = torch.tensor(mix_boxes_pad).float()
206 | 
207 |         hypothesis = entry["q_token"]
208 |         input_mask = entry["q_input_mask"]
209 |         segment_ids = entry["q_segment_ids"]
210 | 
211 |         target = torch.zeros(self.num_labels)
212 | 
213 |         answer = entry["answer"]
214 |         labels = answer["labels"]
215 |         scores = answer["scores"]
216 |         if labels is not None:
217 |             target.scatter_(0, labels, scores)
218 | 
219 |         return features, spatials, image_mask, hypothesis, target, input_mask, segment_ids, question_id
220 | 
221 |     def __len__(self):
222 |         return len(self.entries)
223 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/volta/datasets/visual_genome_dataset.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Facebook, Inc. and its affiliates.
  2 | 
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | import os
  7 | import json
  8 | import _pickle as cPickle
  9 | import logging
 10 | 
 11 | import numpy as np
 12 | import torch
 13 | from torch.utils.data import Dataset
 14 | from transformers import AutoTokenizer
 15 | 
 16 | from ._image_features_reader import ImageFeaturesH5Reader
 17 | 
 18 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 19 | os.environ["HDF5_USE_FILE_LOCKING"] = "FALSE"
 20 | 
 21 | 
 22 | def assert_eq(real, expected):
 23 |     assert real == expected, "%s (true) vs %s (expected)" % (real, expected)
 24 | 
 25 | 
 26 | def _create_entry(item):
 27 |     entry = {
 28 |         "question_id": item["question_id"],
 29 |         "image_id": item["image_id"],
 30 |         "question": item["question"],
 31 |         "answer": item,
 32 |     }
 33 |     return entry
 34 | 
 35 | 
 36 | def _load_dataset(dataroot, name, clean_datasets):
 37 |     """Load entries
 38 | 
 39 |     dataroot: root path of dataset
 40 |     name: 'train', 'val'
 41 |     """
 42 |     if name == "train":
 43 |         items_path = os.path.join(dataroot, "cache", "trainval_target.pkl")
 44 |         items = cPickle.load(open(items_path, "rb"))
 45 |         items = sorted(items, key=lambda x: x["question_id"])
 46 |         items = items[:-5000]
 47 |     elif name == "val":
 48 |         items_path = os.path.join(dataroot, "cache", "trainval_target.pkl")
 49 |         items = cPickle.load(open(items_path, "rb"))
 50 |         items = sorted(items, key=lambda x: x["question_id"])
 51 |         items = items[-5000:]
 52 |     else:
 53 |         assert False, "data split is not recognized."
 54 | 
 55 |     if "test" in name:
 56 |         entries = []
 57 |         for item in items:
 58 |             entries.append(item)
 59 |     else:
 60 |         entries = []
 61 |         remove_ids = []
 62 |         if clean_datasets:
 63 |             remove_ids = np.load(os.path.join(dataroot, "cache", "genome_test_ids.npy"))
 64 |             remove_ids = [int(x) for x in remove_ids]
 65 |         for item in items:
 66 |             if int(item["image_id"]) in remove_ids:
 67 |                 continue
 68 |             entries.append(_create_entry(item))
 69 |     return entries
 70 | 
 71 | 
 72 | class GenomeQAClassificationDataset(Dataset):
 73 |     def __init__(
 74 |         self,
 75 |         task: str,
 76 |         dataroot: str,
 77 |         annotations_jsonpath: str,
 78 |         split: str,
 79 |         image_features_reader: ImageFeaturesH5Reader,
 80 |         gt_image_features_reader: ImageFeaturesH5Reader,
 81 |         tokenizer: AutoTokenizer,
 82 |         bert_model,
 83 |         clean_datasets,
 84 |         padding_index: int = 0,
 85 |         max_seq_length: int = 16,
 86 |         max_region_num: int = 37,
 87 |     ):
 88 |         super().__init__()
 89 |         self.split = split
 90 |         ans2label_path = os.path.join(dataroot, "cache", "trainval_ans2label.pkl")
 91 |         label2ans_path = os.path.join(dataroot, "cache", "trainval_label2ans.pkl")
 92 |         self.ans2label = cPickle.load(open(ans2label_path, "rb"))
 93 |         self.label2ans = cPickle.load(open(label2ans_path, "rb"))
 94 |         self.num_labels = len(self.ans2label)
 95 |         self._max_region_num = max_region_num
 96 |         self._max_seq_length = max_seq_length
 97 |         self._image_features_reader = image_features_reader
 98 |         self._tokenizer = tokenizer
 99 |         self._padding_index = padding_index
100 | 
101 |         clean_train = "_cleaned" if clean_datasets else ""
102 | 
103 |         if "roberta" in bert_model:
104 |             cache_path = os.path.join(
105 |                 dataroot,
106 |                 "cache",
107 |                 task
108 |                 + "_"
109 |                 + split
110 |                 + "_"
111 |                 + "roberta"
112 |                 + "_"
113 |                 + str(max_seq_length)
114 |                 + clean_train
115 |                 + ".pkl",
116 |             )
117 |         else:
118 |             cache_path = os.path.join(
119 |                 dataroot,
120 |                 "cache",
121 |                 task + "_" + split + "_" + str(max_seq_length) + clean_train + ".pkl",
122 |             )
123 | 
124 |         if not os.path.exists(cache_path):
125 |             self.entries = _load_dataset(dataroot, split, clean_datasets)
126 |             self.tokenize(max_seq_length)
127 |             self.tensorize()
128 |             cPickle.dump(self.entries, open(cache_path, "wb"))
129 |         else:
130 |             logger.info("Loading from %s" % cache_path)
131 |             self.entries = cPickle.load(open(cache_path, "rb"))
132 | 
133 |     def tokenize(self, max_length=16):
134 |         """Tokenizes the questions.
135 | 
136 |         This will add q_token in each entry of the dataset.
137 |         -1 represent nil, and should be treated as padding_index in embedding
138 |         """
139 |         for entry in self.entries:
140 |             # tokens = self._tokenizer.tokenize(entry["question"])
141 |             # tokens = ["[CLS]"] + tokens + ["[SEP]"]
142 | 
143 |             # tokens = [
144 |             #     self._tokenizer.vocab.get(w, self._tokenizer.vocab["[UNK]"])
145 |             #     for w in tokens
146 |             # ]
147 |             tokens = self._tokenizer.encode(entry["question"])
148 |             tokens = [tokens[0]] + tokens[1:-1][: self._max_seq_length - 2] + [tokens[-1]]
149 | 
150 |             segment_ids = [0] * len(tokens)
151 |             input_mask = [1] * len(tokens)
152 | 
153 |             if len(tokens) < max_length:
154 |                 # Note here we pad in front of the sentence
155 |                 padding = [self._padding_index] * (max_length - len(tokens))
156 |                 tokens = tokens + padding
157 |                 input_mask += padding
158 |                 segment_ids += padding
159 | 
160 |             assert_eq(len(tokens), max_length)
161 |             entry["q_token"] = tokens
162 |             entry["q_input_mask"] = input_mask
163 |             entry["q_segment_ids"] = segment_ids
164 | 
165 |     def tensorize(self):
166 | 
167 |         for entry in self.entries:
168 |             question = torch.from_numpy(np.array(entry["q_token"]))
169 |             entry["q_token"] = question
170 | 
171 |             q_input_mask = torch.from_numpy(np.array(entry["q_input_mask"]))
172 |             entry["q_input_mask"] = q_input_mask
173 | 
174 |             q_segment_ids = torch.from_numpy(np.array(entry["q_segment_ids"]))
175 |             entry["q_segment_ids"] = q_segment_ids
176 | 
177 |             if "test" not in self.split:
178 |                 answer = entry["answer"]
179 |                 labels = np.array(answer["labels"])
180 |                 scores = np.array(answer["scores"], dtype=np.float32)
181 |                 if len(labels):
182 |                     labels = torch.from_numpy(labels)
183 |                     scores = torch.from_numpy(scores)
184 |                     entry["answer"]["labels"] = labels
185 |                     entry["answer"]["scores"] = scores
186 |                 else:
187 |                     entry["answer"]["labels"] = None
188 |                     entry["answer"]["scores"] = None
189 | 
190 |     def __getitem__(self, index):
191 |         entry = self.entries[index]
192 |         image_id = entry["image_id"]
193 |         question_id = entry["question_id"]
194 |         features, num_boxes, boxes, _ = self._image_features_reader[image_id]
195 | 
196 |         mix_num_boxes = min(int(num_boxes), self._max_region_num)
197 |         mix_boxes_pad = np.zeros((self._max_region_num, 5))
198 |         mix_features_pad = np.zeros((self._max_region_num, 2048))
199 | 
200 |         image_mask = [1] * (int(mix_num_boxes))
201 |         while len(image_mask) < self._max_region_num:
202 |             image_mask.append(0)
203 | 
204 |         mix_boxes_pad[:mix_num_boxes] = boxes[:mix_num_boxes]
205 |         mix_features_pad[:mix_num_boxes] = features[:mix_num_boxes]
206 | 
207 |         features = torch.tensor(mix_features_pad).float()
208 |         image_mask = torch.tensor(image_mask).long()
209 |         spatials = torch.tensor(mix_boxes_pad).float()
210 | 
211 |         question = entry["q_token"]
212 |         input_mask = entry["q_input_mask"]
213 |         segment_ids = entry["q_segment_ids"]
214 | 
215 |         co_attention_mask = torch.zeros((self._max_region_num, self._max_seq_length))
216 |         target = torch.zeros(self.num_labels)
217 | 
218 |         if "test" not in self.split:
219 |             answer = entry["answer"]
220 |             labels = answer["labels"]
221 |             scores = answer["scores"]
222 |             if labels is not None:
223 |                 target.scatter_(0, labels, scores)
224 | 
225 |         return (
226 |             features,
227 |             spatials,
228 |             image_mask,
229 |             question,
230 |             target,
231 |             input_mask,
232 |             segment_ids,
233 |             co_attention_mask,
234 |             question_id,
235 |         )
236 | 
237 |     def __len__(self):
238 |         return len(self.entries)
239 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/volta/extras.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | import sys
  3 | 
  4 | import torch
  5 | from torch import nn
  6 | 
  7 | from .embeddings import BertLayerNorm
  8 | 
  9 | def gelu(x):
 10 |     """Implementation of the gelu activation function.
 11 |         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
 12 |         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 13 |         Also see https://arxiv.org/abs/1606.08415
 14 |     """
 15 |     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
 16 | 
 17 | def swish(x):
 18 |     return x * torch.sigmoid(x)
 19 | 
 20 | ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 21 | 
 22 | 
 23 | 
 24 | class BertSelfAttention(nn.Module):
 25 |     def __init__(self, config):
 26 |         super(BertSelfAttention, self).__init__()
 27 |         if config.hidden_size % config.num_attention_heads != 0:
 28 |             raise ValueError(
 29 |                 "The hidden size (%d) is not a multiple of the number of attention "
 30 |                 "heads (%d)" % (config.hidden_size, config.num_attention_heads))
 31 |         self.num_attention_heads = config.num_attention_heads
 32 |         self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
 33 |         self.all_head_size = self.num_attention_heads * self.attention_head_size
 34 | 
 35 |         self.query = nn.Linear(config.hidden_size, self.all_head_size)
 36 |         self.key = nn.Linear(config.hidden_size, self.all_head_size)
 37 |         self.value = nn.Linear(config.hidden_size, self.all_head_size)
 38 | 
 39 |         self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
 40 | 
 41 |     def transpose_for_scores(self, x):
 42 |         new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
 43 |         x = x.view(*new_x_shape)
 44 |         return x.permute(0, 2, 1, 3)
 45 | 
 46 |     def forward(self, hidden_states, attention_mask=None):
 47 |         mixed_query_layer = self.query(hidden_states)
 48 |         mixed_key_layer = self.key(hidden_states)
 49 |         mixed_value_layer = self.value(hidden_states)
 50 | 
 51 |         query_layer = self.transpose_for_scores(mixed_query_layer)
 52 |         key_layer = self.transpose_for_scores(mixed_key_layer)
 53 |         value_layer = self.transpose_for_scores(mixed_value_layer)
 54 | 
 55 |         # Take the dot product between "query" and "key" to get the raw attention scores.
 56 |         attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
 57 |         attention_scores = attention_scores / math.sqrt(self.attention_head_size)
 58 |         # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
 59 |         if attention_mask is not None:
 60 |             attention_scores = attention_scores + attention_mask
 61 | 
 62 |         # Normalize the attention scores to probabilities.
 63 |         attention_probs = nn.Softmax(dim=-1)(attention_scores)
 64 | 
 65 |         # This is actually dropping out entire tokens to attend to, which might
 66 |         # seem a bit unusual, but is taken from the original Transformer paper.
 67 |         attention_probs = self.dropout(attention_probs)
 68 | 
 69 |         context_layer = torch.matmul(attention_probs, value_layer)
 70 |         context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
 71 |         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
 72 |         context_layer = context_layer.view(*new_context_layer_shape)
 73 |         return context_layer
 74 | 
 75 | 
 76 | class BertSelfOutput(nn.Module):
 77 |     def __init__(self, config):
 78 |         super(BertSelfOutput, self).__init__()
 79 |         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
 80 |         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
 81 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
 82 | 
 83 |     def forward(self, hidden_states, input_tensor):
 84 |         hidden_states = hidden_states
 85 |         hidden_states = self.dense(hidden_states)
 86 |         hidden_states = self.dropout(hidden_states)
 87 |         hidden_states = self.LayerNorm(hidden_states + input_tensor)
 88 |         return hidden_states
 89 | 
 90 | 
 91 | class BertAttention(nn.Module):
 92 |     def __init__(self, config):
 93 |         super(BertAttention, self).__init__()
 94 |         self.self = BertSelfAttention(config)
 95 |         self.output = BertSelfOutput(config).cuda()
 96 | 
 97 |     def forward(self, input_tensor, attention_mask):
 98 |         self_output = self.self(input_tensor, attention_mask)
 99 |         attention_output = self.output(self_output, input_tensor)
100 |         return attention_output
101 | 
102 | 
103 | class BertIntermediate(nn.Module):
104 |     def __init__(self, config):
105 |         super(BertIntermediate, self).__init__()
106 |         self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
107 |         if isinstance(config.hidden_act, str) or (sys.version_info[0] == 2 and isinstance(config.hidden_act, unicode)):
108 |             self.intermediate_act_fn = ACT2FN[config.hidden_act]
109 |         else:
110 |             self.intermediate_act_fn = config.hidden_act
111 | 
112 |     def forward(self, hidden_states):
113 |         hidden_states = self.dense(hidden_states)
114 |         hidden_states = self.intermediate_act_fn(hidden_states)
115 |         return hidden_states
116 | 
117 | 
118 | class BertOutput(nn.Module):
119 |     def __init__(self, config):
120 |         super(BertOutput, self).__init__()
121 |         self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
122 |         self.LayerNorm = BertLayerNorm(config.hidden_size, eps=1e-12)
123 |         self.dropout = nn.Dropout(config.hidden_dropout_prob)
124 | 
125 |     def forward(self, hidden_states, input_tensor):
126 |         hidden_states = self.dense(hidden_states)
127 |         hidden_states = self.dropout(hidden_states)
128 |         hidden_states = self.LayerNorm(hidden_states + input_tensor)
129 |         return hidden_states
130 | 
131 | 
132 | class BertLayer(nn.Module):
133 |     def __init__(self, config):
134 |         super(BertLayer, self).__init__()
135 |         self.attention = BertAttention(config)
136 |         self.intermediate = BertIntermediate(config)
137 |         self.output = BertOutput(config)
138 | 
139 |     def forward(self, hidden_states, attention_mask):
140 |         attention_output = self.attention(hidden_states, attention_mask)
141 |         intermediate_output = self.intermediate(attention_output)
142 |         layer_output = self.output(intermediate_output, attention_output)
143 |         return layer_output
144 | 
145 | 
146 | """
147 | The above modules are copied from BERT.
148 | """
149 | 
150 | 
151 | class InputFeatures(object):
152 |     """A single set of features of data."""
153 | 
154 |     def __init__(self, input_ids, input_mask, segment_ids):
155 |         self.input_ids = input_ids
156 |         self.input_mask = input_mask
157 |         self.segment_ids = segment_ids
158 | 
159 | 
160 | def convert_sents_to_features(sents, max_seq_length, tokenizer):
161 |     """Loads a data file into a list of `InputBatch`s."""
162 | 
163 |     features = []
164 |     tokenized_sentences = []
165 | 
166 |     for (i, sent) in enumerate(sents):
167 |         tokens_a = tokenizer.tokenize(sent.strip())
168 | 
169 |         # Account for [CLS] and [SEP] with "- 2"
170 |         if len(tokens_a) > max_seq_length - 2:
171 |             tokens_a = tokens_a[:(max_seq_length - 2)]
172 | 
173 |         # Keep segment id which allows loading BERT-weights.
174 |         tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
175 |         tokenized_sentences.append(tokens)
176 | 
177 |     max_len = max(len(tokens) for tokens in tokenized_sentences)
178 | 
179 |     for (i, tokens) in enumerate(tokenized_sentences):
180 |         segment_ids = [0] * len(tokens)
181 | 
182 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
183 | 
184 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
185 |         # tokens are attended to.
186 |         input_mask = [1] * len(input_ids)
187 | 
188 |         # Zero-pad up to the sequence length.
189 |         padding = [0] * (max_len - len(input_ids))
190 |         input_ids += padding
191 |         input_mask += padding
192 |         segment_ids += padding
193 | 
194 |         assert len(input_ids) == max_len
195 |         assert len(input_mask) == max_len
196 |         assert len(segment_ids) == max_len
197 | 
198 |         features.append(
199 |             InputFeatures(input_ids=input_ids,
200 |                           input_mask=input_mask,
201 |                           segment_ids=segment_ids))
202 |     return features
203 | 
204 | 
205 | """
206 | The above modules are copied from LXMERT.
207 | """
208 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/volta/losses.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) 2020, Emanuele Bugliarello (@e-bug).
  2 | 
  3 | # This source code is licensed under the MIT license found in the
  4 | # LICENSE file in the root directory of this source tree.
  5 | 
  6 | import copy
  7 | 
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.nn.functional as F
 11 | 
 12 | 
 13 | # ==================================================================================================================== #
 14 | #                                                  Vision Pretraining                                                  #
 15 | # ==================================================================================================================== #
 16 | def kl_1601(prediction_scores_v, weight, label, image_cls, image_feat, obj_labels, obj_confs, attr_labels, attr_confs):
 17 |     if (weight > 0) and (image_cls is not None):
 18 |         image_target = image_cls
 19 |         loss = nn.KLDivLoss(reduction="none")(F.log_softmax(prediction_scores_v, dim=2), image_target)
 20 |         return weight * torch.sum(loss * (label == 1).unsqueeze(2).float()) / max(torch.sum((label == 1)), 1)
 21 |     else:
 22 |         return 0
 23 | 
 24 | 
 25 | def mse_2048(prediction_scores_v, weight, label, image_cls, image_feat, obj_labels, obj_confs, attr_labels, attr_confs):
 26 |     # regress the feature
 27 |     if (weight > 0) and (image_feat is not None):
 28 |         image_target = copy.deepcopy(image_feat)
 29 |         loss = nn.MSELoss(reduction="none")(prediction_scores_v, image_target)
 30 |         return weight * torch.sum(loss * (label == 1).unsqueeze(2).float()) / \
 31 |             max(torch.sum((label == 1).unsqueeze(2).expand_as(loss)), 1)
 32 |     else:
 33 |         return 0
 34 | 
 35 | 
 36 | def nce_2048(prediction_scores_v, weight, label, image_cls, image_feat, obj_labels, obj_confs, attr_labels, attr_confs):
 37 |     # NCE loss
 38 |     num_negative = 128
 39 |     if (weight > 0) and (image_feat is not None):
 40 | 
 41 |         image_target = copy.deepcopy(image_feat)
 42 | 
 43 |         # generate negative sampled index.
 44 |         num_across_batch = int(num_negative * 0.7)
 45 |         num_inside_batch = int(num_negative * 0.3)
 46 | 
 47 |         batch_size, num_regions, _ = prediction_scores_v.size()
 48 |         assert batch_size != 0
 49 |         # random negative across batches.
 50 |         row_across_index = image_target.new(batch_size, num_regions, num_across_batch).random_(0, batch_size - 1)
 51 |         col_across_index = image_target.new(batch_size, num_regions, num_across_batch).random_(0, num_regions)
 52 | 
 53 |         for i in range(batch_size - 1):
 54 |             row_across_index[i][row_across_index[i] == i] = batch_size - 1
 55 |         final_across_index = row_across_index * num_regions + col_across_index
 56 | 
 57 |         # random negative inside batches.
 58 |         row_inside_index = image_target.new(batch_size, num_regions, num_inside_batch).zero_()
 59 |         col_inside_index = image_target.new(batch_size, num_regions, num_inside_batch).random_(0, num_regions - 1)
 60 | 
 61 |         for i in range(batch_size):
 62 |             row_inside_index[i] = i
 63 |         for i in range(num_regions - 1):
 64 |             col_inside_index[:, i, :][col_inside_index[:, i, :] == i] = num_regions - 1
 65 |         final_inside_index = row_inside_index * num_regions + col_inside_index
 66 | 
 67 |         final_index = torch.cat((final_across_index, final_inside_index), dim=2)
 68 | 
 69 |         # Let's first sample where we need to compute.
 70 |         predict_v = prediction_scores_v[label == 1]
 71 |         neg_index_v = final_index[label == 1]
 72 | 
 73 |         flat_image_target = image_target.view(batch_size * num_regions, -1)
 74 |         # we also need to append the target feature at the beginning.
 75 |         negative_v = flat_image_target[neg_index_v]
 76 |         positive_v = image_target[label == 1]
 77 |         sample_v = torch.cat((positive_v.unsqueeze(1), negative_v), dim=1)
 78 | 
 79 |         # calculate the loss.
 80 |         score = torch.bmm(sample_v, predict_v.unsqueeze(2)).squeeze(2)
 81 |         return weight * nn.CrossEntropyLoss()(score, image_target.new(score.size(0)).zero_())
 82 |     else:
 83 |         return 0
 84 | 
 85 | 
 86 | def xent_1600(prediction_scores_v, weight, label, image_cls, image_feat, obj_labels, obj_confs, attr_labels, attr_confs):
 87 |     if (weight > 0) and (obj_labels is not None) and (obj_confs is not None):
 88 |         # hard object labels
 89 |         image_target, mask_conf = obj_labels, obj_confs
 90 |         loss = nn.CrossEntropyLoss(reduction='none')(prediction_scores_v.reshape(-1, 1600), image_target.view(-1,))
 91 |         loss = loss * mask_conf.view(-1)
 92 |         return weight * torch.sum(loss * (label.view(-1) == 1)) / max(torch.sum((label == 1)), 1)
 93 |     else:
 94 |         return 0
 95 | 
 96 | 
 97 | def xent_400(prediction_scores_v, weight, label, image_cls, image_feat, obj_labels, obj_confs, attr_labels, attr_confs):
 98 |     if (weight > 0) and (attr_labels is not None) and (attr_confs is not None):
 99 |         # hard attribute labels
100 |         image_target, mask_conf = attr_labels, attr_confs
101 |         loss = nn.CrossEntropyLoss(reduction='none')(prediction_scores_v.reshape(-1, 400), image_target.view(-1,))
102 |         loss = loss * mask_conf.view(-1)
103 |         return weight * torch.sum(loss * (label.view(-1) == 1)) / max(torch.sum((label == 1)), 1)
104 |     else:
105 |         return 0
106 | 
107 | 
108 | def huber_2048(prediction_scores_v, weight, label, image_cls, image_feat, obj_labels, obj_confs, attr_labels, attr_confs):
109 |     if (weight > 0) and (image_feat is not None):
110 |         # regress the feature
111 |         image_target = copy.deepcopy(image_feat)
112 |         loss = nn.SmoothL1Loss(reduction='none')(prediction_scores_v, image_target)
113 |         return weight * torch.sum(loss * (label == 1).unsqueeze(2).float()) / \
114 |             max(torch.sum((label == 1).unsqueeze(2).expand_as(loss)), 1)
115 |     else:
116 |         return 0
117 | 
118 | 
119 | def xent_1601(prediction_scores_v, weight, label, image_cls, image_feat, obj_labels, obj_confs, attr_labels, attr_confs):
120 |     if (weight > 0) and (obj_labels is not None):
121 |         # hard object labels
122 |         image_target = obj_labels
123 |         loss = nn.CrossEntropyLoss(reduction='none')(prediction_scores_v.reshape(-1, 1601), image_target.view(-1,))
124 |         return weight * torch.sum(loss * (label.view(-1) == 1)) / max(torch.sum((label == 1)), 1)
125 |     else:
126 |         return 0
127 | 
128 | 
129 | pre_vis_targets = {
130 |     "0": 1601,
131 |     "1": 2048,
132 |     "2": 2048,
133 |     "3": 1600,
134 |     "4": 400,
135 |     "5": 2048,
136 |     "6": 1601
137 | }
138 | 
139 | pre_vis_criterions = {
140 |     "0": kl_1601,
141 |     "1": mse_2048,
142 |     "2": nce_2048,
143 |     "3": xent_1600,
144 |     "4": xent_400,
145 |     "5": huber_2048,
146 |     "6": xent_1601,
147 | }
148 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/volta/optimization.py:
--------------------------------------------------------------------------------
  1 | # This source code is licensed under the MIT license found in the
  2 | # LICENSE file in the root directory of this source tree.
  3 | 
  4 | import math
  5 | import torch
  6 | from torch.optim import Optimizer
  7 | 
  8 | 
  9 | class RAdam(Optimizer):
 10 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
 11 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
 12 |         self.buffer = [[None, None, None] for ind in range(10)]
 13 |         super(RAdam, self).__init__(params, defaults)
 14 | 
 15 |     def __setstate__(self, state):
 16 |         super(RAdam, self).__setstate__(state)
 17 | 
 18 |     def step(self, closure=None):
 19 | 
 20 |         loss = None
 21 |         if closure is not None:
 22 |             loss = closure()
 23 | 
 24 |         for group in self.param_groups:
 25 | 
 26 |             for p in group["params"]:
 27 |                 if p.grad is None:
 28 |                     continue
 29 |                 grad = p.grad.data.float()
 30 |                 if grad.is_sparse:
 31 |                     raise RuntimeError("RAdam does not support sparse gradients")
 32 | 
 33 |                 p_data_fp32 = p.data.float()
 34 | 
 35 |                 state = self.state[p]
 36 | 
 37 |                 if len(state) == 0:
 38 |                     state["step"] = 0
 39 |                     state["exp_avg"] = torch.zeros_like(p_data_fp32)
 40 |                     state["exp_avg_sq"] = torch.zeros_like(p_data_fp32)
 41 |                 else:
 42 |                     state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32)
 43 |                     state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32)
 44 | 
 45 |                 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
 46 |                 beta1, beta2 = group["betas"]
 47 | 
 48 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
 49 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
 50 | 
 51 |                 state["step"] += 1
 52 |                 buffered = self.buffer[int(state["step"] % 10)]
 53 |                 if state["step"] == buffered[0]:
 54 |                     N_sma, step_size = buffered[1], buffered[2]
 55 |                 else:
 56 |                     buffered[0] = state["step"]
 57 |                     beta2_t = beta2 ** state["step"]
 58 |                     N_sma_max = 2 / (1 - beta2) - 1
 59 |                     N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t)
 60 |                     buffered[1] = N_sma
 61 | 
 62 |                     # more conservative since it's an approximated value
 63 |                     if N_sma >= 5:
 64 |                         step_size = (
 65 |                             group["lr"]
 66 |                             * math.sqrt(
 67 |                                 (1 - beta2_t)
 68 |                                 * (N_sma - 4)
 69 |                                 / (N_sma_max - 4)
 70 |                                 * (N_sma - 2)
 71 |                                 / N_sma
 72 |                                 * N_sma_max
 73 |                                 / (N_sma_max - 2)
 74 |                             )
 75 |                             / (1 - beta1 ** state["step"])
 76 |                         )
 77 |                     else:
 78 |                         step_size = group["lr"] / (1 - beta1 ** state["step"])
 79 |                     buffered[2] = step_size
 80 | 
 81 |                 if group["weight_decay"] != 0:
 82 |                     p_data_fp32.add_(-group["weight_decay"] * group["lr"], p_data_fp32)
 83 | 
 84 |                 # more conservative since it's an approximated value
 85 |                 if N_sma >= 5:
 86 |                     denom = exp_avg_sq.sqrt().add_(group["eps"])
 87 |                     p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
 88 |                 else:
 89 |                     p_data_fp32.add_(-step_size, exp_avg)
 90 | 
 91 |                 p.data.copy_(p_data_fp32)
 92 | 
 93 |         return loss
 94 | 
 95 | 
 96 | class PlainRAdam(Optimizer):
 97 |     def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
 98 |         defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
 99 | 
100 |         super(PlainRAdam, self).__init__(params, defaults)
101 | 
102 |     def __setstate__(self, state):
103 |         super(PlainRAdam, self).__setstate__(state)
104 | 
105 |     def step(self, closure=None):
106 | 
107 |         loss = None
108 |         if closure is not None:
109 |             loss = closure()
110 | 
111 |         for group in self.param_groups:
112 | 
113 |             for p in group["params"]:
114 |                 if p.grad is None:
115 |                     continue
116 |                 grad = p.grad.data.float()
117 |                 if grad.is_sparse:
118 |                     raise RuntimeError("RAdam does not support sparse gradients")
119 | 
120 |                 p_data_fp32 = p.data.float()
121 | 
122 |                 state = self.state[p]
123 | 
124 |                 if len(state) == 0:
125 |                     state["step"] = 0
126 |                     state["exp_avg"] = torch.zeros_like(p_data_fp32)
127 |                     state["exp_avg_sq"] = torch.zeros_like(p_data_fp32)
128 |                 else:
129 |                     state["exp_avg"] = state["exp_avg"].type_as(p_data_fp32)
130 |                     state["exp_avg_sq"] = state["exp_avg_sq"].type_as(p_data_fp32)
131 | 
132 |                 exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
133 |                 beta1, beta2 = group["betas"]
134 | 
135 |                 exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
136 |                 exp_avg.mul_(beta1).add_(1 - beta1, grad)
137 | 
138 |                 state["step"] += 1
139 |                 beta2_t = beta2 ** state["step"]
140 |                 N_sma_max = 2 / (1 - beta2) - 1
141 |                 N_sma = N_sma_max - 2 * state["step"] * beta2_t / (1 - beta2_t)
142 | 
143 |                 if group["weight_decay"] != 0:
144 |                     p_data_fp32.add_(-group["weight_decay"] * group["lr"], p_data_fp32)
145 | 
146 |                 # more conservative since it's an approximated value
147 |                 if N_sma >= 5:
148 |                     step_size = (
149 |                         group["lr"]
150 |                         * math.sqrt(
151 |                             (1 - beta2_t)
152 |                             * (N_sma - 4)
153 |                             / (N_sma_max - 4)
154 |                             * (N_sma - 2)
155 |                             / N_sma
156 |                             * N_sma_max
157 |                             / (N_sma_max - 2)
158 |                         )
159 |                         / (1 - beta1 ** state["step"])
160 |                     )
161 |                     denom = exp_avg_sq.sqrt().add_(group["eps"])
162 |                     p_data_fp32.addcdiv_(-step_size, exp_avg, denom)
163 |                 else:
164 |                     step_size = group["lr"] / (1 - beta1 ** state["step"])
165 |                     p_data_fp32.add_(-step_size, exp_avg)
166 | 
167 |                 p.data.copy_(p_data_fp32)
168 | 
169 |         return loss
170 | 


--------------------------------------------------------------------------------
/baselines/crossencoders/zero_shot.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | import json
  4 | import yaml
  5 | import random
  6 | import logging
  7 | import argparse
  8 | from io import open
  9 | from tqdm import tqdm
 10 | from easydict import EasyDict as edict
 11 | 
 12 | import numpy as np
 13 | 
 14 | import torch
 15 | import torch.nn as nn
 16 | import torch.distributed as dist
 17 | 
 18 | from volta.config import BertConfig
 19 | from volta.encoders import BertForVLTasks, BertForVLPreTraining
 20 | from volta.train_utils import tbLogger
 21 | from volta.task_utils import LoadDatasetEval, LoadLoss, EvaluatingModel
 22 | 
 23 | 
 24 | logging.basicConfig(
 25 |     format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
 26 |     datefmt="%m/%d/%Y %H:%M:%S",
 27 |     level=logging.INFO,
 28 | )
 29 | logger = logging.getLogger(__name__)
 30 | 
 31 | 
 32 | def parse_args():
 33 |     parser = argparse.ArgumentParser()
 34 | 
 35 |     # Model
 36 |     parser.add_argument("--from_pretrained", default="bert-base-uncased", type=str,
 37 |                         help="Bert pre-trained model selected in the list: bert-base-uncased, "
 38 |                              "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
 39 |     parser.add_argument("--config_file", default="config/bert_config.json", type=str,
 40 |                         help="The config file which specified the model details.")
 41 |     # Output
 42 |     parser.add_argument("--output_dir", default="results", type=str,
 43 |                         help="The output directory where the model checkpoints will be written.")
 44 |     parser.add_argument("--save_name", default="", type=str,
 45 |                         help="save name for training.")
 46 |     # Task
 47 |     parser.add_argument("--tasks_config_file", default="config_tasks/vilbert_trainval_tasks.yml", type=str,
 48 |                         help="The config file which specified the tasks details.")
 49 |     parser.add_argument("--task", default="", type=str,
 50 |                         help="training task number")
 51 |     # Evaluation
 52 |     parser.add_argument("--split", default="", type=str,
 53 |                         help="which split to use.")
 54 |     parser.add_argument("--batch_size", default=30, type=int,
 55 |                         help="batch size.")
 56 |     parser.add_argument("--drop_last", action="store_true",
 57 |                         help="whether to drop last incomplete batch")
 58 |     # Seed
 59 |     parser.add_argument("--seed", type=int, default=42,
 60 |                         help="random seed for initialization")
 61 |     # Distributed
 62 |     parser.add_argument("--local_rank", type=int, default=-1,
 63 |                         help="local_rank for distributed training on gpus")
 64 |     parser.add_argument("--num_workers", type=int, default=16,
 65 |                         help="Number of workers in the dataloader.")
 66 |     parser.add_argument("--in_memory", default=False, type=bool,
 67 |                         help="whether use chunck for parallel training.")
 68 |     parser.add_argument("--use_chunk", default=0, type=float,
 69 |                         help="whether use chunck for parallel training.")
 70 | 
 71 |     return parser.parse_args()
 72 | 
 73 | 
 74 | def main():
 75 |     args = parse_args()
 76 | 
 77 |     # Devices
 78 |     if args.local_rank == -1:
 79 |         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 80 |         n_gpu = torch.cuda.device_count()
 81 |     else:
 82 |         torch.cuda.set_device(args.local_rank)
 83 |         device = torch.device("cuda", args.local_rank)
 84 |         n_gpu = 1
 85 |         torch.distributed.init_process_group(backend="nccl")
 86 |     default_gpu = False
 87 |     if dist.is_available() and args.local_rank != -1:
 88 |         rank = dist.get_rank()
 89 |         if rank == 0:
 90 |             default_gpu = True
 91 |     else:
 92 |         default_gpu = True
 93 |     logger.info(f"device: {device} n_gpu: {n_gpu}, distributed training: {bool(args.local_rank != -1)}")
 94 | 
 95 |     # Load config
 96 |     config = BertConfig.from_json_file(args.config_file)
 97 | 
 98 |     # Load task config
 99 |     with open(args.tasks_config_file, "r") as f:
100 |         task_cfg = edict(yaml.safe_load(f))
101 |     task_id = args.task.strip()
102 |     task = "TASK" + task_id
103 |     task_name = task_cfg[task]["name"]
104 |     if task_cfg[task].get("fusion_method", None):
105 |         # VL-BERT pooling for VQA
106 |         config.fusion_method = task_cfg[task]["fusion_method"]
107 | 
108 |     # Output dirs
109 |     timeStamp = args.from_pretrained.split("/")[-1] + "-" + args.save_name
110 |     savePath = os.path.join(args.output_dir, timeStamp)
111 |     if default_gpu and not os.path.exists(savePath):
112 |         os.makedirs(savePath)
113 | 
114 |     # Seed
115 |     random.seed(args.seed)
116 |     np.random.seed(args.seed)
117 |     torch.manual_seed(args.seed)
118 | 
119 |     # Dataset
120 |     batch_size, task2num_iters, dset_val, dl_val = LoadDatasetEval(args, config, task_cfg, args.task)
121 | 
122 |     # Logging
123 |     tb_logger = tbLogger(timeStamp, savePath, [task_name], [task], task2num_iters,
124 |                          1, save_logger=False, txt_name="eval.txt")
125 | 
126 |     # Model
127 | #    model = BertForVLTasks.from_pretrained(args.from_pretrained, config=config, task_cfg=task_cfg, task_ids=[task])
128 |     model = BertForVLPreTraining.from_pretrained(args.from_pretrained, config=config)
129 | 
130 |     # Optimization details
131 |     criterion = LoadLoss(task_cfg, args.task)
132 | 
133 |     # Move to GPU(s)
134 |     model.to(device)
135 |     if args.local_rank != -1:
136 |         try:
137 |             from apex.parallel import DistributedDataParallel as DDP
138 |         except ImportError:
139 |             raise ImportError(
140 |                 "Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training."
141 |             )
142 |         model = DDP(model, delay_allreduce=True)
143 |     elif n_gpu > 1:
144 |         model = nn.DataParallel(model)
145 | 
146 |     # Print summary
147 |     if default_gpu:
148 |         print("***** Running evaluation *****")
149 |         print("  Num Iters: ", task2num_iters[task])
150 |         print("  Batch size: ", batch_size)
151 | 
152 |     # Evaluate
153 |     model.eval()
154 |     results = []
155 |     others = []
156 |     for i, batch in tqdm(enumerate(dl_val), total=task2num_iters[task]):
157 |         loss, score, batch_size, results, others = EvaluatingModel(config, task_cfg, device, task, batch,
158 |                                                                    model, dl_val, criterion, results, others)
159 | 
160 |         tb_logger.step_val(0, float(loss), float(score), task, batch_size, "val")
161 |         sys.stdout.write("%d/%d\r" % (i, len(dl_val)))
162 |         sys.stdout.flush()
163 |     # save the result or evaluate the result.
164 |     ave_score = tb_logger.showLossVal(task)
165 | 
166 |     if args.split:
167 |         json_path = os.path.join(savePath, args.split)
168 |     else:
169 |         json_path = os.path.join(savePath, task_cfg[task]["val_split"])
170 |     json.dump(results, open(json_path + "_result.json", "w"), indent=2)
171 | 
172 | 
173 | if __name__ == "__main__":
174 |     main()
175 | 


--------------------------------------------------------------------------------
/data/analysis/annotator_agreement.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import random
 3 | 
 4 | import numpy as np
 5 | import krippendorff
 6 | 
 7 | raw_dataset = json.load(open('dataset.json', 'r'))
 8 | train = json.load(open('train_data.json', 'r'))
 9 | valid = json.load(open('valid_data.json', 'r'))
10 | test = json.load(open('test_data.json', 'r'))
11 | 
12 | matrix = []
13 | 
14 | # FOR TEST/VAL
15 | for img_set, val in raw_dataset.items():
16 |     if img_set not in test:
17 |         continue
18 |     for idx, info in val.items():
19 |         rets = info['data']['retrieval']
20 |         if len(rets) > 1:
21 |             correct = 0
22 |             for pred in rets:
23 |                 if pred == int(idx):
24 |                     correct += 1
25 |             if correct >= 2:
26 |                 if len(rets) == 2:
27 |                     rets = np.array(rets + [np.nan])
28 |                 else:
29 |                     rets = np.array(rets)
30 |                 matrix.append(rets)
31 | 
32 | 
33 | matrix = np.array(matrix)
34 | matrix = matrix.transpose([1, 0])
35 | print(matrix.shape)
36 | print(round(krippendorff.alpha(reliability_data=matrix, level_of_measurement='nominal'), 6))
37 | print(round(krippendorff.alpha(reliability_data=matrix, level_of_measurement='interval'), 6))
38 | 


--------------------------------------------------------------------------------
/data/analysis/annotator_bias.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | best_fine = json.load(open('results/clip/NOCONTRA_clip_valid_set.json', 'r'))
 4 | ann_split = json.load(open('annotator_split_valid.json', 'r'))
 5 | 
 6 | acc_unseen = 0
 7 | total_unseen = 0
 8 | acc_seen = 0
 9 | total_seen = 0
10 | 
11 | for img_set, preds in best_fine.items():
12 |     for k, v in preds.items():
13 |         if 'correct_' in k:
14 |             acc = v
15 |             idx = k[-1]
16 |             worker_type = ann_split[img_set][idx]
17 |             if worker_type == 'unseen_worker':
18 |                 acc_unseen += acc
19 |                 total_unseen += 1
20 |             elif worker_type == 'train_worker':
21 |                 acc_seen += acc
22 |                 total_seen += 1
23 |             else:
24 |                 print('wtf')
25 | 
26 | print(f'Performance on unseen workers: {round(acc_unseen/total_unseen,3)}')
27 | print(f'Performance on seen workers: {round(acc_seen/total_seen,3)}')


--------------------------------------------------------------------------------
/data/analysis/calc_accuracies.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from collections import defaultdict
 3 | import sys
 4 | import numpy as np
 5 | import yaml
 6 | 
 7 | best_fine = json.load(open(sys.argv[1], 'r'))
 8 | 
 9 | img_acc = 0
10 | img_total = 0
11 | vid_acc = 0
12 | vid_total = 0
13 | 
14 | for img_set, preds in best_fine.items():
15 |     for k, v in preds.items():
16 |         if 'correct_' in k:
17 |             acc = v
18 |             idx = k[-1]
19 |             if 'open-images' in img_set:
20 |                 img_total += 1
21 |                 img_acc += acc
22 |             else:
23 |                 vid_total += 1
24 |                 vid_acc += acc
25 | 
26 | print(f'Accuracy of CLIP on videos: {round(vid_acc/vid_total, 3)}')
27 | print(f'Accuracy of CLIP on images: {round(img_acc/img_total, 3)}')
28 | print(f'Accuracy: {round((img_acc+vid_acc)/(img_total+vid_total), 4)}')
29 | 


--------------------------------------------------------------------------------
/data/analysis/compare_dataset_statistics.py:
--------------------------------------------------------------------------------
 1 | import matplotlib.pyplot as plt
 2 | import matplotlib
 3 | import json
 4 | from collections import defaultdict
 5 | import spacy
 6 | from collections import Counter
 7 | 
 8 | import yaml
 9 | import seaborn
10 | seaborn.set()
11 | seaborn.set_style('ticks')
12 | # csfont = {'fontname':'Times New Roman'}
13 | matplotlib.rc('font',family='Times New Roman')
14 | 
15 | nlp = spacy.load("en_core_web_sm")
16 | nlp.add_pipe("sentencizer")
17 | 
18 | raw_dataset = json.load(open('dataset.json', 'r'))
19 | train = json.load(open('train_data.json', 'r'))
20 | valid = json.load(open('valid_data.json', 'r'))
21 | test = json.load(open('test_data.json', 'r'))
22 | 
23 | nlvr2 = open('other_datasets_rawtext/all_data_NLVR2.txt', 'r').readlines()
24 | spot = open('other_datasets_rawtext/all_data_spotdiff.txt', 'r').readlines()
25 | cid = open('all_data.txt', 'r').readlines()
26 | dataset = train | valid | test
27 | # with open("all_data.txt", 'w') as f:
28 | #     for k, v in dataset.items():
29 | #         for idx, text in v.items():
30 | #             f.write(text + '\n')
31 | 
32 | plt.xlabel('number of tokens')
33 | plt.ylabel('% of descriptions')
34 | 
35 | for name, data in {'CID': cid, 'NLVR2': nlvr2, 'spot-the-diff': spot}.items():
36 |     print("----------------------\n\n")
37 |     print(name)
38 |     sent_lengths = defaultdict(int)
39 |     numb_toks = defaultdict(int)
40 |     num_descr = 0
41 |     count_tokens = 0
42 |     types = set()
43 |     dep_tree_depth = 0
44 |     all_depths = []
45 |     for text in data:
46 |         doc = nlp(text.strip())
47 |         # spacy.displacy.serve(doc, style="dep")
48 |         depths = {}
49 | 
50 |         def walk_tree(node, depth):
51 |             depths[node.orth_] = depth
52 |             if node.n_lefts + node.n_rights > 0:
53 |                 return [walk_tree(child, depth + 1) for child in node.children]
54 | 
55 |         [walk_tree(sent.root, 0) for sent in doc.sents]
56 |         # print(depths)
57 |         dep_tree_depth += max(depths.values())
58 |         all_depths.append((max(depths.values()), text.strip()))
59 |         sent_lengths[len(list(doc.sents))] += 1
60 |         count = 0
61 |         for token in doc:
62 |             if token.pos_ not in ['SPACE']:
63 |                 count += 1
64 |                 types.add((token.text.lower()))
65 |         numb_toks[count] += 1
66 |         # ann_valid[img_set][img_id]['number_tokens'] = str(count)
67 |         # ann_valid[img_set][img_id]['number_sentences'] = str(len(list(doc.sents)))
68 |         # ann_valid[img_set][img_id]['max_dependency_depth'] = str(max(depths.values()))
69 |         num_descr += 1
70 |         count_tokens += count
71 | 
72 |     # yaml.dump(ann_valid, open('ann_valid_data_rich.yaml', 'w'), default_style='"', sort_keys=False)
73 | 
74 |     print(f'Distrubtion of number of sentences per description: {sent_lengths}')
75 |     descrs = 0
76 |     sents = 0
77 |     for k,v in sent_lengths.items():
78 |         descrs += v
79 |         sents += k * v
80 |     print(f'Avg sentences per descr: {sents/descrs}')
81 |     print(f'Average tokens per description: {count_tokens / num_descr}')
82 |     print(f'Distrubtion of number of words per description: {numb_toks}')
83 |     print(f'Number of types in dataset {len(types)}')
84 |     print(f'Average dependency tree depth: {dep_tree_depth / num_descr}')
85 |     all_depths = sorted(all_depths, key= lambda x: x[0], reverse=True)
86 |     print(f'Top 20 depths {all_depths[:20]}')
87 |     numb_toks = {x[0]: x[1] / num_descr for x in numb_toks.items()}
88 |     numb_toks = sorted(numb_toks.items(), key=lambda x: x[0])
89 |     x = [x[0] for x in numb_toks]
90 |     y = [x[1] for x in numb_toks]
91 | 
92 |     plt.plot(x, y, label=name)
93 |     plt.legend(loc='upper right', shadow=True, fontsize='medium')
94 |     plt.grid()
95 | 
96 |     plt.savefig("numb_tokens.png")
97 | 


--------------------------------------------------------------------------------
/data/analysis/convert.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import math
 3 | from collections import defaultdict
 4 | import torch
 5 | from torch.nn.functional import softmax
 6 | import sys
 7 | 
 8 | filename = sys.argv[1]
 9 | split = sys.argv[2]
10 | data = json.load(open(filename, 'r'))
11 | mapping = json.load(open(f'counter2key_{split}.json', 'r'))
12 | ids = json.load(open('shortid2id.json', 'r'))
13 | 
14 | results = defaultdict(dict)
15 | for img_set, v in data.items():
16 |     real_id = ids[mapping[v[0][0]]]
17 |     real_id = real_id.split('___')[0]
18 |     target = int(str(v[0][0])[-2])
19 | 
20 |     preds = []
21 |     pred_idx = 0
22 |     best_pred = -math.inf
23 |     for j in range(10):
24 |         pred = v[j][1][0]
25 |         preds.append(pred)
26 |         if pred > best_pred:
27 |             best_pred = pred
28 |             pred_idx = j
29 | 
30 |     results[real_id].update({f'raw_preds_{target}': preds, f'model_pred_{target}': pred_idx ,f'correct_{target}': 1 if pred_idx == target else 0})
31 | 
32 | json.dump(results, open(filename, 'w'), indent=2)
33 | 


--------------------------------------------------------------------------------
/data/analysis/convert_zeroshot.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from collections import defaultdict
 3 | import torch
 4 | from torch.nn.functional import softmax
 5 | import sys
 6 | 
 7 | data = list(json.load(open(sys.argv[1], 'r')))
 8 | mapping = json.load(open('counter2key_val.json', 'r'))
 9 | ids = json.load(open('shortid2id.json', 'r'))
10 | 
11 | results = defaultdict(dict)
12 | for i in range(0,len(data)-10, 10):
13 |     real_id = ids[mapping[str(data[i]['question_id'])]]
14 |     real_id = real_id.split('___')[0]
15 |     target = int(str(data[i]['question_id'])[-2])
16 | 
17 |     preds = []
18 |     pred_idx = 0
19 |     best_pred = 0
20 |     for j in range(10):
21 |         pred1 = data[i+j]['prediction_aligned']
22 |         pred2 = data[i+j]['prediction_notaligned']
23 |         p1, p2 = softmax(torch.Tensor([pred1, pred2]))
24 |         preds.append(p1.item())
25 |         if p1 > best_pred:
26 |             best_pred = p1
27 |             pred_idx = j
28 | 
29 |     results[real_id].update({f'raw_preds_{target}': preds, f'model_pred_{target}': pred_idx ,f'correct_{target}': 1 if pred_idx == target else 0})
30 | 
31 | json.dump(results, open(sys.argv[1], 'w'), indent=2)
32 | 


--------------------------------------------------------------------------------
/data/analysis/img_similarity.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import pickle
 4 | import random
 5 | import shutil
 6 | import sys
 7 | from pathlib import Path
 8 | 
 9 | import pytesseract
10 | import yaml
11 | from tqdm import tqdm
12 | import cv2
13 | from glob import glob
14 | from PIL import Image
15 | import torch
16 | import numpy as np
17 | from decord import VideoReader, cpu
18 | from brisque import BRISQUE
19 | import traceback
20 | import clip
21 | 
22 | device = "cuda" if torch.cuda.is_available() else "cpu"
23 | print('USING DEVICE: ' + device)
24 | model, preprocess = clip.load("ViT-B/32", device=device, jit=False)  # Must set jit=False for training
25 | 
26 | 
27 | def encode_images(photos_batch):
28 |     photos = [Image.open(photo_file) for photo_file in photos_batch]
29 |     photos_preprocessed = torch.stack([preprocess(photo) for photo in photos]).to(device)
30 | 
31 |     with torch.no_grad():
32 |         photos_features = model.encode_image(photos_preprocessed)
33 |         photos_features /= photos_features.norm(dim=-1, keepdim=True)
34 |     return photos_features.cpu().numpy()
35 | 
36 | 
37 | # train = json.load(open('train_data.json', 'r'))
38 | # valid = json.load(open('valid_data.json', 'r'))
39 | # test = json.load(open('test_data.json', 'r'))
40 | #
41 | # dataset = train | valid | test
42 | 
43 | dataset = yaml.load(open('ann_valid_data_rich.yaml', 'r'))
44 | total_images = 0
45 | diffs_images = 0
46 | total_vids = 0
47 | diffs_vids = 0
48 | 
49 | for img_set, val in tqdm(list(dataset.items())):
50 |     img_files = list(((Path('games')/ img_set).glob("*.jpg")))
51 |     img_files = sorted(img_files, key=lambda x: int(str(x).split('/')[-1].split('.')[0][3:]))
52 |     img_embs = encode_images(img_files)
53 |     for idx, details in val.items():
54 |         example_diff = 0
55 |         for j in range(10):
56 |             if j != int(idx):
57 |                 dist = float(np.linalg.norm(img_embs[int(idx)] - img_embs[j]))
58 |                 example_diff += dist
59 |                 if 'open-images' in img_set:
60 |                     diffs_images += dist
61 |                     total_images += 1
62 |                 else:
63 |                     diffs_vids += dist
64 |                     total_vids += 1
65 |         details['sum_image_differences'] = str(round(example_diff,4))
66 |         dataset[img_set][idx] = details
67 | 
68 | yaml.dump(dataset, open('ann_valid_data_rich.yaml', 'w'), default_style='"', sort_keys=False)
69 | 
70 | print(f'Average video similarity: {round(diffs_vids/total_vids, 4)}')
71 | print(f'Average image similarity: {round(diffs_images/total_images, 4)}')


--------------------------------------------------------------------------------
/data/vilbert_data_format/test_target.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/McGill-NLP/imagecode/609f07611aed2599f946c30d730f40a41af1079b/data/vilbert_data_format/test_target.pkl


--------------------------------------------------------------------------------
/data/vilbert_data_format/train_target.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/McGill-NLP/imagecode/609f07611aed2599f946c30d730f40a41af1079b/data/vilbert_data_format/train_target.pkl


--------------------------------------------------------------------------------
/data/vilbert_data_format/trainval_ans2label.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/McGill-NLP/imagecode/609f07611aed2599f946c30d730f40a41af1079b/data/vilbert_data_format/trainval_ans2label.pkl


--------------------------------------------------------------------------------
/data/vilbert_data_format/trainval_label2ans.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/McGill-NLP/imagecode/609f07611aed2599f946c30d730f40a41af1079b/data/vilbert_data_format/trainval_label2ans.pkl


--------------------------------------------------------------------------------
/data/vilbert_data_format/val_target.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/McGill-NLP/imagecode/609f07611aed2599f946c30d730f40a41af1079b/data/vilbert_data_format/val_target.pkl


--------------------------------------------------------------------------------
/example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/McGill-NLP/imagecode/609f07611aed2599f946c30d730f40a41af1079b/example.png


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | conda install --yes -c pytorch pytorch=1.7.1 torchvision
2 | pip install git+https://github.com/openai/CLIP.git
3 | conda install -c conda-forge wandb boto3
4 | 


--------------------------------------------------------------------------------