├── .gitignore
├── CLIP-ViL
    ├── LICENSE
    ├── clip
    │   ├── LICENSE
    │   ├── __init__.py
    │   ├── adapter_config.py
    │   ├── bpe_simple_vocab_16e6.txt.gz
    │   ├── clip.py
    │   ├── model.py
    │   └── simple_tokenizer.py
    ├── data
    │   ├── gqa
    │   │   ├── trainval_ans2label.json
    │   │   └── trainval_label2ans.json
    │   ├── mscoco
    │   │   └── README.md
    │   └── vqa
    │   │   ├── trainval_ans2label.json
    │   │   └── trainval_label2ans.json
    ├── readme.md
    ├── scripts
    │   ├── gqa_adapters.sh
    │   ├── gqa_baseline.sh
    │   ├── pretrain.bash
    │   ├── snli-ve_adapters.sh
    │   ├── snli-ve_baseline.sh
    │   ├── vqa_adapters.sh
    │   └── vqa_baseline.sh
    └── src
    │   ├── lxrt
    │       ├── adapters
    │       │   ├── __init__.py
    │       │   ├── adapter_configuration.py
    │       │   ├── adapter_controller.py
    │       │   ├── adapter_hypernetwork.py
    │       │   ├── adapter_modeling.py
    │       │   ├── adapter_outputs.py
    │       │   ├── adapter_utils.py
    │       │   ├── config.py
    │       │   ├── hypercomplex
    │       │   │   ├── __init__.py
    │       │   │   ├── inits.py
    │       │   │   ├── kronecker.py
    │       │   │   └── layers.py
    │       │   └── low_rank_layer.py
    │       ├── entry.py
    │       ├── file_utils.py
    │       ├── modeling.py
    │       ├── optimization.py
    │       ├── tokenization.py
    │       └── visual_transformers.py
    │   ├── param.py
    │   ├── pretrain
    │       ├── lxmert_data.py
    │       ├── lxmert_pretrain.py
    │       └── qa_answer_table.py
    │   ├── tasks
    │       ├── gqa.py
    │       ├── gqa_data.py
    │       ├── gqa_model.py
    │       ├── snli.py
    │       ├── snli_data.py
    │       ├── vision_helpers.py
    │       ├── vqa.py
    │       ├── vqa_data.py
    │       └── vqa_model.py
    │   ├── tools
    │       ├── lmdb_dataset.py
    │       ├── load_stagte_dict.py
    │       ├── resize_images.py
    │       ├── sharearray.py
    │       └── vision_helpers.py
    │   └── utils.py
├── LICENSE
├── README.md
├── VL-T5
    ├── inference
    │   ├── README.md
    │   ├── extracting_data.py
    │   ├── modeling_frcnn.py
    │   ├── processing_image.py
    │   ├── utils.py
    │   └── visualizing_image.py
    ├── requirements.txt
    ├── scripts
    │   ├── image
    │   │   ├── full_finetuning.sh
    │   │   ├── hyperformer.sh
    │   │   ├── multiple_adapters.sh
    │   │   ├── multiple_compacters.sh
    │   │   ├── multiple_lora.sh
    │   │   ├── multiple_prompts.sh
    │   │   ├── single_adapter.sh
    │   │   ├── single_compacter.sh
    │   │   ├── single_lora.sh
    │   │   └── single_prompt.sh
    │   └── video
    │   │   ├── full_finetuning.sh
    │   │   ├── single_adapter.sh
    │   │   ├── single_lora.sh
    │   │   └── single_prompt.sh
    └── src
    │   ├── activitynet.py
    │   ├── activitynet_data.py
    │   ├── activitynet_model.py
    │   ├── adapters
    │       ├── __init__.py
    │       ├── adapter_configuration.py
    │       ├── adapter_controller.py
    │       ├── adapter_hypernetwork.py
    │       ├── adapter_modeling.py
    │       ├── adapter_outputs.py
    │       ├── adapter_utils.py
    │       ├── config.py
    │       ├── hypercomplex
    │       │   ├── __init__.py
    │       │   ├── inits.py
    │       │   ├── kronecker.py
    │       │   └── layers.py
    │       └── low_rank_layer.py
    │   ├── caption.py
    │   ├── caption_clip_data.py
    │   ├── caption_data.py
    │   ├── caption_model.py
    │   ├── caption_raw_data.py
    │   ├── classification.py
    │   ├── classification_clip_data.py
    │   ├── classification_model.py
    │   ├── classification_raw_data.py
    │   ├── clip
    │       ├── __init__.py
    │       ├── bpe_simple_vocab_16e6.txt.gz
    │       ├── clip.py
    │       ├── model.py
    │       └── simple_tokenizer.py
    │   ├── clip_prepro_feats.py
    │   ├── dist_utils.py
    │   ├── gqa.py
    │   ├── gqa_clip_data.py
    │   ├── gqa_data.py
    │   ├── gqa_model.py
    │   ├── gqa_raw_data.py
    │   ├── how2qa.py
    │   ├── lora
    │       ├── __init__.py
    │       ├── config.py
    │       ├── controller.py
    │       ├── layers.py
    │       └── utils.py
    │   ├── mmt.py
    │   ├── mmt_data.py
    │   ├── mmt_model.py
    │   ├── modeling_bart.py
    │   ├── modeling_prefix_bart.py
    │   ├── modeling_t5.py
    │   ├── multitask.py
    │   ├── multitask_data.py
    │   ├── multitask_model.py
    │   ├── multitask_video.py
    │   ├── my_deepspeed.py
    │   ├── my_transformers
    │       ├── __init__.py
    │       ├── modeling_bart.py
    │       └── modeling_t5.py
    │   ├── nlvr.py
    │   ├── nlvr_clip_data.py
    │   ├── nlvr_data.py
    │   ├── nlvr_model.py
    │   ├── nlvr_raw_data.py
    │   ├── param.py
    │   ├── preprocess.py
    │   ├── pretrain.py
    │   ├── pretrain_data.py
    │   ├── pretrain_model.py
    │   ├── pretrain_raw_data.py
    │   ├── pretrain_vcr.py
    │   ├── pretrain_vcr_data.py
    │   ├── prompt
    │       ├── __init__.py
    │       ├── config.py
    │       ├── prompt_controller.py
    │       └── prompt_modeling.py
    │   ├── qa_answer_table.py
    │   ├── refcoco.py
    │   ├── refcoco_data.py
    │   ├── refcoco_model.py
    │   ├── refcoco_utils.py
    │   ├── tokenization.py
    │   ├── trainer_base.py
    │   ├── tvc.py
    │   ├── tvqa.py
    │   ├── utils.py
    │   ├── vcr.py
    │   ├── vcr_data.py
    │   ├── vcr_model.py
    │   ├── video
    │       ├── how2qa_data.py
    │       ├── tvc_data.py
    │       ├── tvqa_data.py
    │       ├── tvqa_matching_data.py
    │       ├── tvr_data.py
    │       ├── video_matching_model.py
    │       ├── video_model.py
    │       └── yc2c_data.py
    │   ├── vis_encoder.py
    │   ├── vqa.py
    │   ├── vqa_clip_data.py
    │   ├── vqa_data.py
    │   ├── vqa_model.py
    │   ├── vqa_raw_data.py
    │   └── yc2c.py
├── assets
    └── vl_adapter_teaser.png
├── download_backbones.py
├── feature_extraction
    ├── README.md
    ├── coco_CLIP.py
    ├── coco_gt.py
    ├── coco_proposal.py
    ├── coco_val_compact.py
    ├── detectron2_given_box_maxnms.py
    ├── detectron2_proposal_maxnms.py
    ├── flickr30k_proposal.py
    ├── process.sh
    ├── refcocog_gt.py
    ├── refcocog_mattnet.py
    ├── tsv_to_h5.py
    ├── vcr_gt.py
    └── vcr_proposal.py
├── inference_example.ipynb
└── requirements.txt


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Initially taken from Github's Python gitignore file
  2 | 
  3 | # Byte-compiled / optimized / DLL files
  4 | __pycache__/
  5 | *.py[cod]
  6 | *$py.class
  7 | 
  8 | # C extensions
  9 | *.so
 10 | 
 11 | # tests and logs
 12 | tests/fixtures/*
 13 | !tests/fixtures/sample_text_no_unicode.txt
 14 | logs/
 15 | lightning_logs/
 16 | lang_code_data/
 17 | **/slurm*
 18 | **/wandb
 19 | **/snap
 20 | datasets
 21 | 
 22 | # Distribution / packaging
 23 | .Python
 24 | build/
 25 | develop-eggs/
 26 | dist/
 27 | downloads/
 28 | eggs/
 29 | .eggs/
 30 | lib/
 31 | lib64/
 32 | parts/
 33 | sdist/
 34 | var/
 35 | wheels/
 36 | *.egg-info/
 37 | .installed.cfg
 38 | *.egg
 39 | MANIFEST
 40 | 
 41 | # PyInstaller
 42 | #  Usually these files are written by a python script from a template
 43 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 44 | *.manifest
 45 | *.spec
 46 | 
 47 | # Installer logs
 48 | pip-log.txt
 49 | pip-delete-this-directory.txt
 50 | 
 51 | # Unit test / coverage reports
 52 | htmlcov/
 53 | .tox/
 54 | .nox/
 55 | .coverage
 56 | .coverage.*
 57 | .cache
 58 | nosetests.xml
 59 | coverage.xml
 60 | *.cover
 61 | .hypothesis/
 62 | .pytest_cache/
 63 | 
 64 | # Translations
 65 | *.mo
 66 | *.pot
 67 | 
 68 | # Django stuff:
 69 | *.log
 70 | local_settings.py
 71 | db.sqlite3
 72 | 
 73 | # Flask stuff:
 74 | instance/
 75 | .webassets-cache
 76 | 
 77 | # Scrapy stuff:
 78 | .scrapy
 79 | 
 80 | # Sphinx documentation
 81 | docs/_build/
 82 | 
 83 | # PyBuilder
 84 | target/
 85 | 
 86 | # Jupyter Notebook
 87 | .ipynb_checkpoints
 88 | 
 89 | # IPython
 90 | profile_default/
 91 | ipython_config.py
 92 | 
 93 | # pyenv
 94 | .python-version
 95 | 
 96 | # celery beat schedule file
 97 | celerybeat-schedule
 98 | 
 99 | # SageMath parsed files
100 | *.sage.py
101 | 
102 | # Environments
103 | .env
104 | .venv
105 | env/
106 | venv/
107 | ENV/
108 | env.bak/
109 | venv.bak/
110 | 
111 | # Spyder project settings
112 | .spyderproject
113 | .spyproject
114 | 
115 | # Rope project settings
116 | .ropeproject
117 | 
118 | # mkdocs documentation
119 | /site
120 | 
121 | # mypy
122 | .mypy_cache/
123 | .dmypy.json
124 | dmypy.json
125 | 
126 | # Pyre type checker
127 | .pyre/
128 | 
129 | # vscode
130 | .vs
131 | .vscode
132 | 
133 | # Pycharm
134 | .idea
135 | 
136 | # TF code
137 | tensorflow_code
138 | 
139 | # Models
140 | proc_data
141 | 
142 | # examples
143 | runs
144 | /runs_old
145 | /wandb
146 | /examples/runs
147 | /examples/**/*.args
148 | /examples/rag/sweep
149 | 
150 | # data
151 | /data
152 | serialization_dir
153 | 
154 | # emacs
155 | *.*~
156 | debug.env
157 | 
158 | # vim
159 | .*.swp
160 | 
161 | #ctags
162 | tags
163 | 
164 | # pre-commit
165 | .pre-commit*
166 | 
167 | # .lock
168 | *.lock
169 | 


--------------------------------------------------------------------------------
/CLIP-ViL/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Hao Tan
 4 | Copyright (c) 2021 Liunian Harold Li
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/CLIP-ViL/clip/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2021 OpenAI
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 
23 | 


--------------------------------------------------------------------------------
/CLIP-ViL/clip/__init__.py:
--------------------------------------------------------------------------------
1 | from .clip import *
2 | from .adapter_config import VisionAdapterConfig
3 | 


--------------------------------------------------------------------------------
/CLIP-ViL/clip/adapter_config.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | 
 4 | @dataclass
 5 | class VisionAdapterConfig(object):
 6 |     """Implements the adapter configuration proposed by Houlsby et. al, 2019
 7 |     in https://arxiv.org/abs/1902.00751."""
 8 |    
 9 |     reduction_factor: int = 1
10 |     


--------------------------------------------------------------------------------
/CLIP-ViL/clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ylsung/VL_adapter/545fcbbdbbaec4c442de35567f6ae477ff4e8265/CLIP-ViL/clip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/CLIP-ViL/clip/clip.py:
--------------------------------------------------------------------------------
  1 | import hashlib
  2 | import os
  3 | import urllib
  4 | import warnings
  5 | from typing import Union, List
  6 | 
  7 | import torch
  8 | from PIL import Image
  9 | from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
 10 | from tqdm import tqdm
 11 | 
 12 | from .model import build_model
 13 | from .simple_tokenizer import SimpleTokenizer as _Tokenizer
 14 | 
 15 | __all__ = ["available_models", "load", "tokenize"]
 16 | _tokenizer = _Tokenizer()
 17 | 
 18 | _MODELS = {
 19 |     "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt",
 20 |     "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt",
 21 |     "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt",
 22 |     "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
 23 | }
 24 | 
 25 | 
 26 | 
 27 | def _download(url: str, root: str = os.path.expanduser("~/.cache/clip")):
 28 |     os.makedirs(root, exist_ok=True)
 29 |     filename = os.path.basename(url)
 30 | 
 31 |     expected_sha256 = url.split("/")[-2]
 32 |     download_target = os.path.join(root, filename)
 33 | 
 34 |     if os.path.exists(download_target) and not os.path.isfile(download_target):
 35 |         raise RuntimeError(f"{download_target} exists and is not a regular file")
 36 | 
 37 |     if os.path.isfile(download_target):
 38 |         if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256:
 39 |             return download_target
 40 |         else:
 41 |             warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file")
 42 | 
 43 |     with urllib.request.urlopen(url) as source, open(download_target, "wb") as output:
 44 |         with tqdm(total=int(source.info().get("Content-Length")), ncols=80) as loop:
 45 |             while True:
 46 |                 buffer = source.read(8192)
 47 |                 if not buffer:
 48 |                     break
 49 | 
 50 |                 output.write(buffer)
 51 |                 loop.update(len(buffer))
 52 | 
 53 |     if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256:
 54 |         raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match")
 55 | 
 56 |     return download_target
 57 | 
 58 | 
 59 | def available_models():
 60 |     return list(_MODELS.keys())
 61 | 
 62 | 
 63 | def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit=True, adapter_config=None):
 64 |     if name not in _MODELS:
 65 |         raise RuntimeError(f"Model {name} not found; available models = {available_models()}")
 66 | 
 67 |     model_path = _download(_MODELS[name])
 68 |     model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval()
 69 |     n_px = model.input_resolution.item()
 70 | 
 71 |     transform = Compose([
 72 |         Resize(n_px, interpolation=Image.BICUBIC),
 73 |         CenterCrop(n_px),
 74 |         lambda image: image.convert("RGB"),
 75 |         ToTensor(),
 76 |         Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
 77 |     ])
 78 | 
 79 |     if not jit:
 80 |         model = build_model(model.state_dict(), adapter_config).to(device)
 81 |         if str(device) == "cpu":
 82 |             model.float()
 83 |         return model, transform
 84 | 
 85 |     # patch the device names
 86 |     device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
 87 |     device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1]
 88 | 
 89 |     def patch_device(module):
 90 |         graphs = [module.graph] if hasattr(module, "graph") else []
 91 |         if hasattr(module, "forward1"):
 92 |             graphs.append(module.forward1.graph)
 93 | 
 94 |         for graph in graphs:
 95 |             for node in graph.findAllNodes("prim::Constant"):
 96 |                 if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"):
 97 |                     node.copyAttributes(device_node)
 98 | 
 99 |     model.apply(patch_device)
100 |     patch_device(model.encode_image)
101 |     patch_device(model.encode_text)
102 | 
103 |     # patch dtype to float32 on CPU
104 |     if str(device) == "cpu":
105 |         float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[])
106 |         float_input = list(float_holder.graph.findNode("aten::to").inputs())[1]
107 |         float_node = float_input.node()
108 | 
109 |         def patch_float(module):
110 |             graphs = [module.graph] if hasattr(module, "graph") else []
111 |             if hasattr(module, "forward1"):
112 |                 graphs.append(module.forward1.graph)
113 | 
114 |             for graph in graphs:
115 |                 for node in graph.findAllNodes("aten::to"):
116 |                     inputs = list(node.inputs())
117 |                     for i in [1, 2]:  # dtype can be the second or third argument to aten::to()
118 |                         if inputs[i].node()["value"] == 5:
119 |                             inputs[i].node().copyAttributes(float_node)
120 | 
121 |         model.apply(patch_float)
122 |         patch_float(model.encode_image)
123 |         patch_float(model.encode_text)
124 | 
125 |         model.float()
126 | 
127 |     return model, transform
128 | 
129 | 
130 | def tokenize(texts: Union[str, List[str]], context_length: int = 77):
131 |     if isinstance(texts, str):
132 |         texts = [texts]
133 | 
134 |     sot_token = _tokenizer.encoder["<|startoftext|>"]
135 |     eot_token = _tokenizer.encoder["<|endoftext|>"]
136 |     all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts]
137 |     result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
138 | 
139 |     for i, tokens in enumerate(all_tokens):
140 |         if len(tokens) > context_length:
141 |             raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}")
142 |         result[i, :len(tokens)] = torch.tensor(tokens)
143 | 
144 |     return result
145 | 


--------------------------------------------------------------------------------
/CLIP-ViL/clip/simple_tokenizer.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import html
  3 | import os
  4 | from functools import lru_cache
  5 | 
  6 | import ftfy
  7 | import regex as re
  8 | 
  9 | 
 10 | @lru_cache()
 11 | def default_bpe():
 12 |     return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
 13 | 
 14 | 
 15 | @lru_cache()
 16 | def bytes_to_unicode():
 17 |     """
 18 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 19 |     The reversible bpe codes work on unicode strings.
 20 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 21 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 22 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 23 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 24 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 25 |     """
 26 |     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
 27 |     cs = bs[:]
 28 |     n = 0
 29 |     for b in range(2**8):
 30 |         if b not in bs:
 31 |             bs.append(b)
 32 |             cs.append(2**8+n)
 33 |             n += 1
 34 |     cs = [chr(n) for n in cs]
 35 |     return dict(zip(bs, cs))
 36 | 
 37 | 
 38 | def get_pairs(word):
 39 |     """Return set of symbol pairs in a word.
 40 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 41 |     """
 42 |     pairs = set()
 43 |     prev_char = word[0]
 44 |     for char in word[1:]:
 45 |         pairs.add((prev_char, char))
 46 |         prev_char = char
 47 |     return pairs
 48 | 
 49 | 
 50 | def basic_clean(text):
 51 |     text = ftfy.fix_text(text)
 52 |     text = html.unescape(html.unescape(text))
 53 |     return text.strip()
 54 | 
 55 | 
 56 | def whitespace_clean(text):
 57 |     text = re.sub(r'\s+', ' ', text)
 58 |     text = text.strip()
 59 |     return text
 60 | 
 61 | 
 62 | class SimpleTokenizer(object):
 63 |     def __init__(self, bpe_path: str = default_bpe()):
 64 |         self.byte_encoder = bytes_to_unicode()
 65 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
 66 |         merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
 67 |         merges = merges[1:49152-256-2+1]
 68 |         merges = [tuple(merge.split()) for merge in merges]
 69 |         vocab = list(bytes_to_unicode().values())
 70 |         vocab = vocab + [v+'</w>' for v in vocab]
 71 |         for merge in merges:
 72 |             vocab.append(''.join(merge))
 73 |         vocab.extend(['<|startoftext|>', '<|endoftext|>'])
 74 |         self.encoder = dict(zip(vocab, range(len(vocab))))
 75 |         self.decoder = {v: k for k, v in self.encoder.items()}
 76 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
 77 |         self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
 78 |         self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
 79 | 
 80 |     def bpe(self, token):
 81 |         if token in self.cache:
 82 |             return self.cache[token]
 83 |         word = tuple(token[:-1]) + ( token[-1] + '</w>',)
 84 |         pairs = get_pairs(word)
 85 | 
 86 |         if not pairs:
 87 |             return token+'</w>'
 88 | 
 89 |         while True:
 90 |             bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
 91 |             if bigram not in self.bpe_ranks:
 92 |                 break
 93 |             first, second = bigram
 94 |             new_word = []
 95 |             i = 0
 96 |             while i < len(word):
 97 |                 try:
 98 |                     j = word.index(first, i)
 99 |                     new_word.extend(word[i:j])
100 |                     i = j
101 |                 except:
102 |                     new_word.extend(word[i:])
103 |                     break
104 | 
105 |                 if word[i] == first and i < len(word)-1 and word[i+1] == second:
106 |                     new_word.append(first+second)
107 |                     i += 2
108 |                 else:
109 |                     new_word.append(word[i])
110 |                     i += 1
111 |             new_word = tuple(new_word)
112 |             word = new_word
113 |             if len(word) == 1:
114 |                 break
115 |             else:
116 |                 pairs = get_pairs(word)
117 |         word = ' '.join(word)
118 |         self.cache[token] = word
119 |         return word
120 | 
121 |     def encode(self, text):
122 |         bpe_tokens = []
123 |         text = whitespace_clean(basic_clean(text)).lower()
124 |         for token in re.findall(self.pat, text):
125 |             token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
126 |             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
127 |         return bpe_tokens
128 | 
129 |     def decode(self, tokens):
130 |         text = ''.join([self.decoder[token] for token in tokens])
131 |         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
132 |         return text
133 | 


--------------------------------------------------------------------------------
/CLIP-ViL/data/mscoco/README.md:
--------------------------------------------------------------------------------
1 | # Put raw COCO (train2014m, val2014, test2015) images here
2 | 


--------------------------------------------------------------------------------
/CLIP-ViL/scripts/gqa_adapters.sh:
--------------------------------------------------------------------------------
 1 | # The name of this experiment.
 2 | name=$2
 3 | 
 4 | # Save logs and models under snap/vqa; make backup.
 5 | output=$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # export PYTHONPATH=$PYTHONPATH:/local/harold/ubert/clip_vlp/CLIP
11 | 
12 | # See Readme.md for option details.
13 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
14 |     unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/tasks/gqa.py \
15 |     --distributed \
16 |     --train train,valid --valid testdev \
17 |     --tqdm --output $output \
18 |     --input_raw_images \
19 |     --use_clip \
20 |     --numWorkers 10 \
21 |     --batchSize 2 --optim bert --lr 1e-5 --epochs 10 \
22 |     --llayers 12 --xlayers 0 --rlayers 0 \
23 |     --visualbert_style \
24 |     --vqa_style_transform \
25 |     --loadLXMERTQA snap/pretrained/CLIP_VL_RN50x4 \
26 |     --fp16 \
27 |     --add_zero_padding \
28 |     --gradient_accumulation_steps 8 \
29 |     --warmup_ratio 0.05 \
30 |     --report_step 400 \
31 |     --use_separate_optimizer_for_visual \
32 |     --sgd_lr 0.001 \
33 |     --sgd_momentum 0.0 \
34 |     --schedule 3 \
35 |     --use_positional_embedding \
36 |     --pos_num 25 \
37 |     --clip_model_name RN50x4 \
38 |     --loss_scale 500 \
39 |     --use_adapter \
40 |     --reduction_factor 4 \
41 |     ${@:5}  | tee $output/log.log
42 | 
43 | 
44 | # bash scripts/gqa_2.sh 2 snap/gqa/adapter4 9599 1 --gradient_accumulation_steps 8 --batchSize 32 --lr 1e-5 --warmup_ratio 0.05 --report_step 400 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 --loss_scale 500
45 | 
46 | # bash run/finetune/gqa.bash 4,5,6,7 snap/gqa/final_e20_small_lr 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERTQA /local/harold/ubert/clip_vlp/lxmert/snap/pretrain/clip_full_20_no_qa_50x4_new_continue_from_9/Epoch11 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 --loss_scale 500 
47 | 
48 | # bash run/finetune/gqa.bash 3,4,5,6 snap/gqa/freeze_50x4 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 --loss_scale 500 --freeze_clip
49 | 
50 | # bash run/finetune/gqa.bash 5 snap/gqa/test 9595 1 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 --loss_scale 500 --test submit
51 | 
52 | # bash run/finetune/gqa.bash 4,5,6,7 snap/gqa/final_e20_RN50_large_lr 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERTQA /local/harold/ubert/clip_vlp/lxmert/snap/pretrain/clip_full_20_no_qa_continue_from_17/Epoch11 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --loss_scale 500 --lr 5e-5
53 | 
54 | 
55 | # # bash run/finetune/gqa.bash 0 snap/gqa/test_rn50 9545 1 --gradient_accumulation_steps 1 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --freeze_clip --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --loss_scale 500 --lr 3e-5 --test submit --load snap/gqa/final_e20_RN50/BEST
56 | 
57 | # # bash run/finetune/gqa.bash 3,4,5,6 snap/gqa/scratch_50x4_FU_TRUE 9595 4 --gradient_accumulation_steps 1 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --freeze_clip --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --loss_scale 500 --lr 3e-5 --clip_model_name RN50x4


--------------------------------------------------------------------------------
/CLIP-ViL/scripts/gqa_baseline.sh:
--------------------------------------------------------------------------------
 1 | # The name of this experiment.
 2 | name=$2
 3 | 
 4 | # Save logs and models under snap/vqa; make backup.
 5 | output=$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # export PYTHONPATH=$PYTHONPATH:/local/harold/ubert/clip_vlp/CLIP
11 | 
12 | # See Readme.md for option details.
13 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
14 |     unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/tasks/gqa.py \
15 |     --distributed \
16 |     --train train,valid --valid testdev \
17 |     --tqdm --output $output \
18 |     --input_raw_images \
19 |     --use_clip \
20 |     --numWorkers 10 \
21 |     --batchSize 2 --optim bert --lr 1e-5 --epochs 10 \
22 |     --llayers 12 --xlayers 0 --rlayers 0 \
23 |     --loadLXMERTQA snap/pretrained/CLIP_VL_RN50x4 \
24 |     --visualbert_style \
25 |     --vqa_style_transform \
26 |     --fp16 \
27 |     --add_zero_padding \
28 |     --gradient_accumulation_steps 8 \
29 |     --warmup_ratio 0.05 \
30 |     --report_step 400 \
31 |     --use_separate_optimizer_for_visual \
32 |     --sgd_lr 0.001 \
33 |     --sgd_momentum 0.0 \
34 |     --schedule 3 \
35 |     --use_positional_embedding \
36 |     --pos_num 25 \
37 |     --clip_model_name RN50x4 \
38 |     --loss_scale 500 \
39 |     ${@:5}  | tee $output/log.log
40 | 
41 | 
42 | # bash scripts/gqa.sh 0 snap/gqa/full 9595 1 --gradient_accumulation_steps 8 --batchSize 32 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERTQA snap/pretrained/CLIP_VL_RN50x4_LXRT --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 --loss_scale 500
43 | 
44 | # bash run/finetune/gqa.bash 4,5,6,7 snap/gqa/final_e20_small_lr 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERTQA /local/harold/ubert/clip_vlp/lxmert/snap/pretrain/clip_full_20_no_qa_50x4_new_continue_from_9/Epoch11 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 --loss_scale 500 
45 | 
46 | # bash run/finetune/gqa.bash 3,4,5,6 snap/gqa/freeze_50x4 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 --loss_scale 500 --freeze_clip
47 | 
48 | # bash run/finetune/gqa.bash 5 snap/gqa/test 9595 1 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 --loss_scale 500 --test submit
49 | 
50 | # bash run/finetune/gqa.bash 4,5,6,7 snap/gqa/final_e20_RN50_large_lr 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERTQA /local/harold/ubert/clip_vlp/lxmert/snap/pretrain/clip_full_20_no_qa_continue_from_17/Epoch11 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --loss_scale 500 --lr 5e-5
51 | 
52 | 
53 | # # bash run/finetune/gqa.bash 0 snap/gqa/test_rn50 9545 1 --gradient_accumulation_steps 1 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --freeze_clip --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --loss_scale 500 --lr 3e-5 --test submit --load snap/gqa/final_e20_RN50/BEST
54 | 
55 | # # bash run/finetune/gqa.bash 3,4,5,6 snap/gqa/scratch_50x4_FU_TRUE 9595 4 --gradient_accumulation_steps 1 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --freeze_clip --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --loss_scale 500 --lr 3e-5 --clip_model_name RN50x4


--------------------------------------------------------------------------------
/CLIP-ViL/scripts/pretrain.bash:
--------------------------------------------------------------------------------
 1 | # The name of this experiment.
 2 | name=$2
 3 | 
 4 | # Save logs and models under snap/vqa; make backup.
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # See Readme.md for option details.
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/pretrain/lxmert_pretrain.py \
13 |     --taskMaskLM --taskMatched \
14 |     --visualLosses obj,attr,feat \
15 |     --wordMaskRate 0.15 \
16 |     --train mscoco_train,mscoco_nominival,vgnococo --valid mscoco_minival \
17 |     --batchSize 256 --optim bert --lr 1e-4 --epochs 20 \
18 |     --tqdm \
19 |     --llayers 12 --xlayers 0 --rlayers 0 \
20 |     --visualbert_style \
21 |     --input_raw_images \
22 |     --vqa_style_transform \
23 |     --objMaskRate 0.0 \
24 |     --numWorkers 0\
25 |     --clip_model_name RN50\
26 |     --use_clip \
27 |     --distributed \
28 |     --output $output\
29 |     ${@:5}  | tee $output/log.log
30 | 


--------------------------------------------------------------------------------
/CLIP-ViL/scripts/snli-ve_adapters.sh:
--------------------------------------------------------------------------------
 1 | # The name of this experiment.
 2 | name=$2
 3 | 
 4 | # Save logs and models under snap/vqa; make backup.
 5 | output=$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # export PYTHONPATH=$PYTHONPATH:/local/harold/ubert/clip_vlp/CLIP
11 | 
12 | # See Readme.md for option details.
13 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
14 |     unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/tasks/snli.py \
15 |     --distributed \
16 |     --train train --valid valid  \
17 |     --tqdm --output $output \
18 |     --input_raw_images \
19 |     --use_clip \
20 |     --numWorkers 10 \
21 |     --batchSize 2 --optim bert --lr 1e-5 --epochs 10 \
22 |     --llayers 12 --xlayers 0 --rlayers 0 \
23 |     --visualbert_style \
24 |     --vqa_style_transform \
25 |     --clip_model_name RN50x4 \
26 |     --loadLXMERT snap/pretrained/CLIP_VL_RN50x4 \
27 |     --fp16 \
28 |     --use_adapter \
29 |     --reduction_factor 4 \
30 |     --add_zero_padding \
31 |     --gradient_accumulation_steps 8 \
32 |     --report_step 400 \
33 |     --warmup_ratio 0.05 \
34 |     --use_separate_optimizer_for_visual \
35 |     --sgd_lr 0.001 \
36 |     --sgd_momentum 0.0 \
37 |     --schedule 1 \
38 |     --use_positional_embedding \
39 |     --pos_num 25 \
40 |     --clip_model_name RN50x4 \
41 |     ${@:5}  | tee $output/log.log
42 | 
43 | 
44 |     
45 | #bash run/finetune/snli_ve.bash 5 snap/snli/test 9595 1 --gradient_accumulation_steps 1 --batchSize 12 --lr 5e-5 --freeze_clip --loss_scale 500 --warmup_ratio 0.05 
46 | 
47 | # bash run/finetune/snli_ve.bash 4,5,6,7 snap/snli/final_e20_schedule_2 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERT /local/harold/ubert/clip_vlp/lxmert/snap/pretrain/clip_full_20_no_qa_50x4_new_continue_from_9/Epoch11 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 2 --schedule 1 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4
48 | 
49 | # bash run/finetune/snli_ve.bash 4,5,6,7 snap/snli/final_e20_RN50_schedule_2 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERT /local/harold/ubert/clip_vlp/lxmert/snap/pretrain/clip_full_20_no_qa_continue_from_17/Epoch11 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 2 --schedule 1 --use_positional_embedding --pos_num 25
50 | 
51 | # bash run/finetune/snli_ve.bash 5 snap/snli/test 9595 1 --gradient_accumulation_steps 1 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --freeze_clip --use_positional_embedding --pos_num 25 


--------------------------------------------------------------------------------
/CLIP-ViL/scripts/snli-ve_baseline.sh:
--------------------------------------------------------------------------------
 1 | # The name of this experiment.
 2 | name=$2
 3 | 
 4 | # Save logs and models under snap/vqa; make backup.
 5 | output=$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # export PYTHONPATH=$PYTHONPATH:/local/harold/ubert/clip_vlp/CLIP
11 | 
12 | # See Readme.md for option details.
13 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
14 |     unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/tasks/snli.py \
15 |     --distributed \
16 |     --train train --valid valid  \
17 |     --tqdm --output $output \
18 |     --input_raw_images \
19 |     --use_clip \
20 |     --numWorkers 10 \
21 |     --batchSize 32 --optim bert --lr 5e-5 --epochs 2 \
22 |     --llayers 12 --xlayers 0 --rlayers 0 \
23 |     --visualbert_style \
24 |     --vqa_style_transform \
25 |     --clip_model_name RN50x4 \
26 |     --load snap/snli-ve/full_finetuning/BEST \
27 |     --fp16 \
28 |     --add_zero_padding \
29 |     --gradient_accumulation_steps 8 \
30 |     --report_step 400 \
31 |     --warmup_ratio 0.05 \
32 |     --use_separate_optimizer_for_visual \
33 |     --sgd_lr 0.001 \
34 |     --sgd_momentum 0.0 \
35 |     --schedule 1 \
36 |     --use_positional_embedding \
37 |     --pos_num 25 \
38 |     --clip_model_name RN50x4 \
39 |     ${@:5}  | tee $output/log.log  
40 | 
41 | 
42 |     
43 | 
44 | #bash run/finetune/snli_ve.bash 5 snap/snli/test 9595 1 --gradient_accumulation_steps 1 --batchSize 12 --lr 5e-5 --freeze_clip --loss_scale 500 --warmup_ratio 0.05 
45 | 
46 | # bash run/finetune/snli_ve.bash 4,5,6,7 snap/snli/final_e20_schedule_2 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERT /local/harold/ubert/clip_vlp/lxmert/snap/pretrain/clip_full_20_no_qa_50x4_new_continue_from_9/Epoch11 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 2 --schedule 1 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4
47 | 
48 | # bash run/finetune/snli_ve.bash 4,5,6,7 snap/snli/final_e20_RN50_schedule_2 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERT /local/harold/ubert/clip_vlp/lxmert/snap/pretrain/clip_full_20_no_qa_continue_from_17/Epoch11 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 2 --schedule 1 --use_positional_embedding --pos_num 25
49 | 
50 | # bash run/finetune/snli_ve.bash 5 snap/snli/test 9595 1 --gradient_accumulation_steps 1 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --freeze_clip --use_positional_embedding --pos_num 25 


--------------------------------------------------------------------------------
/CLIP-ViL/scripts/vqa_adapters.sh:
--------------------------------------------------------------------------------
 1 | # The name of this experiment.
 2 | name=$2
 3 | 
 4 | # Save logs and models under snap/vqa; make backup.
 5 | output=$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # export PYTHONPATH=$PYTHONPATH:/local/harold/ubert/clip_vlp/CLIP
11 | 
12 | # See Readme.md for option details.
13 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
14 |     unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/tasks/vqa.py \
15 |     --distributed \
16 |     --train train,nominival --valid minival  \
17 |     --tqdm --output $output \
18 |     --input_raw_images \
19 |     --use_clip \
20 |     --numWorkers 10 \
21 |     --batchSize 32 --optim bert --lr 5e-4 --epochs 5 \
22 |     --llayers 12 --xlayers 0 --rlayers 0 \
23 |     --visualbert_style \
24 |     --vqa_style_transform \
25 |     --clip_model_name RN50x4 \
26 |     --add_zero_padding \
27 |     --gradient_accumulation_steps 8 \
28 |     --loss_scale 500 \
29 |     --warmup_ratio 0.05 \
30 |     --report_step 400 \
31 |     --use_separate_optimizer_for_visual \
32 |     --sgd_lr 0.0001 \
33 |     --sgd_momentum 0.0 \
34 |     --schedule 2 \
35 |     --use_positional_embedding \
36 |     --pos_num 25 \
37 |     --fp16 \
38 |     --use_adapter \
39 |     --reduction_factor 4 \
40 |     --clip_model_name RN50x4 \
41 |     --loadLXMERTQA snap/pretrained/CLIP_VL_RN50x4 \
42 |     ${@:5}  | tee $output/log.log
43 | 
44 | 
45 | # CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
46 | #     unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/tasks/vqa.py \
47 | #     --distributed \
48 | #     --train train,nominival --valid minival  \
49 | #     --test test \
50 | #     --tqdm --output $output \
51 | #     --input_raw_images \
52 | #     --use_clip \
53 | #     --numWorkers 10 \
54 | #     --batchSize 32 --optim bert --lr 4e-5 --epochs 5 \
55 | #     --llayers 12 --xlayers 0 --rlayers 0 \
56 | #     --visualbert_style \
57 | #     --vqa_style_transform \
58 | #     --clip_model_name RN50x4 \
59 | #     --add_zero_padding \
60 | #     --gradient_accumulation_steps 8 \
61 | #     --loss_scale 500 \
62 | #     --warmup_ratio 0.05 \
63 | #     --report_step 400 \
64 | #     --use_separate_optimizer_for_visual \
65 | #     --sgd_lr 0.001 \
66 | #     --sgd_momentum 0.0 \
67 | #     --schedule 2 \
68 | #     --use_positional_embedding \
69 | #     --pos_num 25 \
70 | #     --fp16 \
71 | #     --use_adapter \
72 | #     --reduction_factor 4 \
73 | #     --clip_model_name RN50x4 \
74 | #     --load snap/vqa/vqa_clip_rn50x4_LMadapter4_5e-4/BEST
75 | #     ${@:5}  | tee $output/log.log
76 | 


--------------------------------------------------------------------------------
/CLIP-ViL/scripts/vqa_baseline.sh:
--------------------------------------------------------------------------------
 1 | # The name of this experiment.
 2 | name=$2
 3 | 
 4 | # Save logs and models under snap/vqa; make backup.
 5 | output=$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # export PYTHONPATH=$PYTHONPATH:/local/harold/ubert/clip_vlp/CLIP
11 | 
12 | # See Readme.md for option details.
13 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
14 |     unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/tasks/vqa.py \
15 |     --distributed \
16 |     --train train,nominival --valid minival  \
17 |     --tqdm --output $output \
18 |     --input_raw_images \
19 |     --use_clip \
20 |     --numWorkers 10 \
21 |     --batchSize 32 --optim bert --lr 5e-5 --epochs 5 \
22 |     --llayers 12 --xlayers 0 --rlayers 0 \
23 |     --visualbert_style \
24 |     --vqa_style_transform \
25 |     --clip_model_name RN50x4 \
26 |     --add_zero_padding \
27 |     --gradient_accumulation_steps 8 \
28 |     --loss_scale 500 \
29 |     --warmup_ratio 0.05 \
30 |     --report_step 400 \
31 |     --use_separate_optimizer_for_visual \
32 |     --sgd_lr 0.001 \
33 |     --sgd_momentum 0.0 \
34 |     --schedule 2 \
35 |     --use_positional_embedding \
36 |     --pos_num 25 \
37 |     --fp16 \
38 |     --clip_model_name RN50x4 \
39 |     --loadLXMERTQA snap/pretrained/CLIP_VL_RN50x4
40 |     ${@:5}  | tee $output/log.log
41 | 
42 | 
43 | # CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
44 | #     unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/tasks/vqa.py \
45 | #     --distributed \
46 | #     --train train,nominival --valid minival  \
47 | #     --test test \
48 | #     --tqdm --output $output \
49 | #     --input_raw_images \
50 | #     --use_clip \
51 | #     --numWorkers 10 \
52 | #     --batchSize 32 --optim bert --lr 5e-5 --epochs 5 \
53 | #     --llayers 12 --xlayers 0 --rlayers 0 \
54 | #     --visualbert_style \
55 | #     --vqa_style_transform \
56 | #     --clip_model_name RN50x4 \
57 | #     --add_zero_padding \
58 | #     --gradient_accumulation_steps 8 \
59 | #     --loss_scale 500 \
60 | #     --warmup_ratio 0.05 \
61 | #     --report_step 400 \
62 | #     --use_separate_optimizer_for_visual \
63 | #     --sgd_lr 0.001 \
64 | #     --sgd_momentum 0.0 \
65 | #     --schedule 2 \
66 | #     --use_positional_embedding \
67 | #     --pos_num 25 \
68 | #     --fp16 \
69 | #     --clip_model_name RN50x4 \
70 | #     --load snap/vqa/vqa_clip_rn50x4/BEST \
71 | #     ${@:5}  | tee $output/log.log
72 |     


--------------------------------------------------------------------------------
/CLIP-ViL/src/lxrt/adapters/__init__.py:
--------------------------------------------------------------------------------
1 | # The codes are borrowed from https://github.com/rabeehk/compacter
2 | 
3 | from .config import MetaAdapterConfig, AdapterConfig, CompactorConfig, LRAdapterConfig
4 | from .adapter_modeling import Adapter, HyperComplexAdapter, OutputAdapter
5 | from .adapter_controller import AdapterController, AdapterLayer, MetaLayersAdapterController, OutputParallelAdapterLayer
6 | from .adapter_hypernetwork import AdapterLayersHyperNetController, AdapterLayersOneHyperNetController
7 | from .adapter_utils import TaskEmbeddingController


--------------------------------------------------------------------------------
/CLIP-ViL/src/lxrt/adapters/adapter_configuration.py:
--------------------------------------------------------------------------------
 1 | """Implements the adapters and other parameter-efficient finetuning methods' configurations."""
 2 | 
 3 | from collections import OrderedDict
 4 | from dataclasses import dataclass
 5 | 
 6 | import torch.nn as nn
 7 | 
 8 | @dataclass
 9 | class AdapterConfig(object):
10 |     """Implements the adapter configuration proposed by Houlsby et. al, 2019
11 |     in https://arxiv.org/abs/1902.00751.
12 |     We additionally pass all the configuration of parameter-efficient finetuning
13 |     methods with this config."""
14 |     add_layer_norm_before_adapter: bool = False
15 |     add_layer_norm_after_adapter: bool = True
16 |     non_linearity: str = "swish"
17 |     task_reduction_factor: int = 16
18 |     add_adapter_in_feed_forward = True
19 |     add_adapter_in_self_attention = True
20 |     hidden_dim = 128
21 |     task_adapter_layers_encoder = None
22 |     task_adapter_layers_decoder = None
23 |     task_adapter_in_decoder = True
24 |     intrinsic_dim = 100
25 |     normalize_intrinsic_projections = False
26 |     # This can be either random, or fastfood.
27 |     intrinsic_projection = "random"
28 | 
29 |     # Hypercomplex adapters parameters 
30 |     hypercomplex_adapters = False
31 |     hypercomplex_division = 8
32 |     learn_phm = True
33 |     hypercomplex_nonlinearity="glorot-uniform"
34 |     shared_phm_rule = False 
35 |     factorized_phm = False 
36 |     shared_W_phm = False
37 |     factorized_phm_rule = False 
38 |     phm_c_init = "normal"
39 |     phm_rank = 1
40 |     phm_init_range=0.01
41 | 
42 |     # prefix-tuning parameters.
43 |     prefix_dim = 100
44 |     init_prefix_from_vocab = False 
45 |     kronecker_prod = False  
46 | 
47 |     # BitFit configuration.
48 |     bitfit = False
49 | 
50 |     # Low-rank adapters.
51 |     low_rank_adapters = False
52 |     low_rank_w_init = "glorot-uniform"
53 |     low_rank_rank = 1
54 | 
55 | 
56 | ADAPTER_CONFIG_MAPPING = OrderedDict(
57 |     [("adapter", AdapterConfig)])
58 | 
59 | 
60 | class AutoAdapterConfig(nn.Module):
61 |     """Generic Adapter config class to instantiate different adapter configs."""
62 | 
63 |     @classmethod
64 |     def get(cls, config_name: str):
65 |         if config_name in ADAPTER_CONFIG_MAPPING:
66 |             return ADAPTER_CONFIG_MAPPING[config_name]()
67 |         raise ValueError(
68 |             "Unrecognized adapter config type identifier: {}. Should contain one of {}"
69 |                 .format(config_name, ", ".join(ADAPTER_CONFIG_MAPPING.keys())))
70 | 


--------------------------------------------------------------------------------
/CLIP-ViL/src/lxrt/adapters/adapter_modeling.py:
--------------------------------------------------------------------------------
  1 | """Implements an Adapter, Low-rank adapters and Hyper-adapter Layers."""
  2 | import torch
  3 | import torch.nn as nn
  4 | from .adapter_utils import Activations
  5 | 
  6 | from .hypercomplex.layers import PHMLinear
  7 | from .low_rank_layer import LowRankLinear
  8 | 
  9 | 
 10 | class LowRankAdapter(nn.Module):
 11 |     """This is the low-rank adapter, in which each adapter is composed of two rank-one matrices.
 12 |     """
 13 |     def __init__(self, config):
 14 |         super().__init__()
 15 |         self.config = config
 16 |         self.input_dim = config.input_dim
 17 |         self.down_sample_size = self.input_dim // config.reduction_factor
 18 |         self.activation = Activations(config.non_linearity.lower())
 19 |         self.down_sampler = LowRankLinear(self.input_dim, self.down_sample_size,
 20 |                                           w_init=config.low_rank_w_init,
 21 |                                           rank=config.low_rank_rank)
 22 |         self.up_sampler = LowRankLinear(self.down_sample_size, self.input_dim,
 23 |                                         w_init=config.low_rank_w_init,
 24 |                                         rank=config.low_rank_rank)
 25 | 
 26 |     def forward(self, x):
 27 |         z = self.down_sampler(x)
 28 |         z = self.activation(z)
 29 |         output = self.up_sampler(z)
 30 |         return output
 31 | 
 32 | 
 33 | class Adapter(nn.Module):
 34 |     """Conventional Adapter layer, in which the weights of up and down sampler modules
 35 |     are parameters and are optimized."""
 36 | 
 37 |     def __init__(self, config):
 38 |         super().__init__()
 39 |         self.config = config
 40 |         self.input_dim = config.d_model
 41 |         reduction_factor = config.reduction_factor
 42 |         self.down_sample_size = self.input_dim // reduction_factor
 43 |         self.activation = Activations(config.non_linearity.lower())
 44 |         self.down_sampler = nn.Linear(self.input_dim, self.down_sample_size) 
 45 |         self.up_sampler = nn.Linear(self.down_sample_size, self.input_dim) 
 46 | 
 47 |         if config.use_gate:
 48 |             self.gate = nn.Parameter(torch.zeros(1))
 49 |         else:
 50 |             self.gate = None
 51 | 
 52 |     def forward(self, x):
 53 |         z = self.down_sampler(x)
 54 |         z = self.activation(z)
 55 |         output = self.up_sampler(z)
 56 | 
 57 |         if self.gate is not None:
 58 |             output = self.gate * output
 59 | 
 60 |         return output 
 61 | 
 62 | 
 63 | class OutputAdapter(nn.Module):
 64 |     """Conventional Adapter layer, in which the weights of up and down sampler modules
 65 |     are parameters and are optimized."""
 66 | 
 67 |     def __init__(self, config, output_dim):
 68 |         super().__init__()
 69 |         self.config = config
 70 |         self.input_dim = config.d_model
 71 |         reduction_factor = 16
 72 |         self.down_sample_size = self.input_dim // reduction_factor
 73 |         self.activation = Activations(config.non_linearity.lower())
 74 |         self.down_sampler = nn.Linear(self.input_dim, self.down_sample_size) 
 75 |         self.up_sampler = nn.Linear(self.down_sample_size, output_dim) 
 76 | 
 77 |     def forward(self, x):
 78 |         z = self.down_sampler(x)
 79 |         z = self.activation(z)
 80 |         output = self.up_sampler(z)
 81 |         return output 
 82 | 
 83 |     def resize_up_sampler(self, resized_size):
 84 |         self.up_sampler = nn.Linear(self.down_sample_size, resized_size)
 85 | 
 86 | 
 87 | class HyperComplexAdapter(nn.Module):
 88 |     """Hypercomplex Adapter layer, in which the weights of up and down sampler modules
 89 |     are parameters are 1/n times of the conventional adapter layers, where n is
 90 |     hypercomplex division number."""
 91 | 
 92 |     def __init__(self, config):
 93 |         super().__init__()
 94 |         self.config = config
 95 |         self.input_dim = config.input_dim
 96 |         self.down_sample_size = self.input_dim // config.reduction_factor
 97 |         self.activation = Activations(config.non_linearity.lower())
 98 |         self.down_sampler = PHMLinear(in_features=self.input_dim,
 99 |                                       out_features=self.down_sample_size,
100 |                                       bias=True,
101 |                                       c_init=config.phm_c_init,
102 |                                       phm_dim=config.hypercomplex_division,
103 |                                       learn_phm=config.learn_phm,
104 |                                       w_init=config.hypercomplex_nonlinearity,
105 |                                       shared_phm_rule=config.shared_phm_rule,
106 |                                       factorized_phm=config.factorized_phm,
107 |                                       shared_W_phm=config.shared_W_phm,
108 |                                       factorized_phm_rule=config.factorized_phm_rule,
109 |                                       phm_rank=config.phm_rank,
110 |                                       phm_init_range=config.phm_init_range,
111 |                                       kronecker_prod=config.kronecker_prod)
112 |         self.up_sampler = PHMLinear(in_features=self.down_sample_size,
113 |                                     out_features=self.input_dim, 
114 |                                     bias=True,
115 |                                     c_init=config.phm_c_init,
116 |                                     phm_dim=config.hypercomplex_division,
117 |                                     learn_phm=config.learn_phm,
118 |                                     w_init=config.hypercomplex_nonlinearity,
119 |                                     shared_phm_rule=config.shared_phm_rule,
120 |                                     factorized_phm=config.factorized_phm,
121 |                                     shared_W_phm=config.shared_W_phm,
122 |                                     factorized_phm_rule=config.factorized_phm_rule,
123 |                                     phm_rank=config.phm_rank,
124 |                                     phm_init_range=config.phm_init_range,
125 |                                     kronecker_prod=config.kronecker_prod)
126 | 
127 |     def forward(self, x):
128 |         z = self.down_sampler(x)
129 |         z = self.activation(z)
130 |         return self.up_sampler(z)


--------------------------------------------------------------------------------
/CLIP-ViL/src/lxrt/adapters/adapter_outputs.py:
--------------------------------------------------------------------------------
 1 | """Defines the output class for the adapter layers' parameters."""
 2 | import torch
 3 | from dataclasses import dataclass
 4 | 
 5 | 
 6 | @dataclass
 7 | class SamplerOutput:
 8 |     """Base class for the base and weights of each adapter."""
 9 |     weight: torch.FloatTensor = None
10 |     bias: torch.FloatTensor = None
11 | 
12 | 
13 | @dataclass
14 | class LayerNormOutput:
15 |     """Base class for the base and weights of the conditional
16 |     layer norms."""
17 |     weight: torch.FloatTensor = None
18 |     bias: torch.FloatTensor = None
19 | 
20 | 
21 | @dataclass
22 | class AdapterOutput:
23 |     """Base class for each adapter weights"""
24 |     up: SamplerOutput = None
25 |     down: SamplerOutput = None
26 |     pre_norm: LayerNormOutput = None
27 |     post_norm: LayerNormOutput = None
28 | 
29 | 
30 | @dataclass
31 | class AdapterT5BlockOutput:
32 |     """
33 |     Base class for adapter layer's outputs.
34 |     """
35 |     feed_forward: AdapterOutput = None
36 |     self_attention: AdapterOutput = None
37 |     cross_attention: AdapterOutput = None


--------------------------------------------------------------------------------
/CLIP-ViL/src/lxrt/adapters/adapter_utils.py:
--------------------------------------------------------------------------------
 1 | """Implementation of different utility functions for adapter layers."""
 2 | import torch
 3 | import torch.nn as nn
 4 | from transformers.activations import get_activation
 5 | 
 6 | 
 7 | class Activations(nn.Module):
 8 |     def __init__(self, activation_type):
 9 |         super().__init__()
10 |         self.f = get_activation(activation_type)
11 | 
12 |     def forward(self, x):
13 |         return self.f(x)
14 | 
15 | 
16 | def init_linear_layer(linear_layer, std=1e-2):
17 |     """Initializes the given linear module as explained in adapter paper."""
18 |     nn.init.normal_(linear_layer.weight, std=std)
19 |     nn.init.zeros_(linear_layer.bias)
20 | 
21 | 
22 | def linear_layer(input_dim, output_dim, std=1e-2):
23 |     """Generates a linear module and initializes it."""
24 |     linear = nn.Linear(input_dim, output_dim)
25 |     init_linear_layer(linear, std=std)
26 |     return linear
27 | 
28 | 
29 | class TaskHyperNet(nn.Module):
30 |     """This module generates the task-embeddings from the initial feeded task embeddings."""
31 | 
32 |     def __init__(self, config, input_dim):
33 |         super(TaskHyperNet, self).__init__()
34 |         self.task_hidden_dim = config.task_hidden_dim
35 |         self.projected_task_embedding_dim = config.projected_task_embedding_dim
36 |         self.task_embeding_generator = nn.Sequential(
37 |             linear_layer(input_dim, self.task_hidden_dim),
38 |             nn.ReLU(),
39 |             linear_layer(self.task_hidden_dim, self.projected_task_embedding_dim))
40 | 
41 |     def forward(self, task_embedding):
42 |         task_embedding = task_embedding.view(-1)
43 |         return self.task_embeding_generator(task_embedding).view(-1)
44 | 
45 | 
46 | class LayerNormHyperNet(nn.Module):
47 |     """This module generates the weight and bias for the task conditioned layer norm."""
48 | 
49 |     def __init__(self, config):
50 |         super(LayerNormHyperNet, self).__init__()
51 |         self.task_embedding_dim = config.projected_task_embedding_dim \
52 |             if config.train_task_embeddings else config.task_embedding_dim
53 |         self.weight_generator = linear_layer(self.task_embedding_dim, config.input_dim)
54 |         self.bias_generator = linear_layer(self.task_embedding_dim, config.input_dim)
55 | 
56 |     def forward(self, input):
57 |         return self.weight_generator(input), self.bias_generator(input)
58 | 
59 | 
60 | class TaskEmbeddingController(nn.Module):
61 |     """Main module controlling task embeddings."""
62 | 
63 |     def __init__(self, config):
64 |         super(TaskEmbeddingController, self).__init__()
65 |         # self.device = config.device
66 |         self.task_embedding_dim = config.task_embedding_dim
67 |         self.tasks = config.tasks
68 |         self.task_to_task_embeddings = {task: task for task in self.tasks}
69 |         if config.task_to_embeddings is not None:
70 |             self.task_to_task_embeddings = config.task_to_embeddings
71 |             self.tasks = self.task_to_task_embeddings.values()
72 |         self.set_task_embeddings(self.tasks)
73 |         self.train_task_embeddings = config.train_task_embeddings
74 |         if self.train_task_embeddings:
75 |             self.task_hyper_net = TaskHyperNet(config)
76 | 
77 |     def get_task(self, task):
78 |         return self.task_to_task_embeddings[task]
79 | 
80 |     def set_task_embeddings(self, tasks):
81 |         self.task_to_embeddings = nn.ParameterDict(dict())
82 |         for task in tasks:
83 |             task_embedding = torch.Tensor(torch.randn(self.task_embedding_dim))
84 |             self.task_to_embeddings[task] = nn.Parameter(task_embedding)
85 | 
86 |     def forward(self, task):
87 |         task_mapped = self.get_task(task)
88 |         task_embedding = self.task_to_embeddings[task_mapped]
89 |         if self.train_task_embeddings:
90 |             return self.task_hyper_net(task_embedding)
91 |         return task_embedding
92 | 


--------------------------------------------------------------------------------
/CLIP-ViL/src/lxrt/adapters/config.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | 
  3 | 
  4 | @dataclass
  5 | class AdapterConfig(object):
  6 |     """Implements the adapter configuration proposed by Houlsby et. al, 2019
  7 |     in https://arxiv.org/abs/1902.00751."""
  8 |     add_layer_norm_before_adapter: bool = False
  9 |     add_layer_norm_after_adapter: bool = False
 10 |     non_linearity: str = "gelu_new"
 11 |     reduction_factor: int = 16
 12 |     weight_init_range = 1e-2
 13 |     # Whether to use conditional layer norms for adapters.
 14 |     conditional_layer_norm = False
 15 |     hidden_dim = 128
 16 |     # Whether to add adapter blocks, this is used in case we need
 17 |     # to tune only layer norms.
 18 |     train_adapters_blocks = True
 19 | 
 20 |     task_adapter_layers_encoder = None
 21 |     task_adapter_layers_decoder = None
 22 |     task_adapter_in_decoder = True
 23 |     intrinsic_dim = 100
 24 |     normalize_intrinsic_projections = False
 25 |     # This can be either random, or fastfood.
 26 |     intrinsic_projection = "random"
 27 | 
 28 |     # Hypercomplex adapters parameters 
 29 |     hypercomplex_adapters = False
 30 |     hypercomplex_division = 8
 31 |     learn_phm = True
 32 |     hypercomplex_nonlinearity="glorot-uniform"
 33 |     shared_phm_rule = False 
 34 |     factorized_phm = False 
 35 |     shared_W_phm = False
 36 |     factorized_phm_rule = False 
 37 |     phm_c_init = "normal"
 38 |     phm_rank = 1
 39 |     phm_init_range=0.01
 40 | 
 41 |     # prefix-tuning parameters.
 42 |     prefix_dim = 100
 43 |     init_prefix_from_vocab = False 
 44 |     kronecker_prod = False  
 45 | 
 46 |     # BitFit configuration.
 47 |     bitfit = False
 48 | 
 49 |     # Low-rank adapters.
 50 |     low_rank_adapters = False
 51 |     low_rank_w_init = "glorot-uniform"
 52 |     low_rank_rank = 1
 53 | 
 54 |     # whether using single adapter for all tasks
 55 |     use_single_adapter = True
 56 | 
 57 | 
 58 | class MetaAdapterConfig(AdapterConfig):
 59 |     """Implements Meta adapter in which a hyper-network generates the parameters of
 60 |      adapter layers. In this case we have a task embeddings which is feed to the
 61 |      hyper-network to allow it generate the weights for the adapter layers."""
 62 |     task_embedding_dim = 512
 63 |     task_embedding_dir = None
 64 |     hidden_dim = 128
 65 |     train_task_embeddings = False
 66 |     non_linearity: str = "gelu_new"
 67 |     projected_task_embedding_dim = 64
 68 |     task_hidden_dim = 128
 69 |     parametric_task_embedding = False
 70 |     # If Specified, uses one hypernet to generates the adapters weights.
 71 |     unique_hyper_net = True
 72 |     unique_hyper_net_layer_norm = True
 73 |     # We consider only one hyper-net for all the blocks of transformer.
 74 |     efficient_unique_hyper_net = False
 75 |     task_to_embeddings=None
 76 | 
 77 | 
 78 | @dataclass
 79 | class CompactorConfig(object):
 80 |     add_layer_norm_before_adapter: bool = False
 81 |     add_layer_norm_after_adapter: bool = False
 82 |     non_linearity: str = "gelu_new"
 83 |     reduction_factor: int = 16
 84 |     weight_init_range = 1e-2
 85 |     # Whether to use conditional layer norms for adapters.
 86 |     hidden_dim = 128
 87 |     # Whether to add adapter blocks, this is used in case we need
 88 |     # to tune only layer norms.
 89 |     task_adapter_layers_encoder = None
 90 |     task_adapter_layers_decoder = None
 91 |     task_adapter_in_decoder = True
 92 |     intrinsic_dim = 100
 93 |     normalize_intrinsic_projections = False
 94 |     # This can be either random, or fastfood.
 95 |     intrinsic_projection = "random"
 96 | 
 97 |     # Hypercomplex adapters parameters 
 98 |     hypercomplex_adapters = True
 99 |     hypercomplex_division = 4
100 |     train_task_adapters = True
101 |     learn_phm = True
102 |     hypercomplex_nonlinearity="glorot-uniform"
103 |     shared_phm_rule = True 
104 |     factorized_phm = True 
105 |     shared_W_phm = False
106 |     factorized_phm_rule = False 
107 |     phm_c_init = "normal"
108 |     phm_rank = 1
109 |     phm_init_range=0.0001
110 | 
111 |     # prefix-tuning parameters.
112 |     prefix_dim = 100
113 |     init_prefix_from_vocab = False 
114 |     kronecker_prod = False  
115 | 
116 |     # BitFit configuration.
117 |     bitfit = False
118 | 
119 |     # Low-rank adapters.
120 |     low_rank_adapters = False
121 |     low_rank_w_init = "glorot-uniform"
122 |     low_rank_rank = 1
123 | 
124 |     # whether using single adapter for all tasks
125 |     use_single_adapter = False
126 | 
127 | 
128 | @dataclass
129 | class LRAdapterConfig(object):
130 |     add_layer_norm_before_adapter: bool = False
131 |     add_layer_norm_after_adapter: bool = False
132 |     non_linearity: str = "gelu_new"
133 |     reduction_factor: int = 16
134 |     weight_init_range = 1e-2
135 |     # Whether to use conditional layer norms for adapters.
136 |     hidden_dim = 128
137 |     # Whether to add adapter blocks, this is used in case we need
138 |     # to tune only layer norms.
139 |     task_adapter_layers_encoder = None
140 |     task_adapter_layers_decoder = None
141 |     task_adapter_in_decoder = True
142 |     intrinsic_dim = 100
143 |     normalize_intrinsic_projections = False
144 |     # This can be either random, or fastfood.
145 |     intrinsic_projection = "random"
146 | 
147 |     # Hypercomplex adapters parameters 
148 |     hypercomplex_adapters = False
149 |     hypercomplex_division = 4
150 |     train_task_adapters = True
151 |     learn_phm = True
152 |     hypercomplex_nonlinearity="glorot-uniform"
153 |     shared_phm_rule = True 
154 |     factorized_phm = True 
155 |     shared_W_phm = False
156 |     factorized_phm_rule = False 
157 |     phm_c_init = "normal"
158 |     phm_rank = 1
159 |     phm_init_range=0.0001
160 | 
161 |     # prefix-tuning parameters.
162 |     prefix_dim = 100
163 |     init_prefix_from_vocab = False 
164 |     kronecker_prod = False  
165 | 
166 |     # BitFit configuration.
167 |     bitfit = False
168 | 
169 |     # Low-rank adapters.
170 |     low_rank_adapters = True
171 |     low_rank_w_init = "glorot-uniform"
172 |     low_rank_rank = 1
173 | 
174 |     # whether using single adapter for all tasks
175 |     use_single_adapter = False


--------------------------------------------------------------------------------
/CLIP-ViL/src/lxrt/adapters/hypercomplex/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ylsung/VL_adapter/545fcbbdbbaec4c442de35567f6ae477ff4e8265/CLIP-ViL/src/lxrt/adapters/hypercomplex/__init__.py


--------------------------------------------------------------------------------
/CLIP-ViL/src/lxrt/adapters/hypercomplex/inits.py:
--------------------------------------------------------------------------------
 1 | # The codes are from https://github.com/bayer-science-for-a-better-life/phc-gnn
 2 | import torch
 3 | import math
 4 | 
 5 | 
 6 | def glorot_normal(tensor: torch.Tensor):
 7 |     return torch.nn.init.xavier_normal_(tensor, gain=math.sqrt(2))
 8 | 
 9 | def glorot_uniform(tensor: torch.Tensor):
10 |     return torch.nn.init.xavier_uniform_(tensor, gain=math.sqrt(2))


--------------------------------------------------------------------------------
/CLIP-ViL/src/lxrt/adapters/hypercomplex/kronecker.py:
--------------------------------------------------------------------------------
 1 | # The codes are from https://github.com/bayer-science-for-a-better-life/phc-gnn
 2 | import torch
 3 | 
 4 | # TODO: change this with torch.kron
 5 | """A part of the pylabyk library: numpytorch.py at https://github.com/yulkang/pylabyk"""
 6 | def kronecker_product(a, b):
 7 |     """
 8 |     Kronecker product of matrices a and b with leading batch dimensions.
 9 |     Batch dimensions are broadcast. The number of them mush
10 |     :type a: torch.Tensor
11 |     :type b: torch.Tensor
12 |     :rtype: torch.Tensor
13 |     """
14 |     #return torch.stack([torch.kron(ai, bi) for ai, bi in zip(a,b)], dim=0)
15 |     siz1 = torch.Size(torch.tensor(a.shape[-2:]) * torch.tensor(b.shape[-2:]))
16 |     res = a.unsqueeze(-1).unsqueeze(-3) * b.unsqueeze(-2).unsqueeze(-4)
17 |     siz0 = res.shape[:-4]
18 |     out = res.reshape(siz0 + siz1)
19 |     return out
20 | 
21 | 
22 | def kronecker_product_einsum_batched(A: torch.Tensor, B: torch.Tensor):
23 |     """
24 |     Batched Version of Kronecker Products
25 |     :param A: has shape (b, a, c)
26 |     :param B: has shape (b, k, p)
27 |     :return: (b, ak, cp)
28 |     """
29 |     assert A.dim() == 3 and B.dim() == 3
30 |     res = torch.einsum('bac,bkp->bakcp', A, B).view(A.size(0),
31 |                                                     A.size(1)*B.size(1),
32 |                                                     A.size(2)*B.size(2))
33 |     return res


--------------------------------------------------------------------------------
/CLIP-ViL/src/lxrt/adapters/low_rank_layer.py:
--------------------------------------------------------------------------------
 1 | """This script implements a low-rank linear layer."""
 2 | import torch 
 3 | import torch.nn as nn 
 4 | 
 5 | from .hypercomplex.inits import glorot_uniform, glorot_normal
 6 | 
 7 | class LowRankLinear(torch.nn.Module):
 8 |     def __init__(self, input_dim: int, output_dim: int, rank: int = 1,
 9 |         bias: bool = True, w_init: str = "glorot-uniform"):
10 |         super(LowRankLinear, self).__init__()
11 |         self.input_dim = input_dim
12 |         self.output_dim = output_dim 
13 |         self.rank = rank
14 |         self.bias = bias
15 |         self.w_init = w_init
16 |         self.W_left = nn.Parameter(torch.Tensor(size=(input_dim, rank)), requires_grad=True)
17 |         self.W_right = nn.Parameter(torch.Tensor(size=(rank, output_dim)), requires_grad=True)
18 |         if bias:
19 |             self.b = nn.Parameter(torch.Tensor(output_dim))
20 |         self.reset_parameters()
21 |     
22 |     def reset_parameters(self):
23 |         if self.bias:
24 |             self.b.data = torch.zeros_like(self.b.data)
25 |         if self.w_init == "glorot-uniform": 
26 |             self.W_left.data = glorot_uniform(self.W_left.data) 
27 |             self.W_right.data = glorot_uniform(self.W_right.data)          
28 |         elif self.w_init == "glorot-normal":
29 |             self.W_left.data = glorot_normal(self.W_left.data)
30 |             self.W_right.data = glorot_normal(self.W_right.data)
31 |         else:
32 |             raise ValueError
33 | 
34 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
35 |         W = self.W_left.matmul(self.W_right)
36 |         output = torch.matmul(input=x, other=W)
37 |         if self.bias:
38 |             output += self.b
39 |         return output
40 | 


--------------------------------------------------------------------------------
/CLIP-ViL/src/lxrt/visual_transformers.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import json
 3 | import logging
 4 | import math
 5 | import os
 6 | import shutil
 7 | import tarfile
 8 | import tempfile
 9 | import sys
10 | from io import open
11 | import torch.nn.functional as F
12 | 
13 | import torch
14 | from torch import nn
15 | from torch.nn import CrossEntropyLoss, SmoothL1Loss
16 | import numpy as np
17 | def resize_pos_embed(posemb, posemb_new):
18 |     # Rescale the grid of position embeddings when loading from state_dict. Adapted from
19 |     # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224
20 |     ntok_new = posemb_new.shape[1]
21 |     if True:
22 |         posemb_tok, posemb_grid = posemb[:, :1], posemb[0, 1:]
23 |         ntok_new -= 1
24 |     else:
25 |         posemb_tok, posemb_grid = posemb[:, :0], posemb[0]
26 |     gs_old = int(math.sqrt(len(posemb_grid)))
27 |     gs_new = int(math.sqrt(ntok_new))
28 |     #_logger.info('Position embedding grid-size from %s to %s', gs_old, gs_new)
29 |     posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2)
30 |     posemb_grid = F.interpolate(posemb_grid, size=(gs_new, gs_new), mode='bilinear')
31 |     posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_new * gs_new, -1)
32 |     posemb = torch.cat([posemb_tok, posemb_grid], dim=1)
33 |     return posemb
34 | 
35 | def initialize_clip(VISUAL_CONFIG, num_patches = 240, adapter_config=None):
36 |     import clip
37 |     clip_model, preprocess = clip.load(VISUAL_CONFIG.clip_model_name, jit=False, adapter_config=adapter_config)
38 |     if VISUAL_CONFIG.clip_model_name == "ViT-B/32" and VISUAL_CONFIG.reset_pos_embedding:
39 | 
40 |         #from timm.models.vision_transformer import resize_pos_embed
41 |         pos_embed = nn.Parameter(torch.zeros(num_patches + 1, 768).float())
42 |         pos_embed.weight = resize_pos_embed(clip_model.visual.positional_embedding.unsqueeze(0), pos_embed.unsqueeze(0))
43 |         clip_model.visual.positional_embedding = pos_embed
44 |         # model.visual.positional_embedding = model.visual.positional_embedding.to("cuda")
45 |         #print(model.visual.positional_embedding.device)
46 |         # pass
47 |     if VISUAL_CONFIG.freeze_clip:
48 |         for parameter in clip_model.parameters():
49 |             parameter.requires_grad = False
50 |     return clip_model
51 | 
52 | def initialize_vit(VISUAL_CONFIG, model_type = "ViT-B_32", pretrained_dir = "data/ViT-B_32.npz", img_size = (384, 640), num_patches = 240):
53 |     from vit.models.modeling import VisionTransformer, CONFIGS
54 |     config = CONFIGS[model_type]
55 |     model = VisionTransformer(config, img_size = 224, zero_head=True, num_classes=1)
56 |     model.load_from(np.load(pretrained_dir))
57 | 
58 |     pos_embed = nn.Parameter(torch.zeros(num_patches + 1, 768).float())
59 |     pos_embed.weight = resize_pos_embed(model.transformer.embeddings.position_embeddings, pos_embed.unsqueeze(0))
60 |     model.transformer.embeddings.position_embeddings = pos_embed
61 |     if VISUAL_CONFIG.freeze_clip:
62 |         for parameter in model.parameters():
63 |             parameter.requires_grad = False
64 |     return model
65 | 
66 | def initialize_optimizer(visual_model, lr, momentum, weight_decay):
67 |     optimizer = torch.optim.SGD(visual_model.parameters(), lr,
68 |                                 momentum=momentum,
69 |                                 weight_decay=weight_decay)
70 |     return optimizer
71 | 
72 | def adjust_learning_rate(optimizer, epoch, args):
73 |     """Decay the learning rate based on schedule"""
74 |     lr = args.sgd_lr
75 |     
76 |     for milestone in args.schedule:
77 |         lr *= 0.1 if epoch >= milestone else 1.
78 |     for param_group in optimizer.param_groups:
79 |         param_group['lr'] = lr
80 | 
81 | from torch.optim import Optimizer
82 | 
83 | class FusedOptimizer(Optimizer):
84 |     def __init__(self, optimizers):
85 |         self.optimizers = optimizers
86 |         param_groups = []
87 |         for optimizer in self.optimizers:
88 |             param_groups += optimizer.param_groups
89 |         #super(FusedOptimizer, self).__init__([], {})
90 |         self.param_groups = param_groups
91 | 
92 |     def step(self):
93 |         for optimizer in self.optimizers:
94 |             optimizer.step()
95 | 


--------------------------------------------------------------------------------
/CLIP-ViL/src/pretrain/qa_answer_table.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyleft 2019 project LXRT.
  3 | 
  4 | import json
  5 | import torch
  6 | 
  7 | 
  8 | class AnswerTable:
  9 |     ANS_CONVERT = {
 10 |         "a man": "man",
 11 |         "the man": "man",
 12 |         "a woman": "woman",
 13 |         "the woman": "woman",
 14 |         'one': '1',
 15 |         'two': '2',
 16 |         'three': '3',
 17 |         'four': '4',
 18 |         'five': '5',
 19 |         'six': '6',
 20 |         'seven': '7',
 21 |         'eight': '8',
 22 |         'nine': '9',
 23 |         'ten': '10',
 24 |         'grey': 'gray',
 25 |     }
 26 | 
 27 |     def __init__(self, dsets=None):
 28 |         self.all_ans = json.load(open("data/lxmert/all_ans.json"))
 29 |         if dsets is not None:
 30 |             dsets = set(dsets)
 31 |             # If the answer is used in the dsets
 32 |             self.anss = [ans['ans'] for ans in self.all_ans if
 33 |                          len(set(ans['dsets']) & dsets) > 0]
 34 |         else:
 35 |             self.anss = [ans['ans'] for ans in self.all_ans]
 36 |         self.ans_set = set(self.anss)
 37 | 
 38 |         self._id2ans_map = self.anss
 39 |         self._ans2id_map = {ans: ans_id for ans_id, ans in enumerate(self.anss)}
 40 | 
 41 |         assert len(self._id2ans_map) == len(self._ans2id_map)
 42 |         for ans_id, ans in enumerate(self._id2ans_map):
 43 |             assert self._ans2id_map[ans] == ans_id
 44 | 
 45 |     def convert_ans(self, ans):
 46 |         if len(ans) == 0:
 47 |             return ""
 48 |         ans = ans.lower()
 49 |         if ans[-1] == '.':
 50 |             ans = ans[:-1].strip()
 51 |         if ans.startswith("a "):
 52 |             ans = ans[2:].strip()
 53 |         if ans.startswith("an "):
 54 |             ans = ans[3:].strip()
 55 |         if ans.startswith("the "):
 56 |             ans = ans[4:].strip()
 57 |         if ans in self.ANS_CONVERT:
 58 |             ans = self.ANS_CONVERT[ans]
 59 |         return ans
 60 | 
 61 |     def ans2id(self, ans):
 62 |         return self._ans2id_map[ans]
 63 | 
 64 |     def id2ans(self, ans_id):
 65 |         return self._id2ans_map[ans_id]
 66 | 
 67 |     def ans2id_map(self):
 68 |         return self._ans2id_map.copy()
 69 | 
 70 |     def id2ans_map(self):
 71 |         return self._id2ans_map.copy()
 72 | 
 73 |     def used(self, ans):
 74 |         return ans in self.ans_set
 75 | 
 76 |     def all_answers(self):
 77 |         return self.anss.copy()
 78 | 
 79 |     @property
 80 |     def num_answers(self):
 81 |         return len(self.anss)
 82 | 
 83 | from tools.load_stagte_dict import load_state_dict_flexible, load_state_dict_flexible_with_fp16
 84 | def load_lxmert_qa(path, model, label2ans):
 85 |     """
 86 |     Load model weights from LXMERT pre-training.
 87 |     The answers in the fine-tuned QA task (indicated by label2ans)
 88 |         would also be properly initialized with LXMERT pre-trained
 89 |         QA heads.
 90 | 
 91 |     :param path: Path to LXMERT snapshot.
 92 |     :param model: LXRT model instance.
 93 |     :param label2ans: The label2ans dict of fine-tuned QA datasets, like
 94 |         {0: 'cat', 1: 'dog', ...}
 95 |     :return:
 96 |     """
 97 |     print("Load QA pre-trained LXMERT from %s " % path)
 98 |     loaded_state_dict = torch.load("%s_LXRT.pth" % path, "cpu")
 99 |     model_state_dict = model.state_dict()
100 | 
101 |     # Handle Multi-GPU pre-training --> Single GPU fine-tuning
102 |     for key in list(loaded_state_dict.keys()):
103 |         loaded_state_dict[key.replace("module.", '')] = loaded_state_dict.pop(key)
104 | 
105 |     # Isolate bert model
106 |     bert_state_dict = {}
107 |     for key, value in loaded_state_dict.items():
108 |         if key.startswith('bert.'):
109 |             bert_state_dict[key] = value
110 | 
111 |     # Isolate answer head
112 |     answer_state_dict = {}
113 |     for key, value in loaded_state_dict.items():
114 |         if key.startswith("answer_head."):
115 |             answer_state_dict[key.replace('answer_head.', '')] = value
116 | 
117 |     # Do surgery on answer state dict
118 |     ans_weight = answer_state_dict['logit_fc.3.weight']
119 |     ans_bias = answer_state_dict['logit_fc.3.bias']
120 |     import copy
121 |     new_answer_weight = copy.deepcopy(model_state_dict['logit_fc.3.weight'])
122 |     new_answer_bias = copy.deepcopy(model_state_dict['logit_fc.3.bias'])
123 |     answer_table = AnswerTable()
124 |     loaded = 0
125 |     unload = 0
126 |     if type(label2ans) is list:
127 |         label2ans = {label: ans for label, ans in enumerate(label2ans)}
128 |     for label, ans in label2ans.items():
129 |         new_ans = answer_table.convert_ans(ans)
130 |         if answer_table.used(new_ans):
131 |             ans_id_9500 = answer_table.ans2id(new_ans)
132 |             new_answer_weight[label] = ans_weight[ans_id_9500]
133 |             new_answer_bias[label] = ans_bias[ans_id_9500]
134 |             loaded += 1
135 |         else:
136 |             new_answer_weight[label] = 0.
137 |             new_answer_bias[label] = 0.
138 |             unload += 1
139 |     print("Loaded %d answers from LXRTQA pre-training and %d not" % (loaded, unload))
140 |     print()
141 |     answer_state_dict['logit_fc.3.weight'] = new_answer_weight
142 |     answer_state_dict['logit_fc.3.bias'] = new_answer_bias
143 | 
144 |     # Load Bert Weights
145 |     bert_model_keys = set(model.lxrt_encoder.model.state_dict().keys())
146 |     bert_loaded_keys = set(bert_state_dict.keys())
147 |     # assert len(bert_model_keys - bert_loaded_keys) == 0
148 |     load_state_dict_flexible_with_fp16(model.lxrt_encoder.model, bert_state_dict)
149 |     #model.lxrt_encoder.model.load_state_dict(bert_state_dict, strict=False)
150 | 
151 |     # Load Answer Logic FC Weights
152 |     model_keys = set(model.state_dict().keys())
153 |     ans_loaded_keys = set(answer_state_dict.keys())
154 |     # assert len(ans_loaded_keys - model_keys) == 0
155 | 
156 |     #model.load_state_dict(answer_state_dict, strict=False)
157 |     load_state_dict_flexible_with_fp16(model, answer_state_dict)
158 | 
159 | 
160 | 
161 | 


--------------------------------------------------------------------------------
/CLIP-ViL/src/tasks/gqa_model.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyleft 2019 project LXRT.
 3 | 
 4 | import torch.nn as nn
 5 | 
 6 | from param import args
 7 | from lxrt.entry import LXRTEncoder
 8 | from lxrt.modeling import BertLayerNorm, GeLU
 9 | 
10 | # Max length including <bos> and <eos>
11 | MAX_GQA_LENGTH = 20
12 | 
13 | 
14 | class GQAModel(nn.Module):
15 |     def __init__(self, num_answers):
16 |         super().__init__()
17 |         self.lxrt_encoder = LXRTEncoder(
18 |             args,
19 |             max_seq_length=MAX_GQA_LENGTH
20 |         )
21 |         hid_dim = self.lxrt_encoder.dim
22 |         self.logit_fc = nn.Sequential(
23 |             nn.Linear(hid_dim, hid_dim * 2),
24 |             GeLU(),
25 |             BertLayerNorm(hid_dim * 2, eps=1e-12),
26 |             nn.Linear(hid_dim * 2, num_answers)
27 |         )
28 |         self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
29 |         self.task = "vqa"
30 | 
31 |     def forward(self, feat, pos, sent):
32 |         """
33 |         b -- batch_size, o -- object_number, f -- visual_feature_size
34 | 
35 |         :param feat: (b, o, f)
36 |         :param pos:  (b, o, 4)
37 |         :param sent: (b,) Type -- list of string
38 |         :param leng: (b,) Type -- int numpy array
39 |         :return: (b, num_answer) The logit of each answers.
40 |         """
41 |         x = self.lxrt_encoder(sent, (feat, pos), task=self.task)
42 |         logit = self.logit_fc(x)
43 | 
44 |         return logit
45 | 
46 | 
47 | 


--------------------------------------------------------------------------------
/CLIP-ViL/src/tasks/vqa_model.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyleft 2019 project LXRT.
 3 | 
 4 | import torch.nn as nn
 5 | 
 6 | from param import args
 7 | from lxrt.entry import LXRTEncoder
 8 | from lxrt.modeling import BertLayerNorm, GeLU
 9 | 
10 | # Max length including <bos> and <eos>
11 | MAX_VQA_LENGTH = 20
12 | 
13 | 
14 | class VQAModel(nn.Module):
15 |     def __init__(self, num_answers):
16 |         super().__init__()
17 |         
18 |         # Build LXRT encoder
19 |         self.lxrt_encoder = LXRTEncoder(
20 |             args,
21 |             max_seq_length=MAX_VQA_LENGTH
22 |         )
23 |         hid_dim = self.lxrt_encoder.dim
24 |         
25 |         # VQA Answer heads
26 |         self.logit_fc = nn.Sequential(
27 |             nn.Linear(hid_dim, hid_dim * 2),
28 |             GeLU(),
29 |             BertLayerNorm(hid_dim * 2, eps=1e-12),
30 |             nn.Linear(hid_dim * 2, num_answers)
31 |         )
32 |         self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
33 | 
34 |         self.task = "vqa"
35 | 
36 |     def forward(self, feat, pos, sent):
37 |         """
38 |         b -- batch_size, o -- object_number, f -- visual_feature_size
39 | 
40 |         :param feat: (b, o, f)
41 |         :param pos:  (b, o, 4)
42 |         :param sent: (b,) Type -- list of string
43 |         :param leng: (b,) Type -- int numpy array
44 |         :return: (b, num_answer) The logit of each answers.
45 |         """
46 |         # print(len(sent), feat.shape, pos.shape)
47 |         x = self.lxrt_encoder(sent, (feat, pos), task=self.task)
48 |         logit = self.logit_fc(x)
49 | 
50 |         return logit
51 | 
52 | 
53 | if __name__ == "__main__":
54 |     model = VQAModel(4000)


--------------------------------------------------------------------------------
/CLIP-ViL/src/tools/lmdb_dataset.py:
--------------------------------------------------------------------------------
 1 | from collections import defaultdict
 2 | class TrainingMeter():
 3 |     def __init__(self):
 4 |         self.counter_dict = defaultdict(float)
 5 |         self.true_dict = defaultdict(float)
 6 | 
 7 |     def update(self, loss_dict):
 8 |         for key, item in loss_dict.items():
 9 |             self.counter_dict[key] += 1
10 |             self.true_dict[key] += item
11 | 
12 |     def report(self, logger = None):
13 |         keys = list(self.counter_dict.keys())
14 |         keys.sort()
15 |         for key in keys:
16 |             if logger is None:
17 |                 print("  {} : {:.7}".format(key, self.true_dict[key] / self.counter_dict[key]))
18 |             else:
19 |                 logger.info("  {} : {:.7}".format(key, self.true_dict[key] / self.counter_dict[key]))
20 |     
21 |     def clean(self):
22 |         self.counter_dict = defaultdict(float)
23 |         self.true_dict = defaultdict(float)
24 | 
25 | 
26 | from lz4.frame import compress, decompress
27 | from collections import defaultdict
28 | from contextlib import contextmanager
29 | import io
30 | import json
31 | from os.path import exists
32 | import msgpack
33 | import msgpack_numpy
34 | import collections
35 | import lmdb
36 | msgpack_numpy.patch()
37 | 
38 | class TxtLmdb(object):
39 |     def __init__(self, db_dir, readonly=True, readahead=False):
40 |         self.readonly = readonly
41 |         if readonly:
42 |             # training
43 |             self.env = lmdb.open(db_dir,
44 |                                  readonly=True, create=False,
45 |                                  readahead=readahead)
46 |             self.txn = self.env.begin(buffers=True)
47 |             self.write_cnt = None
48 |         else:
49 |             # prepro
50 |             self.env = lmdb.open(db_dir, readonly=False, create=True,
51 |                                  map_size=4 * 1024**4)
52 |             self.txn = self.env.begin(write=True)
53 |             self.write_cnt = 0
54 | 
55 |     def __del__(self):
56 |         if self.write_cnt:
57 |             self.txn.commit()
58 |         self.env.close()
59 | 
60 |     def __getitem__(self, key):
61 |         return msgpack.loads(decompress(self.txn.get(key.encode('utf-8'))),
62 |                              raw=False)
63 | 
64 |     def __setitem__(self, key, value):
65 |         # NOTE: not thread safe
66 |         if self.readonly:
67 |             raise ValueError('readonly text DB')
68 |         ret = self.txn.put(key.encode('utf-8'),
69 |                            compress(msgpack.dumps(value, use_bin_type=True)))
70 |         self.write_cnt += 1
71 |         if self.write_cnt % 1000 == 0:
72 |             self.txn.commit()
73 |             self.txn = self.env.begin(write=True)
74 |             self.write_cnt = 0
75 |         return ret
76 | 


--------------------------------------------------------------------------------
/CLIP-ViL/src/tools/load_stagte_dict.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import torch
 3 | def load_state_dict_flexible(model, state_dict):
 4 |     try:
 5 |         model.load_state_dict(state_dict)
 6 |     except:
 7 |         print("Full loading failed!! Try partial loading!!")
 8 | 
 9 |     own_state = model.state_dict()
10 | 
11 |     for name, param in state_dict.items():
12 |         if name not in own_state:
13 |             print("Skipped: " + name)
14 |             continue
15 |         if isinstance(param, torch.nn.Parameter):
16 |             # backwards compatibility for serialized parameters
17 |             param = param.data
18 |         try:
19 |             own_state[name].copy_(param)
20 |             print("Successfully loaded: "+name)
21 |         except:
22 |             print("Part load failed: " + name)
23 | 
24 | def load_state_dict_flexible_with_fp16(model, state_dict):
25 |     try:
26 |         model.load_state_dict(state_dict)
27 |     except:
28 |         print("Full loading failed!! Try partial loading!!")
29 | 
30 |     own_state = model.state_dict()
31 | 
32 |     for name, param in state_dict.items():
33 |         if name not in own_state:
34 |             print("Skipped: " + name)
35 |             continue
36 |         if isinstance(param, torch.nn.Parameter):
37 |             # backwards compatibility for serialized parameters
38 |             param = param.data
39 |         try:
40 |             #print("Name {}, original_type: {}, load type".format(name, own_state[name].dtype, param.dtype))
41 |             param = param.to(own_state[name].device)
42 |             own_state[name].copy_(param)
43 |             print("Successfully loaded: "+name)
44 |         except:
45 |             print("Part load failed: " + name)


--------------------------------------------------------------------------------
/CLIP-ViL/src/tools/resize_images.py:
--------------------------------------------------------------------------------
  1 | folder = "/local/harold/ubert/clip_vlp/lxmert/data/mscoco/val2014/"
  2 | root = "/local/harold/ubert/clip_vlp/lxmert/data/mscoco/val_"
  3 | out_folder = "/local/harold/ubert/clip_vlp/lxmert/data/mscoco/val2014_small.lmdb"
  4 | import torch
  5 | import os
  6 | import json
  7 | from PIL import Image
  8 | 
  9 | from tqdm import tqdm
 10 | from vlm.vok_utilis import TxtLmdb
 11 | import numpy as np
 12 | def vokenize_and_cache_dataset(output_path, dataset, vokenizer, tokenizer):
 13 |     ## Let's use lmdb
 14 | 
 15 |     
 16 |     data_loader = DataLoader(dataset, shuffle=False, batch_size=1)
 17 |     for index, batch in enumerate(tqdm(data_loader)):
 18 |         top_scores, top_idxs, input_tokens, top_paths = vokenize_batch(batch, tokenizer, vokenizer)
 19 |         
 20 |         top_paths = top_paths[0]
 21 |         top_idxs = top_idxs[0].cpu().numpy().tolist()
 22 |         input_tokens = input_tokens[0]
 23 |         top_scores = top_scores[0].cpu().numpy().tolist()
 24 |         lmdb_dataset[str(index)] = {
 25 |             "top_paths": top_paths,
 26 |             "top_idxs": top_idxs,
 27 |             "input_tokens": input_tokens,
 28 |             "top_scores": top_scores
 29 |         }
 30 | 
 31 |     del lmdb_dataset
 32 | 
 33 | from torchvision.transforms import Compose, CenterCrop, ToTensor, Normalize, ColorJitter
 34 | from vision_helpers import Resize, PadToGivenSize
 35 | 
 36 | min_size = 384
 37 | max_size = 640
 38 | flip_horizontal_prob = 0.0
 39 | flip_vertical_prob = 0.0
 40 | brightness = 0.0
 41 | contrast = 0.0
 42 | saturation = 0.0
 43 | hue = 0.0
 44 | color_jitter = ColorJitter(
 45 |     brightness=brightness,
 46 |     contrast=contrast,
 47 |     saturation=saturation,
 48 |     hue=hue,
 49 | )
 50 | transform = Compose(
 51 |     [
 52 |         Resize(min_size, max_size)
 53 |         #lambda image: image.convert("RGB"),
 54 |         #Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
 55 |     ]
 56 | )
 57 | import copy
 58 | import os
 59 | import random
 60 | 
 61 | import h5py
 62 | import torch
 63 | from torch.utils.data import DataLoader, Dataset
 64 | from torch.nn.utils.rnn import pad_sequence
 65 | from tqdm import tqdm
 66 | 
 67 | #class ToyDataset(Dataset):
 68 | #    def __init__(self, )
 69 | all_image_files = []
 70 | for _, dirs, files in os.walk(folder, topdown=False):
 71 |         for image_file in tqdm(files):
 72 |             if image_file.endswith("jpg"):
 73 |                 all_image_files.append(image_file)
 74 | #with open(root+"image_ids.json", "w") as f:
 75 | #    json.dump(all_image_files, f)
 76 | 
 77 | #with open("/local/harold/vqa/google_concetual/image_ids.json") as f:
 78 | #    all_image_files = json.load(f)
 79 | 
 80 | from PIL import Image
 81 | import io
 82 | 
 83 | def image_to_byte_array(image):
 84 |   imgByteArr = io.BytesIO()
 85 |   image.save(imgByteArr, format="JPEG")
 86 |   imgByteArr = imgByteArr.getvalue()
 87 |   return imgByteArr
 88 | 
 89 | def byte_array_to_image(byte):
 90 |     imgByteArr = io.BytesIO(byte)
 91 |     imgByteArr.seek(0)
 92 |     return Image.open(imgByteArr)
 93 | 
 94 | from tqdm import tqdm
 95 | lmdb_dataset = TxtLmdb(out_folder, readonly=False)
 96 | valid_images = {}
 97 | skipped = 0
 98 | for image in tqdm(all_image_files):
 99 |     try:
100 |         feats = transform(Image.open(os.path.join(folder, image)))  # Raw image as a tensor: 3 x 224 x 224
101 |         lmdb_dataset[image] = image_to_byte_array(feats)
102 |         valid_images[image] = feats.size
103 |     except KeyboardInterrupt:
104 |         del lmdb_dataset
105 |         assert (0)
106 |     except:
107 |         skipped += 1
108 |         if skipped % 100 == 0:
109 |             print("{} skipped.".format(skipped))
110 |         pass
111 | 
112 | with open(root + "image_size.json", "w") as f:
113 |     json.dump(valid_images, f)
114 | 
115 | '''
116 | all_image_files = []
117 | for root, dirs, files in os.walk(folder, topdown=False):
118 |         for image_file in files:
119 |             if image_file.endswith("jpg"):
120 |                 all_image_files.append(image_file)
121 | with open("/local/harold/vqa/google_concetual/image_ids.json", "w") as f:
122 |     json.dump(all_image_files, f)'''


--------------------------------------------------------------------------------
/CLIP-ViL/src/utils.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyleft 2019 Project LXRT
 3 | 
 4 | import sys
 5 | import csv
 6 | import base64
 7 | import time
 8 | 
 9 | import numpy as np
10 | from collections import defaultdict
11 | class TrainingMeter():
12 |     def __init__(self):
13 |         self.counter_dict = defaultdict(float)
14 |         self.true_dict = defaultdict(float)
15 | 
16 |     def update(self, loss_dict):
17 |         for key, item in loss_dict.items():
18 |             self.counter_dict[key] += 1
19 |             self.true_dict[key] += item
20 | 
21 |     def report(self, logger = None):
22 |         keys = list(self.counter_dict.keys())
23 |         keys.sort()
24 |         for key in keys:
25 |             if logger is None:
26 |                 print("  {} : {:.7}".format(key, self.true_dict[key] / self.counter_dict[key]))
27 |             else:
28 |                 logger.info("  {} : {:.7}".format(key, self.true_dict[key] / self.counter_dict[key]))
29 |     
30 |     def clean(self):
31 |         self.counter_dict = defaultdict(float)
32 |         self.true_dict = defaultdict(float)
33 | 
34 | 
35 | csv.field_size_limit(sys.maxsize)
36 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf",
37 |               "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"]
38 | 
39 | 
40 | def load_obj_tsv(fname, topk=None):
41 |     """Load object features from tsv file.
42 | 
43 |     :param fname: The path to the tsv file.
44 |     :param topk: Only load features for top K images (lines) in the tsv file.
45 |         Will load all the features if topk is either -1 or None.
46 |     :return: A list of image object features where each feature is a dict.
47 |         See FILENAMES above for the keys in the feature dict.
48 |     """
49 |     data = []
50 |     start_time = time.time()
51 |     print("Start to load Faster-RCNN detected objects from %s" % fname)
52 |     with open(fname) as f:
53 |         reader = csv.DictReader(f, FIELDNAMES, delimiter="\t")
54 |         for i, item in enumerate(reader):
55 | 
56 |             for key in ['img_h', 'img_w', 'num_boxes']:
57 |                 item[key] = int(item[key])
58 |             
59 |             boxes = item['num_boxes']
60 |             decode_config = [
61 |                 ('objects_id', (boxes, ), np.int64),
62 |                 ('objects_conf', (boxes, ), np.float32),
63 |                 ('attrs_id', (boxes, ), np.int64),
64 |                 ('attrs_conf', (boxes, ), np.float32),
65 |                 ('boxes', (boxes, 4), np.float32),
66 |                 ('features', (boxes, -1), np.float32),
67 |             ]
68 |             for key, shape, dtype in decode_config:
69 |                 item[key] = np.frombuffer(base64.b64decode(item[key]), dtype=dtype)
70 |                 item[key] = item[key].reshape(shape)
71 |                 item[key].setflags(write=False)
72 | 
73 |             data.append(item)
74 |             if topk is not None and len(data) == topk:
75 |                 break
76 |     elapsed_time = time.time() - start_time
77 |     print("Loaded %d images in file %s in %d seconds." % (len(data), fname, elapsed_time))
78 |     return data
79 | 
80 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 YI-LIN SUNG
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # VL-Adapter
  2 | 
  3 | * Authors: [Yi-Lin Sung](https://ylsung.github.io/), [Jaemin Cho](https://j-min.io/), [Mohit Bansal](https://www.cs.unc.edu/~mbansal/)
  4 | * Paper: ["VL-Adapter: Parameter-Efficient Transfer Learning for Vision-and-Language Tasks"](https://arxiv.org/abs/2112.06825) (CVPR 2022)
  5 | 
  6 | We evaluate VL-adapter in a unified multi-task
  7 | setup on both image-text and video-text benchmarks. For the image-text tasks, we use four diverse V&L datasets: VQAv2, GQA, NLVR2, and MSCOCO image captioning. For video-text tasks, we use TVQA, How2QA, TVC, and YC2C. 
  8 | 
  9 | Our results demonstrate that training the adapter with the weight-sharing technique (4.18% of total parameters for image-text tasks and 3.39% for video-text tasks) can match
 10 | the performance of fine-tuning the entire model.
 11 | 
 12 | ![](assets/vl_adapter_teaser.png)
 13 | 
 14 | ** Note **
 15 | Please go into CLIP-ViL folder and follow the README there for running the experiments of adapters on CLIP-ViL. This README is for adapters on VL-Bart.
 16 | 
 17 | 
 18 | ## Installation
 19 | 
 20 | ```
 21 | # Create python environment (optional)
 22 | conda create -n vlt5 python=3.8
 23 | source activate vlt5
 24 | 
 25 | # Install python dependencies
 26 | pip install -r requirements.txt
 27 | 
 28 | # Download T5/BART backbone checkpoint
 29 | python download_backbones.py
 30 | 
 31 | # For MSCOCO captioning evaluation (optional; for captioning only)
 32 | python -c "import language_evaluation; language_evaluation.download('coco')"
 33 | ```
 34 | 
 35 | ## Code structure
 36 | ```bash
 37 | # Store images, features, and annotations
 38 | ./datasets
 39 |     COCO/
 40 |         images/
 41 |         clip_featuers/
 42 |     VG/
 43 |         images/
 44 |         clip_features/
 45 |     GQA/
 46 |         images/
 47 |         clip_features/
 48 |     nlvr/
 49 |         images/
 50 |         clip_features/
 51 |     vqa/
 52 |     lxmert/
 53 | 
 54 |     video/
 55 |         ann/
 56 |         vis_features
 57 | 
 58 | # Train VL-T5 with adapters
 59 | ./VL-T5/
 60 |     src/
 61 |         modeling_t5.py modeling_bart.py                       <= VL-T5/VL-BART model classes
 62 |         pretrain.py, pretrain_data.py, pretrain_model.py      <= pretraining
 63 |         vqa.py, vqa_data.py vqa_model.py ...                  <= fine-tuning on downstream tasks (ex. VQA, GQA, NLVR2)
 64 |         multitask.py, multitask_data.py multiask_model.py     <= multitask learning on 7 downstream tasks
 65 |         param.py                                              <= (argparse) configuration
 66 |         tokenization.py                                       <= custom tokenizer
 67 |         utils.py, dist_utils.py                               <= utility functions
 68 |     snap/                                                     <= store weight checkpoints
 69 |     scripts/                                                  <= bash scripts for pretraining and finetuning
 70 | ```
 71 | 
 72 | ## Data
 73 | 
 74 | ### Image-text dataset
 75 | Please go to [link](https://drive.google.com/file/d/1O_RU1iFh_sbItZCTkOHUrbVIQQ_89Djj/view?usp=sharing) to download the processed CLIP features. We suggest to use [gdrive](https://github.com/prasmussen/gdrive) to download it. Unzip the downloaded file and arrange the folders following the format which is shown in the "Code Structure."
 76 | 
 77 | If you would like to use dgrive to download the data, please try the following command
 78 | 
 79 | ```
 80 | gdrive download 1O_RU1iFh_sbItZCTkOHUrbVIQQ_89Djj
 81 | ```
 82 | 
 83 | ### Extract your own CLIP features
 84 | Please refer to `feature_extraction` for more details.
 85 | 
 86 | ### Video-text dataset
 87 | Please go to [VALUE](https://github.com/VALUE-Leaderboard/DataRelease) to download the ViT processed data.
 88 | 
 89 | ## Run different approaches
 90 | The following scripts can run every approach with the best hyper-parameters.
 91 | 
 92 | ### Image dataset
 93 | 
 94 | ```bash
 95 | # Full fine-tuning
 96 | cd VL-T5/
 97 | bash scripts/image/full_finetuning.sh 1
 98 | 
 99 | # Single Adapter
100 | cd VL-T5/
101 | bash scripts/image/single_adapter.sh 1
102 | 
103 | # Multiple Adapters
104 | cd VL-T5/
105 | bash scripts/image/multiple_adapters.sh 1
106 | 
107 | # Hyperformer
108 | cd VL-T5/
109 | bash scripts/image/hyperformer.sh 1
110 | 
111 | # Single Compacter
112 | cd VL-T5/
113 | bash scripts/image/single_compacter.sh 1
114 | 
115 | # Multiple Compacters
116 | cd VL-T5/
117 | bash scripts/image/multiple_compacters.sh 1
118 | 
119 | # Single LoRA
120 | cd VL-T5/
121 | bash scripts/image/single_lora.sh 1
122 | 
123 | # Multiple LoRA
124 | cd VL-T5/
125 | bash scripts/image/multiple_lora.sh 1
126 | 
127 | # Single Prompt
128 | cd VL-T5/
129 | bash scripts/image/single_prompt.sh 1
130 | 
131 | # Multiple Prompts
132 | cd VL-T5/
133 | bash scripts/image/multiple_prompts.sh 1
134 | ```
135 | 
136 | ### Video dataset
137 | 
138 | ```bash
139 | # Full fine-tuning
140 | cd VL-T5/
141 | bash scripts/video/full_finetuning.sh 1
142 | 
143 | # Single Adapter
144 | cd VL-T5/
145 | bash scripts/video/single_adapter.sh 1
146 | 
147 | # Single LoRA
148 | cd VL-T5/
149 | bash scripts/video/single_lora.sh 1
150 | 
151 | # Single Prompt
152 | cd VL-T5/
153 | bash scripts/video/single_prompt.sh 1
154 | 
155 | ```
156 | 
157 | 
158 | ## Acknowledgement
159 | 
160 | This repo is adapted from [VLT5](https://github.com/j-min/VL-T5). I also borrow some codes from [CLIP](https://github.com/openai/CLIP), [CLIP-ViL](https://github.com/clip-vil/CLIP-ViL), [Compacter](https://github.com/ylsung/compacter), [Hyperformer](https://github.com/rabeehk/hyperformer) and [Prefix-tuning](https://github.com/XiangLi1999/PrefixTuning).
161 | 
162 | 
163 | ## Reference
164 | 
165 | Please cite our paper if you use our models in your project.
166 | 
167 | ```bibtex
168 | @inproceedings{sung2022vladapter,
169 |   title     = {VL-Adapter: Parameter-Efficient Transfer Learning for Vision-and-Language Tasks},
170 |   author    = {Yi-Lin Sung, Jaemin Cho, Mohit Bansal},
171 |   booktitle = {CVPR},
172 |   year      = {2022}
173 | }
174 | ```


--------------------------------------------------------------------------------
/VL-T5/inference/README.md:
--------------------------------------------------------------------------------
1 | Utility scripts for inference on custom images.
2 | The Faster R-CNN inference scripts are adapted from [Huggingface transformers LXMERT example](https://github.com/huggingface/transformers/blob/master/examples/research_projects/lxmert/).


--------------------------------------------------------------------------------
/VL-T5/inference/extracting_data.py:
--------------------------------------------------------------------------------
  1 | import getopt
  2 | import json
  3 | import os
  4 | 
  5 | # import numpy as np
  6 | import sys
  7 | from collections import OrderedDict
  8 | 
  9 | import datasets
 10 | import numpy as np
 11 | import torch
 12 | 
 13 | from modeling_frcnn import GeneralizedRCNN
 14 | from processing_image import Preprocess
 15 | from utils import Config
 16 | 
 17 | 
 18 | """
 19 | USAGE:
 20 | ``python extracting_data.py -i <img_dir> -o <dataset_file>.datasets <batch_size>``
 21 | """
 22 | 
 23 | 
 24 | TEST = False
 25 | CONFIG = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
 26 | DEFAULT_SCHEMA = datasets.Features(
 27 |     OrderedDict(
 28 |         {
 29 |             "attr_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
 30 |             "attr_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
 31 |             "boxes": datasets.Array2D((CONFIG.MAX_DETECTIONS, 4), dtype="float32"),
 32 |             "img_id": datasets.Value("int32"),
 33 |             "obj_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
 34 |             "obj_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")),
 35 |             "roi_features": datasets.Array2D((CONFIG.MAX_DETECTIONS, 2048), dtype="float32"),
 36 |             "sizes": datasets.Sequence(length=2, feature=datasets.Value("float32")),
 37 |             "preds_per_image": datasets.Value(dtype="int32"),
 38 |         }
 39 |     )
 40 | )
 41 | 
 42 | 
 43 | class Extract:
 44 |     def __init__(self, argv=sys.argv[1:]):
 45 |         inputdir = None
 46 |         outputfile = None
 47 |         subset_list = None
 48 |         batch_size = 1
 49 |         opts, args = getopt.getopt(argv, "i:o:b:s", ["inputdir=", "outfile=", "batch_size=", "subset_list="])
 50 |         for opt, arg in opts:
 51 |             if opt in ("-i", "--inputdir"):
 52 |                 inputdir = arg
 53 |             elif opt in ("-o", "--outfile"):
 54 |                 outputfile = arg
 55 |             elif opt in ("-b", "--batch_size"):
 56 |                 batch_size = int(arg)
 57 |             elif opt in ("-s", "--subset_list"):
 58 |                 subset_list = arg
 59 | 
 60 |         assert inputdir is not None  # and os.path.isdir(inputdir), f"{inputdir}"
 61 |         assert outputfile is not None and not os.path.isfile(outputfile), f"{outputfile}"
 62 |         if subset_list is not None:
 63 |             with open(os.path.realpath(subset_list)) as f:
 64 |                 self.subset_list = set(map(lambda x: self._vqa_file_split()[0], tryload(f)))
 65 |         else:
 66 |             self.subset_list = None
 67 | 
 68 |         self.config = CONFIG
 69 |         if torch.cuda.is_available():
 70 |             self.config.model.device = "cuda"
 71 |         self.inputdir = os.path.realpath(inputdir)
 72 |         self.outputfile = os.path.realpath(outputfile)
 73 |         self.preprocess = Preprocess(self.config)
 74 |         self.model = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=self.config)
 75 |         self.batch = batch_size if batch_size != 0 else 1
 76 |         self.schema = DEFAULT_SCHEMA
 77 | 
 78 |     def _vqa_file_split(self, file):
 79 |         img_id = int(file.split(".")[0].split("_")[-1])
 80 |         filepath = os.path.join(self.inputdir, file)
 81 |         return (img_id, filepath)
 82 | 
 83 |     @property
 84 |     def file_generator(self):
 85 |         batch = []
 86 |         for i, file in enumerate(os.listdir(self.inputdir)):
 87 |             if self.subset_list is not None and i not in self.subset_list:
 88 |                 continue
 89 |             batch.append(self._vqa_file_split(file))
 90 |             if len(batch) == self.batch:
 91 |                 temp = batch
 92 |                 batch = []
 93 |                 yield list(map(list, zip(*temp)))
 94 | 
 95 |         for i in range(1):
 96 |             yield list(map(list, zip(*batch)))
 97 | 
 98 |     def __call__(self):
 99 |         # make writer
100 |         if not TEST:
101 |             writer = datasets.ArrowWriter(features=self.schema, path=self.outputfile)
102 |         # do file generator
103 |         for i, (img_ids, filepaths) in enumerate(self.file_generator):
104 |             images, sizes, scales_yx = self.preprocess(filepaths)
105 |             output_dict = self.model(
106 |                 images,
107 |                 sizes,
108 |                 scales_yx=scales_yx,
109 |                 padding="max_detections",
110 |                 max_detections=self.config.MAX_DETECTIONS,
111 |                 pad_value=0,
112 |                 return_tensors="np",
113 |                 location="cpu",
114 |             )
115 |             output_dict["boxes"] = output_dict.pop("normalized_boxes")
116 |             if not TEST:
117 |                 output_dict["img_id"] = np.array(img_ids)
118 |                 batch = self.schema.encode_batch(output_dict)
119 |                 writer.write_batch(batch)
120 |             if TEST:
121 |                 break
122 |             # finalizer the writer
123 |         if not TEST:
124 |             num_examples, num_bytes = writer.finalize()
125 |             print(f"Success! You wrote {num_examples} entry(s) and {num_bytes >> 20} mb")
126 | 
127 | 
128 | def tryload(stream):
129 |     try:
130 |         data = json.load(stream)
131 |         try:
132 |             data = list(data.keys())
133 |         except Exception:
134 |             data = [d["img_id"] for d in data]
135 |     except Exception:
136 |         try:
137 |             data = eval(stream.read())
138 |         except Exception:
139 |             data = stream.read().split("\n")
140 |     return data
141 | 
142 | 
143 | if __name__ == "__main__":
144 |     extract = Extract(sys.argv[1:])
145 |     extract()
146 |     if not TEST:
147 |         dataset = datasets.Dataset.from_file(extract.outputfile)
148 |         # wala!
149 |         # print(np.array(dataset[0:2]["roi_features"]).shape)
150 | 


--------------------------------------------------------------------------------
/VL-T5/requirements.txt:
--------------------------------------------------------------------------------
1 | ftfy
2 | timm


--------------------------------------------------------------------------------
/VL-T5/scripts/image/full_finetuning.sh:
--------------------------------------------------------------------------------
 1 | task=multitask
 2 | 
 3 | # or bart
 4 | model="bart"
 5 | 
 6 | echo $model
 7 | 
 8 | if [ $model == "t5" ]
 9 | then
10 |     folder_prefix="VLT5"
11 |     backbone="t5-base"
12 |     batch_size=300
13 | elif [ $model == "bart" ]
14 | then
15 |     folder_prefix="VLBart"
16 |     backbone="facebook/bart-base"
17 |     batch_size=500
18 | fi
19 | 
20 | echo $folder_prefix
21 | echo $backbone
22 | 
23 | feature=RN101
24 | 
25 | lr=1e-4
26 | name=4tasks_hard_${feature}_LMfull_bs${batch_size}_image224_lr${lr}
27 | output=snap/${folder_prefix}_${task}/$name
28 | 
29 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \
30 | python -m torch.distributed.launch \
31 |     --nproc_per_node=$1 \
32 |     --master_port=26757 \
33 |     src/${task}.py \
34 |     --distributed --multiGPU \
35 |     --optim adamw \
36 |     --warmup_ratio 0.1 \
37 |     --clip_grad_norm 5 \
38 |     --lr ${lr} \
39 |     --epochs 20 \
40 |     --num_workers 4 \
41 |     --backbone ${backbone} \
42 |     --output $output ${@:2} \
43 |     --num_beams 5 \
44 |     --batch_size ${batch_size} \
45 |     --valid_batch_size ${batch_size} \
46 |     --unfreeze_language_model \
47 |     --tasks "vqa,gqa,nlvr,caption" \
48 |     --feature ${feature} --n_boxes 36 --downsample \
49 |     --image_size "(224,224)" \
50 |     --run_name $name
51 | 


--------------------------------------------------------------------------------
/VL-T5/scripts/image/hyperformer.sh:
--------------------------------------------------------------------------------
 1 | task=multitask
 2 | 
 3 | # or bart
 4 | model="bart"
 5 | 
 6 | echo $model
 7 | 
 8 | if [ $model == "t5" ]
 9 | then
10 |     folder_prefix="VLT5"
11 |     backbone="t5-base"
12 |     batch_size=300
13 | elif [ $model == "bart" ]
14 | then
15 |     folder_prefix="VLBart"
16 |     backbone="facebook/bart-base"
17 |     batch_size=500
18 | fi
19 | 
20 | echo $folder_prefix
21 | echo $backbone
22 | 
23 | feature=RN101
24 | 
25 | lr=1e-3
26 | 
27 | projected_task_embedding_dim=8
28 | 
29 | name=4tasks_hard_${feature}_LMhyperformer${projected_task_embedding_dim}+r8+ln_bs${batch_size}_image224_lr${lr}
30 | output=snap/${folder_prefix}_${task}/$name
31 | 
32 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \
33 | python -m torch.distributed.launch \
34 |     --nproc_per_node=$1 \
35 |     --master_port=26763 \
36 |     src/${task}.py \
37 |     --distributed --multiGPU \
38 |     --optim adamw \
39 |     --warmup_ratio 0.1 \
40 |     --clip_grad_norm 5 \
41 |     --lr ${lr} \
42 |     --epochs 20 \
43 |     --num_workers 4 \
44 |     --backbone ${backbone} \
45 |     --output $output ${@:2} \
46 |     --num_beams 5 \
47 |     --batch_size ${batch_size} \
48 |     --valid_batch_size ${batch_size} \
49 |     --use_hyperformer \
50 |     --unique_hyper_net \
51 |     --unfreeze_layer_norms \
52 |     --projected_task_embedding_dim ${projected_task_embedding_dim} \
53 |     --reduction_factor 8 \
54 |     --tasks "vqa,gqa,nlvr,caption" \
55 |     --feature ${feature} --n_boxes 36 --downsample \
56 |     --image_size "(224,224)" \
57 |     --run_name $name


--------------------------------------------------------------------------------
/VL-T5/scripts/image/multiple_adapters.sh:
--------------------------------------------------------------------------------
 1 | task=multitask
 2 | 
 3 | # or bart
 4 | model="bart"
 5 | 
 6 | echo $model
 7 | 
 8 | if [ $model == "t5" ]
 9 | then
10 |     folder_prefix="VLT5"
11 |     backbone="t5-base"
12 |     batch_size=300
13 | elif [ $model == "bart" ]
14 | then
15 |     folder_prefix="VLBart"
16 |     backbone="facebook/bart-base"
17 |     batch_size=500
18 | fi
19 | 
20 | echo $folder_prefix
21 | echo $backbone
22 | 
23 | feature=RN101
24 | 
25 | lr=3e-4
26 | name=4tasks_hard_${feature}_LMadapter+r8+ln_bs${batch_size}_image224_lr${lr}
27 | output=snap/${folder_prefix}_${task}/$name
28 | 
29 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \
30 | python -m torch.distributed.launch \
31 |     --nproc_per_node=$1 \
32 |     --master_port=26757 \
33 |     src/${task}.py \
34 |     --distributed --multiGPU \
35 |     --optim adamw \
36 |     --warmup_ratio 0.1 \
37 |     --clip_grad_norm 5 \
38 |     --lr ${lr} \
39 |     --epochs 20 \
40 |     --num_workers 4 \
41 |     --backbone ${backbone} \
42 |     --output $output ${@:2} \
43 |     --num_beams 5 \
44 |     --batch_size ${batch_size} \
45 |     --valid_batch_size ${batch_size} \
46 |     --use_adapter \
47 |     --unfreeze_layer_norms \
48 |     --reduction_factor 8 \
49 |     --tasks "vqa,gqa,nlvr,caption" \
50 |     --feature ${feature} --n_boxes 36 --downsample \
51 |     --image_size "(224,224)" \
52 |     --run_name $name
53 | 


--------------------------------------------------------------------------------
/VL-T5/scripts/image/multiple_compacters.sh:
--------------------------------------------------------------------------------
 1 | task=multitask
 2 | 
 3 | # or bart
 4 | model="bart"
 5 | 
 6 | echo $model
 7 | 
 8 | if [ $model == "t5" ]
 9 | then
10 |     folder_prefix="VLT5"
11 |     backbone="t5-base"
12 |     batch_size=300
13 | elif [ $model == "bart" ]
14 | then
15 |     folder_prefix="VLBart"
16 |     backbone="facebook/bart-base"
17 |     batch_size=500
18 | fi
19 | 
20 | echo $folder_prefix
21 | echo $backbone
22 | 
23 | feature=RN101
24 | 
25 | lr=1e-3
26 | 
27 | hypercomplex_division=2
28 | 
29 | name=4tasks_hard_${feature}_LMcompacter+hdiv${hypercomplex_division}+noshare+nofac+ln+prompt_bs${batch_size}_image224_lr${lr}
30 | output=snap/${folder_prefix}_${task}/$name
31 | 
32 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \
33 | python -m torch.distributed.launch \
34 |     --nproc_per_node=$1 \
35 |     --master_port=26764 \
36 |     src/${task}.py \
37 |     --distributed --multiGPU \
38 |     --optim adamw \
39 |     --warmup_ratio 0.1 \
40 |     --clip_grad_norm 5 \
41 |     --lr ${lr} \
42 |     --epochs 20 \
43 |     --num_workers 4 \
44 |     --backbone ${backbone} \
45 |     --output $output ${@:2} \
46 |     --num_beams 5 \
47 |     --batch_size ${batch_size} \
48 |     --valid_batch_size ${batch_size} \
49 |     --use_compacter \
50 |     --shared_phm_rule False \
51 |     --factorized_phm False \
52 |     --unfreeze_layer_norms \
53 |     --hypercomplex_division ${hypercomplex_division} \
54 |     --reduction_factor 8 \
55 |     --tasks "vqa,gqa,nlvr,caption" \
56 |     --feature ${feature} --n_boxes 36 --downsample \
57 |     --image_size "(224,224)" \
58 |     --run_name $name
59 | 


--------------------------------------------------------------------------------
/VL-T5/scripts/image/multiple_lora.sh:
--------------------------------------------------------------------------------
 1 | task=multitask
 2 | 
 3 | # or bart
 4 | model="bart"
 5 | 
 6 | echo $model
 7 | 
 8 | if [ $model == "t5" ]
 9 | then
10 |     folder_prefix="VLT5"
11 |     backbone="t5-base"
12 |     batch_size=300
13 | elif [ $model == "bart" ]
14 | then
15 |     folder_prefix="VLBart"
16 |     backbone="facebook/bart-base"
17 |     batch_size=500
18 | fi
19 | 
20 | echo $folder_prefix
21 | echo $backbone
22 | 
23 | feature=RN101
24 | 
25 | lr=1e-3
26 | 
27 | lora_dim=128
28 | 
29 | name=${feature}_LMmultilora${lora_dim}+lr${lr}_bs${batch_size}_image224
30 | output=snap/${folder_prefix}_${task}/$name
31 | 
32 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \
33 | python -m torch.distributed.launch \
34 |     --nproc_per_node=$1 \
35 |     --master_port=26786 \
36 |     src/${task}.py \
37 |     --distributed --multiGPU \
38 |     --optim adamw \
39 |     --warmup_ratio 0.1 \
40 |     --clip_grad_norm 5 \
41 |     --lr ${lr} \
42 |     --epochs 20 \
43 |     --num_workers 4 \
44 |     --backbone ${backbone} \
45 |     --output $output ${@:2} \
46 |     --num_beams 5 \
47 |     --use_lora \
48 |     --lora_dim ${lora_dim} \
49 |     --batch_size ${batch_size} \
50 |     --valid_batch_size ${batch_size} \
51 |     --tasks "vqa,gqa,nlvr,caption" \
52 |     --feature ${feature} --n_boxes 36 --downsample \
53 |     --image_size "(224,224)" \
54 |     --run_name $name
55 | 


--------------------------------------------------------------------------------
/VL-T5/scripts/image/multiple_prompts.sh:
--------------------------------------------------------------------------------
 1 | task=multitask
 2 | 
 3 | # or bart
 4 | model="bart"
 5 | 
 6 | echo $model
 7 | 
 8 | if [ $model == "t5" ]
 9 | then
10 |     folder_prefix="VLT5"
11 |     backbone="t5-base"
12 |     batch_size=300
13 | elif [ $model == "bart" ]
14 | then
15 |     folder_prefix="VLBart"
16 |     backbone="facebook/bart-base"
17 |     batch_size=500
18 | fi
19 | 
20 | echo $folder_prefix
21 | echo $backbone
22 | 
23 | feature=RN101
24 | 
25 | lr=1e-3
26 | name=4tasks_hard_${feature}_LMprompt40_bs${batch_size}_image224_lr${lr}
27 | output=snap/${folder_prefix}_${task}/$name
28 | 
29 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \
30 | python -m torch.distributed.launch \
31 |     --nproc_per_node=$1 \
32 |     --master_port=26757 \
33 |     src/${task}.py \
34 |     --distributed --multiGPU \
35 |     --optim adamw \
36 |     --warmup_ratio 0.1 \
37 |     --clip_grad_norm 5 \
38 |     --lr ${lr} \
39 |     --epochs 20 \
40 |     --num_workers 4 \
41 |     --backbone ${backbone} \
42 |     --output $output ${@:2} \
43 |     --num_beams 5 \
44 |     --batch_size ${batch_size} \
45 |     --valid_batch_size ${batch_size} \
46 |     --encoder_prompt_len 40 \
47 |     --mid_dim 800 \
48 |     --tasks "vqa,gqa,nlvr,caption" \
49 |     --feature ${feature} --n_boxes 36 --downsample \
50 |     --image_size "(224,224)" \
51 |     --run_name $name
52 | 


--------------------------------------------------------------------------------
/VL-T5/scripts/image/single_adapter.sh:
--------------------------------------------------------------------------------
 1 | task=multitask
 2 | 
 3 | # or bart
 4 | model="bart"
 5 | 
 6 | echo $model
 7 | 
 8 | if [ $model == "t5" ]
 9 | then
10 |     folder_prefix="VLT5"
11 |     backbone="t5-base"
12 |     batch_size=300
13 | elif [ $model == "bart" ]
14 | then
15 |     folder_prefix="VLBart"
16 |     backbone="facebook/bart-base"
17 |     batch_size=500
18 | fi
19 | 
20 | echo $folder_prefix
21 | echo $backbone
22 | 
23 | feature=RN101
24 | 
25 | lr=1e-3
26 | name=4tasks_hard_${feature}_LMOneadapter+r8+ln_bs${batch_size}_image224_lr${lr}
27 | output=snap/${folder_prefix}_${task}/$name
28 | 
29 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \
30 | python -m torch.distributed.launch \
31 |     --nproc_per_node=$1 \
32 |     --master_port=26757 \
33 |     src/${task}.py \
34 |     --distributed --multiGPU \
35 |     --optim adamw \
36 |     --warmup_ratio 0.1 \
37 |     --clip_grad_norm 5 \
38 |     --lr ${lr} \
39 |     --epochs 20 \
40 |     --num_workers 4 \
41 |     --backbone ${backbone} \
42 |     --output $output ${@:2} \
43 |     --num_beams 5 \
44 |     --batch_size ${batch_size} \
45 |     --valid_batch_size ${batch_size} \
46 |     --use_adapter \
47 |     --unfreeze_layer_norms \
48 |     --reduction_factor 8 \
49 |     --use_single_adapter \
50 |     --use_tasks_prompts \
51 |     --tasks "vqa,gqa,nlvr,caption" \
52 |     --feature ${feature} --n_boxes 36 --downsample \
53 |     --image_size "(224,224)" \
54 |     --run_name $name
55 | 


--------------------------------------------------------------------------------
/VL-T5/scripts/image/single_compacter.sh:
--------------------------------------------------------------------------------
 1 | task=multitask
 2 | 
 3 | # or bart
 4 | model="bart"
 5 | 
 6 | echo $model
 7 | 
 8 | if [ $model == "t5" ]
 9 | then
10 |     folder_prefix="VLT5"
11 |     backbone="t5-base"
12 |     batch_size=300
13 | elif [ $model == "bart" ]
14 | then
15 |     folder_prefix="VLBart"
16 |     backbone="facebook/bart-base"
17 |     batch_size=500
18 | fi
19 | 
20 | echo $folder_prefix
21 | echo $backbone
22 | 
23 | feature=RN101
24 | 
25 | lr=1e-3
26 | 
27 | hypercomplex_division=2
28 | 
29 | name=4tasks_hard_${feature}_LMOnecompacter+hdiv${hypercomplex_division}+noshare+nofac+ln+prompt_bs${batch_size}_image224_lr${lr}
30 | output=snap/${folder_prefix}_${task}/$name
31 | 
32 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \
33 | python -m torch.distributed.launch \
34 |     --nproc_per_node=$1 \
35 |     --master_port=26764 \
36 |     src/${task}.py \
37 |     --distributed --multiGPU \
38 |     --optim adamw \
39 |     --warmup_ratio 0.1 \
40 |     --clip_grad_norm 5 \
41 |     --lr ${lr} \
42 |     --epochs 20 \
43 |     --num_workers 4 \
44 |     --backbone ${backbone} \
45 |     --output $output ${@:2} \
46 |     --num_beams 5 \
47 |     --batch_size ${batch_size} \
48 |     --valid_batch_size ${batch_size} \
49 |     --use_compacter \
50 |     --shared_phm_rule False \
51 |     --factorized_phm False \
52 |     --unfreeze_layer_norms \
53 |     --use_single_adapter \
54 |     --use_tasks_prompts \
55 |     --hypercomplex_division ${hypercomplex_division} \
56 |     --reduction_factor 8 \
57 |     --tasks "vqa,gqa,nlvr,caption" \
58 |     --feature ${feature} --n_boxes 36 --downsample \
59 |     --image_size "(224,224)" \
60 |     --run_name $name
61 | 


--------------------------------------------------------------------------------
/VL-T5/scripts/image/single_lora.sh:
--------------------------------------------------------------------------------
 1 | task=multitask
 2 | 
 3 | # or bart
 4 | model="bart"
 5 | 
 6 | echo $model
 7 | 
 8 | if [ $model == "t5" ]
 9 | then
10 |     folder_prefix="VLT5"
11 |     backbone="t5-base"
12 |     batch_size=300
13 | elif [ $model == "bart" ]
14 | then
15 |     folder_prefix="VLBart"
16 |     backbone="facebook/bart-base"
17 |     batch_size=500
18 | fi
19 | 
20 | echo $folder_prefix
21 | echo $backbone
22 | 
23 | feature=RN101
24 | 
25 | lr=1e-3
26 | 
27 | lora_dim=128
28 | 
29 | name=${feature}_LMsinglelora${lora_dim}+lr${lr}_bs${batch_size}_image224
30 | output=snap/${folder_prefix}_${task}/$name
31 | 
32 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \
33 | python -m torch.distributed.launch \
34 |     --nproc_per_node=$1 \
35 |     --master_port=26786 \
36 |     src/${task}.py \
37 |     --distributed --multiGPU \
38 |     --optim adamw \
39 |     --warmup_ratio 0.1 \
40 |     --clip_grad_norm 5 \
41 |     --lr ${lr} \
42 |     --epochs 20 \
43 |     --num_workers 4 \
44 |     --backbone ${backbone} \
45 |     --output $output ${@:2} \
46 |     --num_beams 5 \
47 |     --use_lora \
48 |     --lora_dim ${lora_dim} \
49 |     --use_single_lora \
50 |     --use_tasks_prompts \
51 |     --batch_size ${batch_size} \
52 |     --valid_batch_size ${batch_size} \
53 |     --tasks "vqa,gqa,nlvr,caption" \
54 |     --feature ${feature} --n_boxes 36 --downsample \
55 |     --image_size "(224,224)" \
56 |     --run_name $name
57 | 


--------------------------------------------------------------------------------
/VL-T5/scripts/image/single_prompt.sh:
--------------------------------------------------------------------------------
 1 | task=multitask
 2 | 
 3 | # or bart
 4 | model="bart"
 5 | 
 6 | echo $model
 7 | 
 8 | if [ $model == "t5" ]
 9 | then
10 |     folder_prefix="VLT5"
11 |     backbone="t5-base"
12 |     batch_size=300
13 | elif [ $model == "bart" ]
14 | then
15 |     folder_prefix="VLBart"
16 |     backbone="facebook/bart-base"
17 |     batch_size=500
18 | fi
19 | 
20 | echo $folder_prefix
21 | echo $backbone
22 | 
23 | feature=RN101
24 | 
25 | lr=1e-3
26 | name=4tasks_hard_${feature}_LMOneprompt40_bs${batch_size}_image224_lr${lr}
27 | output=snap/${folder_prefix}_${task}/$name
28 | 
29 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \
30 | python -m torch.distributed.launch \
31 |     --nproc_per_node=$1 \
32 |     --master_port=26757 \
33 |     src/${task}.py \
34 |     --distributed --multiGPU \
35 |     --optim adamw \
36 |     --warmup_ratio 0.1 \
37 |     --clip_grad_norm 5 \
38 |     --lr ${lr} \
39 |     --epochs 20 \
40 |     --num_workers 4 \
41 |     --backbone ${backbone} \
42 |     --output $output ${@:2} \
43 |     --num_beams 5 \
44 |     --batch_size ${batch_size} \
45 |     --valid_batch_size ${batch_size} \
46 |     --encoder_prompt_len 40 \
47 |     --mid_dim 800 \
48 |     --use_single_prompt \
49 |     --use_tasks_prompts \
50 |     --tasks "vqa,gqa,nlvr,caption" \
51 |     --feature ${feature} --n_boxes 36 --downsample \
52 |     --image_size "(224,224)" \
53 |     --run_name $name
54 | 


--------------------------------------------------------------------------------
/VL-T5/scripts/video/full_finetuning.sh:
--------------------------------------------------------------------------------
 1 | task=multitask_video
 2 | 
 3 | # or bart
 4 | model="bart"
 5 | 
 6 | echo $model
 7 | 
 8 | if [ $model == "t5" ]
 9 | then
10 |     folder_prefix="VLT5"
11 |     backbone="t5-base"
12 |     batch_size=30
13 | elif [ $model == "bart" ]
14 | then
15 |     folder_prefix="VLBart"
16 |     backbone="facebook/bart-base"
17 |     batch_size=50
18 | fi
19 | 
20 | echo $folder_prefix
21 | echo $backbone
22 | 
23 | feature=ViT
24 | 
25 | lr=3e-5
26 | 
27 | name=${feature}_LMfull_bs${batch_size}_image224_lr${lr}_subs_epoch7
28 | output=snap/${folder_prefix}_${task}/$name
29 | 
30 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \
31 | python -m torch.distributed.launch \
32 |     --nproc_per_node=$1 \
33 |     --master_port=26791 \
34 |     src/${task}.py \
35 |     --distributed --multiGPU \
36 |     --optim adamw \
37 |     --warmup_ratio 0.1 \
38 |     --clip_grad_norm 5 \
39 |     --lr ${lr} \
40 |     --epochs 7 \
41 |     --num_workers 4 \
42 |     --backbone ${backbone} \
43 |     --output $output ${@:2} \
44 |     --num_beams 5 \
45 |     --unfreeze_language_model \
46 |     --batch_size ${batch_size} \
47 |     --valid_batch_size ${batch_size} \
48 |     --use_tasks_prompts \
49 |     --tasks "tvqa,how2qa,tvc,yc2c" \
50 |     --feature ${feature} --n_boxes 64 --downsample \
51 |     --image_size "(224,224)" \
52 |     --run_name $name
53 | 


--------------------------------------------------------------------------------
/VL-T5/scripts/video/single_adapter.sh:
--------------------------------------------------------------------------------
 1 | task=multitask_video
 2 | 
 3 | # or bart
 4 | model="bart"
 5 | 
 6 | echo $model
 7 | 
 8 | if [ $model == "t5" ]
 9 | then
10 |     folder_prefix="VLT5"
11 |     backbone="t5-base"
12 |     batch_size=30
13 | elif [ $model == "bart" ]
14 | then
15 |     folder_prefix="VLBart"
16 |     backbone="facebook/bart-base"
17 |     batch_size=50
18 | fi
19 | 
20 | echo $folder_prefix
21 | echo $backbone
22 | 
23 | feature=ViT
24 | 
25 | lr=3e-4
26 | 
27 | name=${feature}_LMadapter_bs${batch_size}_image224_lr${lr}_subs_epoch7
28 | output=snap/${folder_prefix}_${task}/$name
29 | 
30 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \
31 | python -m torch.distributed.launch \
32 |     --nproc_per_node=$1 \
33 |     --master_port=26792 \
34 |     src/${task}.py \
35 |     --distributed --multiGPU \
36 |     --optim adamw \
37 |     --warmup_ratio 0.1 \
38 |     --clip_grad_norm 5 \
39 |     --lr ${lr} \
40 |     --epochs 7 \
41 |     --num_workers 4 \
42 |     --backbone ${backbone} \
43 |     --output $output ${@:2} \
44 |     --num_beams 5 \
45 |     --use_adapter \
46 |     --use_single_adapter \
47 |     --unfreeze_layer_norms \
48 |     --reduction_factor 8 \
49 |     --batch_size ${batch_size} \
50 |     --valid_batch_size ${batch_size} \
51 |     --use_tasks_prompts \
52 |     --tasks "tvqa,how2qa,tvc,yc2c" \
53 |     --feature ${feature} --n_boxes 64 --downsample \
54 |     --image_size "(224,224)" \
55 |     --run_name $name
56 | 


--------------------------------------------------------------------------------
/VL-T5/scripts/video/single_lora.sh:
--------------------------------------------------------------------------------
 1 | task=multitask_video
 2 | 
 3 | # or bart
 4 | model="bart"
 5 | 
 6 | echo $model
 7 | 
 8 | if [ $model == "t5" ]
 9 | then
10 |     folder_prefix="VLT5"
11 |     backbone="t5-base"
12 |     batch_size=300
13 | elif [ $model == "bart" ]
14 | then
15 |     folder_prefix="VLBart"
16 |     backbone="facebook/bart-base"
17 |     batch_size=50
18 | fi
19 | 
20 | echo $folder_prefix
21 | echo $backbone
22 | 
23 | feature=ViT
24 | 
25 | lr=3e-4
26 | 
27 | name=${feature}_LMlora_bs${batch_size}_image224_lr${lr}_subs_epoch7
28 | output=snap/${folder_prefix}_${task}/$name
29 | 
30 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \
31 | python -m torch.distributed.launch \
32 |     --nproc_per_node=$1 \
33 |     --master_port=26799 \
34 |     src/${task}.py \
35 |     --distributed --multiGPU \
36 |     --optim adamw \
37 |     --warmup_ratio 0.1 \
38 |     --clip_grad_norm 5 \
39 |     --lr ${lr} \
40 |     --epochs 7 \
41 |     --num_workers 4 \
42 |     --backbone ${backbone} \
43 |     --output $output ${@:2} \
44 |     --num_beams 5 \
45 |     --use_lora \
46 |     --use_single_lora \
47 |     --lora_dim 128 \
48 |     --batch_size ${batch_size} \
49 |     --valid_batch_size ${batch_size} \
50 |     --use_tasks_prompts \
51 |     --tasks "tvqa,how2qa,tvc,yc2c" \
52 |     --feature ${feature} --n_boxes 64 --downsample \
53 |     --image_size "(224,224)" \
54 |     --run_name $name
55 | 


--------------------------------------------------------------------------------
/VL-T5/scripts/video/single_prompt.sh:
--------------------------------------------------------------------------------
 1 | task=multitask_video
 2 | 
 3 | # or bart
 4 | model="bart"
 5 | 
 6 | echo $model
 7 | 
 8 | if [ $model == "t5" ]
 9 | then
10 |     folder_prefix="VLT5"
11 |     backbone="t5-base"
12 |     batch_size=300
13 | elif [ $model == "bart" ]
14 | then
15 |     folder_prefix="VLBart"
16 |     backbone="facebook/bart-base"
17 |     batch_size=50
18 | fi
19 | 
20 | echo $folder_prefix
21 | echo $backbone
22 | 
23 | feature=ViT
24 | 
25 | lr=3e-4
26 | 
27 | name=${feature}_LMprompt_bs${batch_size}_image224_lr${lr}_subs_epoch7
28 | output=snap/${folder_prefix}_${task}/$name
29 | 
30 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \
31 | python -m torch.distributed.launch \
32 |     --nproc_per_node=$1 \
33 |     --master_port=26798 \
34 |     src/${task}.py \
35 |     --distributed --multiGPU \
36 |     --optim adamw \
37 |     --warmup_ratio 0.1 \
38 |     --clip_grad_norm 5 \
39 |     --lr ${lr} \
40 |     --epochs 7 \
41 |     --num_workers 4 \
42 |     --backbone ${backbone} \
43 |     --output $output ${@:2} \
44 |     --num_beams 5 \
45 |     --batch_size ${batch_size} \
46 |     --valid_batch_size ${batch_size} \
47 |     --encoder_prompt_len 40 \
48 |     --mid_dim 800 \
49 |     --use_single_prompt \
50 |     --use_tasks_prompts \
51 |     --tasks "tvqa,how2qa,tvc,yc2c" \
52 |     --feature ${feature} --n_boxes 64 --downsample \
53 |     --image_size "(224,224)" \
54 |     --run_name $name
55 | 


--------------------------------------------------------------------------------
/VL-T5/src/adapters/__init__.py:
--------------------------------------------------------------------------------
1 | # The codes are borrowed from https://github.com/rabeehk/compacter
2 | 
3 | from .config import MetaAdapterConfig, AdapterConfig, CompactorConfig, LRAdapterConfig
4 | from .adapter_modeling import Adapter, HyperComplexAdapter, OutputAdapter
5 | from .adapter_controller import AdapterController, AdapterLayer, MetaLayersAdapterController, OutputParallelAdapterLayer
6 | from .adapter_hypernetwork import AdapterLayersHyperNetController, AdapterLayersOneHyperNetController
7 | from .adapter_utils import TaskEmbeddingController


--------------------------------------------------------------------------------
/VL-T5/src/adapters/adapter_configuration.py:
--------------------------------------------------------------------------------
 1 | """Implements the adapters and other parameter-efficient finetuning methods' configurations."""
 2 | 
 3 | from collections import OrderedDict
 4 | from dataclasses import dataclass
 5 | 
 6 | import torch.nn as nn
 7 | 
 8 | @dataclass
 9 | class AdapterConfig(object):
10 |     """Implements the adapter configuration proposed by Houlsby et. al, 2019
11 |     in https://arxiv.org/abs/1902.00751.
12 |     We additionally pass all the configuration of parameter-efficient finetuning
13 |     methods with this config."""
14 |     add_layer_norm_before_adapter: bool = False
15 |     add_layer_norm_after_adapter: bool = True
16 |     non_linearity: str = "swish"
17 |     task_reduction_factor: int = 16
18 |     add_adapter_in_feed_forward = True
19 |     add_adapter_in_self_attention = True
20 |     hidden_dim = 128
21 |     task_adapter_layers_encoder = None
22 |     task_adapter_layers_decoder = None
23 |     task_adapter_in_decoder = True
24 |     intrinsic_dim = 100
25 |     normalize_intrinsic_projections = False
26 |     # This can be either random, or fastfood.
27 |     intrinsic_projection = "random"
28 | 
29 |     # Hypercomplex adapters parameters 
30 |     hypercomplex_adapters = False
31 |     hypercomplex_division = 8
32 |     learn_phm = True
33 |     hypercomplex_nonlinearity="glorot-uniform"
34 |     shared_phm_rule = False 
35 |     factorized_phm = False 
36 |     shared_W_phm = False
37 |     factorized_phm_rule = False 
38 |     phm_c_init = "normal"
39 |     phm_rank = 1
40 |     phm_init_range=0.01
41 | 
42 |     # prefix-tuning parameters.
43 |     prefix_dim = 100
44 |     init_prefix_from_vocab = False 
45 |     kronecker_prod = False  
46 | 
47 |     # BitFit configuration.
48 |     bitfit = False
49 | 
50 |     # Low-rank adapters.
51 |     low_rank_adapters = False
52 |     low_rank_w_init = "glorot-uniform"
53 |     low_rank_rank = 1
54 | 
55 | 
56 | ADAPTER_CONFIG_MAPPING = OrderedDict(
57 |     [("adapter", AdapterConfig)])
58 | 
59 | 
60 | class AutoAdapterConfig(nn.Module):
61 |     """Generic Adapter config class to instantiate different adapter configs."""
62 | 
63 |     @classmethod
64 |     def get(cls, config_name: str):
65 |         if config_name in ADAPTER_CONFIG_MAPPING:
66 |             return ADAPTER_CONFIG_MAPPING[config_name]()
67 |         raise ValueError(
68 |             "Unrecognized adapter config type identifier: {}. Should contain one of {}"
69 |                 .format(config_name, ", ".join(ADAPTER_CONFIG_MAPPING.keys())))
70 | 


--------------------------------------------------------------------------------
/VL-T5/src/adapters/adapter_modeling.py:
--------------------------------------------------------------------------------
  1 | """Implements an Adapter, Low-rank adapters and Hyper-adapter Layers."""
  2 | import torch.nn as nn
  3 | from .adapter_utils import Activations
  4 | 
  5 | from .hypercomplex.layers import PHMLinear
  6 | from .low_rank_layer import LowRankLinear
  7 | 
  8 | 
  9 | class LowRankAdapter(nn.Module):
 10 |     """This is the low-rank adapter, in which each adapter is composed of two rank-one matrices.
 11 |     """
 12 |     def __init__(self, config):
 13 |         super().__init__()
 14 |         self.config = config
 15 |         self.input_dim = config.input_dim
 16 |         self.down_sample_size = self.input_dim // config.reduction_factor
 17 |         self.activation = Activations(config.non_linearity.lower())
 18 |         self.down_sampler = LowRankLinear(self.input_dim, self.down_sample_size,
 19 |                                           w_init=config.low_rank_w_init,
 20 |                                           rank=config.low_rank_rank)
 21 |         self.up_sampler = LowRankLinear(self.down_sample_size, self.input_dim,
 22 |                                         w_init=config.low_rank_w_init,
 23 |                                         rank=config.low_rank_rank)
 24 | 
 25 |         self.track_z = config.track_z
 26 | 
 27 |     def forward(self, x):
 28 |         z = self.down_sampler(x)
 29 |         z = self.activation(z)
 30 |         if self.track_z:
 31 |             self.z = z
 32 |         output = self.up_sampler(z)
 33 |         return output
 34 | 
 35 | 
 36 | class Adapter(nn.Module):
 37 |     """Conventional Adapter layer, in which the weights of up and down sampler modules
 38 |     are parameters and are optimized."""
 39 | 
 40 |     def __init__(self, config):
 41 |         super().__init__()
 42 |         self.config = config
 43 |         self.input_dim = config.d_model
 44 |         reduction_factor = config.reduction_factor
 45 |         self.down_sample_size = self.input_dim // reduction_factor
 46 |         self.activation = Activations(config.non_linearity.lower())
 47 |         self.down_sampler = nn.Linear(self.input_dim, self.down_sample_size) 
 48 |         self.up_sampler = nn.Linear(self.down_sample_size, self.input_dim) 
 49 | 
 50 |         self.track_z = config.track_z
 51 | 
 52 |     def forward(self, x):
 53 |         z = self.down_sampler(x)
 54 |         z = self.activation(z)
 55 |         if self.track_z:
 56 |             self.z = z
 57 |         output = self.up_sampler(z)
 58 |         return output 
 59 | 
 60 | 
 61 | class OutputAdapter(nn.Module):
 62 |     """Conventional Adapter layer, in which the weights of up and down sampler modules
 63 |     are parameters and are optimized."""
 64 | 
 65 |     def __init__(self, config, output_dim):
 66 |         super().__init__()
 67 |         self.config = config
 68 |         self.input_dim = config.d_model
 69 |         reduction_factor = 16
 70 |         self.down_sample_size = self.input_dim // reduction_factor
 71 |         self.activation = Activations(config.non_linearity.lower())
 72 |         self.down_sampler = nn.Linear(self.input_dim, self.down_sample_size) 
 73 |         self.up_sampler = nn.Linear(self.down_sample_size, output_dim) 
 74 | 
 75 |     def forward(self, x):
 76 |         z = self.down_sampler(x)
 77 |         z = self.activation(z)
 78 |         output = self.up_sampler(z)
 79 |         return output 
 80 | 
 81 |     def resize_up_sampler(self, resized_size):
 82 |         self.up_sampler = nn.Linear(self.down_sample_size, resized_size)
 83 | 
 84 | 
 85 | class HyperComplexAdapter(nn.Module):
 86 |     """Hypercomplex Adapter layer, in which the weights of up and down sampler modules
 87 |     are parameters are 1/n times of the conventional adapter layers, where n is
 88 |     hypercomplex division number."""
 89 | 
 90 |     def __init__(self, config):
 91 |         super().__init__()
 92 |         self.config = config
 93 |         self.input_dim = config.input_dim
 94 |         self.down_sample_size = self.input_dim // config.reduction_factor
 95 |         self.activation = Activations(config.non_linearity.lower())
 96 |         self.down_sampler = PHMLinear(in_features=self.input_dim,
 97 |                                       out_features=self.down_sample_size,
 98 |                                       bias=True,
 99 |                                       c_init=config.phm_c_init,
100 |                                       phm_dim=config.hypercomplex_division,
101 |                                       learn_phm=config.learn_phm,
102 |                                       w_init=config.hypercomplex_nonlinearity,
103 |                                       shared_phm_rule=config.shared_phm_rule,
104 |                                       factorized_phm=config.factorized_phm,
105 |                                       shared_W_phm=config.shared_W_phm,
106 |                                       factorized_phm_rule=config.factorized_phm_rule,
107 |                                       phm_rank=config.phm_rank,
108 |                                       phm_init_range=config.phm_init_range,
109 |                                       kronecker_prod=config.kronecker_prod)
110 |         self.up_sampler = PHMLinear(in_features=self.down_sample_size,
111 |                                     out_features=self.input_dim, 
112 |                                     bias=True,
113 |                                     c_init=config.phm_c_init,
114 |                                     phm_dim=config.hypercomplex_division,
115 |                                     learn_phm=config.learn_phm,
116 |                                     w_init=config.hypercomplex_nonlinearity,
117 |                                     shared_phm_rule=config.shared_phm_rule,
118 |                                     factorized_phm=config.factorized_phm,
119 |                                     shared_W_phm=config.shared_W_phm,
120 |                                     factorized_phm_rule=config.factorized_phm_rule,
121 |                                     phm_rank=config.phm_rank,
122 |                                     phm_init_range=config.phm_init_range,
123 |                                     kronecker_prod=config.kronecker_prod)
124 | 
125 |         self.track_z = config.track_z
126 | 
127 |     def forward(self, x):
128 |         z = self.down_sampler(x)
129 |         z = self.activation(z)
130 |         if self.track_z:
131 |             self.z = z
132 |         return self.up_sampler(z)


--------------------------------------------------------------------------------
/VL-T5/src/adapters/adapter_outputs.py:
--------------------------------------------------------------------------------
 1 | """Defines the output class for the adapter layers' parameters."""
 2 | import torch
 3 | from dataclasses import dataclass
 4 | 
 5 | 
 6 | @dataclass
 7 | class SamplerOutput:
 8 |     """Base class for the base and weights of each adapter."""
 9 |     weight: torch.FloatTensor = None
10 |     bias: torch.FloatTensor = None
11 | 
12 | 
13 | @dataclass
14 | class LayerNormOutput:
15 |     """Base class for the base and weights of the conditional
16 |     layer norms."""
17 |     weight: torch.FloatTensor = None
18 |     bias: torch.FloatTensor = None
19 | 
20 | 
21 | @dataclass
22 | class AdapterOutput:
23 |     """Base class for each adapter weights"""
24 |     up: SamplerOutput = None
25 |     down: SamplerOutput = None
26 |     pre_norm: LayerNormOutput = None
27 |     post_norm: LayerNormOutput = None
28 | 
29 | 
30 | @dataclass
31 | class AdapterT5BlockOutput:
32 |     """
33 |     Base class for adapter layer's outputs.
34 |     """
35 |     feed_forward: AdapterOutput = None
36 |     self_attention: AdapterOutput = None
37 |     cross_attention: AdapterOutput = None


--------------------------------------------------------------------------------
/VL-T5/src/adapters/adapter_utils.py:
--------------------------------------------------------------------------------
 1 | """Implementation of different utility functions for adapter layers."""
 2 | import torch
 3 | import torch.nn as nn
 4 | from transformers.activations import get_activation
 5 | 
 6 | 
 7 | class Activations(nn.Module):
 8 |     def __init__(self, activation_type):
 9 |         super().__init__()
10 |         self.f = get_activation(activation_type)
11 | 
12 |     def forward(self, x):
13 |         return self.f(x)
14 | 
15 | 
16 | def init_linear_layer(linear_layer, std=1e-2):
17 |     """Initializes the given linear module as explained in adapter paper."""
18 |     nn.init.normal_(linear_layer.weight, std=std)
19 |     nn.init.zeros_(linear_layer.bias)
20 | 
21 | 
22 | def linear_layer(input_dim, output_dim, std=1e-2):
23 |     """Generates a linear module and initializes it."""
24 |     linear = nn.Linear(input_dim, output_dim)
25 |     init_linear_layer(linear, std=std)
26 |     return linear
27 | 
28 | 
29 | class TaskHyperNet(nn.Module):
30 |     """This module generates the task-embeddings from the initial feeded task embeddings."""
31 | 
32 |     def __init__(self, config, input_dim):
33 |         super(TaskHyperNet, self).__init__()
34 |         self.task_hidden_dim = config.task_hidden_dim
35 |         self.projected_task_embedding_dim = config.projected_task_embedding_dim
36 |         self.task_embeding_generator = nn.Sequential(
37 |             linear_layer(input_dim, self.task_hidden_dim),
38 |             nn.ReLU(),
39 |             linear_layer(self.task_hidden_dim, self.projected_task_embedding_dim))
40 | 
41 |     def forward(self, task_embedding):
42 |         task_embedding = task_embedding.view(-1)
43 |         return self.task_embeding_generator(task_embedding).view(-1)
44 | 
45 | 
46 | class LayerNormHyperNet(nn.Module):
47 |     """This module generates the weight and bias for the task conditioned layer norm."""
48 | 
49 |     def __init__(self, config):
50 |         super(LayerNormHyperNet, self).__init__()
51 |         self.task_embedding_dim = config.projected_task_embedding_dim \
52 |             if config.train_task_embeddings else config.task_embedding_dim
53 |         self.weight_generator = linear_layer(self.task_embedding_dim, config.input_dim)
54 |         self.bias_generator = linear_layer(self.task_embedding_dim, config.input_dim)
55 | 
56 |     def forward(self, input):
57 |         return self.weight_generator(input), self.bias_generator(input)
58 | 
59 | 
60 | class TaskEmbeddingController(nn.Module):
61 |     """Main module controlling task embeddings."""
62 | 
63 |     def __init__(self, config):
64 |         super(TaskEmbeddingController, self).__init__()
65 |         # self.device = config.device
66 |         self.task_embedding_dim = config.task_embedding_dim
67 |         self.tasks = config.tasks
68 |         self.task_to_task_embeddings = {task: task for task in self.tasks}
69 |         if config.task_to_embeddings is not None:
70 |             self.task_to_task_embeddings = config.task_to_embeddings
71 |             self.tasks = self.task_to_task_embeddings.values()
72 |         self.set_task_embeddings(self.tasks)
73 |         self.train_task_embeddings = config.train_task_embeddings
74 |         if self.train_task_embeddings:
75 |             self.task_hyper_net = TaskHyperNet(config)
76 | 
77 |     def get_task(self, task):
78 |         return self.task_to_task_embeddings[task]
79 | 
80 |     def set_task_embeddings(self, tasks):
81 |         self.task_to_embeddings = nn.ParameterDict(dict())
82 |         for task in tasks:
83 |             task_embedding = torch.Tensor(torch.randn(self.task_embedding_dim))
84 |             self.task_to_embeddings[task] = nn.Parameter(task_embedding)
85 | 
86 |     def forward(self, task):
87 |         task_mapped = self.get_task(task)
88 |         task_embedding = self.task_to_embeddings[task_mapped]
89 |         if self.train_task_embeddings:
90 |             return self.task_hyper_net(task_embedding)
91 |         return task_embedding
92 | 


--------------------------------------------------------------------------------
/VL-T5/src/adapters/config.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | 
  3 | 
  4 | @dataclass
  5 | class AdapterConfig(object):
  6 |     """Implements the adapter configuration proposed by Houlsby et. al, 2019
  7 |     in https://arxiv.org/abs/1902.00751."""
  8 |     add_layer_norm_before_adapter: bool = False
  9 |     add_layer_norm_after_adapter: bool = False
 10 |     non_linearity: str = "gelu_new"
 11 |     reduction_factor: int = 16
 12 |     weight_init_range = 1e-2
 13 |     # Whether to use conditional layer norms for adapters.
 14 |     conditional_layer_norm = False
 15 |     hidden_dim = 128
 16 |     # Whether to add adapter blocks, this is used in case we need
 17 |     # to tune only layer norms.
 18 |     train_adapters_blocks = True
 19 | 
 20 |     task_adapter_layers_encoder = None
 21 |     task_adapter_layers_decoder = None
 22 |     task_adapter_in_decoder = True
 23 |     intrinsic_dim = 100
 24 |     normalize_intrinsic_projections = False
 25 |     # This can be either random, or fastfood.
 26 |     intrinsic_projection = "random"
 27 | 
 28 |     # Hypercomplex adapters parameters 
 29 |     hypercomplex_adapters = False
 30 |     hypercomplex_division = 8
 31 |     learn_phm = True
 32 |     hypercomplex_nonlinearity="glorot-uniform"
 33 |     shared_phm_rule = False 
 34 |     factorized_phm = False 
 35 |     shared_W_phm = False
 36 |     factorized_phm_rule = False 
 37 |     phm_c_init = "normal"
 38 |     phm_rank = 1
 39 |     phm_init_range=0.01
 40 | 
 41 |     # prefix-tuning parameters.
 42 |     prefix_dim = 100
 43 |     init_prefix_from_vocab = False 
 44 |     kronecker_prod = False  
 45 | 
 46 |     # BitFit configuration.
 47 |     bitfit = False
 48 | 
 49 |     # Low-rank adapters.
 50 |     low_rank_adapters = False
 51 |     low_rank_w_init = "glorot-uniform"
 52 |     low_rank_rank = 1
 53 | 
 54 |     # whether using single adapter for all tasks
 55 |     use_single_adapter = False
 56 | 
 57 | 
 58 | class MetaAdapterConfig(AdapterConfig):
 59 |     """Implements Meta adapter in which a hyper-network generates the parameters of
 60 |      adapter layers. In this case we have a task embeddings which is feed to the
 61 |      hyper-network to allow it generate the weights for the adapter layers."""
 62 |     task_embedding_dim = 512
 63 |     task_embedding_dir = None
 64 |     hidden_dim = 128
 65 |     train_task_embeddings = False
 66 |     non_linearity: str = "gelu_new"
 67 |     projected_task_embedding_dim = 64
 68 |     task_hidden_dim = 128
 69 |     parametric_task_embedding = False
 70 |     # If Specified, uses one hypernet to generates the adapters weights.
 71 |     unique_hyper_net = True
 72 |     unique_hyper_net_layer_norm = True
 73 |     # We consider only one hyper-net for all the blocks of transformer.
 74 |     efficient_unique_hyper_net = False
 75 |     task_to_embeddings=None
 76 | 
 77 | 
 78 | @dataclass
 79 | class CompactorConfig(object):
 80 |     add_layer_norm_before_adapter: bool = False
 81 |     add_layer_norm_after_adapter: bool = False
 82 |     non_linearity: str = "gelu_new"
 83 |     reduction_factor: int = 16
 84 |     weight_init_range = 1e-2
 85 |     # Whether to use conditional layer norms for adapters.
 86 |     hidden_dim = 128
 87 |     # Whether to add adapter blocks, this is used in case we need
 88 |     # to tune only layer norms.
 89 |     task_adapter_layers_encoder = None
 90 |     task_adapter_layers_decoder = None
 91 |     task_adapter_in_decoder = True
 92 |     intrinsic_dim = 100
 93 |     normalize_intrinsic_projections = False
 94 |     # This can be either random, or fastfood.
 95 |     intrinsic_projection = "random"
 96 | 
 97 |     # Hypercomplex adapters parameters 
 98 |     hypercomplex_adapters = True
 99 |     hypercomplex_division = 4
100 |     train_task_adapters = True
101 |     learn_phm = True
102 |     hypercomplex_nonlinearity="glorot-uniform"
103 |     shared_phm_rule = True 
104 |     factorized_phm = True 
105 |     shared_W_phm = False
106 |     factorized_phm_rule = False 
107 |     phm_c_init = "normal"
108 |     phm_rank = 1
109 |     phm_init_range=0.0001
110 | 
111 |     # prefix-tuning parameters.
112 |     prefix_dim = 100
113 |     init_prefix_from_vocab = False 
114 |     kronecker_prod = False  
115 | 
116 |     # BitFit configuration.
117 |     bitfit = False
118 | 
119 |     # Low-rank adapters.
120 |     low_rank_adapters = False
121 |     low_rank_w_init = "glorot-uniform"
122 |     low_rank_rank = 1
123 | 
124 |     # whether using single adapter for all tasks
125 |     use_single_adapter = False
126 | 
127 | 
128 | @dataclass
129 | class LRAdapterConfig(object):
130 |     add_layer_norm_before_adapter: bool = False
131 |     add_layer_norm_after_adapter: bool = False
132 |     non_linearity: str = "gelu_new"
133 |     reduction_factor: int = 16
134 |     weight_init_range = 1e-2
135 |     # Whether to use conditional layer norms for adapters.
136 |     hidden_dim = 128
137 |     # Whether to add adapter blocks, this is used in case we need
138 |     # to tune only layer norms.
139 |     task_adapter_layers_encoder = None
140 |     task_adapter_layers_decoder = None
141 |     task_adapter_in_decoder = True
142 |     intrinsic_dim = 100
143 |     normalize_intrinsic_projections = False
144 |     # This can be either random, or fastfood.
145 |     intrinsic_projection = "random"
146 | 
147 |     # Hypercomplex adapters parameters 
148 |     hypercomplex_adapters = False
149 |     hypercomplex_division = 4
150 |     train_task_adapters = True
151 |     learn_phm = True
152 |     hypercomplex_nonlinearity="glorot-uniform"
153 |     shared_phm_rule = True 
154 |     factorized_phm = True 
155 |     shared_W_phm = False
156 |     factorized_phm_rule = False 
157 |     phm_c_init = "normal"
158 |     phm_rank = 1
159 |     phm_init_range=0.0001
160 | 
161 |     # prefix-tuning parameters.
162 |     prefix_dim = 100
163 |     init_prefix_from_vocab = False 
164 |     kronecker_prod = False  
165 | 
166 |     # BitFit configuration.
167 |     bitfit = False
168 | 
169 |     # Low-rank adapters.
170 |     low_rank_adapters = True
171 |     low_rank_w_init = "glorot-uniform"
172 |     low_rank_rank = 1
173 | 
174 |     # whether using single adapter for all tasks
175 |     use_single_adapter = False


--------------------------------------------------------------------------------
/VL-T5/src/adapters/hypercomplex/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ylsung/VL_adapter/545fcbbdbbaec4c442de35567f6ae477ff4e8265/VL-T5/src/adapters/hypercomplex/__init__.py


--------------------------------------------------------------------------------
/VL-T5/src/adapters/hypercomplex/inits.py:
--------------------------------------------------------------------------------
 1 | # The codes are from https://github.com/bayer-science-for-a-better-life/phc-gnn
 2 | import torch
 3 | import math
 4 | 
 5 | 
 6 | def glorot_normal(tensor: torch.Tensor):
 7 |     return torch.nn.init.xavier_normal_(tensor, gain=math.sqrt(2))
 8 | 
 9 | def glorot_uniform(tensor: torch.Tensor):
10 |     return torch.nn.init.xavier_uniform_(tensor, gain=math.sqrt(2))


--------------------------------------------------------------------------------
/VL-T5/src/adapters/hypercomplex/kronecker.py:
--------------------------------------------------------------------------------
 1 | # The codes are from https://github.com/bayer-science-for-a-better-life/phc-gnn
 2 | import torch
 3 | 
 4 | # TODO: change this with torch.kron
 5 | """A part of the pylabyk library: numpytorch.py at https://github.com/yulkang/pylabyk"""
 6 | def kronecker_product(a, b):
 7 |     """
 8 |     Kronecker product of matrices a and b with leading batch dimensions.
 9 |     Batch dimensions are broadcast. The number of them mush
10 |     :type a: torch.Tensor
11 |     :type b: torch.Tensor
12 |     :rtype: torch.Tensor
13 |     """
14 |     #return torch.stack([torch.kron(ai, bi) for ai, bi in zip(a,b)], dim=0)
15 |     siz1 = torch.Size(torch.tensor(a.shape[-2:]) * torch.tensor(b.shape[-2:]))
16 |     res = a.unsqueeze(-1).unsqueeze(-3) * b.unsqueeze(-2).unsqueeze(-4)
17 |     siz0 = res.shape[:-4]
18 |     out = res.reshape(siz0 + siz1)
19 |     return out
20 | 
21 | 
22 | def kronecker_product_einsum_batched(A: torch.Tensor, B: torch.Tensor):
23 |     """
24 |     Batched Version of Kronecker Products
25 |     :param A: has shape (b, a, c)
26 |     :param B: has shape (b, k, p)
27 |     :return: (b, ak, cp)
28 |     """
29 |     assert A.dim() == 3 and B.dim() == 3
30 |     res = torch.einsum('bac,bkp->bakcp', A, B).view(A.size(0),
31 |                                                     A.size(1)*B.size(1),
32 |                                                     A.size(2)*B.size(2))
33 |     return res


--------------------------------------------------------------------------------
/VL-T5/src/adapters/low_rank_layer.py:
--------------------------------------------------------------------------------
 1 | """This script implements a low-rank linear layer."""
 2 | import torch 
 3 | import torch.nn as nn 
 4 | 
 5 | from .hypercomplex.inits import glorot_uniform, glorot_normal
 6 | 
 7 | class LowRankLinear(torch.nn.Module):
 8 |     def __init__(self, input_dim: int, output_dim: int, rank: int = 1,
 9 |         bias: bool = True, w_init: str = "glorot-uniform"):
10 |         super(LowRankLinear, self).__init__()
11 |         self.input_dim = input_dim
12 |         self.output_dim = output_dim 
13 |         self.rank = rank
14 |         self.bias = bias
15 |         self.w_init = w_init
16 |         self.W_left = nn.Parameter(torch.Tensor(size=(input_dim, rank)), requires_grad=True)
17 |         self.W_right = nn.Parameter(torch.Tensor(size=(rank, output_dim)), requires_grad=True)
18 |         if bias:
19 |             self.b = nn.Parameter(torch.Tensor(output_dim))
20 |         self.reset_parameters()
21 |     
22 |     def reset_parameters(self):
23 |         if self.bias:
24 |             self.b.data = torch.zeros_like(self.b.data)
25 |         if self.w_init == "glorot-uniform": 
26 |             self.W_left.data = glorot_uniform(self.W_left.data) 
27 |             self.W_right.data = glorot_uniform(self.W_right.data)          
28 |         elif self.w_init == "glorot-normal":
29 |             self.W_left.data = glorot_normal(self.W_left.data)
30 |             self.W_right.data = glorot_normal(self.W_right.data)
31 |         else:
32 |             raise ValueError
33 | 
34 |     def forward(self, x: torch.Tensor) -> torch.Tensor:
35 |         W = self.W_left.matmul(self.W_right)
36 |         output = torch.matmul(input=x, other=W)
37 |         if self.bias:
38 |             output += self.b
39 |         return output
40 | 


--------------------------------------------------------------------------------
/VL-T5/src/caption_model.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | 
  6 | from modeling_t5 import VLT5
  7 | class VLT5COCOCaption(VLT5):
  8 |     def __init__(self, config):
  9 |         super().__init__(config)
 10 | 
 11 |     def train_step(self, batch):
 12 |         device = next(self.parameters()).device
 13 | 
 14 |         batch = self.vis_forward(batch, device)
 15 |         task = batch["task"]
 16 |         vis_feats = batch['vis_feats'].to(device)
 17 |         input_ids = batch['input_ids'].to(device)
 18 |         vis_pos = batch['boxes'].to(device)
 19 | 
 20 |         lm_labels = batch["target_ids"].to(device)
 21 | 
 22 |         reduce_loss = True
 23 |         output = self(
 24 |             input_ids=input_ids,
 25 |             vis_inputs=(vis_feats, vis_pos),
 26 |             labels=lm_labels,
 27 |             reduce_loss=reduce_loss,
 28 |             task=task,
 29 |         )
 30 | 
 31 |         lm_mask = lm_labels != -100
 32 |         B, L = lm_labels.size()
 33 | 
 34 |         loss = output['loss']
 35 | 
 36 |         result = {
 37 |             'loss': loss
 38 |         }
 39 |         return result
 40 | 
 41 |     def test_step(self, batch, **kwargs):
 42 |         device = next(self.parameters()).device
 43 | 
 44 |         batch = self.vis_forward(batch, device)
 45 |         task = batch["task"]
 46 |         vis_feats = batch['vis_feats'].to(device)
 47 |         input_ids = batch['input_ids'].to(device)
 48 |         vis_pos = batch['boxes'].to(device)
 49 | 
 50 |         output = self.generate(
 51 |             input_ids=input_ids,
 52 |             vis_inputs=(vis_feats, vis_pos),
 53 |             task=task,
 54 |             **kwargs,
 55 |         )
 56 | 
 57 |         generated_sents = self.tokenizer.batch_decode(output, skip_special_tokens=True)
 58 | 
 59 |         result = {}
 60 |         result['pred'] = generated_sents
 61 | 
 62 |         return result
 63 | 
 64 | 
 65 | from modeling_bart import VLBart
 66 | class VLBartCOCOCaption(VLBart):
 67 |     def __init__(self, config):
 68 |         super().__init__(config)
 69 | 
 70 |     def train_step(self, batch):
 71 |         device = next(self.parameters()).device
 72 | 
 73 |         batch = self.vis_forward(batch, device)
 74 |         task = batch["task"]
 75 |         vis_feats = batch['vis_feats'].to(device)
 76 |         input_ids = batch['input_ids'].to(device)
 77 |         vis_pos = batch['boxes'].to(device)
 78 | 
 79 |         lm_labels = batch["target_ids"].to(device)
 80 | 
 81 |         reduce_loss = True
 82 |         output = self(
 83 |             input_ids=input_ids,
 84 |             vis_inputs=(vis_feats, vis_pos),
 85 |             labels=lm_labels,
 86 |             reduce_loss=reduce_loss,
 87 |             task=task,
 88 |         )
 89 | 
 90 |         lm_mask = lm_labels != -100
 91 |         B, L = lm_labels.size()
 92 | 
 93 |         loss = output['loss']
 94 | 
 95 |         result = {
 96 |             'loss': loss
 97 |         }
 98 |         return result
 99 | 
100 |     def test_step(self, batch, **kwargs):
101 |         device = next(self.parameters()).device
102 | 
103 |         batch = self.vis_forward(batch, device)
104 |         task = batch["task"]
105 |         vis_feats = batch['vis_feats'].to(device)
106 |         input_ids = batch['input_ids'].to(device)
107 |         vis_pos = batch['boxes'].to(device)
108 | 
109 |         output = self.generate(
110 |             input_ids=input_ids,
111 |             vis_inputs=(vis_feats, vis_pos),
112 |             task=task,
113 |             **kwargs
114 |         )
115 | 
116 |         generated_sents = self.tokenizer.batch_decode(output, skip_special_tokens=True)
117 | 
118 |         result = {}
119 |         result['pred'] = generated_sents
120 | 
121 |         return result


--------------------------------------------------------------------------------
/VL-T5/src/clip/__init__.py:
--------------------------------------------------------------------------------
1 | from .clip import *
2 | 


--------------------------------------------------------------------------------
/VL-T5/src/clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ylsung/VL_adapter/545fcbbdbbaec4c442de35567f6ae477ff4e8265/VL-T5/src/clip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/VL-T5/src/clip/simple_tokenizer.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import html
  3 | import os
  4 | from functools import lru_cache
  5 | 
  6 | import ftfy
  7 | import regex as re
  8 | 
  9 | 
 10 | @lru_cache()
 11 | def default_bpe():
 12 |     return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
 13 | 
 14 | 
 15 | @lru_cache()
 16 | def bytes_to_unicode():
 17 |     """
 18 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 19 |     The reversible bpe codes work on unicode strings.
 20 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 21 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 22 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 23 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 24 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 25 |     """
 26 |     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
 27 |     cs = bs[:]
 28 |     n = 0
 29 |     for b in range(2**8):
 30 |         if b not in bs:
 31 |             bs.append(b)
 32 |             cs.append(2**8+n)
 33 |             n += 1
 34 |     cs = [chr(n) for n in cs]
 35 |     return dict(zip(bs, cs))
 36 | 
 37 | 
 38 | def get_pairs(word):
 39 |     """Return set of symbol pairs in a word.
 40 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 41 |     """
 42 |     pairs = set()
 43 |     prev_char = word[0]
 44 |     for char in word[1:]:
 45 |         pairs.add((prev_char, char))
 46 |         prev_char = char
 47 |     return pairs
 48 | 
 49 | 
 50 | def basic_clean(text):
 51 |     text = ftfy.fix_text(text)
 52 |     text = html.unescape(html.unescape(text))
 53 |     return text.strip()
 54 | 
 55 | 
 56 | def whitespace_clean(text):
 57 |     text = re.sub(r'\s+', ' ', text)
 58 |     text = text.strip()
 59 |     return text
 60 | 
 61 | 
 62 | class SimpleTokenizer(object):
 63 |     def __init__(self, bpe_path: str = default_bpe()):
 64 |         self.byte_encoder = bytes_to_unicode()
 65 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
 66 |         merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
 67 |         merges = merges[1:49152-256-2+1]
 68 |         merges = [tuple(merge.split()) for merge in merges]
 69 |         vocab = list(bytes_to_unicode().values())
 70 |         vocab = vocab + [v+'</w>' for v in vocab]
 71 |         for merge in merges:
 72 |             vocab.append(''.join(merge))
 73 |         vocab.extend(['<|startoftext|>', '<|endoftext|>'])
 74 |         self.encoder = dict(zip(vocab, range(len(vocab))))
 75 |         self.decoder = {v: k for k, v in self.encoder.items()}
 76 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
 77 |         self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
 78 |         self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
 79 | 
 80 |     def bpe(self, token):
 81 |         if token in self.cache:
 82 |             return self.cache[token]
 83 |         word = tuple(token[:-1]) + ( token[-1] + '</w>',)
 84 |         pairs = get_pairs(word)
 85 | 
 86 |         if not pairs:
 87 |             return token+'</w>'
 88 | 
 89 |         while True:
 90 |             bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
 91 |             if bigram not in self.bpe_ranks:
 92 |                 break
 93 |             first, second = bigram
 94 |             new_word = []
 95 |             i = 0
 96 |             while i < len(word):
 97 |                 try:
 98 |                     j = word.index(first, i)
 99 |                     new_word.extend(word[i:j])
100 |                     i = j
101 |                 except:
102 |                     new_word.extend(word[i:])
103 |                     break
104 | 
105 |                 if word[i] == first and i < len(word)-1 and word[i+1] == second:
106 |                     new_word.append(first+second)
107 |                     i += 2
108 |                 else:
109 |                     new_word.append(word[i])
110 |                     i += 1
111 |             new_word = tuple(new_word)
112 |             word = new_word
113 |             if len(word) == 1:
114 |                 break
115 |             else:
116 |                 pairs = get_pairs(word)
117 |         word = ' '.join(word)
118 |         self.cache[token] = word
119 |         return word
120 | 
121 |     def encode(self, text):
122 |         bpe_tokens = []
123 |         text = whitespace_clean(basic_clean(text)).lower()
124 |         for token in re.findall(self.pat, text):
125 |             token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
126 |             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
127 |         return bpe_tokens
128 | 
129 |     def decode(self, tokens):
130 |         text = ''.join([self.decoder[token] for token in tokens])
131 |         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
132 |         return text
133 | 


--------------------------------------------------------------------------------
/VL-T5/src/gqa_model.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import torch
  3 | import torch.nn as nn
  4 | import numpy as np
  5 | 
  6 | 
  7 | from modeling_t5 import VLT5
  8 | class VLT5GQA(VLT5):
  9 |     def __init__(self, config):
 10 |         super().__init__(config)
 11 | 
 12 |     def train_step(self, batch):
 13 | 
 14 |         device = next(self.parameters()).device
 15 | 
 16 |         batch = self.vis_forward(batch, device)
 17 |         task = batch["task"]
 18 |         vis_feats = batch['vis_feats'].to(device)
 19 |         input_ids = batch['input_ids'].to(device)
 20 |         vis_pos = batch['boxes'].to(device)
 21 | 
 22 |         lm_labels = batch["target_ids"].to(device)
 23 | 
 24 |         output = self(
 25 |             input_ids=input_ids,
 26 |             vis_inputs=(vis_feats, vis_pos),
 27 |             labels=lm_labels,
 28 |             return_dict=True,
 29 |             task=task,
 30 |         )
 31 |         assert 'loss' in output
 32 | 
 33 |         lm_mask = lm_labels != -100
 34 |         B, L = lm_labels.size()
 35 | 
 36 |         loss = output['loss']
 37 | 
 38 |         loss = loss.view(B, L) * lm_mask
 39 | 
 40 |         loss = loss.sum(dim=1) / lm_mask.sum(dim=1).clamp(min=1)  # B
 41 | 
 42 |         loss = loss.mean()
 43 | 
 44 |         result = {
 45 |             'loss': loss
 46 |         }
 47 |         return result
 48 | 
 49 |     def test_step(self, batch, **kwargs):
 50 |         device = next(self.parameters()).device
 51 | 
 52 |         batch = self.vis_forward(batch, device)
 53 |         task = batch["task"]
 54 |         vis_feats = batch['vis_feats'].to(device)
 55 |         input_ids = batch['input_ids'].to(device)
 56 |         vis_pos = batch['boxes'].to(device)
 57 | 
 58 | 
 59 |         output = self.generate(
 60 |             input_ids=input_ids,
 61 |             vis_inputs=(vis_feats, vis_pos),
 62 |             task=task,
 63 |             **kwargs
 64 |         )
 65 | 
 66 |         generated_sents = self.tokenizer.batch_decode(output, skip_special_tokens=True)
 67 | 
 68 |         result = {}
 69 |         result['pred_ans'] = generated_sents
 70 | 
 71 |         return result
 72 | 
 73 | 
 74 | from modeling_bart import VLBart
 75 | class VLBartGQA(VLBart):
 76 |     def __init__(self, config):
 77 |         super().__init__(config)
 78 | 
 79 |     def train_step(self, batch):
 80 | 
 81 |         device = next(self.parameters()).device
 82 | 
 83 |         batch = self.vis_forward(batch, device)
 84 |         task = batch["task"]
 85 |         vis_feats = batch['vis_feats'].to(device)
 86 |         input_ids = batch['input_ids'].to(device)
 87 |         vis_pos = batch['boxes'].to(device)
 88 | 
 89 |         lm_labels = batch["target_ids"].to(device)
 90 | 
 91 |         output = self(
 92 |             input_ids=input_ids,
 93 |             vis_inputs=(vis_feats, vis_pos),
 94 |             labels=lm_labels,
 95 |             return_dict=True,
 96 |             task=task,
 97 |         )
 98 |         assert 'loss' in output
 99 | 
100 |         lm_mask = lm_labels != -100
101 |         B, L = lm_labels.size()
102 | 
103 |         loss = output['loss']
104 | 
105 |         loss = loss.view(B, L) * lm_mask
106 | 
107 |         loss = loss.sum(dim=1) / lm_mask.sum(dim=1).clamp(min=1)  # B
108 | 
109 |         loss = loss.mean()
110 | 
111 |         result = {
112 |             'loss': loss
113 |         }
114 |         return result
115 | 
116 |     def test_step(self, batch, **kwargs):
117 |         device = next(self.parameters()).device
118 | 
119 |         batch = self.vis_forward(batch, device)
120 |         task = batch["task"]
121 |         vis_feats = batch['vis_feats'].to(device)
122 |         input_ids = batch['input_ids'].to(device)
123 |         vis_pos = batch['boxes'].to(device)
124 | 
125 | 
126 |         output = self.generate(
127 |             input_ids=input_ids,
128 |             vis_inputs=(vis_feats, vis_pos),
129 |             task=task,
130 |             **kwargs
131 |         )
132 | 
133 |         generated_sents = self.tokenizer.batch_decode(output, skip_special_tokens=True)
134 | 
135 |         result = {}
136 |         result['pred_ans'] = generated_sents
137 | 
138 |         return result
139 | 


--------------------------------------------------------------------------------
/VL-T5/src/lora/__init__.py:
--------------------------------------------------------------------------------
1 | # The codes in the folder are copied from https://github.com/microsoft/LoRA/tree/aa68d8a021c7ba08973e35fdfdc76338fdbfad57/loralib
2 | 
3 | name = "lora"
4 | 
5 | from .layers import *
6 | from .utils import *
7 | from .config import *
8 | from .controller import LoRALinearController


--------------------------------------------------------------------------------
/VL-T5/src/lora/config.py:
--------------------------------------------------------------------------------
1 | from dataclasses import dataclass
2 | 
3 | 
4 | @dataclass
5 | class LoraConfig(object):
6 |     lora_dim = 4
7 |     lora_alpha = 32
8 |     lora_dropout = 0.1
9 | 


--------------------------------------------------------------------------------
/VL-T5/src/lora/controller.py:
--------------------------------------------------------------------------------
 1 | """Implements Adapter Controller, a module that keeps multiple
 2 | layers of Adapters, and controls which adapter layer to use."""
 3 | import os
 4 | import math
 5 | import torch
 6 | import torch.nn as nn
 7 | import torch.nn.functional as F
 8 | from .layers import LoRALayer
 9 | 
10 | 
11 | class LoRALinearController(nn.Linear, LoRALayer):
12 |     """Implements Adapter controller module which controls the logics of
13 |     putting adapter layers within transformer's layers."""
14 | 
15 |     def __init__(
16 |         self, 
17 |         in_features: int, 
18 |         out_features: int, 
19 |         fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
20 |         config=None,
21 |         **kwargs
22 |     ):
23 |         nn.Linear.__init__(self, in_features, out_features, **kwargs)
24 | 
25 |         self.tasks = config.tasks
26 |         self.use_single_lora = config.use_single_lora
27 | 
28 |         r = config.lora_dim
29 |         lora_alpha = config.lora_alpha
30 |         lora_dropout = config.lora_dropout
31 | 
32 |         LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout,
33 |                             merge_weights=True)
34 | 
35 |         self.fan_in_fan_out = fan_in_fan_out
36 |         self.lora_As = nn.ParameterDict(dict())
37 |         self.lora_Bs = nn.ParameterDict(dict())
38 |         # Actual trainable parameters
39 |         if r > 0:
40 |             self.lora_As, self.lora_Bs = self.construct_lora_weights(self.tasks)
41 |             self.scaling = self.lora_alpha / self.r
42 |             # Freezing the pre-trained weight matrix
43 |             self.weight.requires_grad = False
44 |         self.reset_parameters()
45 |         if fan_in_fan_out:
46 |             self.weight.data = self.weight.data.T
47 | 
48 |     def reset_parameters(self):
49 |         nn.Linear.reset_parameters(self)
50 |         if hasattr(self, 'lora_As'):
51 |             # initialize A the same way as the default for nn.Linear and B to zero
52 |             for task in self.tasks:
53 |                 nn.init.kaiming_uniform_(self.lora_As[task], a=math.sqrt(5))
54 |                 nn.init.zeros_(self.lora_Bs[task])
55 | 
56 |     def forward(self, x, task):
57 |         def T(w):
58 |             return w.T if self.fan_in_fan_out else w
59 | 
60 |         result = F.linear(x, T(self.weight), bias=self.bias)
61 | 
62 |         lora_A = self.lora_As[task]
63 |         lora_B = self.lora_Bs[task]
64 | 
65 |         if self.training:
66 |             result += (self.lora_dropout(x) @ lora_A.T @ lora_B.T) * self.scaling
67 |         else:
68 |             result += (x @ lora_A.T @ lora_B.T) * self.scaling
69 | 
70 |         return result
71 | 
72 |     def get_task(self, task):
73 |         return task 
74 | 
75 |     def construct_lora_weights(self, tasks):
76 |         if self.use_single_lora:
77 |             lora_A = nn.Parameter(self.weight.new_zeros((self.r, self.in_features)))
78 |             lora_B = nn.Parameter(self.weight.new_zeros((self.out_features, self.r)))
79 |             for task in tasks:
80 |                 self.lora_As[task] = lora_A
81 |                 self.lora_Bs[task] = lora_B
82 |         else:
83 |             for task in tasks:
84 |                 self.lora_As[task] = nn.Parameter(self.weight.new_zeros((self.r, self.in_features)))
85 |                 self.lora_Bs[task] = nn.Parameter(self.weight.new_zeros((self.out_features, self.r)))
86 | 
87 |         return self.lora_As, self.lora_Bs
88 | 


--------------------------------------------------------------------------------
/VL-T5/src/lora/utils.py:
--------------------------------------------------------------------------------
 1 | #  ------------------------------------------------------------------------------------------
 2 | #  Copyright (c) Microsoft Corporation. All rights reserved.
 3 | #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 4 | #  ------------------------------------------------------------------------------------------
 5 | import torch
 6 | import torch.nn as nn
 7 | 
 8 | from typing import Dict
 9 | 
10 | from .layers import LoRALayer
11 | 
12 | 
13 | def mark_only_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None:
14 |     for n, p in model.named_parameters():
15 |         if 'lora_' not in n:
16 |             p.requires_grad = False
17 |     if bias == 'none':
18 |         return
19 |     elif bias == 'all':
20 |         for n, p in model.named_parameters():
21 |             if 'bias' in n:
22 |                 p.requires_grad = True
23 |     elif bias == 'lora_only':
24 |         for m in model.modules():
25 |             if isinstance(m, LoRALayer) and \
26 |                 hasattr(m, 'bias') and \
27 |                 m.bias is not None:
28 |                     m.bias.requires_grad = True
29 |     else:
30 |         raise NotImplementedError
31 | 
32 | 
33 | def lora_state_dict(model: nn.Module, bias: str = 'none') -> Dict[str, torch.Tensor]:
34 |     my_state_dict = model.state_dict()
35 |     if bias == 'none':
36 |         return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k}
37 |     elif bias == 'all':
38 |         return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k or 'bias' in k}
39 |     elif bias == 'lora_only':
40 |         to_return = {}
41 |         for k in my_state_dict:
42 |             if 'lora_' in k:
43 |                 to_return[k] = my_state_dict[k]
44 |                 bias_name = k.split('lora_')[0]+'bias'
45 |                 if bias_name in my_state_dict:
46 |                     to_return[bias_name] = my_state_dict[bias_name]
47 |         return to_return
48 |     else:
49 |         raise NotImplementedError


--------------------------------------------------------------------------------
/VL-T5/src/mmt_model.py:
--------------------------------------------------------------------------------
  1 | 
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | 
  6 | from modeling_t5 import VLT5
  7 | class VLT5MMT(VLT5):
  8 |     def __init__(self, config):
  9 |         super().__init__(config)
 10 | 
 11 |     def train_step(self, batch):
 12 | 
 13 |         device = next(self.parameters()).device
 14 |         vis_feats = batch['vis_feats'].to(device)
 15 |         input_ids = batch['input_ids'].to(device)
 16 |         vis_pos = batch['boxes'].to(device)
 17 | 
 18 |         vis_attention_mask = batch['vis_attention_mask'].to(device)
 19 | 
 20 |         lm_labels = batch["target_ids"].to(device)
 21 | 
 22 |         output = self(
 23 |             input_ids=input_ids,
 24 |             vis_inputs=(vis_feats, vis_pos),
 25 |             vis_attention_mask=vis_attention_mask,
 26 |             labels=lm_labels,
 27 |             reduce_loss=True,
 28 |             return_dict=True
 29 |         )
 30 | 
 31 |         loss = output['loss']
 32 | 
 33 |         result = {
 34 |             'loss': loss
 35 |         }
 36 |         return result
 37 | 
 38 |     def test_step(self, batch, **kwargs):
 39 |         device = next(self.parameters()).device
 40 |         vis_feats = batch['vis_feats'].to(device)
 41 |         input_ids = batch['input_ids'].to(device)
 42 |         vis_pos = batch['boxes'].to(device)
 43 | 
 44 |         vis_attention_mask = batch['vis_attention_mask'].to(device)
 45 | 
 46 |         output = self.generate(
 47 |             input_ids=input_ids,
 48 |             vis_inputs=(vis_feats, vis_pos),
 49 |             vis_attention_mask=vis_attention_mask,
 50 |             **kwargs
 51 |         )
 52 | 
 53 |         generated_sents = self.tokenizer.batch_decode(output, skip_special_tokens=True)
 54 | 
 55 |         result = {}
 56 |         result['pred'] = generated_sents
 57 | 
 58 |         return result
 59 | 
 60 | 
 61 | from modeling_bart import VLBart
 62 | class VLBartMMT(VLBart):
 63 |     def __init__(self, config):
 64 |         super().__init__(config)
 65 | 
 66 |     def train_step(self, batch):
 67 | 
 68 |         device = next(self.parameters()).device
 69 |         vis_feats = batch['vis_feats'].to(device)
 70 |         input_ids = batch['input_ids'].to(device)
 71 |         vis_pos = batch['boxes'].to(device)
 72 | 
 73 |         vis_attention_mask = batch['vis_attention_mask'].to(device)
 74 | 
 75 |         lm_labels = batch["target_ids"].to(device)
 76 | 
 77 |         output = self(
 78 |             input_ids=input_ids,
 79 |             vis_inputs=(vis_feats, vis_pos),
 80 |             vis_attention_mask=vis_attention_mask,
 81 |             labels=lm_labels,
 82 |             reduce_loss=True,
 83 |             return_dict=True
 84 |         )
 85 | 
 86 |         loss = output['loss']
 87 | 
 88 |         result = {
 89 |             'loss': loss
 90 |         }
 91 |         return result
 92 | 
 93 |     def test_step(self, batch, **kwargs):
 94 |         device = next(self.parameters()).device
 95 |         vis_feats = batch['vis_feats'].to(device)
 96 |         input_ids = batch['input_ids'].to(device)
 97 |         vis_pos = batch['boxes'].to(device)
 98 | 
 99 |         vis_attention_mask = batch['vis_attention_mask'].to(device)
100 | 
101 |         output = self.generate(
102 |             input_ids=input_ids,
103 |             vis_inputs=(vis_feats, vis_pos),
104 |             vis_attention_mask=vis_attention_mask,
105 |             **kwargs
106 |         )
107 | 
108 |         generated_sents = self.tokenizer.batch_decode(output, skip_special_tokens=True)
109 | 
110 |         result = {}
111 |         result['pred'] = generated_sents
112 | 
113 |         return result
114 | 


--------------------------------------------------------------------------------
/VL-T5/src/multitask_data.py:
--------------------------------------------------------------------------------
 1 | import more_itertools
 2 | from typing import Any, Dict, Iterable, Union, List, Mapping
 3 | import vqa_data
 4 | import refcoco_data
 5 | import itertools
 6 | import random
 7 | 
 8 | class MultitaskLoader(object):
 9 |     def __init__(self, loaders, shuffle=True, drop_last=False, sampling='roundrobin', n_batches=None, verbose=True):
10 |         self.loaders = loaders
11 |         self.verbose = verbose
12 |         # self.loader_lens = [len(loader) for loader in self.loaders]
13 |         self.task2len = {loader.task: len(loader) for loader in self.loaders}
14 |         if self.verbose:
15 |             print('Task2len:', self.task2len)
16 |         self.task2loader = {loader.task: loader for loader in self.loaders}
17 |         # print('loader lens:', self.loader_lens)
18 | 
19 |         self.shuffle = shuffle
20 |         self.drop_last = drop_last
21 |         self.sampling = sampling
22 |         self.epoch_tasks = None
23 |         self.n_batches = n_batches
24 |         self.set_epoch(0)
25 |         # print('loader indices:', self.loader_indices)
26 | 
27 |     def __iter__(self):
28 |         self.task2iter = {loader.task: iter(loader) for loader in self.loaders}
29 |         # self.loader_iters = [iter(loader) for loader in self.loaders]
30 | 
31 |         return self
32 | 
33 |     def set_epoch(self, epoch):
34 |         for loader in self.loaders:
35 |             loader.sampler.set_epoch(epoch)
36 | 
37 |         if self.sampling == 'roundrobin':
38 |             epoch_tasks = []
39 |             for task, loader in self.task2loader.items():
40 |                 n_batches = len(loader)
41 |                 epoch_tasks.extend([task]*n_batches)
42 |         elif self.sampling == 'balanced':
43 |             if self.n_batches is None:
44 |                 n_batches = sum(self.task2len.values()) // len(self.loaders)
45 |             else:
46 |                 n_batches = self.n_batches
47 |             if self.verbose:
48 |                 print('# batches:', n_batches)
49 |             epoch_tasks = []
50 |             for task, loader in self.task2loader.items():
51 |                 epoch_tasks.extend([task]*n_batches)
52 | 
53 |         if self.shuffle:
54 |             random.Random(epoch).shuffle(epoch_tasks)
55 |         self.epoch_tasks = epoch_tasks
56 |         if self.verbose:
57 |             print('# epoch_tasks:', len(self.epoch_tasks))
58 | 
59 |     def __next__(self):
60 |         if len(self.epoch_tasks) > 0:
61 |             task = self.epoch_tasks.pop()
62 |             loader_iter = self.task2iter[task]
63 |             return next(loader_iter)
64 |         else:
65 |             raise StopIteration
66 | 
67 |     def __len__(self):
68 |         return len(self.epoch_tasks)
69 | 
70 | 
71 | 
72 | def _chunked_iterator(i: Iterable, chunk_size: int, drop_last: bool):
73 |     chunks = more_itertools.chunked(i, chunk_size)
74 |     if drop_last:
75 |         return (chunk for chunk in chunks if len(chunk) == chunk_size)
76 |     else:
77 |         return chunks
78 | 


--------------------------------------------------------------------------------
/VL-T5/src/my_deepspeed.py:
--------------------------------------------------------------------------------
 1 | # from transformers.deepspeed import HfDeepSpeedConfig
 2 | import json
 3 | 
 4 | def deepspeed_init(trainer, resume_from_checkpoint=None):
 5 |     """
 6 |     Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args.
 7 |     If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made.
 8 |     Args:
 9 |         trainer: Trainer object
10 |         num_training_steps: per single gpu
11 |         resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load
12 |     Returns: model, optimizer, lr_scheduler
13 |     """
14 |     import deepspeed
15 |     from deepspeed.utils import logger as ds_logger
16 | 
17 |     model = trainer.model
18 |     args = trainer.args
19 | 
20 |     optimizer = trainer.optim
21 |     lr_scheduler = trainer.lr_scheduler
22 | 
23 |     with open(args.deepspeed, "r") as f:
24 |         ds_config = json.load(f)
25 | 
26 |     if args.fp16:
27 |         ds_config["fp16"] = {"enabled": True, "loss_scale": 0}
28 | 
29 |     ds_config["gradient_clipping"] = args.clip_grad_norm
30 |     ds_config["train_micro_batch_size_per_gpu"] = args.batch_size
31 |     ds_config["zero_allow_untested_optimizer"] = True
32 | 
33 |     # hf_deepspeed_config = args.hf_deepspeed_config
34 |     # hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps)
35 | 
36 |     # resume config update - some bits like `model` and `num_training_steps` only become available during train
37 |     # config = HfDeepSpeedConfig(args.deepspeed)
38 |     config = ds_config
39 | 
40 |     # keep for quick debug:
41 |     # from pprint import pprint; pprint(config)
42 | 
43 |     # set the Deepspeed log level consistent with the trainer
44 |     # ds_logger.setLevel(args.get_process_log_level())
45 | 
46 |     model_parameters = filter(lambda p: p.requires_grad, model.parameters())
47 | 
48 |     model, optimizer, _, lr_scheduler = deepspeed.initialize(
49 |         model=model,
50 |         model_parameters=model_parameters,
51 |         config_params=config,
52 |         optimizer=optimizer,
53 |         lr_scheduler=lr_scheduler,
54 |     )
55 | 
56 |     if resume_from_checkpoint is not None:
57 | 
58 |         # it's possible that the user is trying to resume from model_path, which doesn't necessarily
59 |         # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's
60 |         # a resume from a checkpoint and not just a local pretrained weight. So we check here if the
61 |         # path contains what looks like a deepspeed checkpoint
62 |         import glob
63 | 
64 |         deepspeed_checkpoint_dirs = sorted(glob.glob(f"{resume_from_checkpoint}/global_step*"))
65 | 
66 |         if len(deepspeed_checkpoint_dirs) > 0:
67 |             logger.info(f"Attempting to resume from {resume_from_checkpoint}")
68 |             # this magically updates self.optimizer and self.lr_scheduler
69 |             load_path, _ = model.load_checkpoint(
70 |                 resume_from_checkpoint, load_optimizer_states=True, load_lr_scheduler_states=True
71 |             )
72 |             if load_path is None:
73 |                 raise ValueError(f"[deepspeed] failed to resume from checkpoint {resume_from_checkpoint}")
74 |         else:
75 |             logger.info(f"{resume_from_checkpoint} doesn't have deepspeed checkpoints, doing nothing")
76 | 
77 |     return model, optimizer, lr_scheduler
78 | 


--------------------------------------------------------------------------------
/VL-T5/src/my_transformers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ylsung/VL_adapter/545fcbbdbbaec4c442de35567f6ae477ff4e8265/VL-T5/src/my_transformers/__init__.py


--------------------------------------------------------------------------------
/VL-T5/src/prompt/__init__.py:
--------------------------------------------------------------------------------
1 | from .config import EncoderPromptConfig, DecoderPromptConfig
2 | from .prompt_controller import PromptController


--------------------------------------------------------------------------------
/VL-T5/src/prompt/config.py:
--------------------------------------------------------------------------------
 1 | from dataclasses import dataclass
 2 | 
 3 | 
 4 | @dataclass
 5 | class EncoderPromptConfig(object):
 6 |     seq_len = 0
 7 |     input_dim = 768
 8 |     mid_dim = 768
 9 |     use_input_prompt = True
10 |     use_single_prompt = False
11 | 
12 | @dataclass
13 | class DecoderPromptConfig(object):
14 |     seq_len = 0
15 |     input_dim = 768
16 |     mid_dim = 768
17 |     use_input_prompt = True
18 |     use_single_prompt = False


--------------------------------------------------------------------------------
/VL-T5/src/prompt/prompt_controller.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from .prompt_modeling import InputPrompts
 4 | 
 5 | 
 6 | class PromptController(nn.Module):
 7 |     """Implements Adapter controller module which controls the logics of
 8 |     putting adapter layers within transformer's layers."""
 9 | 
10 |     def __init__(self, config):
11 |         super().__init__()
12 |         self.config = config
13 |         self.prompts = nn.ModuleDict(dict())
14 |         self.tasks = config.tasks
15 |         self.use_input_prompt = config.use_input_prompt
16 |         self.use_single_prompt = config.use_single_prompt
17 |         self.prompts = self.construct_prompts(self.tasks)
18 | 
19 |     def get_task(self, task):
20 |         return task 
21 | 
22 |     def construct_prompts(self, tasks):
23 |         """
24 |         Constructs adapter layers and adds them to a dictionary for the given
25 |         tasks.
26 |         Args:
27 |             tasks: A list of string containing the task names.
28 |         """
29 | 
30 |         if self.use_single_prompt:
31 |             if self.use_input_prompt:
32 |                 prompt = InputPrompts(self.config)
33 | 
34 |             for task in tasks:
35 |                 self.prompts[task] = prompt
36 | 
37 |         else:
38 |             for task in tasks:
39 |                 if self.use_input_prompt:
40 |                     prompt = InputPrompts(self.config)
41 |                     
42 |                     self.prompts[task] = prompt
43 | 
44 |         return self.prompts
45 | 
46 |     def convert_to_list(self, tasks):
47 |         if isinstance(tasks, list):
48 |             return tasks
49 |         return [tasks]
50 | 
51 |     def get_prompt(self, task):
52 |         """Given a task returns its corresponding adapter layer.
53 |         Args:
54 |             task: Input task name.
55 |         Returns:
56 |             Adapter layer corresponding to the given task.
57 |         """
58 |         return self.prompts[task]
59 | 
60 |     def forward(self, bsz, device, task):
61 |         """
62 |         Retrieves the adapter layer corresponding to the given
63 |         task. It freezes the adapter layers for all the other tasks
64 |         and call the selected adapter layer.
65 |         Args:
66 |             task: the name of the current task.
67 |             inputs: the inputs to feed in in the adapter layer.
68 |         Returns:
69 |             outputs of the adapter layer.
70 |         """
71 |         task = self.get_task(task)
72 |         # Enables the adapter layer for the given task.
73 |         prompt_module = self.get_prompt(task)
74 | 
75 |         trainable_prompt = prompt_module.get_prompt(bsz, device)
76 | 
77 |         return trainable_prompt
78 | 
79 | 


--------------------------------------------------------------------------------
/VL-T5/src/prompt/prompt_modeling.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | class InputPrompts(nn.Module):
 5 |     def __init__(self, config):
 6 |         super().__init__()
 7 |         
 8 |         self.prompt_len = config.prompt_len
 9 |         self.input_dim = config.input_dim
10 |         self.mid_dim = config.mid_dim
11 | 
12 |         self.prefix_tokens = torch.arange(self.prompt_len).long()
13 |         self.prefix_embedding = nn.Sequential(
14 |             nn.Embedding(self.prompt_len, self.input_dim),
15 |             nn.Linear(self.input_dim, self.mid_dim),
16 |             nn.Tanh(),
17 |             nn.Linear(self.mid_dim, self.input_dim),
18 |         )
19 | 
20 |     def get_prompt(self, bsz, device):
21 |         input_tokens = self.prefix_tokens.unsqueeze(0).expand(bsz, -1).to(device) # (B, L)
22 |         prefix_prompt = self.prefix_embedding(input_tokens) # (B, L, d_model * n_heads * n_layer)
23 |         
24 |         return prefix_prompt


--------------------------------------------------------------------------------
/VL-T5/src/utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import numpy as np
  3 | import torch
  4 | import torch.distributed as dist
  5 | import collections
  6 | import logging
  7 | 
  8 | def get_area(pos):
  9 |     """
 10 |     Args
 11 |         pos: [B, N, 4]
 12 |             (x1, x2, y1, y2)
 13 | 
 14 |     Return
 15 |         area : [B, N]
 16 |     """
 17 |     # [B, N]
 18 |     height = pos[:, :, 3] - pos[:, :, 2]
 19 |     width = pos[:, :, 1] - pos[:, :, 0]
 20 |     area = height * width
 21 |     return area
 22 | 
 23 | def get_relative_distance(pos):
 24 |     """
 25 |     Args
 26 |         pos: [B, N, 4]
 27 |             (x1, x2, y1, y2)
 28 | 
 29 |     Return
 30 |         out : [B, N, N, 4]
 31 |     """
 32 |     # B, N = pos.size()[:-1]
 33 | 
 34 |     # [B, N, N, 4]
 35 |     relative_distance = pos.unsqueeze(1) - pos.unsqueeze(2)
 36 | 
 37 |     return relative_distance
 38 | 
 39 | 
 40 | class LossMeter(object):
 41 |     def __init__(self, maxlen=100):
 42 |         """Computes and stores the running average"""
 43 |         self.vals = collections.deque([], maxlen=maxlen)
 44 | 
 45 |     def __len__(self):
 46 |         return len(self.vals)
 47 | 
 48 |     def update(self, new_val):
 49 |         self.vals.append(new_val)
 50 | 
 51 |     @property
 52 |     def val(self):
 53 |         return sum(self.vals) / len(self.vals)
 54 | 
 55 |     def __repr__(self):
 56 |         return str(self.val)
 57 | 
 58 | 
 59 | def count_parameters(model):
 60 |     return sum(p.numel() for p in model.parameters() if p.requires_grad)
 61 | 
 62 | 
 63 | def load_state_dict(state_dict_path, loc='cpu'):
 64 |     state_dict = torch.load(state_dict_path, map_location=loc)
 65 |     # Change Multi GPU to single GPU
 66 |     original_keys = list(state_dict.keys())
 67 |     for key in original_keys:
 68 |         if key.startswith("module."):
 69 |             new_key = key[len("module."):]
 70 |             state_dict[new_key] = state_dict.pop(key)
 71 |     return state_dict
 72 | 
 73 | 
 74 | def set_global_logging_level(level=logging.ERROR, prefices=[""]):
 75 |     """
 76 |     Override logging levels of different modules based on their name as a prefix.
 77 |     It needs to be invoked after the modules have been loaded so that their loggers have been initialized.
 78 | 
 79 |     Args:
 80 |         - level: desired level. e.g. logging.INFO. Optional. Default is logging.ERROR
 81 |         - prefices: list of one or more str prefices to match (e.g. ["transformers", "torch"]). Optional.
 82 |           Default is `[""]` to match all active loggers.
 83 |           The match is a case-sensitive `module_name.startswith(prefix)`
 84 |     """
 85 |     prefix_re = re.compile(fr'^(?:{ "|".join(prefices) })')
 86 |     for name in logging.root.manager.loggerDict:
 87 |         if re.match(prefix_re, name):
 88 |             logging.getLogger(name).setLevel(level)
 89 | 
 90 | 
 91 | def get_iou(anchors, gt_boxes):
 92 |     """
 93 |     anchors: (N, 4) torch floattensor
 94 |     gt_boxes: (K, 4) torch floattensor
 95 |     overlaps: (N, K) ndarray of overlap between boxes and query_boxes
 96 |     """
 97 |     N = anchors.size(0)
 98 | 
 99 |     if gt_boxes.size() == (4,):
100 |         gt_boxes = gt_boxes.view(1, 4)
101 |     K = gt_boxes.size(0)
102 | 
103 |     gt_boxes_area = (
104 |         (gt_boxes[:, 2] - gt_boxes[:, 0] + 1) *
105 |         (gt_boxes[:, 3] - gt_boxes[:, 1] + 1)
106 |     ).view(1, K)
107 | 
108 |     anchors_area = (
109 |         (anchors[:, 2] - anchors[:, 0] + 1) *
110 |         (anchors[:, 3] - anchors[:, 1] + 1)
111 |     ).view(N, 1)
112 | 
113 |     boxes = anchors.view(N, 1, 4).expand(N, K, 4)
114 |     query_boxes = gt_boxes.view(1, K, 4).expand(N, K, 4)
115 | 
116 |     iw = (
117 |         torch.min(boxes[:, :, 2], query_boxes[:, :, 2])
118 |         - torch.max(boxes[:, :, 0], query_boxes[:, :, 0])
119 |         + 1
120 |     )
121 |     iw[iw < 0] = 0
122 | 
123 |     ih = (
124 |         torch.min(boxes[:, :, 3], query_boxes[:, :, 3])
125 |         - torch.max(boxes[:, :, 1], query_boxes[:, :, 1])
126 |         + 1
127 |     )
128 |     ih[ih < 0] = 0
129 | 
130 |     ua = anchors_area + gt_boxes_area - (iw * ih)
131 |     overlaps = iw * ih / ua
132 | 
133 |     return overlaps
134 | 
135 | 
136 | def xywh_to_xyxy(boxes):
137 |     """Convert [x y w h] box format to [x1 y1 x2 y2] format."""
138 |     return np.hstack((boxes[:, 0:2], boxes[:, 0:2] + boxes[:, 2:4] - 1))
139 | 
140 | 
141 | from torch.optim import Optimizer
142 | 
143 | class FusedOptimizer(Optimizer):
144 |     def __init__(self, optimizers):
145 |         self.optimizers = optimizers
146 |         param_groups = []
147 |         for optimizer in self.optimizers:
148 |             param_groups += optimizer.param_groups
149 |         #super(FusedOptimizer, self).__init__([], {})
150 |         self.param_groups = param_groups
151 | 
152 |     def step(self):
153 |         for optimizer in self.optimizers:
154 |             optimizer.step()
155 | 


--------------------------------------------------------------------------------
/assets/vl_adapter_teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/ylsung/VL_adapter/545fcbbdbbaec4c442de35567f6ae477ff4e8265/assets/vl_adapter_teaser.png


--------------------------------------------------------------------------------
/download_backbones.py:
--------------------------------------------------------------------------------
 1 | 
 2 | from transformers import T5ForConditionalGeneration, T5Tokenizer
 3 | from transformers import BartForConditionalGeneration, BartTokenizer
 4 | 
 5 | if __name__ == '__main__':
 6 | 
 7 | 
 8 |     print('Downloading checkpoints if not cached')
 9 |     print('T5-base')
10 |     model = T5ForConditionalGeneration.from_pretrained('t5-base')
11 |     tokenizer = T5Tokenizer.from_pretrained('t5-base')
12 |     print('BART-base')
13 |     tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
14 |     model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
15 |     print('Done!')
16 | 
17 | 


--------------------------------------------------------------------------------
/feature_extraction/README.md:
--------------------------------------------------------------------------------
 1 | # Feature extraction
 2 | 
 3 | 
 4 | ## Feature extraction using CLIP
 5 | The commands to process COCO images
 6 | ```bash
 7 | model_type=$1 # one of [RN50, RN101, RN50x4, ViT-B/32, vit_base_patch32_224_in21k]. The code uses RN101.
 8 | GPU=$2
 9 | 
10 | train_image_root=[The images that store training images]
11 | val_image_root=[The images that store validation images]
12 | test_image_root=[The images that store testing images]
13 | 
14 | output_dir=[A folder that stores all clip_features]
15 | 
16 | echo Use ${model_type} to extract features
17 | 
18 | CUDA_VISIBLE_DEVICES=$2 python coco_CLIP.py --model_type ${model_type} --images_root ${train_image_root} --output_dir ${output_dir}
19 | CUDA_VISIBLE_DEVICES=$2 python coco_CLIP.py --model_type ${model_type} --images_root ${val_image_root} --output_dir ${output_dir}
20 | CUDA_VISIBLE_DEVICES=$2 python coco_CLIP.py --model_type ${model_type} --images_root ${test_image_root} --output_dir ${output_dir}
21 | ```
22 | 
23 | ---
24 | The following is the feature extraction using other vision encoders.
25 | 
26 | 
27 | 
28 | We use [Hao Tan's Detectron2 implementation of 'Bottom-up feature extractor'](https://github.com/airsplay/py-bottom-up-attention), which is compatible with [the original Caffe implementation](https://github.com/peteanderson80/bottom-up-attention).
29 | 
30 | Following LXMERT, we use the feature extractor which outputs 36 boxes per image.
31 | We store features in hdf5 format.
32 | 
33 | 
34 | ## Download features
35 | 
36 | Download `datasets` folder from [Google Drive](https://drive.google.com/drive/folders/1MBBhlkP83VMKS2Qe0SmFfzkHhMpIG5wf?usp=sharing)
37 | 
38 | 
39 | ## Install feature extractor (optional)
40 | 
41 | Please follow [the original installation guide](https://github.com/airsplay/py-bottom-up-attention#installation).
42 | 
43 | ## Manually extract & convert features (optional)
44 | 
45 | * `_prpoposal.py`: extract features from 36 detected boxes
46 | * `_gt.py`: extract features from ground truth boxes
47 | * `_mattnet.py`: extract features from box predictions shared from [MattNet](https://github.com/lichengunc/MAttNet#pre-computed-detectionsmasks)
48 | 
49 | ```bash
50 | # Pretrain/VQA: Download LXMERT's COCO features (tsv) and convert to hdf5
51 | wget https://nlp.cs.unc.edu/data/lxmert_data/mscoco_imgfeat/train2014_obj36.zip
52 | wget https://nlp.cs.unc.edu/data/lxmert_data/mscoco_imgfeat/val2014_obj36.zip
53 | python tsv_to_h5.py --tsv_path train2014_obj36.tsv --h5_path train2014_obj36.h5
54 | python tsv_to_h5.py --tsv_path val2014_obj36.tsv --h5_path val2014_obj36.h5
55 | # Get resplit_val_obj36.h5 from val2014_obj36.h5
56 | python coco_val_compact.py
57 | 
58 | # Pretrain(VG)/GQA: Download LXMERT's VG features (tsv) and convert to hdf5
59 | wget https://nlp.cs.unc.edu/data/lxmert_data/vg_gqa_imgfeat/vg_gqa_obj36.zip
60 | python tsv_to_h5.py --tsv_path vg_gqa_obj36.tsv --h5_path vg_gqa_obj36.h5
61 | 
62 | # RefCOCOg
63 | python refcocog_gt.py --split train
64 | python refcocog_mattnet.py --split val
65 | python refcocog_mattnet.py --split test
66 | 
67 | # NLVR2: Download LXMERT's COCO features (tsv) and convert to hdf5
68 | wget https://nlp.cs.unc.edu/data/lxmert_data/nlvr2_imgfeat/train_obj36.zip
69 | wget https://nlp.cs.unc.edu/data/lxmert_data/nlvr2_imgfeat/valid_obj36.zip
70 | wget https://nlp.cs.unc.edu/data/lxmert_data/nlvr2_imgfeat/test_obj36.zip
71 | python tsv_to_h5.py --tsv_path train_obj36.tsv --h5_path train_obj36.h5
72 | python tsv_to_h5.py --tsv_path valid_obj36.tsv --h5_path valid_obj36.h5
73 | python tsv_to_h5.py --tsv_path test_obj36.tsv --h5_path test_obj36.h5
74 | 
75 | # Multi30K
76 | # Download images following https://github.com/multi30k/dataset
77 | python flickr30k_proposal.py --split trainval
78 | python flickr30k_proposal.py --split test2017
79 | python flickr30k_proposal.py --split test2018
80 | ```


--------------------------------------------------------------------------------
/feature_extraction/coco_gt.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | # from detectron2_proposal_maxnms import collate_fn, extract, NUM_OBJECTS, DIM
  4 | from detectron2_given_box_maxnms import extract, DIM
  5 | from torch.utils.data import Dataset, DataLoader
  6 | import cv2
  7 | from tqdm import tqdm
  8 | from pathlib import Path
  9 | import argparse
 10 | 
 11 | from pycocotools.coco import COCO
 12 | import json
 13 | import numpy as np
 14 | 
 15 | 
 16 | class COCODataset(Dataset):
 17 |     def __init__(self, image_dir, box_ann_path, split='val2014'):
 18 |         self.image_dir = image_dir
 19 | 
 20 |         box_ann_path = str(box_ann_path)
 21 | 
 22 |         self.coco = COCO(box_ann_path)
 23 | 
 24 |         self.split = split
 25 |         with open(box_ann_path) as f:
 26 |             box_ann = json.load(f)
 27 |         id2name = {}
 28 |         for cat2name in box_ann['categories']:
 29 |             id2name[cat2name['id']] = cat2name['name']
 30 |         self.id2name = id2name
 31 | 
 32 |         img_ids = []
 33 |         boxes = []
 34 |         captions = []
 35 |         for img_id, anns in self.coco.imgToAnns.items():
 36 |             img_ids.append(img_id)
 37 | 
 38 |             boxes.append([ann['bbox'] for ann in anns])
 39 |             captions.append([self.id2name[ann['category_id']] for ann in anns])
 40 | 
 41 |         assert len(img_ids) == len(boxes)
 42 |         assert len(img_ids) == len(captions)
 43 | 
 44 |         self.img_ids = img_ids
 45 |         self.boxes = boxes
 46 |         self.captions = captions
 47 | 
 48 |     def __len__(self):
 49 |         return len(self.coco.imgToAnns)
 50 | 
 51 |     def __getitem__(self, idx):
 52 | 
 53 |         image_id = self.img_ids[idx]
 54 | 
 55 |         image_name = f'COCO_{self.split}_{str(image_id).zfill(12)}'
 56 | 
 57 |         image_path = self.image_dir.joinpath(f'{image_name}.jpg')
 58 | 
 59 |         image_id = image_path.stem
 60 | 
 61 |         img = cv2.imread(str(image_path))
 62 | 
 63 |         H, W, _ = img.shape
 64 | 
 65 |         boxes = []
 66 |         for box in self.boxes[idx]:
 67 |             x, y, width, height = box
 68 |             x1 = x
 69 |             y1 = y
 70 |             x2 = x + width
 71 |             y2 = y + height
 72 |             boxes.append([x1, y1, x2, y2])
 73 | 
 74 |         assert len(boxes) > 0
 75 | 
 76 |         boxes = np.array(boxes)
 77 | 
 78 |         captions = self.captions[idx]
 79 | 
 80 |         return {
 81 |             'img_id': image_name,
 82 |             'img': img,
 83 |             'boxes': boxes,
 84 |             'captions': captions
 85 |         }
 86 | 
 87 | 
 88 | def collate_fn(batch):
 89 |     img_ids = []
 90 |     imgs = []
 91 |     boxes = []
 92 |     captions = []
 93 | 
 94 |     for i, entry in enumerate(batch):
 95 |         img_ids.append(entry['img_id'])
 96 |         imgs.append(entry['img'])
 97 |         boxes.append(entry['boxes'])
 98 |         captions.append(entry['captions'])
 99 | 
100 |     batch_out = {}
101 |     batch_out['img_ids'] = img_ids
102 |     batch_out['imgs'] = imgs
103 | 
104 |     batch_out['boxes'] = boxes
105 | 
106 |     batch_out['captions'] = captions
107 | 
108 |     return batch_out
109 | 
110 | 
111 | if __name__ == "__main__":
112 | 
113 |     parser = argparse.ArgumentParser()
114 |     parser.add_argument('--batchsize', default=1, type=int, help='batch_size')
115 |     parser.add_argument('--cocoroot', type=str, default='/ssd-playpen/home/jmincho/workspace/datasets/COCO/')
116 |     parser.add_argument('--split', type=str, default='valid', choices=['train', 'valid', 'test'])
117 | 
118 |     args = parser.parse_args()
119 | 
120 |     SPLIT2DIR = {
121 |         'train': 'train2014',
122 |         'valid': 'val2014',
123 |         'test': 'test2015',
124 |     }
125 | 
126 |     coco_dir = Path(args.cocoroot).resolve()
127 |     coco_img_dir = coco_dir.joinpath('images')
128 |     coco_img_split_dir = coco_img_dir.joinpath(SPLIT2DIR[args.split])
129 |     box_ann_path = coco_dir.joinpath('annotations').joinpath(f'instances_{SPLIT2DIR[args.split]}.json')
130 | 
131 |     dataset_name = 'COCO'
132 | 
133 |     out_dir = coco_dir.joinpath('features')
134 |     if not out_dir.exists():
135 |         out_dir.mkdir()
136 | 
137 |     print('Load images from', coco_img_split_dir)
138 |     print('# Images:', len(list(coco_img_split_dir.iterdir())))
139 | 
140 |     dataset = COCODataset(coco_img_split_dir, box_ann_path, SPLIT2DIR[args.split])
141 |     print('# Annotated Images:', len(dataset))
142 | 
143 |     dataloader = DataLoader(dataset, batch_size=args.batchsize,
144 |                             shuffle=False, collate_fn=collate_fn, num_workers=4)
145 | 
146 |     output_fname = out_dir.joinpath(f'{SPLIT2DIR[args.split]}_GT.h5')
147 |     print('features will be saved at', output_fname)
148 | 
149 |     desc = f'{dataset_name}_{SPLIT2DIR[args.split]}_{DIM}'
150 | 
151 |     extract(output_fname, dataloader, desc)
152 | 


--------------------------------------------------------------------------------
/feature_extraction/coco_proposal.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from detectron2_proposal_maxnms import collate_fn, extract, NUM_OBJECTS, DIM
 4 | from torch.utils.data import Dataset, DataLoader
 5 | import cv2
 6 | from tqdm import tqdm
 7 | from pathlib import Path
 8 | import argparse
 9 | 
10 | 
11 | class COCODataset(Dataset):
12 |     def __init__(self, image_dir):
13 |         self.image_dir = image_dir
14 |         self.image_path_list = list(tqdm(image_dir.iterdir()))
15 |         self.n_images = len(self.image_path_list)
16 | 
17 |         # self.transform = image_transform
18 | 
19 |     def __len__(self):
20 |         return self.n_images
21 | 
22 |     def __getitem__(self, idx):
23 |         image_path = self.image_path_list[idx]
24 |         image_id = image_path.stem
25 | 
26 |         img = cv2.imread(str(image_path))
27 | 
28 |         return {
29 |             'img_id': image_id,
30 |             'img': img
31 |         }
32 | 
33 | if __name__ == "__main__":
34 | 
35 |     parser = argparse.ArgumentParser()
36 |     parser.add_argument('--batchsize', default=1, type=int, help='batch_size')
37 |     parser.add_argument('--cocoroot', type=str, default='/ssd-playpen/home/jmincho/workspace/datasets/COCO/')
38 |     parser.add_argument('--split', type=str, default='valid', choices=['train', 'valid', 'test'])
39 | 
40 |     args = parser.parse_args()
41 | 
42 |     SPLIT2DIR = {
43 |         'train': 'train2014',
44 |         'valid': 'val2014',
45 |         'test': 'test2015',
46 |     }
47 | 
48 |     coco_dir = Path(args.cocoroot).resolve()
49 |     coco_img_dir = coco_dir.joinpath('images')
50 |     coco_img_split_dir = coco_img_dir.joinpath(SPLIT2DIR[args.split])
51 | 
52 |     dataset_name = 'COCO'
53 | 
54 |     out_dir = coco_dir.joinpath('features')
55 |     if not out_dir.exists():
56 |         out_dir.mkdir()
57 | 
58 |     print('Load images from', coco_img_split_dir)
59 |     print('# Images:', len(list(coco_img_split_dir.iterdir())))
60 | 
61 |     dataset = COCODataset(coco_img_split_dir)
62 | 
63 |     dataloader = DataLoader(dataset, batch_size=args.batchsize,
64 |                             shuffle=False, collate_fn=collate_fn, num_workers=4)
65 | 
66 |     output_fname = out_dir.joinpath(f'{args.split}_boxes{NUM_OBJECTS}.h5')
67 |     print('features will be saved at', output_fname)
68 | 
69 |     desc = f'{dataset_name}_{args.split}_{(NUM_OBJECTS, DIM)}'
70 | 
71 |     extract(output_fname, dataloader, desc)
72 | 


--------------------------------------------------------------------------------
/feature_extraction/coco_val_compact.py:
--------------------------------------------------------------------------------
 1 | import h5py
 2 | from tqdm import tqdm
 3 | import json
 4 | import pathlib
 5 | import argparse
 6 | 
 7 | if __name__ == '__main__':
 8 | 
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--batchsize', default=1, type=int, help='batch_size')
11 |     parser.add_argument('--data_dir', type=str,
12 |                         default='.')
13 | 
14 |     args = parser.parse_args()
15 | 
16 |     data_dir = pathlib.Path(args.data_dir).resolve()
17 |     coco_dir = data_dir.joinpath('COCO')
18 | 
19 |     with open(data_dir.joinpath('lxmert/mscoco_resplit_val.json'))as f:
20 |         val_data = json.load(f)
21 | 
22 |     print(len(val_data))
23 | 
24 |     source_f = h5py.File(coco_dir.joinpath('features/val2014_obj36.h5'), 'r')
25 |     target_f = h5py.File(coco_dir.joinpath('features/resplit_val_obj36.h5'), 'w')
26 | 
27 |     img_id = val_data[0]['img_id']
28 | 
29 |     keys = list(source_f[img_id].keys())
30 | 
31 |     for datum in tqdm(val_data, ncols=50):
32 |         img_id = datum['img_id']
33 | 
34 |         grp = target_f.create_group(str(img_id))
35 |         for k in keys:
36 |             grp[k] = source_f[f'{img_id}/{k}'][()]
37 | 


--------------------------------------------------------------------------------
/feature_extraction/flickr30k_proposal.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from detectron2_proposal_maxnms import collate_fn, extract, NUM_OBJECTS, DIM
 4 | from torch.utils.data import Dataset, DataLoader
 5 | import cv2
 6 | from tqdm import tqdm
 7 | from pathlib import Path
 8 | import argparse
 9 | 
10 | 
11 | class Flickr30KDataset(Dataset):
12 |     def __init__(self, image_dir):
13 |         self.image_dir = image_dir
14 |         self.image_path_list = list(tqdm(image_dir.iterdir()))
15 |         self.n_images = len(self.image_path_list)
16 | 
17 |         # self.transform = image_transform
18 | 
19 |     def __len__(self):
20 |         return self.n_images
21 | 
22 |     def __getitem__(self, idx):
23 |         image_path = self.image_path_list[idx]
24 |         image_id = image_path.stem
25 | 
26 |         img = cv2.imread(str(image_path))
27 | 
28 |         return {
29 |             'img_id': image_id,
30 |             'img': img
31 |         }
32 | 
33 | if __name__ == "__main__":
34 | 
35 |     parser = argparse.ArgumentParser()
36 |     parser.add_argument('--batchsize', default=1, type=int, help='batch_size')
37 |     parser.add_argument('--flickrroot', type=str,
38 |                         default='/ssd-playpen/home/jmincho/workspace/datasets/flickr30k/')
39 |     parser.add_argument('--split', type=str, default=None, choices=['trainval', 'test2017', 'test2018'])
40 | 
41 |     args = parser.parse_args()
42 | 
43 |     SPLIT2DIR = {
44 |         'trainval': 'flickr30k_images',
45 |         'test2017': 'test_2017_flickr_images',
46 |         'test2018': 'test_2018_flickr_images',
47 |     }
48 | 
49 |     flickr_dir = Path(args.flickrroot).resolve()
50 |     flickr_img_dir = flickr_dir.joinpath('flickr30k_images/').joinpath(SPLIT2DIR[args.split])
51 | 
52 |     dataset_name = 'Flickr30K'
53 | 
54 |     out_dir = flickr_dir.joinpath('features')
55 |     if not out_dir.exists():
56 |         out_dir.mkdir()
57 | 
58 |     print('Load images from', flickr_img_dir)
59 |     print('# Images:', len(list(flickr_img_dir.iterdir())))
60 | 
61 |     dataset = Flickr30KDataset(flickr_img_dir)
62 | 
63 |     dataloader = DataLoader(dataset, batch_size=args.batchsize,
64 |                             shuffle=False, collate_fn=collate_fn, num_workers=4)
65 | 
66 |     output_fname = out_dir.joinpath(f'{args.split}_boxes{NUM_OBJECTS}.h5')
67 |     print('features will be saved at', output_fname)
68 | 
69 |     desc = f'{dataset_name}_{args.split}_{(NUM_OBJECTS, DIM)}'
70 | 
71 |     extract(output_fname, dataloader, desc)
72 | 


--------------------------------------------------------------------------------
/feature_extraction/process.sh:
--------------------------------------------------------------------------------
 1 | # wget https://nlp.cs.unc.edu/data/lxmert_data/mscoco_imgfeat/train2014_obj36.zip
 2 | # wget https://nlp.cs.unc.edu/data/lxmert_data/mscoco_imgfeat/val2014_obj36.zip
 3 | unzip train2014_obj36.zip -d .
 4 | unzip val2014_obj36.zip -d .
 5 | python tsv_to_h5.py --tsv_path train2014_obj36.tsv --h5_path train2014_obj36.h5
 6 | python tsv_to_h5.py --tsv_path val2014_obj36.tsv --h5_path val2014_obj36.h5
 7 | # Get resplit_val_obj36.h5 from val2014_obj36.h5
 8 | python coco_val_compact.py
 9 | 
10 | # Pretrain(VG)/GQA: Download LXMERT's VG features (tsv) and convert to hdf5
11 | # wget https://nlp.cs.unc.edu/data/lxmert_data/vg_gqa_imgfeat/vg_gqa_obj36.zip
12 | unzip vg_gpa_obj36.zip -d .
13 | python tsv_to_h5.py --tsv_path vg_gqa_obj36.tsv --h5_path vg_gqa_obj36.h5
14 | 
15 | # RefCOCOg
16 | python refcocog_gt.py --split train
17 | python refcocog_mattnet.py --split val
18 | python refcocog_mattnet.py --split test
19 | 
20 | # NLVR2: Download LXMERT's COCO features (tsv) and convert to hdf5
21 | # wget https://nlp.cs.unc.edu/data/lxmert_data/nlvr2_imgfeat/train_obj36.zip
22 | # wget https://nlp.cs.unc.edu/data/lxmert_data/nlvr2_imgfeat/valid_obj36.zip
23 | # wget https://nlp.cs.unc.edu/data/lxmert_data/nlvr2_imgfeat/test_obj36.zip
24 | unzip train_obj36.zip -d .
25 | unzip valid_obj36.zip -d .
26 | unzip test_obj36.zip -d .
27 | 
28 | python tsv_to_h5.py --tsv_path train_obj36.tsv --h5_path train_obj36.h5
29 | python tsv_to_h5.py --tsv_path valid_obj36.tsv --h5_path valid_obj36.h5
30 | python tsv_to_h5.py --tsv_path test_obj36.tsv --h5_path test_obj36.h5
31 | 
32 | # Multi30K
33 | # Download images following https://github.com/multi30k/dataset
34 | python flickr30k_proposal.py --split trainval
35 | python flickr30k_proposal.py --split test2017
36 | python flickr30k_proposal.py --split test2018
37 | 


--------------------------------------------------------------------------------
/feature_extraction/refcocog_gt.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | from pathlib import Path
  4 | import argparse
  5 | import json
  6 | 
  7 | import cv2
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | from torch.utils.data import Dataset, DataLoader
 11 | 
 12 | from detectron2_given_box_maxnms import extract, DIM
 13 | 
 14 | from pycocotools.coco import COCO
 15 | 
 16 | 
 17 | class RefCOCODataset(Dataset):
 18 |     def __init__(self, refcoco_dir, refcoco_images_dir, coco_dir, split='val'):
 19 | 
 20 |         self.image_dir = refcoco_images_dir
 21 | 
 22 |         # coco_train_annFile = coco_dir.joinpath('annotations/instances_train2014.json')
 23 |         # self.coco = COCO(coco_train_annFile)
 24 | 
 25 |         assert split in ['train', 'val', 'test']
 26 | 
 27 |         workspace_dir = Path(__file__).resolve().parent.parent
 28 |         refcoco_util_dir = workspace_dir.joinpath('refcoco_utils')
 29 |         import sys
 30 |         sys.path.append(str(refcoco_util_dir))
 31 |         from refer import REFER
 32 |         self.refer = REFER('refcocog', 'umd')
 33 | 
 34 |         ref_ids = self.refer.getRefIds(split=split)
 35 | 
 36 |         id2dets = {}
 37 |         img_ids = []
 38 |         image_fns = []
 39 |         for ref_id in ref_ids:
 40 |             ref = self.refer.Refs[ref_id]
 41 |             img_id = ref['image_id']
 42 | 
 43 |             if img_id not in img_ids:
 44 |                 img_ids.append(img_id)
 45 | 
 46 |                 fn_ann = ref['file_name']
 47 | 
 48 |                 # COCO_train2014_000000419645_398406.jpg
 49 |                 # COCO_train2014_000000419645.jpg
 50 | 
 51 |                 suffix = fn_ann.split('.')[-1]
 52 | 
 53 |                 fname = '_'.join(fn_ann.split('_')[:-1]) + '.' + suffix
 54 | 
 55 |                 image_fns.append(fname)
 56 | 
 57 |                 detections = self.refer.imgToAnns[img_id]
 58 | 
 59 |                 id2dets[img_id] = detections
 60 | 
 61 |         self.image_ids = img_ids
 62 |         self.image_fns = image_fns
 63 |         self.id2dets = id2dets
 64 | 
 65 |     def __len__(self):
 66 |         return len(self.image_ids)
 67 | 
 68 |     def __getitem__(self, idx):
 69 | 
 70 |         image_id = self.image_ids[idx]
 71 |         image_fn = self.image_fns[idx]
 72 |         image_path = self.image_dir.joinpath(image_fn)
 73 | 
 74 |         assert Path(image_path).exists(), image_path
 75 | 
 76 |         img = cv2.imread(str(image_path))
 77 | 
 78 |         H, W, C = img.shape
 79 | 
 80 |         dets = self.id2dets[image_id]
 81 |         # cat_names = [det['category_name'] for det in dets]
 82 | 
 83 |         boxes = []
 84 |         for i, region in enumerate([det['bbox'] for det in dets]):
 85 |             # (x1, y1, x2, y2)
 86 |             x, y, w, h = region[:4]
 87 |             x1, y1, x2, y2 = x, y, x+w, y+h
 88 | 
 89 |             # x1, y1, x2, y2 = region[:4]
 90 | 
 91 |             assert x2 <= W, (image_id, i, region)
 92 |             assert y2 <= H, (image_id, i, region)
 93 | 
 94 |             box = [x1, y1, x2, y2]
 95 |             boxes.append(box)
 96 | 
 97 |         boxes = np.array(boxes)
 98 | 
 99 |         return {
100 |             'img_id': str(image_id),
101 |             'img_fn': image_fn,
102 |             'img': img,
103 |             'boxes': boxes,
104 |             # 'captions': cat_names
105 |         }
106 | 
107 | def collate_fn(batch):
108 |     img_ids = []
109 |     imgs = []
110 | 
111 |     boxes = []
112 | 
113 |     captions = []
114 | 
115 |     for i, entry in enumerate(batch):
116 |         img_ids.append(entry['img_id'])
117 |         imgs.append(entry['img'])
118 |         boxes.append(entry['boxes'])
119 |         # captions.append(entry['captions'])
120 | 
121 |     batch_out = {}
122 |     batch_out['img_ids'] = img_ids
123 |     batch_out['imgs'] = imgs
124 | 
125 |     batch_out['boxes'] = boxes
126 | 
127 |     # batch_out['captions'] = captions
128 | 
129 |     return batch_out
130 | 
131 | 
132 | if __name__ == "__main__":
133 | 
134 |     parser = argparse.ArgumentParser()
135 |     parser.add_argument('--batchsize', default=1, type=int, help='batch_size')
136 |     parser.add_argument('--refcocoroot', type=str, default='RefCOCO/')
137 |     parser.add_argument('--cocoroot', type=str, default='COCO/')
138 |     parser.add_argument('--split', type=str, default='val', choices=['train', 'val', 'test'])
139 | 
140 |     args = parser.parse_args()
141 | 
142 |     refcoco_dir = Path(args.refcocoroot).resolve()
143 |     refcocog_dir = refcoco_dir.joinpath('refcocog')
144 |     coco_dir = Path(args.cocoroot).resolve()
145 |     refcoco_images_dir = coco_dir.joinpath('images/train2014')
146 |     dataset_name = 'RefCOCOg'
147 | 
148 |     out_dir = refcocog_dir.joinpath('features')
149 |     if not out_dir.exists():
150 |         out_dir.mkdir()
151 | 
152 |     dataset = RefCOCODataset(refcoco_dir, refcoco_images_dir, coco_dir, args.split)
153 |     print('# Images:', len(dataset))
154 | 
155 |     dataloader = DataLoader(dataset, batch_size=args.batchsize,
156 |                             shuffle=False, collate_fn=collate_fn, num_workers=4)
157 | 
158 |     output_fname = out_dir.joinpath(f'{args.split}_boxes_GT.h5')
159 |     print('features will be saved at', output_fname)
160 | 
161 |     desc = f'{dataset_name}_given_boxes_({DIM})'
162 | 
163 |     extract(output_fname, dataloader, desc)
164 | 


--------------------------------------------------------------------------------
/feature_extraction/refcocog_mattnet.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | from pathlib import Path
  4 | import argparse
  5 | import json
  6 | 
  7 | import cv2
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | from torch.utils.data import Dataset, DataLoader
 11 | 
 12 | from detectron2_given_box_maxnms import extract, DIM
 13 | 
 14 | class RefCOCODataset(Dataset):
 15 |     def __init__(self, refcoco_dir, refcoco_images_dir, split='val'):
 16 | 
 17 |         self.image_dir = refcoco_images_dir
 18 | 
 19 |         mattnet_maskrcnn_detections_path = refcoco_dir.joinpath('detections/refcocog_umd/res101_coco_minus_refer_notime_dets.json')
 20 |         with open(mattnet_maskrcnn_detections_path) as f:
 21 |             mattnet_maskrcnn_detections = json.load(f)
 22 | 
 23 |         id2dets = {}
 24 |         for det in mattnet_maskrcnn_detections:
 25 |             image_id = det['image_id']
 26 |             if image_id not in id2dets:
 27 |                 id2dets[image_id] = []
 28 |             id2dets[image_id].append(det)
 29 |         self.id2dets = id2dets
 30 | 
 31 |         print('Load mattnet detections from', mattnet_maskrcnn_detections_path)
 32 | 
 33 |         assert split in ['train', 'val', 'test']
 34 | 
 35 |         workspace_dir = Path(__file__).resolve().parent.parent
 36 |         refcoco_util_dir = workspace_dir.joinpath('refcoco_utils')
 37 |         import sys
 38 |         sys.path.append(str(refcoco_util_dir))
 39 |         from refer import REFER
 40 |         self.refer = REFER('refcocog', 'umd')
 41 | 
 42 |         ref_ids = self.refer.getRefIds(split=split)
 43 |         img_ids = []
 44 |         image_fns = []
 45 |         for ref_id in ref_ids:
 46 |             ref = self.refer.Refs[ref_id]
 47 |             img_id = ref['image_id']
 48 | 
 49 |             if img_id not in img_ids:
 50 |                 img_ids.append(img_id)
 51 | 
 52 |                 fn_ann = ref['file_name']
 53 | 
 54 |                 # COCO_train2014_000000419645_398406.jpg
 55 |                 # COCO_train2014_000000419645.jpg
 56 | 
 57 |                 suffix = fn_ann.split('.')[-1]
 58 | 
 59 |                 fname = '_'.join(fn_ann.split('_')[:-1]) + '.' + suffix
 60 | 
 61 |                 image_fns.append(fname)
 62 | 
 63 |         self.image_ids = img_ids
 64 |         self.image_fns = image_fns
 65 | 
 66 |     def __len__(self):
 67 |         return len(self.image_ids)
 68 | 
 69 |     def __getitem__(self, idx):
 70 | 
 71 |         image_id = self.image_ids[idx]
 72 |         image_fn = self.image_fns[idx]
 73 |         image_path = self.image_dir.joinpath(image_fn)
 74 | 
 75 |         assert Path(image_path).exists(), image_path
 76 | 
 77 |         img = cv2.imread(str(image_path))
 78 | 
 79 |         H, W, C = img.shape
 80 | 
 81 |         dets = self.id2dets[image_id]
 82 |         cat_names = [det['category_name'] for det in dets]
 83 | 
 84 |         boxes = []
 85 |         for i, region in enumerate([det['box'] for det in dets]):
 86 |             # (x1, y1, x2, y2)
 87 |             x,y,w,h = region[:4]
 88 | 
 89 |             x1, y1, x2, y2 = x, y, x+w, y+h
 90 | 
 91 |             assert x2 <= W, (image_id, i, region)
 92 |             assert y2 <= H, (image_id, i, region)
 93 | 
 94 |             box = [x1, y1, x2, y2]
 95 |             boxes.append(box)
 96 | 
 97 |         boxes = np.array(boxes)
 98 | 
 99 |         return {
100 |             'img_id': str(image_id),
101 |             'img_fn': image_fn,
102 |             'img': img,
103 |             'boxes': boxes,
104 |             'captions': cat_names
105 |         }
106 | 
107 | 
108 | def collate_fn(batch):
109 |     img_ids = []
110 |     imgs = []
111 | 
112 |     boxes = []
113 | 
114 |     captions = []
115 | 
116 |     for i, entry in enumerate(batch):
117 |         img_ids.append(entry['img_id'])
118 |         imgs.append(entry['img'])
119 |         boxes.append(entry['boxes'])
120 |         captions.append(entry['captions'])
121 | 
122 |     batch_out = {}
123 |     batch_out['img_ids'] = img_ids
124 |     batch_out['imgs'] = imgs
125 | 
126 |     batch_out['boxes'] = boxes
127 | 
128 |     batch_out['captions'] = captions
129 | 
130 |     return batch_out
131 | 
132 | 
133 | if __name__ == "__main__":
134 | 
135 |     parser = argparse.ArgumentParser()
136 |     parser.add_argument('--batchsize', default=1, type=int, help='batch_size')
137 |     parser.add_argument('--refcocoroot', type=str, default='/ssd-playpen/home/jmincho/workspace/datasets/RefCOCO/')
138 |     parser.add_argument('--cocoroot', type=str, default='/ssd-playpen/home/jmincho/workspace/datasets/COCO/')
139 |     parser.add_argument('--split', type=str, default='val', choices=['train', 'val', 'test'])
140 | 
141 |     args = parser.parse_args()
142 | 
143 |     refcoco_dir = Path(args.refcocoroot).resolve()
144 |     refcocog_dir = refcoco_dir.joinpath('refcocog')
145 |     coco_dir = Path(args.cocoroot).resolve()
146 |     refcoco_images_dir = coco_dir.joinpath('images/train2014')
147 |     dataset_name = 'RefCOCOg'
148 | 
149 |     out_dir = refcocog_dir.joinpath('features')
150 |     if not out_dir.exists():
151 |         out_dir.mkdir()
152 | 
153 |     dataset = RefCOCODataset(refcoco_dir, refcoco_images_dir, args.split)
154 |     print('# Images:', len(dataset))
155 | 
156 |     dataloader = DataLoader(dataset, batch_size=args.batchsize,
157 |                             shuffle=False, collate_fn=collate_fn, num_workers=4)
158 | 
159 |     output_fname = out_dir.joinpath(f'{args.split}_boxes_mattnet.h5')
160 |     print('features will be saved at', output_fname)
161 | 
162 |     desc = f'{dataset_name}_given_boxes_({DIM})'
163 | 
164 |     extract(output_fname, dataloader, desc)
165 | 


--------------------------------------------------------------------------------
/feature_extraction/tsv_to_h5.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyleft 2019 Project LXRT
 3 | 
 4 | import sys
 5 | import csv
 6 | import base64
 7 | import time
 8 | from tqdm import tqdm
 9 | import numpy as np
10 | import h5py
11 | import argparse
12 | 
13 | csv.field_size_limit(sys.maxsize)
14 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf",
15 |               "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"]
16 | 
17 | 
18 | def load_obj_tsv(fname, topk=None):
19 |     """Load object features from tsv file.
20 |     :param fname: The path to the tsv file.
21 |     :param topk: Only load features for top K images (lines) in the tsv file.
22 |         Will load all the features if topk is either -1 or None.
23 |     :return: A list of image object features where each feature is a dict.
24 |         See FILENAMES above for the keys in the feature dict.
25 |     """
26 |     data = []
27 |     start_time = time.time()
28 |     print("Start to load Faster-RCNN detected objects from %s" % fname)
29 |     with open(fname) as f:
30 |         reader = csv.DictReader(f, FIELDNAMES, delimiter="\t")
31 |         for i, item in tqdm(enumerate(reader), ncols=150):
32 | 
33 |             for key in ['img_h', 'img_w', 'num_boxes']:
34 |                 item[key] = int(item[key])
35 | 
36 |             boxes = item['num_boxes']
37 |             decode_config = [
38 |                 ('objects_id', (boxes, ), np.int64),
39 |                 ('objects_conf', (boxes, ), np.float32),
40 |                 ('attrs_id', (boxes, ), np.int64),
41 |                 ('attrs_conf', (boxes, ), np.float32),
42 |                 ('boxes', (boxes, 4), np.float32),
43 |                 ('features', (boxes, -1), np.float32),
44 |             ]
45 |             for key, shape, dtype in decode_config:
46 |                 item[key] = np.frombuffer(
47 |                     base64.b64decode(item[key]), dtype=dtype)
48 |                 item[key] = item[key].reshape(shape)
49 |                 item[key].setflags(write=False)
50 | 
51 |             data.append(item)
52 |             if topk is not None and len(data) == topk:
53 |                 break
54 |     elapsed_time = time.time() - start_time
55 |     print("Loaded %d images in file %s in %d seconds." %
56 |           (len(data), fname, elapsed_time))
57 |     return data
58 | 
59 | if __name__ == '__main__':
60 | 
61 |     parser = argparse.ArgumentParser()
62 |     parser.add_argument('--tsv_path', type=str,
63 |                         default='val2014_obj36.tsv')
64 |     parser.add_argument('--h5_path', type=str,
65 |                         default='val2014_obj36.h5')
66 | 
67 |     args = parser.parse_args()
68 |     dim = 2048
69 | 
70 |     print('Load ', args.tsv_path)
71 |     data = load_obj_tsv(args.tsv_path)
72 |     print('# data:', len(data))
73 | 
74 |     output_fname = args.h5_path
75 |     print('features will be saved at', output_fname)
76 | 
77 |     with h5py.File(output_fname, 'w') as f:
78 |         for i, datum in tqdm(enumerate(data),
79 |                             ncols=150,):
80 | 
81 |             img_id = datum['img_id']
82 | 
83 |             num_boxes = datum['num_boxes']
84 | 
85 |             grp = f.create_group(img_id)
86 |             grp['features'] = datum['features'].reshape(num_boxes, 2048)
87 |             grp['obj_id'] = datum['objects_id']
88 |             grp['obj_conf'] = datum['objects_conf']
89 |             grp['attr_id'] = datum['attrs_id']
90 |             grp['attr_conf'] = datum['attrs_conf']
91 |             grp['boxes'] = datum['boxes']
92 |             grp['img_w'] = datum['img_w']
93 |             grp['img_h'] = datum['img_h']
94 | 


--------------------------------------------------------------------------------
/feature_extraction/vcr_gt.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | 
  3 | from pathlib import Path
  4 | import argparse
  5 | import json
  6 | 
  7 | import cv2
  8 | import numpy as np
  9 | from tqdm import tqdm
 10 | from torch.utils.data import Dataset, DataLoader
 11 | 
 12 | from detectron2_given_box_maxnms import extract, DIM
 13 | 
 14 | class VCRDataset(Dataset):
 15 |     def __init__(self, vcr_dir, vcr_images_dir, split='val'):
 16 | 
 17 |         self.image_dir = vcr_images_dir
 18 |         ann_path = vcr_dir.joinpath(f'{split}.jsonl')
 19 | 
 20 |         with open(ann_path, 'r') as f:
 21 |             _items = [json.loads(s) for s in f]
 22 |         print('Load images from', ann_path)
 23 | 
 24 |         image_ids = []
 25 |         image_paths = []
 26 |         items = []
 27 |         for item in _items:
 28 |             if item['img_id'] not in image_ids:
 29 |                 items.append(item)
 30 |                 image_ids.append(item['img_id'])
 31 |                 image_paths.append(item['img_fn'])
 32 | 
 33 |         self.items = items
 34 |         self.n_images = len(items)
 35 | 
 36 |     def __len__(self):
 37 |         return self.n_images
 38 | 
 39 |     def __getitem__(self, idx):
 40 | 
 41 |         item = self.items[idx]
 42 |         image_path = item['img_fn']
 43 |         image_id = item['img_id']
 44 | 
 45 |         image_path = self.image_dir.joinpath(image_path)
 46 | 
 47 |         assert Path(image_path).exists()
 48 | 
 49 |         img = cv2.imread(str(image_path))
 50 | 
 51 |         metadata_path = self.image_dir.joinpath(item['metadata_fn'])
 52 |         with open(metadata_path) as f:
 53 |             metadata = json.load(f)
 54 |         boxes = []
 55 |         regions = metadata['boxes']
 56 | 
 57 |         for i, region in enumerate(regions):
 58 |             # (x1, y1, x2, y2)
 59 |             x1, y1, x2, y2 = region[:4]
 60 | 
 61 |             # assert x2 <= W, (image_id, i, region)
 62 |             # assert y2 <= H, (image_id, i, region)
 63 | 
 64 |             box = [x1, y1, x2, y2]
 65 |             boxes.append(box)
 66 | 
 67 | 
 68 |         boxes = np.array(boxes)
 69 | 
 70 |         return {
 71 |             'img_id': image_id,
 72 |             'img': img,
 73 |             'boxes': boxes,
 74 |             'captions': metadata['names']
 75 |         }
 76 | 
 77 | 
 78 | def collate_fn(batch):
 79 |     img_ids = []
 80 |     imgs = []
 81 | 
 82 |     boxes = []
 83 | 
 84 |     captions = []
 85 | 
 86 |     for i, entry in enumerate(batch):
 87 |         img_ids.append(entry['img_id'])
 88 |         imgs.append(entry['img'])
 89 |         boxes.append(entry['boxes'])
 90 |         captions.append(entry['captions'])
 91 | 
 92 |     batch_out = {}
 93 |     batch_out['img_ids'] = img_ids
 94 |     batch_out['imgs'] = imgs
 95 | 
 96 |     batch_out['boxes'] = boxes
 97 | 
 98 |     batch_out['captions'] = captions
 99 | 
100 |     return batch_out
101 | 
102 | 
103 | if __name__ == "__main__":
104 | 
105 |     parser = argparse.ArgumentParser()
106 |     parser.add_argument('--batchsize', default=1, type=int, help='batch_size')
107 |     parser.add_argument('--vcrroot', type=str, default='/ssd-playpen/home/jmincho/workspace/datasets/VCR/')
108 |     parser.add_argument('--split', type=str, default='val', choices=['train', 'val', 'test'])
109 | 
110 |     args = parser.parse_args()
111 | 
112 |     vcr_dir = Path(args.vcrroot).resolve()
113 |     vcr_images_dir = vcr_dir.joinpath('vcr1images')
114 |     dataset_name = 'VCR'
115 | 
116 |     out_dir = vcr_dir.joinpath('features')
117 |     if not out_dir.exists():
118 |         out_dir.mkdir()
119 | 
120 |     dataset = VCRDataset(vcr_dir, vcr_images_dir, args.split)
121 |     print('# Images:', len(dataset))
122 | 
123 |     dataloader = DataLoader(dataset, batch_size=args.batchsize,
124 |                             shuffle=False, collate_fn=collate_fn, num_workers=4)
125 | 
126 |     output_fname = out_dir.joinpath(f'{args.split}_boxes_GT.h5')
127 |     print('features will be saved at', output_fname)
128 | 
129 |     desc = f'{dataset_name}_given_boxes_({DIM})'
130 | 
131 |     extract(output_fname, dataloader, desc)
132 | 


--------------------------------------------------------------------------------
/feature_extraction/vcr_proposal.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | 
 3 | from detectron2_proposal_maxnms import collate_fn, extract, NUM_OBJECTS, DIM
 4 | from torch.utils.data import Dataset, DataLoader
 5 | import h5py
 6 | import torch
 7 | import cv2
 8 | from tqdm import tqdm
 9 | from pathlib import Path
10 | import argparse
11 | import json
12 | 
13 | 
14 | class VCRDataset(Dataset):
15 |     def __init__(self, vcr_dir, vcr_images_dir, split='val'):
16 | 
17 |         self.image_dir = vcr_images_dir
18 |         ann_path = vcr_dir.joinpath(f'{split}.jsonl')
19 | 
20 |         with open(ann_path, 'r') as f:
21 |             _items = [json.loads(s) for s in f]
22 |         print('Load images from', ann_path)
23 | 
24 |         image_ids = []
25 |         image_paths = []
26 |         items = []
27 |         for item in _items:
28 |             if item['img_id'] not in image_ids:
29 |                 items.append(item)
30 |                 image_ids.append(item['img_id'])
31 |                 image_paths.append(item['img_fn'])
32 | 
33 |         self.items = items
34 |         self.n_images = len(items)
35 | 
36 |     def __len__(self):
37 |         return self.n_images
38 | 
39 |     def __getitem__(self, idx):
40 | 
41 |         item = self.items[idx]
42 |         image_path = item['img_fn']
43 |         image_id = item['img_id']
44 | 
45 |         image_path = self.image_dir.joinpath(image_path)
46 | 
47 |         assert Path(image_path).exists()
48 | 
49 |         img = cv2.imread(str(image_path))
50 | 
51 |         return {
52 |             'img_id': image_id,
53 |             'img': img
54 |         }
55 | 
56 | if __name__ == "__main__":
57 | 
58 |     parser = argparse.ArgumentParser()
59 |     parser.add_argument('--batchsize', default=1, type=int, help='batch_size')
60 |     parser.add_argument('--vcrroot', type=str, default='/ssd-playpen/home/jmincho/workspace/datasets/VCR/')
61 |     parser.add_argument('--split', type=str, default='val', choices=['train', 'val', 'test'])
62 | 
63 |     args = parser.parse_args()
64 | 
65 |     vcr_dir = Path(args.vcrroot).resolve()
66 |     vcr_images_dir = vcr_dir.joinpath('vcr1images')
67 |     dataset_name = 'VCR'
68 | 
69 |     out_dir = vcr_dir.joinpath('features')
70 |     if not out_dir.exists():
71 |         out_dir.mkdir()
72 | 
73 |     # print('Load images from', coco_img_split_dir)
74 | 
75 |     dataset = VCRDataset(vcr_dir, vcr_images_dir, args.split)
76 |     print('# Images:', len(dataset))
77 | 
78 |     dataloader = DataLoader(dataset, batch_size=args.batchsize,
79 |                             shuffle=False, collate_fn=collate_fn, num_workers=4)
80 | 
81 |     output_fname = out_dir.joinpath(f'{args.split}_boxes{NUM_OBJECTS}.h5')
82 |     print('features will be saved at', output_fname)
83 | 
84 |     desc = f'{dataset_name}_{args.split}_{(NUM_OBJECTS, DIM)}'
85 | 
86 |     extract(output_fname, dataloader, desc)
87 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch>=1.6.0
 2 | transformers==4.2.1
 3 | sentencepiece
 4 | h5py
 5 | wandb
 6 | tqdm
 7 | numpy
 8 | pandas
 9 | matplotlib
10 | ftfy
11 | timm
12 | pyyaml
13 | sacrebleu
14 | git+git://github.com/j-min/language-evaluation@master
15 | wget


--------------------------------------------------------------------------------