├── .gitignore ├── CLIP-ViL ├── LICENSE ├── clip │ ├── LICENSE │ ├── __init__.py │ ├── adapter_config.py │ ├── bpe_simple_vocab_16e6.txt.gz │ ├── clip.py │ ├── model.py │ └── simple_tokenizer.py ├── data │ ├── gqa │ │ ├── trainval_ans2label.json │ │ └── trainval_label2ans.json │ ├── mscoco │ │ └── README.md │ └── vqa │ │ ├── trainval_ans2label.json │ │ └── trainval_label2ans.json ├── readme.md ├── scripts │ ├── gqa_adapters.sh │ ├── gqa_baseline.sh │ ├── pretrain.bash │ ├── snli-ve_adapters.sh │ ├── snli-ve_baseline.sh │ ├── vqa_adapters.sh │ └── vqa_baseline.sh └── src │ ├── lxrt │ ├── adapters │ │ ├── __init__.py │ │ ├── adapter_configuration.py │ │ ├── adapter_controller.py │ │ ├── adapter_hypernetwork.py │ │ ├── adapter_modeling.py │ │ ├── adapter_outputs.py │ │ ├── adapter_utils.py │ │ ├── config.py │ │ ├── hypercomplex │ │ │ ├── __init__.py │ │ │ ├── inits.py │ │ │ ├── kronecker.py │ │ │ └── layers.py │ │ └── low_rank_layer.py │ ├── entry.py │ ├── file_utils.py │ ├── modeling.py │ ├── optimization.py │ ├── tokenization.py │ └── visual_transformers.py │ ├── param.py │ ├── pretrain │ ├── lxmert_data.py │ ├── lxmert_pretrain.py │ └── qa_answer_table.py │ ├── tasks │ ├── gqa.py │ ├── gqa_data.py │ ├── gqa_model.py │ ├── snli.py │ ├── snli_data.py │ ├── vision_helpers.py │ ├── vqa.py │ ├── vqa_data.py │ └── vqa_model.py │ ├── tools │ ├── lmdb_dataset.py │ ├── load_stagte_dict.py │ ├── resize_images.py │ ├── sharearray.py │ └── vision_helpers.py │ └── utils.py ├── LICENSE ├── README.md ├── VL-T5 ├── inference │ ├── README.md │ ├── extracting_data.py │ ├── modeling_frcnn.py │ ├── processing_image.py │ ├── utils.py │ └── visualizing_image.py ├── requirements.txt ├── scripts │ ├── image │ │ ├── full_finetuning.sh │ │ ├── hyperformer.sh │ │ ├── multiple_adapters.sh │ │ ├── multiple_compacters.sh │ │ ├── multiple_lora.sh │ │ ├── multiple_prompts.sh │ │ ├── single_adapter.sh │ │ ├── single_compacter.sh │ │ ├── single_lora.sh │ │ └── single_prompt.sh │ └── video │ │ ├── full_finetuning.sh │ │ ├── single_adapter.sh │ │ ├── single_lora.sh │ │ └── single_prompt.sh └── src │ ├── activitynet.py │ ├── activitynet_data.py │ ├── activitynet_model.py │ ├── adapters │ ├── __init__.py │ ├── adapter_configuration.py │ ├── adapter_controller.py │ ├── adapter_hypernetwork.py │ ├── adapter_modeling.py │ ├── adapter_outputs.py │ ├── adapter_utils.py │ ├── config.py │ ├── hypercomplex │ │ ├── __init__.py │ │ ├── inits.py │ │ ├── kronecker.py │ │ └── layers.py │ └── low_rank_layer.py │ ├── caption.py │ ├── caption_clip_data.py │ ├── caption_data.py │ ├── caption_model.py │ ├── caption_raw_data.py │ ├── classification.py │ ├── classification_clip_data.py │ ├── classification_model.py │ ├── classification_raw_data.py │ ├── clip │ ├── __init__.py │ ├── bpe_simple_vocab_16e6.txt.gz │ ├── clip.py │ ├── model.py │ └── simple_tokenizer.py │ ├── clip_prepro_feats.py │ ├── dist_utils.py │ ├── gqa.py │ ├── gqa_clip_data.py │ ├── gqa_data.py │ ├── gqa_model.py │ ├── gqa_raw_data.py │ ├── how2qa.py │ ├── lora │ ├── __init__.py │ ├── config.py │ ├── controller.py │ ├── layers.py │ └── utils.py │ ├── mmt.py │ ├── mmt_data.py │ ├── mmt_model.py │ ├── modeling_bart.py │ ├── modeling_prefix_bart.py │ ├── modeling_t5.py │ ├── multitask.py │ ├── multitask_data.py │ ├── multitask_model.py │ ├── multitask_video.py │ ├── my_deepspeed.py │ ├── my_transformers │ ├── __init__.py │ ├── modeling_bart.py │ └── modeling_t5.py │ ├── nlvr.py │ ├── nlvr_clip_data.py │ ├── nlvr_data.py │ ├── nlvr_model.py │ ├── nlvr_raw_data.py │ ├── param.py │ ├── preprocess.py │ ├── pretrain.py │ ├── pretrain_data.py │ ├── pretrain_model.py │ ├── pretrain_raw_data.py │ ├── pretrain_vcr.py │ ├── pretrain_vcr_data.py │ ├── prompt │ ├── __init__.py │ ├── config.py │ ├── prompt_controller.py │ └── prompt_modeling.py │ ├── qa_answer_table.py │ ├── refcoco.py │ ├── refcoco_data.py │ ├── refcoco_model.py │ ├── refcoco_utils.py │ ├── tokenization.py │ ├── trainer_base.py │ ├── tvc.py │ ├── tvqa.py │ ├── utils.py │ ├── vcr.py │ ├── vcr_data.py │ ├── vcr_model.py │ ├── video │ ├── how2qa_data.py │ ├── tvc_data.py │ ├── tvqa_data.py │ ├── tvqa_matching_data.py │ ├── tvr_data.py │ ├── video_matching_model.py │ ├── video_model.py │ └── yc2c_data.py │ ├── vis_encoder.py │ ├── vqa.py │ ├── vqa_clip_data.py │ ├── vqa_data.py │ ├── vqa_model.py │ ├── vqa_raw_data.py │ └── yc2c.py ├── assets └── vl_adapter_teaser.png ├── download_backbones.py ├── feature_extraction ├── README.md ├── coco_CLIP.py ├── coco_gt.py ├── coco_proposal.py ├── coco_val_compact.py ├── detectron2_given_box_maxnms.py ├── detectron2_proposal_maxnms.py ├── flickr30k_proposal.py ├── process.sh ├── refcocog_gt.py ├── refcocog_mattnet.py ├── tsv_to_h5.py ├── vcr_gt.py └── vcr_proposal.py ├── inference_example.ipynb └── requirements.txt /.gitignore: -------------------------------------------------------------------------------- 1 | # Initially taken from Github's Python gitignore file 2 | 3 | # Byte-compiled / optimized / DLL files 4 | __pycache__/ 5 | *.py[cod] 6 | *$py.class 7 | 8 | # C extensions 9 | *.so 10 | 11 | # tests and logs 12 | tests/fixtures/* 13 | !tests/fixtures/sample_text_no_unicode.txt 14 | logs/ 15 | lightning_logs/ 16 | lang_code_data/ 17 | **/slurm* 18 | **/wandb 19 | **/snap 20 | datasets 21 | 22 | # Distribution / packaging 23 | .Python 24 | build/ 25 | develop-eggs/ 26 | dist/ 27 | downloads/ 28 | eggs/ 29 | .eggs/ 30 | lib/ 31 | lib64/ 32 | parts/ 33 | sdist/ 34 | var/ 35 | wheels/ 36 | *.egg-info/ 37 | .installed.cfg 38 | *.egg 39 | MANIFEST 40 | 41 | # PyInstaller 42 | # Usually these files are written by a python script from a template 43 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 44 | *.manifest 45 | *.spec 46 | 47 | # Installer logs 48 | pip-log.txt 49 | pip-delete-this-directory.txt 50 | 51 | # Unit test / coverage reports 52 | htmlcov/ 53 | .tox/ 54 | .nox/ 55 | .coverage 56 | .coverage.* 57 | .cache 58 | nosetests.xml 59 | coverage.xml 60 | *.cover 61 | .hypothesis/ 62 | .pytest_cache/ 63 | 64 | # Translations 65 | *.mo 66 | *.pot 67 | 68 | # Django stuff: 69 | *.log 70 | local_settings.py 71 | db.sqlite3 72 | 73 | # Flask stuff: 74 | instance/ 75 | .webassets-cache 76 | 77 | # Scrapy stuff: 78 | .scrapy 79 | 80 | # Sphinx documentation 81 | docs/_build/ 82 | 83 | # PyBuilder 84 | target/ 85 | 86 | # Jupyter Notebook 87 | .ipynb_checkpoints 88 | 89 | # IPython 90 | profile_default/ 91 | ipython_config.py 92 | 93 | # pyenv 94 | .python-version 95 | 96 | # celery beat schedule file 97 | celerybeat-schedule 98 | 99 | # SageMath parsed files 100 | *.sage.py 101 | 102 | # Environments 103 | .env 104 | .venv 105 | env/ 106 | venv/ 107 | ENV/ 108 | env.bak/ 109 | venv.bak/ 110 | 111 | # Spyder project settings 112 | .spyderproject 113 | .spyproject 114 | 115 | # Rope project settings 116 | .ropeproject 117 | 118 | # mkdocs documentation 119 | /site 120 | 121 | # mypy 122 | .mypy_cache/ 123 | .dmypy.json 124 | dmypy.json 125 | 126 | # Pyre type checker 127 | .pyre/ 128 | 129 | # vscode 130 | .vs 131 | .vscode 132 | 133 | # Pycharm 134 | .idea 135 | 136 | # TF code 137 | tensorflow_code 138 | 139 | # Models 140 | proc_data 141 | 142 | # examples 143 | runs 144 | /runs_old 145 | /wandb 146 | /examples/runs 147 | /examples/**/*.args 148 | /examples/rag/sweep 149 | 150 | # data 151 | /data 152 | serialization_dir 153 | 154 | # emacs 155 | *.*~ 156 | debug.env 157 | 158 | # vim 159 | .*.swp 160 | 161 | #ctags 162 | tags 163 | 164 | # pre-commit 165 | .pre-commit* 166 | 167 | # .lock 168 | *.lock 169 | -------------------------------------------------------------------------------- /CLIP-ViL/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Hao Tan 4 | Copyright (c) 2021 Liunian Harold Li 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /CLIP-ViL/clip/LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 OpenAI 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | 23 | -------------------------------------------------------------------------------- /CLIP-ViL/clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip import * 2 | from .adapter_config import VisionAdapterConfig 3 | -------------------------------------------------------------------------------- /CLIP-ViL/clip/adapter_config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class VisionAdapterConfig(object): 6 | """Implements the adapter configuration proposed by Houlsby et. al, 2019 7 | in https://arxiv.org/abs/1902.00751.""" 8 | 9 | reduction_factor: int = 1 10 | -------------------------------------------------------------------------------- /CLIP-ViL/clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ylsung/VL_adapter/545fcbbdbbaec4c442de35567f6ae477ff4e8265/CLIP-ViL/clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /CLIP-ViL/clip/clip.py: -------------------------------------------------------------------------------- 1 | import hashlib 2 | import os 3 | import urllib 4 | import warnings 5 | from typing import Union, List 6 | 7 | import torch 8 | from PIL import Image 9 | from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize 10 | from tqdm import tqdm 11 | 12 | from .model import build_model 13 | from .simple_tokenizer import SimpleTokenizer as _Tokenizer 14 | 15 | __all__ = ["available_models", "load", "tokenize"] 16 | _tokenizer = _Tokenizer() 17 | 18 | _MODELS = { 19 | "RN50": "https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt", 20 | "RN101": "https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt", 21 | "RN50x4": "https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt", 22 | "ViT-B/32": "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt", 23 | } 24 | 25 | 26 | 27 | def _download(url: str, root: str = os.path.expanduser("~/.cache/clip")): 28 | os.makedirs(root, exist_ok=True) 29 | filename = os.path.basename(url) 30 | 31 | expected_sha256 = url.split("/")[-2] 32 | download_target = os.path.join(root, filename) 33 | 34 | if os.path.exists(download_target) and not os.path.isfile(download_target): 35 | raise RuntimeError(f"{download_target} exists and is not a regular file") 36 | 37 | if os.path.isfile(download_target): 38 | if hashlib.sha256(open(download_target, "rb").read()).hexdigest() == expected_sha256: 39 | return download_target 40 | else: 41 | warnings.warn(f"{download_target} exists, but the SHA256 checksum does not match; re-downloading the file") 42 | 43 | with urllib.request.urlopen(url) as source, open(download_target, "wb") as output: 44 | with tqdm(total=int(source.info().get("Content-Length")), ncols=80) as loop: 45 | while True: 46 | buffer = source.read(8192) 47 | if not buffer: 48 | break 49 | 50 | output.write(buffer) 51 | loop.update(len(buffer)) 52 | 53 | if hashlib.sha256(open(download_target, "rb").read()).hexdigest() != expected_sha256: 54 | raise RuntimeError(f"Model has been downloaded but the SHA256 checksum does not not match") 55 | 56 | return download_target 57 | 58 | 59 | def available_models(): 60 | return list(_MODELS.keys()) 61 | 62 | 63 | def load(name: str, device: Union[str, torch.device] = "cuda" if torch.cuda.is_available() else "cpu", jit=True, adapter_config=None): 64 | if name not in _MODELS: 65 | raise RuntimeError(f"Model {name} not found; available models = {available_models()}") 66 | 67 | model_path = _download(_MODELS[name]) 68 | model = torch.jit.load(model_path, map_location=device if jit else "cpu").eval() 69 | n_px = model.input_resolution.item() 70 | 71 | transform = Compose([ 72 | Resize(n_px, interpolation=Image.BICUBIC), 73 | CenterCrop(n_px), 74 | lambda image: image.convert("RGB"), 75 | ToTensor(), 76 | Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), 77 | ]) 78 | 79 | if not jit: 80 | model = build_model(model.state_dict(), adapter_config).to(device) 81 | if str(device) == "cpu": 82 | model.float() 83 | return model, transform 84 | 85 | # patch the device names 86 | device_holder = torch.jit.trace(lambda: torch.ones([]).to(torch.device(device)), example_inputs=[]) 87 | device_node = [n for n in device_holder.graph.findAllNodes("prim::Constant") if "Device" in repr(n)][-1] 88 | 89 | def patch_device(module): 90 | graphs = [module.graph] if hasattr(module, "graph") else [] 91 | if hasattr(module, "forward1"): 92 | graphs.append(module.forward1.graph) 93 | 94 | for graph in graphs: 95 | for node in graph.findAllNodes("prim::Constant"): 96 | if "value" in node.attributeNames() and str(node["value"]).startswith("cuda"): 97 | node.copyAttributes(device_node) 98 | 99 | model.apply(patch_device) 100 | patch_device(model.encode_image) 101 | patch_device(model.encode_text) 102 | 103 | # patch dtype to float32 on CPU 104 | if str(device) == "cpu": 105 | float_holder = torch.jit.trace(lambda: torch.ones([]).float(), example_inputs=[]) 106 | float_input = list(float_holder.graph.findNode("aten::to").inputs())[1] 107 | float_node = float_input.node() 108 | 109 | def patch_float(module): 110 | graphs = [module.graph] if hasattr(module, "graph") else [] 111 | if hasattr(module, "forward1"): 112 | graphs.append(module.forward1.graph) 113 | 114 | for graph in graphs: 115 | for node in graph.findAllNodes("aten::to"): 116 | inputs = list(node.inputs()) 117 | for i in [1, 2]: # dtype can be the second or third argument to aten::to() 118 | if inputs[i].node()["value"] == 5: 119 | inputs[i].node().copyAttributes(float_node) 120 | 121 | model.apply(patch_float) 122 | patch_float(model.encode_image) 123 | patch_float(model.encode_text) 124 | 125 | model.float() 126 | 127 | return model, transform 128 | 129 | 130 | def tokenize(texts: Union[str, List[str]], context_length: int = 77): 131 | if isinstance(texts, str): 132 | texts = [texts] 133 | 134 | sot_token = _tokenizer.encoder["<|startoftext|>"] 135 | eot_token = _tokenizer.encoder["<|endoftext|>"] 136 | all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token] for text in texts] 137 | result = torch.zeros(len(all_tokens), context_length, dtype=torch.long) 138 | 139 | for i, tokens in enumerate(all_tokens): 140 | if len(tokens) > context_length: 141 | raise RuntimeError(f"Input {texts[i]} is too long for context length {context_length}") 142 | result[i, :len(tokens)] = torch.tensor(tokens) 143 | 144 | return result 145 | -------------------------------------------------------------------------------- /CLIP-ViL/clip/simple_tokenizer.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import html 3 | import os 4 | from functools import lru_cache 5 | 6 | import ftfy 7 | import regex as re 8 | 9 | 10 | @lru_cache() 11 | def default_bpe(): 12 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz") 13 | 14 | 15 | @lru_cache() 16 | def bytes_to_unicode(): 17 | """ 18 | Returns list of utf-8 byte and a corresponding list of unicode strings. 19 | The reversible bpe codes work on unicode strings. 20 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 21 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 22 | This is a signficant percentage of your normal, say, 32K bpe vocab. 23 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 24 | And avoids mapping to whitespace/control characters the bpe code barfs on. 25 | """ 26 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) 27 | cs = bs[:] 28 | n = 0 29 | for b in range(2**8): 30 | if b not in bs: 31 | bs.append(b) 32 | cs.append(2**8+n) 33 | n += 1 34 | cs = [chr(n) for n in cs] 35 | return dict(zip(bs, cs)) 36 | 37 | 38 | def get_pairs(word): 39 | """Return set of symbol pairs in a word. 40 | Word is represented as tuple of symbols (symbols being variable-length strings). 41 | """ 42 | pairs = set() 43 | prev_char = word[0] 44 | for char in word[1:]: 45 | pairs.add((prev_char, char)) 46 | prev_char = char 47 | return pairs 48 | 49 | 50 | def basic_clean(text): 51 | text = ftfy.fix_text(text) 52 | text = html.unescape(html.unescape(text)) 53 | return text.strip() 54 | 55 | 56 | def whitespace_clean(text): 57 | text = re.sub(r'\s+', ' ', text) 58 | text = text.strip() 59 | return text 60 | 61 | 62 | class SimpleTokenizer(object): 63 | def __init__(self, bpe_path: str = default_bpe()): 64 | self.byte_encoder = bytes_to_unicode() 65 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 66 | merges = gzip.open(bpe_path).read().decode("utf-8").split('\n') 67 | merges = merges[1:49152-256-2+1] 68 | merges = [tuple(merge.split()) for merge in merges] 69 | vocab = list(bytes_to_unicode().values()) 70 | vocab = vocab + [v+'' for v in vocab] 71 | for merge in merges: 72 | vocab.append(''.join(merge)) 73 | vocab.extend(['<|startoftext|>', '<|endoftext|>']) 74 | self.encoder = dict(zip(vocab, range(len(vocab)))) 75 | self.decoder = {v: k for k, v in self.encoder.items()} 76 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 77 | self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} 78 | self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) 79 | 80 | def bpe(self, token): 81 | if token in self.cache: 82 | return self.cache[token] 83 | word = tuple(token[:-1]) + ( token[-1] + '',) 84 | pairs = get_pairs(word) 85 | 86 | if not pairs: 87 | return token+'' 88 | 89 | while True: 90 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) 91 | if bigram not in self.bpe_ranks: 92 | break 93 | first, second = bigram 94 | new_word = [] 95 | i = 0 96 | while i < len(word): 97 | try: 98 | j = word.index(first, i) 99 | new_word.extend(word[i:j]) 100 | i = j 101 | except: 102 | new_word.extend(word[i:]) 103 | break 104 | 105 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 106 | new_word.append(first+second) 107 | i += 2 108 | else: 109 | new_word.append(word[i]) 110 | i += 1 111 | new_word = tuple(new_word) 112 | word = new_word 113 | if len(word) == 1: 114 | break 115 | else: 116 | pairs = get_pairs(word) 117 | word = ' '.join(word) 118 | self.cache[token] = word 119 | return word 120 | 121 | def encode(self, text): 122 | bpe_tokens = [] 123 | text = whitespace_clean(basic_clean(text)).lower() 124 | for token in re.findall(self.pat, text): 125 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) 126 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) 127 | return bpe_tokens 128 | 129 | def decode(self, tokens): 130 | text = ''.join([self.decoder[token] for token in tokens]) 131 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') 132 | return text 133 | -------------------------------------------------------------------------------- /CLIP-ViL/data/mscoco/README.md: -------------------------------------------------------------------------------- 1 | # Put raw COCO (train2014m, val2014, test2015) images here 2 | -------------------------------------------------------------------------------- /CLIP-ViL/scripts/gqa_adapters.sh: -------------------------------------------------------------------------------- 1 | # The name of this experiment. 2 | name=$2 3 | 4 | # Save logs and models under snap/vqa; make backup. 5 | output=$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # export PYTHONPATH=$PYTHONPATH:/local/harold/ubert/clip_vlp/CLIP 11 | 12 | # See Readme.md for option details. 13 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 14 | unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/tasks/gqa.py \ 15 | --distributed \ 16 | --train train,valid --valid testdev \ 17 | --tqdm --output $output \ 18 | --input_raw_images \ 19 | --use_clip \ 20 | --numWorkers 10 \ 21 | --batchSize 2 --optim bert --lr 1e-5 --epochs 10 \ 22 | --llayers 12 --xlayers 0 --rlayers 0 \ 23 | --visualbert_style \ 24 | --vqa_style_transform \ 25 | --loadLXMERTQA snap/pretrained/CLIP_VL_RN50x4 \ 26 | --fp16 \ 27 | --add_zero_padding \ 28 | --gradient_accumulation_steps 8 \ 29 | --warmup_ratio 0.05 \ 30 | --report_step 400 \ 31 | --use_separate_optimizer_for_visual \ 32 | --sgd_lr 0.001 \ 33 | --sgd_momentum 0.0 \ 34 | --schedule 3 \ 35 | --use_positional_embedding \ 36 | --pos_num 25 \ 37 | --clip_model_name RN50x4 \ 38 | --loss_scale 500 \ 39 | --use_adapter \ 40 | --reduction_factor 4 \ 41 | ${@:5} | tee $output/log.log 42 | 43 | 44 | # bash scripts/gqa_2.sh 2 snap/gqa/adapter4 9599 1 --gradient_accumulation_steps 8 --batchSize 32 --lr 1e-5 --warmup_ratio 0.05 --report_step 400 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 --loss_scale 500 45 | 46 | # bash run/finetune/gqa.bash 4,5,6,7 snap/gqa/final_e20_small_lr 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERTQA /local/harold/ubert/clip_vlp/lxmert/snap/pretrain/clip_full_20_no_qa_50x4_new_continue_from_9/Epoch11 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 --loss_scale 500 47 | 48 | # bash run/finetune/gqa.bash 3,4,5,6 snap/gqa/freeze_50x4 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 --loss_scale 500 --freeze_clip 49 | 50 | # bash run/finetune/gqa.bash 5 snap/gqa/test 9595 1 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 --loss_scale 500 --test submit 51 | 52 | # bash run/finetune/gqa.bash 4,5,6,7 snap/gqa/final_e20_RN50_large_lr 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERTQA /local/harold/ubert/clip_vlp/lxmert/snap/pretrain/clip_full_20_no_qa_continue_from_17/Epoch11 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --loss_scale 500 --lr 5e-5 53 | 54 | 55 | # # bash run/finetune/gqa.bash 0 snap/gqa/test_rn50 9545 1 --gradient_accumulation_steps 1 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --freeze_clip --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --loss_scale 500 --lr 3e-5 --test submit --load snap/gqa/final_e20_RN50/BEST 56 | 57 | # # bash run/finetune/gqa.bash 3,4,5,6 snap/gqa/scratch_50x4_FU_TRUE 9595 4 --gradient_accumulation_steps 1 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --freeze_clip --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --loss_scale 500 --lr 3e-5 --clip_model_name RN50x4 -------------------------------------------------------------------------------- /CLIP-ViL/scripts/gqa_baseline.sh: -------------------------------------------------------------------------------- 1 | # The name of this experiment. 2 | name=$2 3 | 4 | # Save logs and models under snap/vqa; make backup. 5 | output=$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # export PYTHONPATH=$PYTHONPATH:/local/harold/ubert/clip_vlp/CLIP 11 | 12 | # See Readme.md for option details. 13 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 14 | unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/tasks/gqa.py \ 15 | --distributed \ 16 | --train train,valid --valid testdev \ 17 | --tqdm --output $output \ 18 | --input_raw_images \ 19 | --use_clip \ 20 | --numWorkers 10 \ 21 | --batchSize 2 --optim bert --lr 1e-5 --epochs 10 \ 22 | --llayers 12 --xlayers 0 --rlayers 0 \ 23 | --loadLXMERTQA snap/pretrained/CLIP_VL_RN50x4 \ 24 | --visualbert_style \ 25 | --vqa_style_transform \ 26 | --fp16 \ 27 | --add_zero_padding \ 28 | --gradient_accumulation_steps 8 \ 29 | --warmup_ratio 0.05 \ 30 | --report_step 400 \ 31 | --use_separate_optimizer_for_visual \ 32 | --sgd_lr 0.001 \ 33 | --sgd_momentum 0.0 \ 34 | --schedule 3 \ 35 | --use_positional_embedding \ 36 | --pos_num 25 \ 37 | --clip_model_name RN50x4 \ 38 | --loss_scale 500 \ 39 | ${@:5} | tee $output/log.log 40 | 41 | 42 | # bash scripts/gqa.sh 0 snap/gqa/full 9595 1 --gradient_accumulation_steps 8 --batchSize 32 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERTQA snap/pretrained/CLIP_VL_RN50x4_LXRT --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 --loss_scale 500 43 | 44 | # bash run/finetune/gqa.bash 4,5,6,7 snap/gqa/final_e20_small_lr 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERTQA /local/harold/ubert/clip_vlp/lxmert/snap/pretrain/clip_full_20_no_qa_50x4_new_continue_from_9/Epoch11 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 --loss_scale 500 45 | 46 | # bash run/finetune/gqa.bash 3,4,5,6 snap/gqa/freeze_50x4 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 --loss_scale 500 --freeze_clip 47 | 48 | # bash run/finetune/gqa.bash 5 snap/gqa/test 9595 1 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 --loss_scale 500 --test submit 49 | 50 | # bash run/finetune/gqa.bash 4,5,6,7 snap/gqa/final_e20_RN50_large_lr 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERTQA /local/harold/ubert/clip_vlp/lxmert/snap/pretrain/clip_full_20_no_qa_continue_from_17/Epoch11 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --loss_scale 500 --lr 5e-5 51 | 52 | 53 | # # bash run/finetune/gqa.bash 0 snap/gqa/test_rn50 9545 1 --gradient_accumulation_steps 1 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --freeze_clip --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --loss_scale 500 --lr 3e-5 --test submit --load snap/gqa/final_e20_RN50/BEST 54 | 55 | # # bash run/finetune/gqa.bash 3,4,5,6 snap/gqa/scratch_50x4_FU_TRUE 9595 4 --gradient_accumulation_steps 1 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --freeze_clip --epoch 5 --schedule 3 --use_positional_embedding --pos_num 25 --loss_scale 500 --lr 3e-5 --clip_model_name RN50x4 -------------------------------------------------------------------------------- /CLIP-ViL/scripts/pretrain.bash: -------------------------------------------------------------------------------- 1 | # The name of this experiment. 2 | name=$2 3 | 4 | # Save logs and models under snap/vqa; make backup. 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # See Readme.md for option details. 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/pretrain/lxmert_pretrain.py \ 13 | --taskMaskLM --taskMatched \ 14 | --visualLosses obj,attr,feat \ 15 | --wordMaskRate 0.15 \ 16 | --train mscoco_train,mscoco_nominival,vgnococo --valid mscoco_minival \ 17 | --batchSize 256 --optim bert --lr 1e-4 --epochs 20 \ 18 | --tqdm \ 19 | --llayers 12 --xlayers 0 --rlayers 0 \ 20 | --visualbert_style \ 21 | --input_raw_images \ 22 | --vqa_style_transform \ 23 | --objMaskRate 0.0 \ 24 | --numWorkers 0\ 25 | --clip_model_name RN50\ 26 | --use_clip \ 27 | --distributed \ 28 | --output $output\ 29 | ${@:5} | tee $output/log.log 30 | -------------------------------------------------------------------------------- /CLIP-ViL/scripts/snli-ve_adapters.sh: -------------------------------------------------------------------------------- 1 | # The name of this experiment. 2 | name=$2 3 | 4 | # Save logs and models under snap/vqa; make backup. 5 | output=$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # export PYTHONPATH=$PYTHONPATH:/local/harold/ubert/clip_vlp/CLIP 11 | 12 | # See Readme.md for option details. 13 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 14 | unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/tasks/snli.py \ 15 | --distributed \ 16 | --train train --valid valid \ 17 | --tqdm --output $output \ 18 | --input_raw_images \ 19 | --use_clip \ 20 | --numWorkers 10 \ 21 | --batchSize 2 --optim bert --lr 1e-5 --epochs 10 \ 22 | --llayers 12 --xlayers 0 --rlayers 0 \ 23 | --visualbert_style \ 24 | --vqa_style_transform \ 25 | --clip_model_name RN50x4 \ 26 | --loadLXMERT snap/pretrained/CLIP_VL_RN50x4 \ 27 | --fp16 \ 28 | --use_adapter \ 29 | --reduction_factor 4 \ 30 | --add_zero_padding \ 31 | --gradient_accumulation_steps 8 \ 32 | --report_step 400 \ 33 | --warmup_ratio 0.05 \ 34 | --use_separate_optimizer_for_visual \ 35 | --sgd_lr 0.001 \ 36 | --sgd_momentum 0.0 \ 37 | --schedule 1 \ 38 | --use_positional_embedding \ 39 | --pos_num 25 \ 40 | --clip_model_name RN50x4 \ 41 | ${@:5} | tee $output/log.log 42 | 43 | 44 | 45 | #bash run/finetune/snli_ve.bash 5 snap/snli/test 9595 1 --gradient_accumulation_steps 1 --batchSize 12 --lr 5e-5 --freeze_clip --loss_scale 500 --warmup_ratio 0.05 46 | 47 | # bash run/finetune/snli_ve.bash 4,5,6,7 snap/snli/final_e20_schedule_2 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERT /local/harold/ubert/clip_vlp/lxmert/snap/pretrain/clip_full_20_no_qa_50x4_new_continue_from_9/Epoch11 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 2 --schedule 1 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 48 | 49 | # bash run/finetune/snli_ve.bash 4,5,6,7 snap/snli/final_e20_RN50_schedule_2 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERT /local/harold/ubert/clip_vlp/lxmert/snap/pretrain/clip_full_20_no_qa_continue_from_17/Epoch11 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 2 --schedule 1 --use_positional_embedding --pos_num 25 50 | 51 | # bash run/finetune/snli_ve.bash 5 snap/snli/test 9595 1 --gradient_accumulation_steps 1 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --freeze_clip --use_positional_embedding --pos_num 25 -------------------------------------------------------------------------------- /CLIP-ViL/scripts/snli-ve_baseline.sh: -------------------------------------------------------------------------------- 1 | # The name of this experiment. 2 | name=$2 3 | 4 | # Save logs and models under snap/vqa; make backup. 5 | output=$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # export PYTHONPATH=$PYTHONPATH:/local/harold/ubert/clip_vlp/CLIP 11 | 12 | # See Readme.md for option details. 13 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 14 | unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/tasks/snli.py \ 15 | --distributed \ 16 | --train train --valid valid \ 17 | --tqdm --output $output \ 18 | --input_raw_images \ 19 | --use_clip \ 20 | --numWorkers 10 \ 21 | --batchSize 32 --optim bert --lr 5e-5 --epochs 2 \ 22 | --llayers 12 --xlayers 0 --rlayers 0 \ 23 | --visualbert_style \ 24 | --vqa_style_transform \ 25 | --clip_model_name RN50x4 \ 26 | --load snap/snli-ve/full_finetuning/BEST \ 27 | --fp16 \ 28 | --add_zero_padding \ 29 | --gradient_accumulation_steps 8 \ 30 | --report_step 400 \ 31 | --warmup_ratio 0.05 \ 32 | --use_separate_optimizer_for_visual \ 33 | --sgd_lr 0.001 \ 34 | --sgd_momentum 0.0 \ 35 | --schedule 1 \ 36 | --use_positional_embedding \ 37 | --pos_num 25 \ 38 | --clip_model_name RN50x4 \ 39 | ${@:5} | tee $output/log.log 40 | 41 | 42 | 43 | 44 | #bash run/finetune/snli_ve.bash 5 snap/snli/test 9595 1 --gradient_accumulation_steps 1 --batchSize 12 --lr 5e-5 --freeze_clip --loss_scale 500 --warmup_ratio 0.05 45 | 46 | # bash run/finetune/snli_ve.bash 4,5,6,7 snap/snli/final_e20_schedule_2 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERT /local/harold/ubert/clip_vlp/lxmert/snap/pretrain/clip_full_20_no_qa_50x4_new_continue_from_9/Epoch11 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 2 --schedule 1 --use_positional_embedding --pos_num 25 --clip_model_name RN50x4 47 | 48 | # bash run/finetune/snli_ve.bash 4,5,6,7 snap/snli/final_e20_RN50_schedule_2 9595 4 --gradient_accumulation_steps 8 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --loadLXMERT /local/harold/ubert/clip_vlp/lxmert/snap/pretrain/clip_full_20_no_qa_continue_from_17/Epoch11 --use_separate_optimizer_for_visual --sgd_lr 0.001 --sgd_momentum 0.0 --epoch 2 --schedule 1 --use_positional_embedding --pos_num 25 49 | 50 | # bash run/finetune/snli_ve.bash 5 snap/snli/test 9595 1 --gradient_accumulation_steps 1 --batchSize 8 --lr 5e-5 --warmup_ratio 0.05 --report_step 400 --freeze_clip --use_positional_embedding --pos_num 25 -------------------------------------------------------------------------------- /CLIP-ViL/scripts/vqa_adapters.sh: -------------------------------------------------------------------------------- 1 | # The name of this experiment. 2 | name=$2 3 | 4 | # Save logs and models under snap/vqa; make backup. 5 | output=$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # export PYTHONPATH=$PYTHONPATH:/local/harold/ubert/clip_vlp/CLIP 11 | 12 | # See Readme.md for option details. 13 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 14 | unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/tasks/vqa.py \ 15 | --distributed \ 16 | --train train,nominival --valid minival \ 17 | --tqdm --output $output \ 18 | --input_raw_images \ 19 | --use_clip \ 20 | --numWorkers 10 \ 21 | --batchSize 32 --optim bert --lr 5e-4 --epochs 5 \ 22 | --llayers 12 --xlayers 0 --rlayers 0 \ 23 | --visualbert_style \ 24 | --vqa_style_transform \ 25 | --clip_model_name RN50x4 \ 26 | --add_zero_padding \ 27 | --gradient_accumulation_steps 8 \ 28 | --loss_scale 500 \ 29 | --warmup_ratio 0.05 \ 30 | --report_step 400 \ 31 | --use_separate_optimizer_for_visual \ 32 | --sgd_lr 0.0001 \ 33 | --sgd_momentum 0.0 \ 34 | --schedule 2 \ 35 | --use_positional_embedding \ 36 | --pos_num 25 \ 37 | --fp16 \ 38 | --use_adapter \ 39 | --reduction_factor 4 \ 40 | --clip_model_name RN50x4 \ 41 | --loadLXMERTQA snap/pretrained/CLIP_VL_RN50x4 \ 42 | ${@:5} | tee $output/log.log 43 | 44 | 45 | # CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 46 | # unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/tasks/vqa.py \ 47 | # --distributed \ 48 | # --train train,nominival --valid minival \ 49 | # --test test \ 50 | # --tqdm --output $output \ 51 | # --input_raw_images \ 52 | # --use_clip \ 53 | # --numWorkers 10 \ 54 | # --batchSize 32 --optim bert --lr 4e-5 --epochs 5 \ 55 | # --llayers 12 --xlayers 0 --rlayers 0 \ 56 | # --visualbert_style \ 57 | # --vqa_style_transform \ 58 | # --clip_model_name RN50x4 \ 59 | # --add_zero_padding \ 60 | # --gradient_accumulation_steps 8 \ 61 | # --loss_scale 500 \ 62 | # --warmup_ratio 0.05 \ 63 | # --report_step 400 \ 64 | # --use_separate_optimizer_for_visual \ 65 | # --sgd_lr 0.001 \ 66 | # --sgd_momentum 0.0 \ 67 | # --schedule 2 \ 68 | # --use_positional_embedding \ 69 | # --pos_num 25 \ 70 | # --fp16 \ 71 | # --use_adapter \ 72 | # --reduction_factor 4 \ 73 | # --clip_model_name RN50x4 \ 74 | # --load snap/vqa/vqa_clip_rn50x4_LMadapter4_5e-4/BEST 75 | # ${@:5} | tee $output/log.log 76 | -------------------------------------------------------------------------------- /CLIP-ViL/scripts/vqa_baseline.sh: -------------------------------------------------------------------------------- 1 | # The name of this experiment. 2 | name=$2 3 | 4 | # Save logs and models under snap/vqa; make backup. 5 | output=$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # export PYTHONPATH=$PYTHONPATH:/local/harold/ubert/clip_vlp/CLIP 11 | 12 | # See Readme.md for option details. 13 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 14 | unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/tasks/vqa.py \ 15 | --distributed \ 16 | --train train,nominival --valid minival \ 17 | --tqdm --output $output \ 18 | --input_raw_images \ 19 | --use_clip \ 20 | --numWorkers 10 \ 21 | --batchSize 32 --optim bert --lr 5e-5 --epochs 5 \ 22 | --llayers 12 --xlayers 0 --rlayers 0 \ 23 | --visualbert_style \ 24 | --vqa_style_transform \ 25 | --clip_model_name RN50x4 \ 26 | --add_zero_padding \ 27 | --gradient_accumulation_steps 8 \ 28 | --loss_scale 500 \ 29 | --warmup_ratio 0.05 \ 30 | --report_step 400 \ 31 | --use_separate_optimizer_for_visual \ 32 | --sgd_lr 0.001 \ 33 | --sgd_momentum 0.0 \ 34 | --schedule 2 \ 35 | --use_positional_embedding \ 36 | --pos_num 25 \ 37 | --fp16 \ 38 | --clip_model_name RN50x4 \ 39 | --loadLXMERTQA snap/pretrained/CLIP_VL_RN50x4 40 | ${@:5} | tee $output/log.log 41 | 42 | 43 | # CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 44 | # unbuffer python -m torch.distributed.launch --master_port=$3 --nproc_per_node=$4 src/tasks/vqa.py \ 45 | # --distributed \ 46 | # --train train,nominival --valid minival \ 47 | # --test test \ 48 | # --tqdm --output $output \ 49 | # --input_raw_images \ 50 | # --use_clip \ 51 | # --numWorkers 10 \ 52 | # --batchSize 32 --optim bert --lr 5e-5 --epochs 5 \ 53 | # --llayers 12 --xlayers 0 --rlayers 0 \ 54 | # --visualbert_style \ 55 | # --vqa_style_transform \ 56 | # --clip_model_name RN50x4 \ 57 | # --add_zero_padding \ 58 | # --gradient_accumulation_steps 8 \ 59 | # --loss_scale 500 \ 60 | # --warmup_ratio 0.05 \ 61 | # --report_step 400 \ 62 | # --use_separate_optimizer_for_visual \ 63 | # --sgd_lr 0.001 \ 64 | # --sgd_momentum 0.0 \ 65 | # --schedule 2 \ 66 | # --use_positional_embedding \ 67 | # --pos_num 25 \ 68 | # --fp16 \ 69 | # --clip_model_name RN50x4 \ 70 | # --load snap/vqa/vqa_clip_rn50x4/BEST \ 71 | # ${@:5} | tee $output/log.log 72 | -------------------------------------------------------------------------------- /CLIP-ViL/src/lxrt/adapters/__init__.py: -------------------------------------------------------------------------------- 1 | # The codes are borrowed from https://github.com/rabeehk/compacter 2 | 3 | from .config import MetaAdapterConfig, AdapterConfig, CompactorConfig, LRAdapterConfig 4 | from .adapter_modeling import Adapter, HyperComplexAdapter, OutputAdapter 5 | from .adapter_controller import AdapterController, AdapterLayer, MetaLayersAdapterController, OutputParallelAdapterLayer 6 | from .adapter_hypernetwork import AdapterLayersHyperNetController, AdapterLayersOneHyperNetController 7 | from .adapter_utils import TaskEmbeddingController -------------------------------------------------------------------------------- /CLIP-ViL/src/lxrt/adapters/adapter_configuration.py: -------------------------------------------------------------------------------- 1 | """Implements the adapters and other parameter-efficient finetuning methods' configurations.""" 2 | 3 | from collections import OrderedDict 4 | from dataclasses import dataclass 5 | 6 | import torch.nn as nn 7 | 8 | @dataclass 9 | class AdapterConfig(object): 10 | """Implements the adapter configuration proposed by Houlsby et. al, 2019 11 | in https://arxiv.org/abs/1902.00751. 12 | We additionally pass all the configuration of parameter-efficient finetuning 13 | methods with this config.""" 14 | add_layer_norm_before_adapter: bool = False 15 | add_layer_norm_after_adapter: bool = True 16 | non_linearity: str = "swish" 17 | task_reduction_factor: int = 16 18 | add_adapter_in_feed_forward = True 19 | add_adapter_in_self_attention = True 20 | hidden_dim = 128 21 | task_adapter_layers_encoder = None 22 | task_adapter_layers_decoder = None 23 | task_adapter_in_decoder = True 24 | intrinsic_dim = 100 25 | normalize_intrinsic_projections = False 26 | # This can be either random, or fastfood. 27 | intrinsic_projection = "random" 28 | 29 | # Hypercomplex adapters parameters 30 | hypercomplex_adapters = False 31 | hypercomplex_division = 8 32 | learn_phm = True 33 | hypercomplex_nonlinearity="glorot-uniform" 34 | shared_phm_rule = False 35 | factorized_phm = False 36 | shared_W_phm = False 37 | factorized_phm_rule = False 38 | phm_c_init = "normal" 39 | phm_rank = 1 40 | phm_init_range=0.01 41 | 42 | # prefix-tuning parameters. 43 | prefix_dim = 100 44 | init_prefix_from_vocab = False 45 | kronecker_prod = False 46 | 47 | # BitFit configuration. 48 | bitfit = False 49 | 50 | # Low-rank adapters. 51 | low_rank_adapters = False 52 | low_rank_w_init = "glorot-uniform" 53 | low_rank_rank = 1 54 | 55 | 56 | ADAPTER_CONFIG_MAPPING = OrderedDict( 57 | [("adapter", AdapterConfig)]) 58 | 59 | 60 | class AutoAdapterConfig(nn.Module): 61 | """Generic Adapter config class to instantiate different adapter configs.""" 62 | 63 | @classmethod 64 | def get(cls, config_name: str): 65 | if config_name in ADAPTER_CONFIG_MAPPING: 66 | return ADAPTER_CONFIG_MAPPING[config_name]() 67 | raise ValueError( 68 | "Unrecognized adapter config type identifier: {}. Should contain one of {}" 69 | .format(config_name, ", ".join(ADAPTER_CONFIG_MAPPING.keys()))) 70 | -------------------------------------------------------------------------------- /CLIP-ViL/src/lxrt/adapters/adapter_modeling.py: -------------------------------------------------------------------------------- 1 | """Implements an Adapter, Low-rank adapters and Hyper-adapter Layers.""" 2 | import torch 3 | import torch.nn as nn 4 | from .adapter_utils import Activations 5 | 6 | from .hypercomplex.layers import PHMLinear 7 | from .low_rank_layer import LowRankLinear 8 | 9 | 10 | class LowRankAdapter(nn.Module): 11 | """This is the low-rank adapter, in which each adapter is composed of two rank-one matrices. 12 | """ 13 | def __init__(self, config): 14 | super().__init__() 15 | self.config = config 16 | self.input_dim = config.input_dim 17 | self.down_sample_size = self.input_dim // config.reduction_factor 18 | self.activation = Activations(config.non_linearity.lower()) 19 | self.down_sampler = LowRankLinear(self.input_dim, self.down_sample_size, 20 | w_init=config.low_rank_w_init, 21 | rank=config.low_rank_rank) 22 | self.up_sampler = LowRankLinear(self.down_sample_size, self.input_dim, 23 | w_init=config.low_rank_w_init, 24 | rank=config.low_rank_rank) 25 | 26 | def forward(self, x): 27 | z = self.down_sampler(x) 28 | z = self.activation(z) 29 | output = self.up_sampler(z) 30 | return output 31 | 32 | 33 | class Adapter(nn.Module): 34 | """Conventional Adapter layer, in which the weights of up and down sampler modules 35 | are parameters and are optimized.""" 36 | 37 | def __init__(self, config): 38 | super().__init__() 39 | self.config = config 40 | self.input_dim = config.d_model 41 | reduction_factor = config.reduction_factor 42 | self.down_sample_size = self.input_dim // reduction_factor 43 | self.activation = Activations(config.non_linearity.lower()) 44 | self.down_sampler = nn.Linear(self.input_dim, self.down_sample_size) 45 | self.up_sampler = nn.Linear(self.down_sample_size, self.input_dim) 46 | 47 | if config.use_gate: 48 | self.gate = nn.Parameter(torch.zeros(1)) 49 | else: 50 | self.gate = None 51 | 52 | def forward(self, x): 53 | z = self.down_sampler(x) 54 | z = self.activation(z) 55 | output = self.up_sampler(z) 56 | 57 | if self.gate is not None: 58 | output = self.gate * output 59 | 60 | return output 61 | 62 | 63 | class OutputAdapter(nn.Module): 64 | """Conventional Adapter layer, in which the weights of up and down sampler modules 65 | are parameters and are optimized.""" 66 | 67 | def __init__(self, config, output_dim): 68 | super().__init__() 69 | self.config = config 70 | self.input_dim = config.d_model 71 | reduction_factor = 16 72 | self.down_sample_size = self.input_dim // reduction_factor 73 | self.activation = Activations(config.non_linearity.lower()) 74 | self.down_sampler = nn.Linear(self.input_dim, self.down_sample_size) 75 | self.up_sampler = nn.Linear(self.down_sample_size, output_dim) 76 | 77 | def forward(self, x): 78 | z = self.down_sampler(x) 79 | z = self.activation(z) 80 | output = self.up_sampler(z) 81 | return output 82 | 83 | def resize_up_sampler(self, resized_size): 84 | self.up_sampler = nn.Linear(self.down_sample_size, resized_size) 85 | 86 | 87 | class HyperComplexAdapter(nn.Module): 88 | """Hypercomplex Adapter layer, in which the weights of up and down sampler modules 89 | are parameters are 1/n times of the conventional adapter layers, where n is 90 | hypercomplex division number.""" 91 | 92 | def __init__(self, config): 93 | super().__init__() 94 | self.config = config 95 | self.input_dim = config.input_dim 96 | self.down_sample_size = self.input_dim // config.reduction_factor 97 | self.activation = Activations(config.non_linearity.lower()) 98 | self.down_sampler = PHMLinear(in_features=self.input_dim, 99 | out_features=self.down_sample_size, 100 | bias=True, 101 | c_init=config.phm_c_init, 102 | phm_dim=config.hypercomplex_division, 103 | learn_phm=config.learn_phm, 104 | w_init=config.hypercomplex_nonlinearity, 105 | shared_phm_rule=config.shared_phm_rule, 106 | factorized_phm=config.factorized_phm, 107 | shared_W_phm=config.shared_W_phm, 108 | factorized_phm_rule=config.factorized_phm_rule, 109 | phm_rank=config.phm_rank, 110 | phm_init_range=config.phm_init_range, 111 | kronecker_prod=config.kronecker_prod) 112 | self.up_sampler = PHMLinear(in_features=self.down_sample_size, 113 | out_features=self.input_dim, 114 | bias=True, 115 | c_init=config.phm_c_init, 116 | phm_dim=config.hypercomplex_division, 117 | learn_phm=config.learn_phm, 118 | w_init=config.hypercomplex_nonlinearity, 119 | shared_phm_rule=config.shared_phm_rule, 120 | factorized_phm=config.factorized_phm, 121 | shared_W_phm=config.shared_W_phm, 122 | factorized_phm_rule=config.factorized_phm_rule, 123 | phm_rank=config.phm_rank, 124 | phm_init_range=config.phm_init_range, 125 | kronecker_prod=config.kronecker_prod) 126 | 127 | def forward(self, x): 128 | z = self.down_sampler(x) 129 | z = self.activation(z) 130 | return self.up_sampler(z) -------------------------------------------------------------------------------- /CLIP-ViL/src/lxrt/adapters/adapter_outputs.py: -------------------------------------------------------------------------------- 1 | """Defines the output class for the adapter layers' parameters.""" 2 | import torch 3 | from dataclasses import dataclass 4 | 5 | 6 | @dataclass 7 | class SamplerOutput: 8 | """Base class for the base and weights of each adapter.""" 9 | weight: torch.FloatTensor = None 10 | bias: torch.FloatTensor = None 11 | 12 | 13 | @dataclass 14 | class LayerNormOutput: 15 | """Base class for the base and weights of the conditional 16 | layer norms.""" 17 | weight: torch.FloatTensor = None 18 | bias: torch.FloatTensor = None 19 | 20 | 21 | @dataclass 22 | class AdapterOutput: 23 | """Base class for each adapter weights""" 24 | up: SamplerOutput = None 25 | down: SamplerOutput = None 26 | pre_norm: LayerNormOutput = None 27 | post_norm: LayerNormOutput = None 28 | 29 | 30 | @dataclass 31 | class AdapterT5BlockOutput: 32 | """ 33 | Base class for adapter layer's outputs. 34 | """ 35 | feed_forward: AdapterOutput = None 36 | self_attention: AdapterOutput = None 37 | cross_attention: AdapterOutput = None -------------------------------------------------------------------------------- /CLIP-ViL/src/lxrt/adapters/adapter_utils.py: -------------------------------------------------------------------------------- 1 | """Implementation of different utility functions for adapter layers.""" 2 | import torch 3 | import torch.nn as nn 4 | from transformers.activations import get_activation 5 | 6 | 7 | class Activations(nn.Module): 8 | def __init__(self, activation_type): 9 | super().__init__() 10 | self.f = get_activation(activation_type) 11 | 12 | def forward(self, x): 13 | return self.f(x) 14 | 15 | 16 | def init_linear_layer(linear_layer, std=1e-2): 17 | """Initializes the given linear module as explained in adapter paper.""" 18 | nn.init.normal_(linear_layer.weight, std=std) 19 | nn.init.zeros_(linear_layer.bias) 20 | 21 | 22 | def linear_layer(input_dim, output_dim, std=1e-2): 23 | """Generates a linear module and initializes it.""" 24 | linear = nn.Linear(input_dim, output_dim) 25 | init_linear_layer(linear, std=std) 26 | return linear 27 | 28 | 29 | class TaskHyperNet(nn.Module): 30 | """This module generates the task-embeddings from the initial feeded task embeddings.""" 31 | 32 | def __init__(self, config, input_dim): 33 | super(TaskHyperNet, self).__init__() 34 | self.task_hidden_dim = config.task_hidden_dim 35 | self.projected_task_embedding_dim = config.projected_task_embedding_dim 36 | self.task_embeding_generator = nn.Sequential( 37 | linear_layer(input_dim, self.task_hidden_dim), 38 | nn.ReLU(), 39 | linear_layer(self.task_hidden_dim, self.projected_task_embedding_dim)) 40 | 41 | def forward(self, task_embedding): 42 | task_embedding = task_embedding.view(-1) 43 | return self.task_embeding_generator(task_embedding).view(-1) 44 | 45 | 46 | class LayerNormHyperNet(nn.Module): 47 | """This module generates the weight and bias for the task conditioned layer norm.""" 48 | 49 | def __init__(self, config): 50 | super(LayerNormHyperNet, self).__init__() 51 | self.task_embedding_dim = config.projected_task_embedding_dim \ 52 | if config.train_task_embeddings else config.task_embedding_dim 53 | self.weight_generator = linear_layer(self.task_embedding_dim, config.input_dim) 54 | self.bias_generator = linear_layer(self.task_embedding_dim, config.input_dim) 55 | 56 | def forward(self, input): 57 | return self.weight_generator(input), self.bias_generator(input) 58 | 59 | 60 | class TaskEmbeddingController(nn.Module): 61 | """Main module controlling task embeddings.""" 62 | 63 | def __init__(self, config): 64 | super(TaskEmbeddingController, self).__init__() 65 | # self.device = config.device 66 | self.task_embedding_dim = config.task_embedding_dim 67 | self.tasks = config.tasks 68 | self.task_to_task_embeddings = {task: task for task in self.tasks} 69 | if config.task_to_embeddings is not None: 70 | self.task_to_task_embeddings = config.task_to_embeddings 71 | self.tasks = self.task_to_task_embeddings.values() 72 | self.set_task_embeddings(self.tasks) 73 | self.train_task_embeddings = config.train_task_embeddings 74 | if self.train_task_embeddings: 75 | self.task_hyper_net = TaskHyperNet(config) 76 | 77 | def get_task(self, task): 78 | return self.task_to_task_embeddings[task] 79 | 80 | def set_task_embeddings(self, tasks): 81 | self.task_to_embeddings = nn.ParameterDict(dict()) 82 | for task in tasks: 83 | task_embedding = torch.Tensor(torch.randn(self.task_embedding_dim)) 84 | self.task_to_embeddings[task] = nn.Parameter(task_embedding) 85 | 86 | def forward(self, task): 87 | task_mapped = self.get_task(task) 88 | task_embedding = self.task_to_embeddings[task_mapped] 89 | if self.train_task_embeddings: 90 | return self.task_hyper_net(task_embedding) 91 | return task_embedding 92 | -------------------------------------------------------------------------------- /CLIP-ViL/src/lxrt/adapters/config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class AdapterConfig(object): 6 | """Implements the adapter configuration proposed by Houlsby et. al, 2019 7 | in https://arxiv.org/abs/1902.00751.""" 8 | add_layer_norm_before_adapter: bool = False 9 | add_layer_norm_after_adapter: bool = False 10 | non_linearity: str = "gelu_new" 11 | reduction_factor: int = 16 12 | weight_init_range = 1e-2 13 | # Whether to use conditional layer norms for adapters. 14 | conditional_layer_norm = False 15 | hidden_dim = 128 16 | # Whether to add adapter blocks, this is used in case we need 17 | # to tune only layer norms. 18 | train_adapters_blocks = True 19 | 20 | task_adapter_layers_encoder = None 21 | task_adapter_layers_decoder = None 22 | task_adapter_in_decoder = True 23 | intrinsic_dim = 100 24 | normalize_intrinsic_projections = False 25 | # This can be either random, or fastfood. 26 | intrinsic_projection = "random" 27 | 28 | # Hypercomplex adapters parameters 29 | hypercomplex_adapters = False 30 | hypercomplex_division = 8 31 | learn_phm = True 32 | hypercomplex_nonlinearity="glorot-uniform" 33 | shared_phm_rule = False 34 | factorized_phm = False 35 | shared_W_phm = False 36 | factorized_phm_rule = False 37 | phm_c_init = "normal" 38 | phm_rank = 1 39 | phm_init_range=0.01 40 | 41 | # prefix-tuning parameters. 42 | prefix_dim = 100 43 | init_prefix_from_vocab = False 44 | kronecker_prod = False 45 | 46 | # BitFit configuration. 47 | bitfit = False 48 | 49 | # Low-rank adapters. 50 | low_rank_adapters = False 51 | low_rank_w_init = "glorot-uniform" 52 | low_rank_rank = 1 53 | 54 | # whether using single adapter for all tasks 55 | use_single_adapter = True 56 | 57 | 58 | class MetaAdapterConfig(AdapterConfig): 59 | """Implements Meta adapter in which a hyper-network generates the parameters of 60 | adapter layers. In this case we have a task embeddings which is feed to the 61 | hyper-network to allow it generate the weights for the adapter layers.""" 62 | task_embedding_dim = 512 63 | task_embedding_dir = None 64 | hidden_dim = 128 65 | train_task_embeddings = False 66 | non_linearity: str = "gelu_new" 67 | projected_task_embedding_dim = 64 68 | task_hidden_dim = 128 69 | parametric_task_embedding = False 70 | # If Specified, uses one hypernet to generates the adapters weights. 71 | unique_hyper_net = True 72 | unique_hyper_net_layer_norm = True 73 | # We consider only one hyper-net for all the blocks of transformer. 74 | efficient_unique_hyper_net = False 75 | task_to_embeddings=None 76 | 77 | 78 | @dataclass 79 | class CompactorConfig(object): 80 | add_layer_norm_before_adapter: bool = False 81 | add_layer_norm_after_adapter: bool = False 82 | non_linearity: str = "gelu_new" 83 | reduction_factor: int = 16 84 | weight_init_range = 1e-2 85 | # Whether to use conditional layer norms for adapters. 86 | hidden_dim = 128 87 | # Whether to add adapter blocks, this is used in case we need 88 | # to tune only layer norms. 89 | task_adapter_layers_encoder = None 90 | task_adapter_layers_decoder = None 91 | task_adapter_in_decoder = True 92 | intrinsic_dim = 100 93 | normalize_intrinsic_projections = False 94 | # This can be either random, or fastfood. 95 | intrinsic_projection = "random" 96 | 97 | # Hypercomplex adapters parameters 98 | hypercomplex_adapters = True 99 | hypercomplex_division = 4 100 | train_task_adapters = True 101 | learn_phm = True 102 | hypercomplex_nonlinearity="glorot-uniform" 103 | shared_phm_rule = True 104 | factorized_phm = True 105 | shared_W_phm = False 106 | factorized_phm_rule = False 107 | phm_c_init = "normal" 108 | phm_rank = 1 109 | phm_init_range=0.0001 110 | 111 | # prefix-tuning parameters. 112 | prefix_dim = 100 113 | init_prefix_from_vocab = False 114 | kronecker_prod = False 115 | 116 | # BitFit configuration. 117 | bitfit = False 118 | 119 | # Low-rank adapters. 120 | low_rank_adapters = False 121 | low_rank_w_init = "glorot-uniform" 122 | low_rank_rank = 1 123 | 124 | # whether using single adapter for all tasks 125 | use_single_adapter = False 126 | 127 | 128 | @dataclass 129 | class LRAdapterConfig(object): 130 | add_layer_norm_before_adapter: bool = False 131 | add_layer_norm_after_adapter: bool = False 132 | non_linearity: str = "gelu_new" 133 | reduction_factor: int = 16 134 | weight_init_range = 1e-2 135 | # Whether to use conditional layer norms for adapters. 136 | hidden_dim = 128 137 | # Whether to add adapter blocks, this is used in case we need 138 | # to tune only layer norms. 139 | task_adapter_layers_encoder = None 140 | task_adapter_layers_decoder = None 141 | task_adapter_in_decoder = True 142 | intrinsic_dim = 100 143 | normalize_intrinsic_projections = False 144 | # This can be either random, or fastfood. 145 | intrinsic_projection = "random" 146 | 147 | # Hypercomplex adapters parameters 148 | hypercomplex_adapters = False 149 | hypercomplex_division = 4 150 | train_task_adapters = True 151 | learn_phm = True 152 | hypercomplex_nonlinearity="glorot-uniform" 153 | shared_phm_rule = True 154 | factorized_phm = True 155 | shared_W_phm = False 156 | factorized_phm_rule = False 157 | phm_c_init = "normal" 158 | phm_rank = 1 159 | phm_init_range=0.0001 160 | 161 | # prefix-tuning parameters. 162 | prefix_dim = 100 163 | init_prefix_from_vocab = False 164 | kronecker_prod = False 165 | 166 | # BitFit configuration. 167 | bitfit = False 168 | 169 | # Low-rank adapters. 170 | low_rank_adapters = True 171 | low_rank_w_init = "glorot-uniform" 172 | low_rank_rank = 1 173 | 174 | # whether using single adapter for all tasks 175 | use_single_adapter = False -------------------------------------------------------------------------------- /CLIP-ViL/src/lxrt/adapters/hypercomplex/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ylsung/VL_adapter/545fcbbdbbaec4c442de35567f6ae477ff4e8265/CLIP-ViL/src/lxrt/adapters/hypercomplex/__init__.py -------------------------------------------------------------------------------- /CLIP-ViL/src/lxrt/adapters/hypercomplex/inits.py: -------------------------------------------------------------------------------- 1 | # The codes are from https://github.com/bayer-science-for-a-better-life/phc-gnn 2 | import torch 3 | import math 4 | 5 | 6 | def glorot_normal(tensor: torch.Tensor): 7 | return torch.nn.init.xavier_normal_(tensor, gain=math.sqrt(2)) 8 | 9 | def glorot_uniform(tensor: torch.Tensor): 10 | return torch.nn.init.xavier_uniform_(tensor, gain=math.sqrt(2)) -------------------------------------------------------------------------------- /CLIP-ViL/src/lxrt/adapters/hypercomplex/kronecker.py: -------------------------------------------------------------------------------- 1 | # The codes are from https://github.com/bayer-science-for-a-better-life/phc-gnn 2 | import torch 3 | 4 | # TODO: change this with torch.kron 5 | """A part of the pylabyk library: numpytorch.py at https://github.com/yulkang/pylabyk""" 6 | def kronecker_product(a, b): 7 | """ 8 | Kronecker product of matrices a and b with leading batch dimensions. 9 | Batch dimensions are broadcast. The number of them mush 10 | :type a: torch.Tensor 11 | :type b: torch.Tensor 12 | :rtype: torch.Tensor 13 | """ 14 | #return torch.stack([torch.kron(ai, bi) for ai, bi in zip(a,b)], dim=0) 15 | siz1 = torch.Size(torch.tensor(a.shape[-2:]) * torch.tensor(b.shape[-2:])) 16 | res = a.unsqueeze(-1).unsqueeze(-3) * b.unsqueeze(-2).unsqueeze(-4) 17 | siz0 = res.shape[:-4] 18 | out = res.reshape(siz0 + siz1) 19 | return out 20 | 21 | 22 | def kronecker_product_einsum_batched(A: torch.Tensor, B: torch.Tensor): 23 | """ 24 | Batched Version of Kronecker Products 25 | :param A: has shape (b, a, c) 26 | :param B: has shape (b, k, p) 27 | :return: (b, ak, cp) 28 | """ 29 | assert A.dim() == 3 and B.dim() == 3 30 | res = torch.einsum('bac,bkp->bakcp', A, B).view(A.size(0), 31 | A.size(1)*B.size(1), 32 | A.size(2)*B.size(2)) 33 | return res -------------------------------------------------------------------------------- /CLIP-ViL/src/lxrt/adapters/low_rank_layer.py: -------------------------------------------------------------------------------- 1 | """This script implements a low-rank linear layer.""" 2 | import torch 3 | import torch.nn as nn 4 | 5 | from .hypercomplex.inits import glorot_uniform, glorot_normal 6 | 7 | class LowRankLinear(torch.nn.Module): 8 | def __init__(self, input_dim: int, output_dim: int, rank: int = 1, 9 | bias: bool = True, w_init: str = "glorot-uniform"): 10 | super(LowRankLinear, self).__init__() 11 | self.input_dim = input_dim 12 | self.output_dim = output_dim 13 | self.rank = rank 14 | self.bias = bias 15 | self.w_init = w_init 16 | self.W_left = nn.Parameter(torch.Tensor(size=(input_dim, rank)), requires_grad=True) 17 | self.W_right = nn.Parameter(torch.Tensor(size=(rank, output_dim)), requires_grad=True) 18 | if bias: 19 | self.b = nn.Parameter(torch.Tensor(output_dim)) 20 | self.reset_parameters() 21 | 22 | def reset_parameters(self): 23 | if self.bias: 24 | self.b.data = torch.zeros_like(self.b.data) 25 | if self.w_init == "glorot-uniform": 26 | self.W_left.data = glorot_uniform(self.W_left.data) 27 | self.W_right.data = glorot_uniform(self.W_right.data) 28 | elif self.w_init == "glorot-normal": 29 | self.W_left.data = glorot_normal(self.W_left.data) 30 | self.W_right.data = glorot_normal(self.W_right.data) 31 | else: 32 | raise ValueError 33 | 34 | def forward(self, x: torch.Tensor) -> torch.Tensor: 35 | W = self.W_left.matmul(self.W_right) 36 | output = torch.matmul(input=x, other=W) 37 | if self.bias: 38 | output += self.b 39 | return output 40 | -------------------------------------------------------------------------------- /CLIP-ViL/src/lxrt/visual_transformers.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import json 3 | import logging 4 | import math 5 | import os 6 | import shutil 7 | import tarfile 8 | import tempfile 9 | import sys 10 | from io import open 11 | import torch.nn.functional as F 12 | 13 | import torch 14 | from torch import nn 15 | from torch.nn import CrossEntropyLoss, SmoothL1Loss 16 | import numpy as np 17 | def resize_pos_embed(posemb, posemb_new): 18 | # Rescale the grid of position embeddings when loading from state_dict. Adapted from 19 | # https://github.com/google-research/vision_transformer/blob/00883dd691c63a6830751563748663526e811cee/vit_jax/checkpoint.py#L224 20 | ntok_new = posemb_new.shape[1] 21 | if True: 22 | posemb_tok, posemb_grid = posemb[:, :1], posemb[0, 1:] 23 | ntok_new -= 1 24 | else: 25 | posemb_tok, posemb_grid = posemb[:, :0], posemb[0] 26 | gs_old = int(math.sqrt(len(posemb_grid))) 27 | gs_new = int(math.sqrt(ntok_new)) 28 | #_logger.info('Position embedding grid-size from %s to %s', gs_old, gs_new) 29 | posemb_grid = posemb_grid.reshape(1, gs_old, gs_old, -1).permute(0, 3, 1, 2) 30 | posemb_grid = F.interpolate(posemb_grid, size=(gs_new, gs_new), mode='bilinear') 31 | posemb_grid = posemb_grid.permute(0, 2, 3, 1).reshape(1, gs_new * gs_new, -1) 32 | posemb = torch.cat([posemb_tok, posemb_grid], dim=1) 33 | return posemb 34 | 35 | def initialize_clip(VISUAL_CONFIG, num_patches = 240, adapter_config=None): 36 | import clip 37 | clip_model, preprocess = clip.load(VISUAL_CONFIG.clip_model_name, jit=False, adapter_config=adapter_config) 38 | if VISUAL_CONFIG.clip_model_name == "ViT-B/32" and VISUAL_CONFIG.reset_pos_embedding: 39 | 40 | #from timm.models.vision_transformer import resize_pos_embed 41 | pos_embed = nn.Parameter(torch.zeros(num_patches + 1, 768).float()) 42 | pos_embed.weight = resize_pos_embed(clip_model.visual.positional_embedding.unsqueeze(0), pos_embed.unsqueeze(0)) 43 | clip_model.visual.positional_embedding = pos_embed 44 | # model.visual.positional_embedding = model.visual.positional_embedding.to("cuda") 45 | #print(model.visual.positional_embedding.device) 46 | # pass 47 | if VISUAL_CONFIG.freeze_clip: 48 | for parameter in clip_model.parameters(): 49 | parameter.requires_grad = False 50 | return clip_model 51 | 52 | def initialize_vit(VISUAL_CONFIG, model_type = "ViT-B_32", pretrained_dir = "data/ViT-B_32.npz", img_size = (384, 640), num_patches = 240): 53 | from vit.models.modeling import VisionTransformer, CONFIGS 54 | config = CONFIGS[model_type] 55 | model = VisionTransformer(config, img_size = 224, zero_head=True, num_classes=1) 56 | model.load_from(np.load(pretrained_dir)) 57 | 58 | pos_embed = nn.Parameter(torch.zeros(num_patches + 1, 768).float()) 59 | pos_embed.weight = resize_pos_embed(model.transformer.embeddings.position_embeddings, pos_embed.unsqueeze(0)) 60 | model.transformer.embeddings.position_embeddings = pos_embed 61 | if VISUAL_CONFIG.freeze_clip: 62 | for parameter in model.parameters(): 63 | parameter.requires_grad = False 64 | return model 65 | 66 | def initialize_optimizer(visual_model, lr, momentum, weight_decay): 67 | optimizer = torch.optim.SGD(visual_model.parameters(), lr, 68 | momentum=momentum, 69 | weight_decay=weight_decay) 70 | return optimizer 71 | 72 | def adjust_learning_rate(optimizer, epoch, args): 73 | """Decay the learning rate based on schedule""" 74 | lr = args.sgd_lr 75 | 76 | for milestone in args.schedule: 77 | lr *= 0.1 if epoch >= milestone else 1. 78 | for param_group in optimizer.param_groups: 79 | param_group['lr'] = lr 80 | 81 | from torch.optim import Optimizer 82 | 83 | class FusedOptimizer(Optimizer): 84 | def __init__(self, optimizers): 85 | self.optimizers = optimizers 86 | param_groups = [] 87 | for optimizer in self.optimizers: 88 | param_groups += optimizer.param_groups 89 | #super(FusedOptimizer, self).__init__([], {}) 90 | self.param_groups = param_groups 91 | 92 | def step(self): 93 | for optimizer in self.optimizers: 94 | optimizer.step() 95 | -------------------------------------------------------------------------------- /CLIP-ViL/src/pretrain/qa_answer_table.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import json 5 | import torch 6 | 7 | 8 | class AnswerTable: 9 | ANS_CONVERT = { 10 | "a man": "man", 11 | "the man": "man", 12 | "a woman": "woman", 13 | "the woman": "woman", 14 | 'one': '1', 15 | 'two': '2', 16 | 'three': '3', 17 | 'four': '4', 18 | 'five': '5', 19 | 'six': '6', 20 | 'seven': '7', 21 | 'eight': '8', 22 | 'nine': '9', 23 | 'ten': '10', 24 | 'grey': 'gray', 25 | } 26 | 27 | def __init__(self, dsets=None): 28 | self.all_ans = json.load(open("data/lxmert/all_ans.json")) 29 | if dsets is not None: 30 | dsets = set(dsets) 31 | # If the answer is used in the dsets 32 | self.anss = [ans['ans'] for ans in self.all_ans if 33 | len(set(ans['dsets']) & dsets) > 0] 34 | else: 35 | self.anss = [ans['ans'] for ans in self.all_ans] 36 | self.ans_set = set(self.anss) 37 | 38 | self._id2ans_map = self.anss 39 | self._ans2id_map = {ans: ans_id for ans_id, ans in enumerate(self.anss)} 40 | 41 | assert len(self._id2ans_map) == len(self._ans2id_map) 42 | for ans_id, ans in enumerate(self._id2ans_map): 43 | assert self._ans2id_map[ans] == ans_id 44 | 45 | def convert_ans(self, ans): 46 | if len(ans) == 0: 47 | return "" 48 | ans = ans.lower() 49 | if ans[-1] == '.': 50 | ans = ans[:-1].strip() 51 | if ans.startswith("a "): 52 | ans = ans[2:].strip() 53 | if ans.startswith("an "): 54 | ans = ans[3:].strip() 55 | if ans.startswith("the "): 56 | ans = ans[4:].strip() 57 | if ans in self.ANS_CONVERT: 58 | ans = self.ANS_CONVERT[ans] 59 | return ans 60 | 61 | def ans2id(self, ans): 62 | return self._ans2id_map[ans] 63 | 64 | def id2ans(self, ans_id): 65 | return self._id2ans_map[ans_id] 66 | 67 | def ans2id_map(self): 68 | return self._ans2id_map.copy() 69 | 70 | def id2ans_map(self): 71 | return self._id2ans_map.copy() 72 | 73 | def used(self, ans): 74 | return ans in self.ans_set 75 | 76 | def all_answers(self): 77 | return self.anss.copy() 78 | 79 | @property 80 | def num_answers(self): 81 | return len(self.anss) 82 | 83 | from tools.load_stagte_dict import load_state_dict_flexible, load_state_dict_flexible_with_fp16 84 | def load_lxmert_qa(path, model, label2ans): 85 | """ 86 | Load model weights from LXMERT pre-training. 87 | The answers in the fine-tuned QA task (indicated by label2ans) 88 | would also be properly initialized with LXMERT pre-trained 89 | QA heads. 90 | 91 | :param path: Path to LXMERT snapshot. 92 | :param model: LXRT model instance. 93 | :param label2ans: The label2ans dict of fine-tuned QA datasets, like 94 | {0: 'cat', 1: 'dog', ...} 95 | :return: 96 | """ 97 | print("Load QA pre-trained LXMERT from %s " % path) 98 | loaded_state_dict = torch.load("%s_LXRT.pth" % path, "cpu") 99 | model_state_dict = model.state_dict() 100 | 101 | # Handle Multi-GPU pre-training --> Single GPU fine-tuning 102 | for key in list(loaded_state_dict.keys()): 103 | loaded_state_dict[key.replace("module.", '')] = loaded_state_dict.pop(key) 104 | 105 | # Isolate bert model 106 | bert_state_dict = {} 107 | for key, value in loaded_state_dict.items(): 108 | if key.startswith('bert.'): 109 | bert_state_dict[key] = value 110 | 111 | # Isolate answer head 112 | answer_state_dict = {} 113 | for key, value in loaded_state_dict.items(): 114 | if key.startswith("answer_head."): 115 | answer_state_dict[key.replace('answer_head.', '')] = value 116 | 117 | # Do surgery on answer state dict 118 | ans_weight = answer_state_dict['logit_fc.3.weight'] 119 | ans_bias = answer_state_dict['logit_fc.3.bias'] 120 | import copy 121 | new_answer_weight = copy.deepcopy(model_state_dict['logit_fc.3.weight']) 122 | new_answer_bias = copy.deepcopy(model_state_dict['logit_fc.3.bias']) 123 | answer_table = AnswerTable() 124 | loaded = 0 125 | unload = 0 126 | if type(label2ans) is list: 127 | label2ans = {label: ans for label, ans in enumerate(label2ans)} 128 | for label, ans in label2ans.items(): 129 | new_ans = answer_table.convert_ans(ans) 130 | if answer_table.used(new_ans): 131 | ans_id_9500 = answer_table.ans2id(new_ans) 132 | new_answer_weight[label] = ans_weight[ans_id_9500] 133 | new_answer_bias[label] = ans_bias[ans_id_9500] 134 | loaded += 1 135 | else: 136 | new_answer_weight[label] = 0. 137 | new_answer_bias[label] = 0. 138 | unload += 1 139 | print("Loaded %d answers from LXRTQA pre-training and %d not" % (loaded, unload)) 140 | print() 141 | answer_state_dict['logit_fc.3.weight'] = new_answer_weight 142 | answer_state_dict['logit_fc.3.bias'] = new_answer_bias 143 | 144 | # Load Bert Weights 145 | bert_model_keys = set(model.lxrt_encoder.model.state_dict().keys()) 146 | bert_loaded_keys = set(bert_state_dict.keys()) 147 | # assert len(bert_model_keys - bert_loaded_keys) == 0 148 | load_state_dict_flexible_with_fp16(model.lxrt_encoder.model, bert_state_dict) 149 | #model.lxrt_encoder.model.load_state_dict(bert_state_dict, strict=False) 150 | 151 | # Load Answer Logic FC Weights 152 | model_keys = set(model.state_dict().keys()) 153 | ans_loaded_keys = set(answer_state_dict.keys()) 154 | # assert len(ans_loaded_keys - model_keys) == 0 155 | 156 | #model.load_state_dict(answer_state_dict, strict=False) 157 | load_state_dict_flexible_with_fp16(model, answer_state_dict) 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /CLIP-ViL/src/tasks/gqa_model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import torch.nn as nn 5 | 6 | from param import args 7 | from lxrt.entry import LXRTEncoder 8 | from lxrt.modeling import BertLayerNorm, GeLU 9 | 10 | # Max length including and 11 | MAX_GQA_LENGTH = 20 12 | 13 | 14 | class GQAModel(nn.Module): 15 | def __init__(self, num_answers): 16 | super().__init__() 17 | self.lxrt_encoder = LXRTEncoder( 18 | args, 19 | max_seq_length=MAX_GQA_LENGTH 20 | ) 21 | hid_dim = self.lxrt_encoder.dim 22 | self.logit_fc = nn.Sequential( 23 | nn.Linear(hid_dim, hid_dim * 2), 24 | GeLU(), 25 | BertLayerNorm(hid_dim * 2, eps=1e-12), 26 | nn.Linear(hid_dim * 2, num_answers) 27 | ) 28 | self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) 29 | self.task = "vqa" 30 | 31 | def forward(self, feat, pos, sent): 32 | """ 33 | b -- batch_size, o -- object_number, f -- visual_feature_size 34 | 35 | :param feat: (b, o, f) 36 | :param pos: (b, o, 4) 37 | :param sent: (b,) Type -- list of string 38 | :param leng: (b,) Type -- int numpy array 39 | :return: (b, num_answer) The logit of each answers. 40 | """ 41 | x = self.lxrt_encoder(sent, (feat, pos), task=self.task) 42 | logit = self.logit_fc(x) 43 | 44 | return logit 45 | 46 | 47 | -------------------------------------------------------------------------------- /CLIP-ViL/src/tasks/vqa_model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import torch.nn as nn 5 | 6 | from param import args 7 | from lxrt.entry import LXRTEncoder 8 | from lxrt.modeling import BertLayerNorm, GeLU 9 | 10 | # Max length including and 11 | MAX_VQA_LENGTH = 20 12 | 13 | 14 | class VQAModel(nn.Module): 15 | def __init__(self, num_answers): 16 | super().__init__() 17 | 18 | # Build LXRT encoder 19 | self.lxrt_encoder = LXRTEncoder( 20 | args, 21 | max_seq_length=MAX_VQA_LENGTH 22 | ) 23 | hid_dim = self.lxrt_encoder.dim 24 | 25 | # VQA Answer heads 26 | self.logit_fc = nn.Sequential( 27 | nn.Linear(hid_dim, hid_dim * 2), 28 | GeLU(), 29 | BertLayerNorm(hid_dim * 2, eps=1e-12), 30 | nn.Linear(hid_dim * 2, num_answers) 31 | ) 32 | self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) 33 | 34 | self.task = "vqa" 35 | 36 | def forward(self, feat, pos, sent): 37 | """ 38 | b -- batch_size, o -- object_number, f -- visual_feature_size 39 | 40 | :param feat: (b, o, f) 41 | :param pos: (b, o, 4) 42 | :param sent: (b,) Type -- list of string 43 | :param leng: (b,) Type -- int numpy array 44 | :return: (b, num_answer) The logit of each answers. 45 | """ 46 | # print(len(sent), feat.shape, pos.shape) 47 | x = self.lxrt_encoder(sent, (feat, pos), task=self.task) 48 | logit = self.logit_fc(x) 49 | 50 | return logit 51 | 52 | 53 | if __name__ == "__main__": 54 | model = VQAModel(4000) -------------------------------------------------------------------------------- /CLIP-ViL/src/tools/lmdb_dataset.py: -------------------------------------------------------------------------------- 1 | from collections import defaultdict 2 | class TrainingMeter(): 3 | def __init__(self): 4 | self.counter_dict = defaultdict(float) 5 | self.true_dict = defaultdict(float) 6 | 7 | def update(self, loss_dict): 8 | for key, item in loss_dict.items(): 9 | self.counter_dict[key] += 1 10 | self.true_dict[key] += item 11 | 12 | def report(self, logger = None): 13 | keys = list(self.counter_dict.keys()) 14 | keys.sort() 15 | for key in keys: 16 | if logger is None: 17 | print(" {} : {:.7}".format(key, self.true_dict[key] / self.counter_dict[key])) 18 | else: 19 | logger.info(" {} : {:.7}".format(key, self.true_dict[key] / self.counter_dict[key])) 20 | 21 | def clean(self): 22 | self.counter_dict = defaultdict(float) 23 | self.true_dict = defaultdict(float) 24 | 25 | 26 | from lz4.frame import compress, decompress 27 | from collections import defaultdict 28 | from contextlib import contextmanager 29 | import io 30 | import json 31 | from os.path import exists 32 | import msgpack 33 | import msgpack_numpy 34 | import collections 35 | import lmdb 36 | msgpack_numpy.patch() 37 | 38 | class TxtLmdb(object): 39 | def __init__(self, db_dir, readonly=True, readahead=False): 40 | self.readonly = readonly 41 | if readonly: 42 | # training 43 | self.env = lmdb.open(db_dir, 44 | readonly=True, create=False, 45 | readahead=readahead) 46 | self.txn = self.env.begin(buffers=True) 47 | self.write_cnt = None 48 | else: 49 | # prepro 50 | self.env = lmdb.open(db_dir, readonly=False, create=True, 51 | map_size=4 * 1024**4) 52 | self.txn = self.env.begin(write=True) 53 | self.write_cnt = 0 54 | 55 | def __del__(self): 56 | if self.write_cnt: 57 | self.txn.commit() 58 | self.env.close() 59 | 60 | def __getitem__(self, key): 61 | return msgpack.loads(decompress(self.txn.get(key.encode('utf-8'))), 62 | raw=False) 63 | 64 | def __setitem__(self, key, value): 65 | # NOTE: not thread safe 66 | if self.readonly: 67 | raise ValueError('readonly text DB') 68 | ret = self.txn.put(key.encode('utf-8'), 69 | compress(msgpack.dumps(value, use_bin_type=True))) 70 | self.write_cnt += 1 71 | if self.write_cnt % 1000 == 0: 72 | self.txn.commit() 73 | self.txn = self.env.begin(write=True) 74 | self.write_cnt = 0 75 | return ret 76 | -------------------------------------------------------------------------------- /CLIP-ViL/src/tools/load_stagte_dict.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | def load_state_dict_flexible(model, state_dict): 4 | try: 5 | model.load_state_dict(state_dict) 6 | except: 7 | print("Full loading failed!! Try partial loading!!") 8 | 9 | own_state = model.state_dict() 10 | 11 | for name, param in state_dict.items(): 12 | if name not in own_state: 13 | print("Skipped: " + name) 14 | continue 15 | if isinstance(param, torch.nn.Parameter): 16 | # backwards compatibility for serialized parameters 17 | param = param.data 18 | try: 19 | own_state[name].copy_(param) 20 | print("Successfully loaded: "+name) 21 | except: 22 | print("Part load failed: " + name) 23 | 24 | def load_state_dict_flexible_with_fp16(model, state_dict): 25 | try: 26 | model.load_state_dict(state_dict) 27 | except: 28 | print("Full loading failed!! Try partial loading!!") 29 | 30 | own_state = model.state_dict() 31 | 32 | for name, param in state_dict.items(): 33 | if name not in own_state: 34 | print("Skipped: " + name) 35 | continue 36 | if isinstance(param, torch.nn.Parameter): 37 | # backwards compatibility for serialized parameters 38 | param = param.data 39 | try: 40 | #print("Name {}, original_type: {}, load type".format(name, own_state[name].dtype, param.dtype)) 41 | param = param.to(own_state[name].device) 42 | own_state[name].copy_(param) 43 | print("Successfully loaded: "+name) 44 | except: 45 | print("Part load failed: " + name) -------------------------------------------------------------------------------- /CLIP-ViL/src/tools/resize_images.py: -------------------------------------------------------------------------------- 1 | folder = "/local/harold/ubert/clip_vlp/lxmert/data/mscoco/val2014/" 2 | root = "/local/harold/ubert/clip_vlp/lxmert/data/mscoco/val_" 3 | out_folder = "/local/harold/ubert/clip_vlp/lxmert/data/mscoco/val2014_small.lmdb" 4 | import torch 5 | import os 6 | import json 7 | from PIL import Image 8 | 9 | from tqdm import tqdm 10 | from vlm.vok_utilis import TxtLmdb 11 | import numpy as np 12 | def vokenize_and_cache_dataset(output_path, dataset, vokenizer, tokenizer): 13 | ## Let's use lmdb 14 | 15 | 16 | data_loader = DataLoader(dataset, shuffle=False, batch_size=1) 17 | for index, batch in enumerate(tqdm(data_loader)): 18 | top_scores, top_idxs, input_tokens, top_paths = vokenize_batch(batch, tokenizer, vokenizer) 19 | 20 | top_paths = top_paths[0] 21 | top_idxs = top_idxs[0].cpu().numpy().tolist() 22 | input_tokens = input_tokens[0] 23 | top_scores = top_scores[0].cpu().numpy().tolist() 24 | lmdb_dataset[str(index)] = { 25 | "top_paths": top_paths, 26 | "top_idxs": top_idxs, 27 | "input_tokens": input_tokens, 28 | "top_scores": top_scores 29 | } 30 | 31 | del lmdb_dataset 32 | 33 | from torchvision.transforms import Compose, CenterCrop, ToTensor, Normalize, ColorJitter 34 | from vision_helpers import Resize, PadToGivenSize 35 | 36 | min_size = 384 37 | max_size = 640 38 | flip_horizontal_prob = 0.0 39 | flip_vertical_prob = 0.0 40 | brightness = 0.0 41 | contrast = 0.0 42 | saturation = 0.0 43 | hue = 0.0 44 | color_jitter = ColorJitter( 45 | brightness=brightness, 46 | contrast=contrast, 47 | saturation=saturation, 48 | hue=hue, 49 | ) 50 | transform = Compose( 51 | [ 52 | Resize(min_size, max_size) 53 | #lambda image: image.convert("RGB"), 54 | #Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), 55 | ] 56 | ) 57 | import copy 58 | import os 59 | import random 60 | 61 | import h5py 62 | import torch 63 | from torch.utils.data import DataLoader, Dataset 64 | from torch.nn.utils.rnn import pad_sequence 65 | from tqdm import tqdm 66 | 67 | #class ToyDataset(Dataset): 68 | # def __init__(self, ) 69 | all_image_files = [] 70 | for _, dirs, files in os.walk(folder, topdown=False): 71 | for image_file in tqdm(files): 72 | if image_file.endswith("jpg"): 73 | all_image_files.append(image_file) 74 | #with open(root+"image_ids.json", "w") as f: 75 | # json.dump(all_image_files, f) 76 | 77 | #with open("/local/harold/vqa/google_concetual/image_ids.json") as f: 78 | # all_image_files = json.load(f) 79 | 80 | from PIL import Image 81 | import io 82 | 83 | def image_to_byte_array(image): 84 | imgByteArr = io.BytesIO() 85 | image.save(imgByteArr, format="JPEG") 86 | imgByteArr = imgByteArr.getvalue() 87 | return imgByteArr 88 | 89 | def byte_array_to_image(byte): 90 | imgByteArr = io.BytesIO(byte) 91 | imgByteArr.seek(0) 92 | return Image.open(imgByteArr) 93 | 94 | from tqdm import tqdm 95 | lmdb_dataset = TxtLmdb(out_folder, readonly=False) 96 | valid_images = {} 97 | skipped = 0 98 | for image in tqdm(all_image_files): 99 | try: 100 | feats = transform(Image.open(os.path.join(folder, image))) # Raw image as a tensor: 3 x 224 x 224 101 | lmdb_dataset[image] = image_to_byte_array(feats) 102 | valid_images[image] = feats.size 103 | except KeyboardInterrupt: 104 | del lmdb_dataset 105 | assert (0) 106 | except: 107 | skipped += 1 108 | if skipped % 100 == 0: 109 | print("{} skipped.".format(skipped)) 110 | pass 111 | 112 | with open(root + "image_size.json", "w") as f: 113 | json.dump(valid_images, f) 114 | 115 | ''' 116 | all_image_files = [] 117 | for root, dirs, files in os.walk(folder, topdown=False): 118 | for image_file in files: 119 | if image_file.endswith("jpg"): 120 | all_image_files.append(image_file) 121 | with open("/local/harold/vqa/google_concetual/image_ids.json", "w") as f: 122 | json.dump(all_image_files, f)''' -------------------------------------------------------------------------------- /CLIP-ViL/src/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 Project LXRT 3 | 4 | import sys 5 | import csv 6 | import base64 7 | import time 8 | 9 | import numpy as np 10 | from collections import defaultdict 11 | class TrainingMeter(): 12 | def __init__(self): 13 | self.counter_dict = defaultdict(float) 14 | self.true_dict = defaultdict(float) 15 | 16 | def update(self, loss_dict): 17 | for key, item in loss_dict.items(): 18 | self.counter_dict[key] += 1 19 | self.true_dict[key] += item 20 | 21 | def report(self, logger = None): 22 | keys = list(self.counter_dict.keys()) 23 | keys.sort() 24 | for key in keys: 25 | if logger is None: 26 | print(" {} : {:.7}".format(key, self.true_dict[key] / self.counter_dict[key])) 27 | else: 28 | logger.info(" {} : {:.7}".format(key, self.true_dict[key] / self.counter_dict[key])) 29 | 30 | def clean(self): 31 | self.counter_dict = defaultdict(float) 32 | self.true_dict = defaultdict(float) 33 | 34 | 35 | csv.field_size_limit(sys.maxsize) 36 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf", 37 | "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"] 38 | 39 | 40 | def load_obj_tsv(fname, topk=None): 41 | """Load object features from tsv file. 42 | 43 | :param fname: The path to the tsv file. 44 | :param topk: Only load features for top K images (lines) in the tsv file. 45 | Will load all the features if topk is either -1 or None. 46 | :return: A list of image object features where each feature is a dict. 47 | See FILENAMES above for the keys in the feature dict. 48 | """ 49 | data = [] 50 | start_time = time.time() 51 | print("Start to load Faster-RCNN detected objects from %s" % fname) 52 | with open(fname) as f: 53 | reader = csv.DictReader(f, FIELDNAMES, delimiter="\t") 54 | for i, item in enumerate(reader): 55 | 56 | for key in ['img_h', 'img_w', 'num_boxes']: 57 | item[key] = int(item[key]) 58 | 59 | boxes = item['num_boxes'] 60 | decode_config = [ 61 | ('objects_id', (boxes, ), np.int64), 62 | ('objects_conf', (boxes, ), np.float32), 63 | ('attrs_id', (boxes, ), np.int64), 64 | ('attrs_conf', (boxes, ), np.float32), 65 | ('boxes', (boxes, 4), np.float32), 66 | ('features', (boxes, -1), np.float32), 67 | ] 68 | for key, shape, dtype in decode_config: 69 | item[key] = np.frombuffer(base64.b64decode(item[key]), dtype=dtype) 70 | item[key] = item[key].reshape(shape) 71 | item[key].setflags(write=False) 72 | 73 | data.append(item) 74 | if topk is not None and len(data) == topk: 75 | break 76 | elapsed_time = time.time() - start_time 77 | print("Loaded %d images in file %s in %d seconds." % (len(data), fname, elapsed_time)) 78 | return data 79 | 80 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 YI-LIN SUNG 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # VL-Adapter 2 | 3 | * Authors: [Yi-Lin Sung](https://ylsung.github.io/), [Jaemin Cho](https://j-min.io/), [Mohit Bansal](https://www.cs.unc.edu/~mbansal/) 4 | * Paper: ["VL-Adapter: Parameter-Efficient Transfer Learning for Vision-and-Language Tasks"](https://arxiv.org/abs/2112.06825) (CVPR 2022) 5 | 6 | We evaluate VL-adapter in a unified multi-task 7 | setup on both image-text and video-text benchmarks. For the image-text tasks, we use four diverse V&L datasets: VQAv2, GQA, NLVR2, and MSCOCO image captioning. For video-text tasks, we use TVQA, How2QA, TVC, and YC2C. 8 | 9 | Our results demonstrate that training the adapter with the weight-sharing technique (4.18% of total parameters for image-text tasks and 3.39% for video-text tasks) can match 10 | the performance of fine-tuning the entire model. 11 | 12 | ![](assets/vl_adapter_teaser.png) 13 | 14 | ** Note ** 15 | Please go into CLIP-ViL folder and follow the README there for running the experiments of adapters on CLIP-ViL. This README is for adapters on VL-Bart. 16 | 17 | 18 | ## Installation 19 | 20 | ``` 21 | # Create python environment (optional) 22 | conda create -n vlt5 python=3.8 23 | source activate vlt5 24 | 25 | # Install python dependencies 26 | pip install -r requirements.txt 27 | 28 | # Download T5/BART backbone checkpoint 29 | python download_backbones.py 30 | 31 | # For MSCOCO captioning evaluation (optional; for captioning only) 32 | python -c "import language_evaluation; language_evaluation.download('coco')" 33 | ``` 34 | 35 | ## Code structure 36 | ```bash 37 | # Store images, features, and annotations 38 | ./datasets 39 | COCO/ 40 | images/ 41 | clip_featuers/ 42 | VG/ 43 | images/ 44 | clip_features/ 45 | GQA/ 46 | images/ 47 | clip_features/ 48 | nlvr/ 49 | images/ 50 | clip_features/ 51 | vqa/ 52 | lxmert/ 53 | 54 | video/ 55 | ann/ 56 | vis_features 57 | 58 | # Train VL-T5 with adapters 59 | ./VL-T5/ 60 | src/ 61 | modeling_t5.py modeling_bart.py <= VL-T5/VL-BART model classes 62 | pretrain.py, pretrain_data.py, pretrain_model.py <= pretraining 63 | vqa.py, vqa_data.py vqa_model.py ... <= fine-tuning on downstream tasks (ex. VQA, GQA, NLVR2) 64 | multitask.py, multitask_data.py multiask_model.py <= multitask learning on 7 downstream tasks 65 | param.py <= (argparse) configuration 66 | tokenization.py <= custom tokenizer 67 | utils.py, dist_utils.py <= utility functions 68 | snap/ <= store weight checkpoints 69 | scripts/ <= bash scripts for pretraining and finetuning 70 | ``` 71 | 72 | ## Data 73 | 74 | ### Image-text dataset 75 | Please go to [link](https://drive.google.com/file/d/1O_RU1iFh_sbItZCTkOHUrbVIQQ_89Djj/view?usp=sharing) to download the processed CLIP features. We suggest to use [gdrive](https://github.com/prasmussen/gdrive) to download it. Unzip the downloaded file and arrange the folders following the format which is shown in the "Code Structure." 76 | 77 | If you would like to use dgrive to download the data, please try the following command 78 | 79 | ``` 80 | gdrive download 1O_RU1iFh_sbItZCTkOHUrbVIQQ_89Djj 81 | ``` 82 | 83 | ### Extract your own CLIP features 84 | Please refer to `feature_extraction` for more details. 85 | 86 | ### Video-text dataset 87 | Please go to [VALUE](https://github.com/VALUE-Leaderboard/DataRelease) to download the ViT processed data. 88 | 89 | ## Run different approaches 90 | The following scripts can run every approach with the best hyper-parameters. 91 | 92 | ### Image dataset 93 | 94 | ```bash 95 | # Full fine-tuning 96 | cd VL-T5/ 97 | bash scripts/image/full_finetuning.sh 1 98 | 99 | # Single Adapter 100 | cd VL-T5/ 101 | bash scripts/image/single_adapter.sh 1 102 | 103 | # Multiple Adapters 104 | cd VL-T5/ 105 | bash scripts/image/multiple_adapters.sh 1 106 | 107 | # Hyperformer 108 | cd VL-T5/ 109 | bash scripts/image/hyperformer.sh 1 110 | 111 | # Single Compacter 112 | cd VL-T5/ 113 | bash scripts/image/single_compacter.sh 1 114 | 115 | # Multiple Compacters 116 | cd VL-T5/ 117 | bash scripts/image/multiple_compacters.sh 1 118 | 119 | # Single LoRA 120 | cd VL-T5/ 121 | bash scripts/image/single_lora.sh 1 122 | 123 | # Multiple LoRA 124 | cd VL-T5/ 125 | bash scripts/image/multiple_lora.sh 1 126 | 127 | # Single Prompt 128 | cd VL-T5/ 129 | bash scripts/image/single_prompt.sh 1 130 | 131 | # Multiple Prompts 132 | cd VL-T5/ 133 | bash scripts/image/multiple_prompts.sh 1 134 | ``` 135 | 136 | ### Video dataset 137 | 138 | ```bash 139 | # Full fine-tuning 140 | cd VL-T5/ 141 | bash scripts/video/full_finetuning.sh 1 142 | 143 | # Single Adapter 144 | cd VL-T5/ 145 | bash scripts/video/single_adapter.sh 1 146 | 147 | # Single LoRA 148 | cd VL-T5/ 149 | bash scripts/video/single_lora.sh 1 150 | 151 | # Single Prompt 152 | cd VL-T5/ 153 | bash scripts/video/single_prompt.sh 1 154 | 155 | ``` 156 | 157 | 158 | ## Acknowledgement 159 | 160 | This repo is adapted from [VLT5](https://github.com/j-min/VL-T5). I also borrow some codes from [CLIP](https://github.com/openai/CLIP), [CLIP-ViL](https://github.com/clip-vil/CLIP-ViL), [Compacter](https://github.com/ylsung/compacter), [Hyperformer](https://github.com/rabeehk/hyperformer) and [Prefix-tuning](https://github.com/XiangLi1999/PrefixTuning). 161 | 162 | 163 | ## Reference 164 | 165 | Please cite our paper if you use our models in your project. 166 | 167 | ```bibtex 168 | @inproceedings{sung2022vladapter, 169 | title = {VL-Adapter: Parameter-Efficient Transfer Learning for Vision-and-Language Tasks}, 170 | author = {Yi-Lin Sung, Jaemin Cho, Mohit Bansal}, 171 | booktitle = {CVPR}, 172 | year = {2022} 173 | } 174 | ``` -------------------------------------------------------------------------------- /VL-T5/inference/README.md: -------------------------------------------------------------------------------- 1 | Utility scripts for inference on custom images. 2 | The Faster R-CNN inference scripts are adapted from [Huggingface transformers LXMERT example](https://github.com/huggingface/transformers/blob/master/examples/research_projects/lxmert/). -------------------------------------------------------------------------------- /VL-T5/inference/extracting_data.py: -------------------------------------------------------------------------------- 1 | import getopt 2 | import json 3 | import os 4 | 5 | # import numpy as np 6 | import sys 7 | from collections import OrderedDict 8 | 9 | import datasets 10 | import numpy as np 11 | import torch 12 | 13 | from modeling_frcnn import GeneralizedRCNN 14 | from processing_image import Preprocess 15 | from utils import Config 16 | 17 | 18 | """ 19 | USAGE: 20 | ``python extracting_data.py -i -o .datasets `` 21 | """ 22 | 23 | 24 | TEST = False 25 | CONFIG = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned") 26 | DEFAULT_SCHEMA = datasets.Features( 27 | OrderedDict( 28 | { 29 | "attr_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")), 30 | "attr_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")), 31 | "boxes": datasets.Array2D((CONFIG.MAX_DETECTIONS, 4), dtype="float32"), 32 | "img_id": datasets.Value("int32"), 33 | "obj_ids": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")), 34 | "obj_probs": datasets.Sequence(length=CONFIG.MAX_DETECTIONS, feature=datasets.Value("float32")), 35 | "roi_features": datasets.Array2D((CONFIG.MAX_DETECTIONS, 2048), dtype="float32"), 36 | "sizes": datasets.Sequence(length=2, feature=datasets.Value("float32")), 37 | "preds_per_image": datasets.Value(dtype="int32"), 38 | } 39 | ) 40 | ) 41 | 42 | 43 | class Extract: 44 | def __init__(self, argv=sys.argv[1:]): 45 | inputdir = None 46 | outputfile = None 47 | subset_list = None 48 | batch_size = 1 49 | opts, args = getopt.getopt(argv, "i:o:b:s", ["inputdir=", "outfile=", "batch_size=", "subset_list="]) 50 | for opt, arg in opts: 51 | if opt in ("-i", "--inputdir"): 52 | inputdir = arg 53 | elif opt in ("-o", "--outfile"): 54 | outputfile = arg 55 | elif opt in ("-b", "--batch_size"): 56 | batch_size = int(arg) 57 | elif opt in ("-s", "--subset_list"): 58 | subset_list = arg 59 | 60 | assert inputdir is not None # and os.path.isdir(inputdir), f"{inputdir}" 61 | assert outputfile is not None and not os.path.isfile(outputfile), f"{outputfile}" 62 | if subset_list is not None: 63 | with open(os.path.realpath(subset_list)) as f: 64 | self.subset_list = set(map(lambda x: self._vqa_file_split()[0], tryload(f))) 65 | else: 66 | self.subset_list = None 67 | 68 | self.config = CONFIG 69 | if torch.cuda.is_available(): 70 | self.config.model.device = "cuda" 71 | self.inputdir = os.path.realpath(inputdir) 72 | self.outputfile = os.path.realpath(outputfile) 73 | self.preprocess = Preprocess(self.config) 74 | self.model = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=self.config) 75 | self.batch = batch_size if batch_size != 0 else 1 76 | self.schema = DEFAULT_SCHEMA 77 | 78 | def _vqa_file_split(self, file): 79 | img_id = int(file.split(".")[0].split("_")[-1]) 80 | filepath = os.path.join(self.inputdir, file) 81 | return (img_id, filepath) 82 | 83 | @property 84 | def file_generator(self): 85 | batch = [] 86 | for i, file in enumerate(os.listdir(self.inputdir)): 87 | if self.subset_list is not None and i not in self.subset_list: 88 | continue 89 | batch.append(self._vqa_file_split(file)) 90 | if len(batch) == self.batch: 91 | temp = batch 92 | batch = [] 93 | yield list(map(list, zip(*temp))) 94 | 95 | for i in range(1): 96 | yield list(map(list, zip(*batch))) 97 | 98 | def __call__(self): 99 | # make writer 100 | if not TEST: 101 | writer = datasets.ArrowWriter(features=self.schema, path=self.outputfile) 102 | # do file generator 103 | for i, (img_ids, filepaths) in enumerate(self.file_generator): 104 | images, sizes, scales_yx = self.preprocess(filepaths) 105 | output_dict = self.model( 106 | images, 107 | sizes, 108 | scales_yx=scales_yx, 109 | padding="max_detections", 110 | max_detections=self.config.MAX_DETECTIONS, 111 | pad_value=0, 112 | return_tensors="np", 113 | location="cpu", 114 | ) 115 | output_dict["boxes"] = output_dict.pop("normalized_boxes") 116 | if not TEST: 117 | output_dict["img_id"] = np.array(img_ids) 118 | batch = self.schema.encode_batch(output_dict) 119 | writer.write_batch(batch) 120 | if TEST: 121 | break 122 | # finalizer the writer 123 | if not TEST: 124 | num_examples, num_bytes = writer.finalize() 125 | print(f"Success! You wrote {num_examples} entry(s) and {num_bytes >> 20} mb") 126 | 127 | 128 | def tryload(stream): 129 | try: 130 | data = json.load(stream) 131 | try: 132 | data = list(data.keys()) 133 | except Exception: 134 | data = [d["img_id"] for d in data] 135 | except Exception: 136 | try: 137 | data = eval(stream.read()) 138 | except Exception: 139 | data = stream.read().split("\n") 140 | return data 141 | 142 | 143 | if __name__ == "__main__": 144 | extract = Extract(sys.argv[1:]) 145 | extract() 146 | if not TEST: 147 | dataset = datasets.Dataset.from_file(extract.outputfile) 148 | # wala! 149 | # print(np.array(dataset[0:2]["roi_features"]).shape) 150 | -------------------------------------------------------------------------------- /VL-T5/requirements.txt: -------------------------------------------------------------------------------- 1 | ftfy 2 | timm -------------------------------------------------------------------------------- /VL-T5/scripts/image/full_finetuning.sh: -------------------------------------------------------------------------------- 1 | task=multitask 2 | 3 | # or bart 4 | model="bart" 5 | 6 | echo $model 7 | 8 | if [ $model == "t5" ] 9 | then 10 | folder_prefix="VLT5" 11 | backbone="t5-base" 12 | batch_size=300 13 | elif [ $model == "bart" ] 14 | then 15 | folder_prefix="VLBart" 16 | backbone="facebook/bart-base" 17 | batch_size=500 18 | fi 19 | 20 | echo $folder_prefix 21 | echo $backbone 22 | 23 | feature=RN101 24 | 25 | lr=1e-4 26 | name=4tasks_hard_${feature}_LMfull_bs${batch_size}_image224_lr${lr} 27 | output=snap/${folder_prefix}_${task}/$name 28 | 29 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \ 30 | python -m torch.distributed.launch \ 31 | --nproc_per_node=$1 \ 32 | --master_port=26757 \ 33 | src/${task}.py \ 34 | --distributed --multiGPU \ 35 | --optim adamw \ 36 | --warmup_ratio 0.1 \ 37 | --clip_grad_norm 5 \ 38 | --lr ${lr} \ 39 | --epochs 20 \ 40 | --num_workers 4 \ 41 | --backbone ${backbone} \ 42 | --output $output ${@:2} \ 43 | --num_beams 5 \ 44 | --batch_size ${batch_size} \ 45 | --valid_batch_size ${batch_size} \ 46 | --unfreeze_language_model \ 47 | --tasks "vqa,gqa,nlvr,caption" \ 48 | --feature ${feature} --n_boxes 36 --downsample \ 49 | --image_size "(224,224)" \ 50 | --run_name $name 51 | -------------------------------------------------------------------------------- /VL-T5/scripts/image/hyperformer.sh: -------------------------------------------------------------------------------- 1 | task=multitask 2 | 3 | # or bart 4 | model="bart" 5 | 6 | echo $model 7 | 8 | if [ $model == "t5" ] 9 | then 10 | folder_prefix="VLT5" 11 | backbone="t5-base" 12 | batch_size=300 13 | elif [ $model == "bart" ] 14 | then 15 | folder_prefix="VLBart" 16 | backbone="facebook/bart-base" 17 | batch_size=500 18 | fi 19 | 20 | echo $folder_prefix 21 | echo $backbone 22 | 23 | feature=RN101 24 | 25 | lr=1e-3 26 | 27 | projected_task_embedding_dim=8 28 | 29 | name=4tasks_hard_${feature}_LMhyperformer${projected_task_embedding_dim}+r8+ln_bs${batch_size}_image224_lr${lr} 30 | output=snap/${folder_prefix}_${task}/$name 31 | 32 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \ 33 | python -m torch.distributed.launch \ 34 | --nproc_per_node=$1 \ 35 | --master_port=26763 \ 36 | src/${task}.py \ 37 | --distributed --multiGPU \ 38 | --optim adamw \ 39 | --warmup_ratio 0.1 \ 40 | --clip_grad_norm 5 \ 41 | --lr ${lr} \ 42 | --epochs 20 \ 43 | --num_workers 4 \ 44 | --backbone ${backbone} \ 45 | --output $output ${@:2} \ 46 | --num_beams 5 \ 47 | --batch_size ${batch_size} \ 48 | --valid_batch_size ${batch_size} \ 49 | --use_hyperformer \ 50 | --unique_hyper_net \ 51 | --unfreeze_layer_norms \ 52 | --projected_task_embedding_dim ${projected_task_embedding_dim} \ 53 | --reduction_factor 8 \ 54 | --tasks "vqa,gqa,nlvr,caption" \ 55 | --feature ${feature} --n_boxes 36 --downsample \ 56 | --image_size "(224,224)" \ 57 | --run_name $name -------------------------------------------------------------------------------- /VL-T5/scripts/image/multiple_adapters.sh: -------------------------------------------------------------------------------- 1 | task=multitask 2 | 3 | # or bart 4 | model="bart" 5 | 6 | echo $model 7 | 8 | if [ $model == "t5" ] 9 | then 10 | folder_prefix="VLT5" 11 | backbone="t5-base" 12 | batch_size=300 13 | elif [ $model == "bart" ] 14 | then 15 | folder_prefix="VLBart" 16 | backbone="facebook/bart-base" 17 | batch_size=500 18 | fi 19 | 20 | echo $folder_prefix 21 | echo $backbone 22 | 23 | feature=RN101 24 | 25 | lr=3e-4 26 | name=4tasks_hard_${feature}_LMadapter+r8+ln_bs${batch_size}_image224_lr${lr} 27 | output=snap/${folder_prefix}_${task}/$name 28 | 29 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \ 30 | python -m torch.distributed.launch \ 31 | --nproc_per_node=$1 \ 32 | --master_port=26757 \ 33 | src/${task}.py \ 34 | --distributed --multiGPU \ 35 | --optim adamw \ 36 | --warmup_ratio 0.1 \ 37 | --clip_grad_norm 5 \ 38 | --lr ${lr} \ 39 | --epochs 20 \ 40 | --num_workers 4 \ 41 | --backbone ${backbone} \ 42 | --output $output ${@:2} \ 43 | --num_beams 5 \ 44 | --batch_size ${batch_size} \ 45 | --valid_batch_size ${batch_size} \ 46 | --use_adapter \ 47 | --unfreeze_layer_norms \ 48 | --reduction_factor 8 \ 49 | --tasks "vqa,gqa,nlvr,caption" \ 50 | --feature ${feature} --n_boxes 36 --downsample \ 51 | --image_size "(224,224)" \ 52 | --run_name $name 53 | -------------------------------------------------------------------------------- /VL-T5/scripts/image/multiple_compacters.sh: -------------------------------------------------------------------------------- 1 | task=multitask 2 | 3 | # or bart 4 | model="bart" 5 | 6 | echo $model 7 | 8 | if [ $model == "t5" ] 9 | then 10 | folder_prefix="VLT5" 11 | backbone="t5-base" 12 | batch_size=300 13 | elif [ $model == "bart" ] 14 | then 15 | folder_prefix="VLBart" 16 | backbone="facebook/bart-base" 17 | batch_size=500 18 | fi 19 | 20 | echo $folder_prefix 21 | echo $backbone 22 | 23 | feature=RN101 24 | 25 | lr=1e-3 26 | 27 | hypercomplex_division=2 28 | 29 | name=4tasks_hard_${feature}_LMcompacter+hdiv${hypercomplex_division}+noshare+nofac+ln+prompt_bs${batch_size}_image224_lr${lr} 30 | output=snap/${folder_prefix}_${task}/$name 31 | 32 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \ 33 | python -m torch.distributed.launch \ 34 | --nproc_per_node=$1 \ 35 | --master_port=26764 \ 36 | src/${task}.py \ 37 | --distributed --multiGPU \ 38 | --optim adamw \ 39 | --warmup_ratio 0.1 \ 40 | --clip_grad_norm 5 \ 41 | --lr ${lr} \ 42 | --epochs 20 \ 43 | --num_workers 4 \ 44 | --backbone ${backbone} \ 45 | --output $output ${@:2} \ 46 | --num_beams 5 \ 47 | --batch_size ${batch_size} \ 48 | --valid_batch_size ${batch_size} \ 49 | --use_compacter \ 50 | --shared_phm_rule False \ 51 | --factorized_phm False \ 52 | --unfreeze_layer_norms \ 53 | --hypercomplex_division ${hypercomplex_division} \ 54 | --reduction_factor 8 \ 55 | --tasks "vqa,gqa,nlvr,caption" \ 56 | --feature ${feature} --n_boxes 36 --downsample \ 57 | --image_size "(224,224)" \ 58 | --run_name $name 59 | -------------------------------------------------------------------------------- /VL-T5/scripts/image/multiple_lora.sh: -------------------------------------------------------------------------------- 1 | task=multitask 2 | 3 | # or bart 4 | model="bart" 5 | 6 | echo $model 7 | 8 | if [ $model == "t5" ] 9 | then 10 | folder_prefix="VLT5" 11 | backbone="t5-base" 12 | batch_size=300 13 | elif [ $model == "bart" ] 14 | then 15 | folder_prefix="VLBart" 16 | backbone="facebook/bart-base" 17 | batch_size=500 18 | fi 19 | 20 | echo $folder_prefix 21 | echo $backbone 22 | 23 | feature=RN101 24 | 25 | lr=1e-3 26 | 27 | lora_dim=128 28 | 29 | name=${feature}_LMmultilora${lora_dim}+lr${lr}_bs${batch_size}_image224 30 | output=snap/${folder_prefix}_${task}/$name 31 | 32 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \ 33 | python -m torch.distributed.launch \ 34 | --nproc_per_node=$1 \ 35 | --master_port=26786 \ 36 | src/${task}.py \ 37 | --distributed --multiGPU \ 38 | --optim adamw \ 39 | --warmup_ratio 0.1 \ 40 | --clip_grad_norm 5 \ 41 | --lr ${lr} \ 42 | --epochs 20 \ 43 | --num_workers 4 \ 44 | --backbone ${backbone} \ 45 | --output $output ${@:2} \ 46 | --num_beams 5 \ 47 | --use_lora \ 48 | --lora_dim ${lora_dim} \ 49 | --batch_size ${batch_size} \ 50 | --valid_batch_size ${batch_size} \ 51 | --tasks "vqa,gqa,nlvr,caption" \ 52 | --feature ${feature} --n_boxes 36 --downsample \ 53 | --image_size "(224,224)" \ 54 | --run_name $name 55 | -------------------------------------------------------------------------------- /VL-T5/scripts/image/multiple_prompts.sh: -------------------------------------------------------------------------------- 1 | task=multitask 2 | 3 | # or bart 4 | model="bart" 5 | 6 | echo $model 7 | 8 | if [ $model == "t5" ] 9 | then 10 | folder_prefix="VLT5" 11 | backbone="t5-base" 12 | batch_size=300 13 | elif [ $model == "bart" ] 14 | then 15 | folder_prefix="VLBart" 16 | backbone="facebook/bart-base" 17 | batch_size=500 18 | fi 19 | 20 | echo $folder_prefix 21 | echo $backbone 22 | 23 | feature=RN101 24 | 25 | lr=1e-3 26 | name=4tasks_hard_${feature}_LMprompt40_bs${batch_size}_image224_lr${lr} 27 | output=snap/${folder_prefix}_${task}/$name 28 | 29 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \ 30 | python -m torch.distributed.launch \ 31 | --nproc_per_node=$1 \ 32 | --master_port=26757 \ 33 | src/${task}.py \ 34 | --distributed --multiGPU \ 35 | --optim adamw \ 36 | --warmup_ratio 0.1 \ 37 | --clip_grad_norm 5 \ 38 | --lr ${lr} \ 39 | --epochs 20 \ 40 | --num_workers 4 \ 41 | --backbone ${backbone} \ 42 | --output $output ${@:2} \ 43 | --num_beams 5 \ 44 | --batch_size ${batch_size} \ 45 | --valid_batch_size ${batch_size} \ 46 | --encoder_prompt_len 40 \ 47 | --mid_dim 800 \ 48 | --tasks "vqa,gqa,nlvr,caption" \ 49 | --feature ${feature} --n_boxes 36 --downsample \ 50 | --image_size "(224,224)" \ 51 | --run_name $name 52 | -------------------------------------------------------------------------------- /VL-T5/scripts/image/single_adapter.sh: -------------------------------------------------------------------------------- 1 | task=multitask 2 | 3 | # or bart 4 | model="bart" 5 | 6 | echo $model 7 | 8 | if [ $model == "t5" ] 9 | then 10 | folder_prefix="VLT5" 11 | backbone="t5-base" 12 | batch_size=300 13 | elif [ $model == "bart" ] 14 | then 15 | folder_prefix="VLBart" 16 | backbone="facebook/bart-base" 17 | batch_size=500 18 | fi 19 | 20 | echo $folder_prefix 21 | echo $backbone 22 | 23 | feature=RN101 24 | 25 | lr=1e-3 26 | name=4tasks_hard_${feature}_LMOneadapter+r8+ln_bs${batch_size}_image224_lr${lr} 27 | output=snap/${folder_prefix}_${task}/$name 28 | 29 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \ 30 | python -m torch.distributed.launch \ 31 | --nproc_per_node=$1 \ 32 | --master_port=26757 \ 33 | src/${task}.py \ 34 | --distributed --multiGPU \ 35 | --optim adamw \ 36 | --warmup_ratio 0.1 \ 37 | --clip_grad_norm 5 \ 38 | --lr ${lr} \ 39 | --epochs 20 \ 40 | --num_workers 4 \ 41 | --backbone ${backbone} \ 42 | --output $output ${@:2} \ 43 | --num_beams 5 \ 44 | --batch_size ${batch_size} \ 45 | --valid_batch_size ${batch_size} \ 46 | --use_adapter \ 47 | --unfreeze_layer_norms \ 48 | --reduction_factor 8 \ 49 | --use_single_adapter \ 50 | --use_tasks_prompts \ 51 | --tasks "vqa,gqa,nlvr,caption" \ 52 | --feature ${feature} --n_boxes 36 --downsample \ 53 | --image_size "(224,224)" \ 54 | --run_name $name 55 | -------------------------------------------------------------------------------- /VL-T5/scripts/image/single_compacter.sh: -------------------------------------------------------------------------------- 1 | task=multitask 2 | 3 | # or bart 4 | model="bart" 5 | 6 | echo $model 7 | 8 | if [ $model == "t5" ] 9 | then 10 | folder_prefix="VLT5" 11 | backbone="t5-base" 12 | batch_size=300 13 | elif [ $model == "bart" ] 14 | then 15 | folder_prefix="VLBart" 16 | backbone="facebook/bart-base" 17 | batch_size=500 18 | fi 19 | 20 | echo $folder_prefix 21 | echo $backbone 22 | 23 | feature=RN101 24 | 25 | lr=1e-3 26 | 27 | hypercomplex_division=2 28 | 29 | name=4tasks_hard_${feature}_LMOnecompacter+hdiv${hypercomplex_division}+noshare+nofac+ln+prompt_bs${batch_size}_image224_lr${lr} 30 | output=snap/${folder_prefix}_${task}/$name 31 | 32 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \ 33 | python -m torch.distributed.launch \ 34 | --nproc_per_node=$1 \ 35 | --master_port=26764 \ 36 | src/${task}.py \ 37 | --distributed --multiGPU \ 38 | --optim adamw \ 39 | --warmup_ratio 0.1 \ 40 | --clip_grad_norm 5 \ 41 | --lr ${lr} \ 42 | --epochs 20 \ 43 | --num_workers 4 \ 44 | --backbone ${backbone} \ 45 | --output $output ${@:2} \ 46 | --num_beams 5 \ 47 | --batch_size ${batch_size} \ 48 | --valid_batch_size ${batch_size} \ 49 | --use_compacter \ 50 | --shared_phm_rule False \ 51 | --factorized_phm False \ 52 | --unfreeze_layer_norms \ 53 | --use_single_adapter \ 54 | --use_tasks_prompts \ 55 | --hypercomplex_division ${hypercomplex_division} \ 56 | --reduction_factor 8 \ 57 | --tasks "vqa,gqa,nlvr,caption" \ 58 | --feature ${feature} --n_boxes 36 --downsample \ 59 | --image_size "(224,224)" \ 60 | --run_name $name 61 | -------------------------------------------------------------------------------- /VL-T5/scripts/image/single_lora.sh: -------------------------------------------------------------------------------- 1 | task=multitask 2 | 3 | # or bart 4 | model="bart" 5 | 6 | echo $model 7 | 8 | if [ $model == "t5" ] 9 | then 10 | folder_prefix="VLT5" 11 | backbone="t5-base" 12 | batch_size=300 13 | elif [ $model == "bart" ] 14 | then 15 | folder_prefix="VLBart" 16 | backbone="facebook/bart-base" 17 | batch_size=500 18 | fi 19 | 20 | echo $folder_prefix 21 | echo $backbone 22 | 23 | feature=RN101 24 | 25 | lr=1e-3 26 | 27 | lora_dim=128 28 | 29 | name=${feature}_LMsinglelora${lora_dim}+lr${lr}_bs${batch_size}_image224 30 | output=snap/${folder_prefix}_${task}/$name 31 | 32 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \ 33 | python -m torch.distributed.launch \ 34 | --nproc_per_node=$1 \ 35 | --master_port=26786 \ 36 | src/${task}.py \ 37 | --distributed --multiGPU \ 38 | --optim adamw \ 39 | --warmup_ratio 0.1 \ 40 | --clip_grad_norm 5 \ 41 | --lr ${lr} \ 42 | --epochs 20 \ 43 | --num_workers 4 \ 44 | --backbone ${backbone} \ 45 | --output $output ${@:2} \ 46 | --num_beams 5 \ 47 | --use_lora \ 48 | --lora_dim ${lora_dim} \ 49 | --use_single_lora \ 50 | --use_tasks_prompts \ 51 | --batch_size ${batch_size} \ 52 | --valid_batch_size ${batch_size} \ 53 | --tasks "vqa,gqa,nlvr,caption" \ 54 | --feature ${feature} --n_boxes 36 --downsample \ 55 | --image_size "(224,224)" \ 56 | --run_name $name 57 | -------------------------------------------------------------------------------- /VL-T5/scripts/image/single_prompt.sh: -------------------------------------------------------------------------------- 1 | task=multitask 2 | 3 | # or bart 4 | model="bart" 5 | 6 | echo $model 7 | 8 | if [ $model == "t5" ] 9 | then 10 | folder_prefix="VLT5" 11 | backbone="t5-base" 12 | batch_size=300 13 | elif [ $model == "bart" ] 14 | then 15 | folder_prefix="VLBart" 16 | backbone="facebook/bart-base" 17 | batch_size=500 18 | fi 19 | 20 | echo $folder_prefix 21 | echo $backbone 22 | 23 | feature=RN101 24 | 25 | lr=1e-3 26 | name=4tasks_hard_${feature}_LMOneprompt40_bs${batch_size}_image224_lr${lr} 27 | output=snap/${folder_prefix}_${task}/$name 28 | 29 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \ 30 | python -m torch.distributed.launch \ 31 | --nproc_per_node=$1 \ 32 | --master_port=26757 \ 33 | src/${task}.py \ 34 | --distributed --multiGPU \ 35 | --optim adamw \ 36 | --warmup_ratio 0.1 \ 37 | --clip_grad_norm 5 \ 38 | --lr ${lr} \ 39 | --epochs 20 \ 40 | --num_workers 4 \ 41 | --backbone ${backbone} \ 42 | --output $output ${@:2} \ 43 | --num_beams 5 \ 44 | --batch_size ${batch_size} \ 45 | --valid_batch_size ${batch_size} \ 46 | --encoder_prompt_len 40 \ 47 | --mid_dim 800 \ 48 | --use_single_prompt \ 49 | --use_tasks_prompts \ 50 | --tasks "vqa,gqa,nlvr,caption" \ 51 | --feature ${feature} --n_boxes 36 --downsample \ 52 | --image_size "(224,224)" \ 53 | --run_name $name 54 | -------------------------------------------------------------------------------- /VL-T5/scripts/video/full_finetuning.sh: -------------------------------------------------------------------------------- 1 | task=multitask_video 2 | 3 | # or bart 4 | model="bart" 5 | 6 | echo $model 7 | 8 | if [ $model == "t5" ] 9 | then 10 | folder_prefix="VLT5" 11 | backbone="t5-base" 12 | batch_size=30 13 | elif [ $model == "bart" ] 14 | then 15 | folder_prefix="VLBart" 16 | backbone="facebook/bart-base" 17 | batch_size=50 18 | fi 19 | 20 | echo $folder_prefix 21 | echo $backbone 22 | 23 | feature=ViT 24 | 25 | lr=3e-5 26 | 27 | name=${feature}_LMfull_bs${batch_size}_image224_lr${lr}_subs_epoch7 28 | output=snap/${folder_prefix}_${task}/$name 29 | 30 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \ 31 | python -m torch.distributed.launch \ 32 | --nproc_per_node=$1 \ 33 | --master_port=26791 \ 34 | src/${task}.py \ 35 | --distributed --multiGPU \ 36 | --optim adamw \ 37 | --warmup_ratio 0.1 \ 38 | --clip_grad_norm 5 \ 39 | --lr ${lr} \ 40 | --epochs 7 \ 41 | --num_workers 4 \ 42 | --backbone ${backbone} \ 43 | --output $output ${@:2} \ 44 | --num_beams 5 \ 45 | --unfreeze_language_model \ 46 | --batch_size ${batch_size} \ 47 | --valid_batch_size ${batch_size} \ 48 | --use_tasks_prompts \ 49 | --tasks "tvqa,how2qa,tvc,yc2c" \ 50 | --feature ${feature} --n_boxes 64 --downsample \ 51 | --image_size "(224,224)" \ 52 | --run_name $name 53 | -------------------------------------------------------------------------------- /VL-T5/scripts/video/single_adapter.sh: -------------------------------------------------------------------------------- 1 | task=multitask_video 2 | 3 | # or bart 4 | model="bart" 5 | 6 | echo $model 7 | 8 | if [ $model == "t5" ] 9 | then 10 | folder_prefix="VLT5" 11 | backbone="t5-base" 12 | batch_size=30 13 | elif [ $model == "bart" ] 14 | then 15 | folder_prefix="VLBart" 16 | backbone="facebook/bart-base" 17 | batch_size=50 18 | fi 19 | 20 | echo $folder_prefix 21 | echo $backbone 22 | 23 | feature=ViT 24 | 25 | lr=3e-4 26 | 27 | name=${feature}_LMadapter_bs${batch_size}_image224_lr${lr}_subs_epoch7 28 | output=snap/${folder_prefix}_${task}/$name 29 | 30 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \ 31 | python -m torch.distributed.launch \ 32 | --nproc_per_node=$1 \ 33 | --master_port=26792 \ 34 | src/${task}.py \ 35 | --distributed --multiGPU \ 36 | --optim adamw \ 37 | --warmup_ratio 0.1 \ 38 | --clip_grad_norm 5 \ 39 | --lr ${lr} \ 40 | --epochs 7 \ 41 | --num_workers 4 \ 42 | --backbone ${backbone} \ 43 | --output $output ${@:2} \ 44 | --num_beams 5 \ 45 | --use_adapter \ 46 | --use_single_adapter \ 47 | --unfreeze_layer_norms \ 48 | --reduction_factor 8 \ 49 | --batch_size ${batch_size} \ 50 | --valid_batch_size ${batch_size} \ 51 | --use_tasks_prompts \ 52 | --tasks "tvqa,how2qa,tvc,yc2c" \ 53 | --feature ${feature} --n_boxes 64 --downsample \ 54 | --image_size "(224,224)" \ 55 | --run_name $name 56 | -------------------------------------------------------------------------------- /VL-T5/scripts/video/single_lora.sh: -------------------------------------------------------------------------------- 1 | task=multitask_video 2 | 3 | # or bart 4 | model="bart" 5 | 6 | echo $model 7 | 8 | if [ $model == "t5" ] 9 | then 10 | folder_prefix="VLT5" 11 | backbone="t5-base" 12 | batch_size=300 13 | elif [ $model == "bart" ] 14 | then 15 | folder_prefix="VLBart" 16 | backbone="facebook/bart-base" 17 | batch_size=50 18 | fi 19 | 20 | echo $folder_prefix 21 | echo $backbone 22 | 23 | feature=ViT 24 | 25 | lr=3e-4 26 | 27 | name=${feature}_LMlora_bs${batch_size}_image224_lr${lr}_subs_epoch7 28 | output=snap/${folder_prefix}_${task}/$name 29 | 30 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \ 31 | python -m torch.distributed.launch \ 32 | --nproc_per_node=$1 \ 33 | --master_port=26799 \ 34 | src/${task}.py \ 35 | --distributed --multiGPU \ 36 | --optim adamw \ 37 | --warmup_ratio 0.1 \ 38 | --clip_grad_norm 5 \ 39 | --lr ${lr} \ 40 | --epochs 7 \ 41 | --num_workers 4 \ 42 | --backbone ${backbone} \ 43 | --output $output ${@:2} \ 44 | --num_beams 5 \ 45 | --use_lora \ 46 | --use_single_lora \ 47 | --lora_dim 128 \ 48 | --batch_size ${batch_size} \ 49 | --valid_batch_size ${batch_size} \ 50 | --use_tasks_prompts \ 51 | --tasks "tvqa,how2qa,tvc,yc2c" \ 52 | --feature ${feature} --n_boxes 64 --downsample \ 53 | --image_size "(224,224)" \ 54 | --run_name $name 55 | -------------------------------------------------------------------------------- /VL-T5/scripts/video/single_prompt.sh: -------------------------------------------------------------------------------- 1 | task=multitask_video 2 | 3 | # or bart 4 | model="bart" 5 | 6 | echo $model 7 | 8 | if [ $model == "t5" ] 9 | then 10 | folder_prefix="VLT5" 11 | backbone="t5-base" 12 | batch_size=300 13 | elif [ $model == "bart" ] 14 | then 15 | folder_prefix="VLBart" 16 | backbone="facebook/bart-base" 17 | batch_size=50 18 | fi 19 | 20 | echo $folder_prefix 21 | echo $backbone 22 | 23 | feature=ViT 24 | 25 | lr=3e-4 26 | 27 | name=${feature}_LMprompt_bs${batch_size}_image224_lr${lr}_subs_epoch7 28 | output=snap/${folder_prefix}_${task}/$name 29 | 30 | TOKENIZERS_PARALLELISM=True PYTHONPATH=$PYTHONPATH:./src \ 31 | python -m torch.distributed.launch \ 32 | --nproc_per_node=$1 \ 33 | --master_port=26798 \ 34 | src/${task}.py \ 35 | --distributed --multiGPU \ 36 | --optim adamw \ 37 | --warmup_ratio 0.1 \ 38 | --clip_grad_norm 5 \ 39 | --lr ${lr} \ 40 | --epochs 7 \ 41 | --num_workers 4 \ 42 | --backbone ${backbone} \ 43 | --output $output ${@:2} \ 44 | --num_beams 5 \ 45 | --batch_size ${batch_size} \ 46 | --valid_batch_size ${batch_size} \ 47 | --encoder_prompt_len 40 \ 48 | --mid_dim 800 \ 49 | --use_single_prompt \ 50 | --use_tasks_prompts \ 51 | --tasks "tvqa,how2qa,tvc,yc2c" \ 52 | --feature ${feature} --n_boxes 64 --downsample \ 53 | --image_size "(224,224)" \ 54 | --run_name $name 55 | -------------------------------------------------------------------------------- /VL-T5/src/adapters/__init__.py: -------------------------------------------------------------------------------- 1 | # The codes are borrowed from https://github.com/rabeehk/compacter 2 | 3 | from .config import MetaAdapterConfig, AdapterConfig, CompactorConfig, LRAdapterConfig 4 | from .adapter_modeling import Adapter, HyperComplexAdapter, OutputAdapter 5 | from .adapter_controller import AdapterController, AdapterLayer, MetaLayersAdapterController, OutputParallelAdapterLayer 6 | from .adapter_hypernetwork import AdapterLayersHyperNetController, AdapterLayersOneHyperNetController 7 | from .adapter_utils import TaskEmbeddingController -------------------------------------------------------------------------------- /VL-T5/src/adapters/adapter_configuration.py: -------------------------------------------------------------------------------- 1 | """Implements the adapters and other parameter-efficient finetuning methods' configurations.""" 2 | 3 | from collections import OrderedDict 4 | from dataclasses import dataclass 5 | 6 | import torch.nn as nn 7 | 8 | @dataclass 9 | class AdapterConfig(object): 10 | """Implements the adapter configuration proposed by Houlsby et. al, 2019 11 | in https://arxiv.org/abs/1902.00751. 12 | We additionally pass all the configuration of parameter-efficient finetuning 13 | methods with this config.""" 14 | add_layer_norm_before_adapter: bool = False 15 | add_layer_norm_after_adapter: bool = True 16 | non_linearity: str = "swish" 17 | task_reduction_factor: int = 16 18 | add_adapter_in_feed_forward = True 19 | add_adapter_in_self_attention = True 20 | hidden_dim = 128 21 | task_adapter_layers_encoder = None 22 | task_adapter_layers_decoder = None 23 | task_adapter_in_decoder = True 24 | intrinsic_dim = 100 25 | normalize_intrinsic_projections = False 26 | # This can be either random, or fastfood. 27 | intrinsic_projection = "random" 28 | 29 | # Hypercomplex adapters parameters 30 | hypercomplex_adapters = False 31 | hypercomplex_division = 8 32 | learn_phm = True 33 | hypercomplex_nonlinearity="glorot-uniform" 34 | shared_phm_rule = False 35 | factorized_phm = False 36 | shared_W_phm = False 37 | factorized_phm_rule = False 38 | phm_c_init = "normal" 39 | phm_rank = 1 40 | phm_init_range=0.01 41 | 42 | # prefix-tuning parameters. 43 | prefix_dim = 100 44 | init_prefix_from_vocab = False 45 | kronecker_prod = False 46 | 47 | # BitFit configuration. 48 | bitfit = False 49 | 50 | # Low-rank adapters. 51 | low_rank_adapters = False 52 | low_rank_w_init = "glorot-uniform" 53 | low_rank_rank = 1 54 | 55 | 56 | ADAPTER_CONFIG_MAPPING = OrderedDict( 57 | [("adapter", AdapterConfig)]) 58 | 59 | 60 | class AutoAdapterConfig(nn.Module): 61 | """Generic Adapter config class to instantiate different adapter configs.""" 62 | 63 | @classmethod 64 | def get(cls, config_name: str): 65 | if config_name in ADAPTER_CONFIG_MAPPING: 66 | return ADAPTER_CONFIG_MAPPING[config_name]() 67 | raise ValueError( 68 | "Unrecognized adapter config type identifier: {}. Should contain one of {}" 69 | .format(config_name, ", ".join(ADAPTER_CONFIG_MAPPING.keys()))) 70 | -------------------------------------------------------------------------------- /VL-T5/src/adapters/adapter_modeling.py: -------------------------------------------------------------------------------- 1 | """Implements an Adapter, Low-rank adapters and Hyper-adapter Layers.""" 2 | import torch.nn as nn 3 | from .adapter_utils import Activations 4 | 5 | from .hypercomplex.layers import PHMLinear 6 | from .low_rank_layer import LowRankLinear 7 | 8 | 9 | class LowRankAdapter(nn.Module): 10 | """This is the low-rank adapter, in which each adapter is composed of two rank-one matrices. 11 | """ 12 | def __init__(self, config): 13 | super().__init__() 14 | self.config = config 15 | self.input_dim = config.input_dim 16 | self.down_sample_size = self.input_dim // config.reduction_factor 17 | self.activation = Activations(config.non_linearity.lower()) 18 | self.down_sampler = LowRankLinear(self.input_dim, self.down_sample_size, 19 | w_init=config.low_rank_w_init, 20 | rank=config.low_rank_rank) 21 | self.up_sampler = LowRankLinear(self.down_sample_size, self.input_dim, 22 | w_init=config.low_rank_w_init, 23 | rank=config.low_rank_rank) 24 | 25 | self.track_z = config.track_z 26 | 27 | def forward(self, x): 28 | z = self.down_sampler(x) 29 | z = self.activation(z) 30 | if self.track_z: 31 | self.z = z 32 | output = self.up_sampler(z) 33 | return output 34 | 35 | 36 | class Adapter(nn.Module): 37 | """Conventional Adapter layer, in which the weights of up and down sampler modules 38 | are parameters and are optimized.""" 39 | 40 | def __init__(self, config): 41 | super().__init__() 42 | self.config = config 43 | self.input_dim = config.d_model 44 | reduction_factor = config.reduction_factor 45 | self.down_sample_size = self.input_dim // reduction_factor 46 | self.activation = Activations(config.non_linearity.lower()) 47 | self.down_sampler = nn.Linear(self.input_dim, self.down_sample_size) 48 | self.up_sampler = nn.Linear(self.down_sample_size, self.input_dim) 49 | 50 | self.track_z = config.track_z 51 | 52 | def forward(self, x): 53 | z = self.down_sampler(x) 54 | z = self.activation(z) 55 | if self.track_z: 56 | self.z = z 57 | output = self.up_sampler(z) 58 | return output 59 | 60 | 61 | class OutputAdapter(nn.Module): 62 | """Conventional Adapter layer, in which the weights of up and down sampler modules 63 | are parameters and are optimized.""" 64 | 65 | def __init__(self, config, output_dim): 66 | super().__init__() 67 | self.config = config 68 | self.input_dim = config.d_model 69 | reduction_factor = 16 70 | self.down_sample_size = self.input_dim // reduction_factor 71 | self.activation = Activations(config.non_linearity.lower()) 72 | self.down_sampler = nn.Linear(self.input_dim, self.down_sample_size) 73 | self.up_sampler = nn.Linear(self.down_sample_size, output_dim) 74 | 75 | def forward(self, x): 76 | z = self.down_sampler(x) 77 | z = self.activation(z) 78 | output = self.up_sampler(z) 79 | return output 80 | 81 | def resize_up_sampler(self, resized_size): 82 | self.up_sampler = nn.Linear(self.down_sample_size, resized_size) 83 | 84 | 85 | class HyperComplexAdapter(nn.Module): 86 | """Hypercomplex Adapter layer, in which the weights of up and down sampler modules 87 | are parameters are 1/n times of the conventional adapter layers, where n is 88 | hypercomplex division number.""" 89 | 90 | def __init__(self, config): 91 | super().__init__() 92 | self.config = config 93 | self.input_dim = config.input_dim 94 | self.down_sample_size = self.input_dim // config.reduction_factor 95 | self.activation = Activations(config.non_linearity.lower()) 96 | self.down_sampler = PHMLinear(in_features=self.input_dim, 97 | out_features=self.down_sample_size, 98 | bias=True, 99 | c_init=config.phm_c_init, 100 | phm_dim=config.hypercomplex_division, 101 | learn_phm=config.learn_phm, 102 | w_init=config.hypercomplex_nonlinearity, 103 | shared_phm_rule=config.shared_phm_rule, 104 | factorized_phm=config.factorized_phm, 105 | shared_W_phm=config.shared_W_phm, 106 | factorized_phm_rule=config.factorized_phm_rule, 107 | phm_rank=config.phm_rank, 108 | phm_init_range=config.phm_init_range, 109 | kronecker_prod=config.kronecker_prod) 110 | self.up_sampler = PHMLinear(in_features=self.down_sample_size, 111 | out_features=self.input_dim, 112 | bias=True, 113 | c_init=config.phm_c_init, 114 | phm_dim=config.hypercomplex_division, 115 | learn_phm=config.learn_phm, 116 | w_init=config.hypercomplex_nonlinearity, 117 | shared_phm_rule=config.shared_phm_rule, 118 | factorized_phm=config.factorized_phm, 119 | shared_W_phm=config.shared_W_phm, 120 | factorized_phm_rule=config.factorized_phm_rule, 121 | phm_rank=config.phm_rank, 122 | phm_init_range=config.phm_init_range, 123 | kronecker_prod=config.kronecker_prod) 124 | 125 | self.track_z = config.track_z 126 | 127 | def forward(self, x): 128 | z = self.down_sampler(x) 129 | z = self.activation(z) 130 | if self.track_z: 131 | self.z = z 132 | return self.up_sampler(z) -------------------------------------------------------------------------------- /VL-T5/src/adapters/adapter_outputs.py: -------------------------------------------------------------------------------- 1 | """Defines the output class for the adapter layers' parameters.""" 2 | import torch 3 | from dataclasses import dataclass 4 | 5 | 6 | @dataclass 7 | class SamplerOutput: 8 | """Base class for the base and weights of each adapter.""" 9 | weight: torch.FloatTensor = None 10 | bias: torch.FloatTensor = None 11 | 12 | 13 | @dataclass 14 | class LayerNormOutput: 15 | """Base class for the base and weights of the conditional 16 | layer norms.""" 17 | weight: torch.FloatTensor = None 18 | bias: torch.FloatTensor = None 19 | 20 | 21 | @dataclass 22 | class AdapterOutput: 23 | """Base class for each adapter weights""" 24 | up: SamplerOutput = None 25 | down: SamplerOutput = None 26 | pre_norm: LayerNormOutput = None 27 | post_norm: LayerNormOutput = None 28 | 29 | 30 | @dataclass 31 | class AdapterT5BlockOutput: 32 | """ 33 | Base class for adapter layer's outputs. 34 | """ 35 | feed_forward: AdapterOutput = None 36 | self_attention: AdapterOutput = None 37 | cross_attention: AdapterOutput = None -------------------------------------------------------------------------------- /VL-T5/src/adapters/adapter_utils.py: -------------------------------------------------------------------------------- 1 | """Implementation of different utility functions for adapter layers.""" 2 | import torch 3 | import torch.nn as nn 4 | from transformers.activations import get_activation 5 | 6 | 7 | class Activations(nn.Module): 8 | def __init__(self, activation_type): 9 | super().__init__() 10 | self.f = get_activation(activation_type) 11 | 12 | def forward(self, x): 13 | return self.f(x) 14 | 15 | 16 | def init_linear_layer(linear_layer, std=1e-2): 17 | """Initializes the given linear module as explained in adapter paper.""" 18 | nn.init.normal_(linear_layer.weight, std=std) 19 | nn.init.zeros_(linear_layer.bias) 20 | 21 | 22 | def linear_layer(input_dim, output_dim, std=1e-2): 23 | """Generates a linear module and initializes it.""" 24 | linear = nn.Linear(input_dim, output_dim) 25 | init_linear_layer(linear, std=std) 26 | return linear 27 | 28 | 29 | class TaskHyperNet(nn.Module): 30 | """This module generates the task-embeddings from the initial feeded task embeddings.""" 31 | 32 | def __init__(self, config, input_dim): 33 | super(TaskHyperNet, self).__init__() 34 | self.task_hidden_dim = config.task_hidden_dim 35 | self.projected_task_embedding_dim = config.projected_task_embedding_dim 36 | self.task_embeding_generator = nn.Sequential( 37 | linear_layer(input_dim, self.task_hidden_dim), 38 | nn.ReLU(), 39 | linear_layer(self.task_hidden_dim, self.projected_task_embedding_dim)) 40 | 41 | def forward(self, task_embedding): 42 | task_embedding = task_embedding.view(-1) 43 | return self.task_embeding_generator(task_embedding).view(-1) 44 | 45 | 46 | class LayerNormHyperNet(nn.Module): 47 | """This module generates the weight and bias for the task conditioned layer norm.""" 48 | 49 | def __init__(self, config): 50 | super(LayerNormHyperNet, self).__init__() 51 | self.task_embedding_dim = config.projected_task_embedding_dim \ 52 | if config.train_task_embeddings else config.task_embedding_dim 53 | self.weight_generator = linear_layer(self.task_embedding_dim, config.input_dim) 54 | self.bias_generator = linear_layer(self.task_embedding_dim, config.input_dim) 55 | 56 | def forward(self, input): 57 | return self.weight_generator(input), self.bias_generator(input) 58 | 59 | 60 | class TaskEmbeddingController(nn.Module): 61 | """Main module controlling task embeddings.""" 62 | 63 | def __init__(self, config): 64 | super(TaskEmbeddingController, self).__init__() 65 | # self.device = config.device 66 | self.task_embedding_dim = config.task_embedding_dim 67 | self.tasks = config.tasks 68 | self.task_to_task_embeddings = {task: task for task in self.tasks} 69 | if config.task_to_embeddings is not None: 70 | self.task_to_task_embeddings = config.task_to_embeddings 71 | self.tasks = self.task_to_task_embeddings.values() 72 | self.set_task_embeddings(self.tasks) 73 | self.train_task_embeddings = config.train_task_embeddings 74 | if self.train_task_embeddings: 75 | self.task_hyper_net = TaskHyperNet(config) 76 | 77 | def get_task(self, task): 78 | return self.task_to_task_embeddings[task] 79 | 80 | def set_task_embeddings(self, tasks): 81 | self.task_to_embeddings = nn.ParameterDict(dict()) 82 | for task in tasks: 83 | task_embedding = torch.Tensor(torch.randn(self.task_embedding_dim)) 84 | self.task_to_embeddings[task] = nn.Parameter(task_embedding) 85 | 86 | def forward(self, task): 87 | task_mapped = self.get_task(task) 88 | task_embedding = self.task_to_embeddings[task_mapped] 89 | if self.train_task_embeddings: 90 | return self.task_hyper_net(task_embedding) 91 | return task_embedding 92 | -------------------------------------------------------------------------------- /VL-T5/src/adapters/config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class AdapterConfig(object): 6 | """Implements the adapter configuration proposed by Houlsby et. al, 2019 7 | in https://arxiv.org/abs/1902.00751.""" 8 | add_layer_norm_before_adapter: bool = False 9 | add_layer_norm_after_adapter: bool = False 10 | non_linearity: str = "gelu_new" 11 | reduction_factor: int = 16 12 | weight_init_range = 1e-2 13 | # Whether to use conditional layer norms for adapters. 14 | conditional_layer_norm = False 15 | hidden_dim = 128 16 | # Whether to add adapter blocks, this is used in case we need 17 | # to tune only layer norms. 18 | train_adapters_blocks = True 19 | 20 | task_adapter_layers_encoder = None 21 | task_adapter_layers_decoder = None 22 | task_adapter_in_decoder = True 23 | intrinsic_dim = 100 24 | normalize_intrinsic_projections = False 25 | # This can be either random, or fastfood. 26 | intrinsic_projection = "random" 27 | 28 | # Hypercomplex adapters parameters 29 | hypercomplex_adapters = False 30 | hypercomplex_division = 8 31 | learn_phm = True 32 | hypercomplex_nonlinearity="glorot-uniform" 33 | shared_phm_rule = False 34 | factorized_phm = False 35 | shared_W_phm = False 36 | factorized_phm_rule = False 37 | phm_c_init = "normal" 38 | phm_rank = 1 39 | phm_init_range=0.01 40 | 41 | # prefix-tuning parameters. 42 | prefix_dim = 100 43 | init_prefix_from_vocab = False 44 | kronecker_prod = False 45 | 46 | # BitFit configuration. 47 | bitfit = False 48 | 49 | # Low-rank adapters. 50 | low_rank_adapters = False 51 | low_rank_w_init = "glorot-uniform" 52 | low_rank_rank = 1 53 | 54 | # whether using single adapter for all tasks 55 | use_single_adapter = False 56 | 57 | 58 | class MetaAdapterConfig(AdapterConfig): 59 | """Implements Meta adapter in which a hyper-network generates the parameters of 60 | adapter layers. In this case we have a task embeddings which is feed to the 61 | hyper-network to allow it generate the weights for the adapter layers.""" 62 | task_embedding_dim = 512 63 | task_embedding_dir = None 64 | hidden_dim = 128 65 | train_task_embeddings = False 66 | non_linearity: str = "gelu_new" 67 | projected_task_embedding_dim = 64 68 | task_hidden_dim = 128 69 | parametric_task_embedding = False 70 | # If Specified, uses one hypernet to generates the adapters weights. 71 | unique_hyper_net = True 72 | unique_hyper_net_layer_norm = True 73 | # We consider only one hyper-net for all the blocks of transformer. 74 | efficient_unique_hyper_net = False 75 | task_to_embeddings=None 76 | 77 | 78 | @dataclass 79 | class CompactorConfig(object): 80 | add_layer_norm_before_adapter: bool = False 81 | add_layer_norm_after_adapter: bool = False 82 | non_linearity: str = "gelu_new" 83 | reduction_factor: int = 16 84 | weight_init_range = 1e-2 85 | # Whether to use conditional layer norms for adapters. 86 | hidden_dim = 128 87 | # Whether to add adapter blocks, this is used in case we need 88 | # to tune only layer norms. 89 | task_adapter_layers_encoder = None 90 | task_adapter_layers_decoder = None 91 | task_adapter_in_decoder = True 92 | intrinsic_dim = 100 93 | normalize_intrinsic_projections = False 94 | # This can be either random, or fastfood. 95 | intrinsic_projection = "random" 96 | 97 | # Hypercomplex adapters parameters 98 | hypercomplex_adapters = True 99 | hypercomplex_division = 4 100 | train_task_adapters = True 101 | learn_phm = True 102 | hypercomplex_nonlinearity="glorot-uniform" 103 | shared_phm_rule = True 104 | factorized_phm = True 105 | shared_W_phm = False 106 | factorized_phm_rule = False 107 | phm_c_init = "normal" 108 | phm_rank = 1 109 | phm_init_range=0.0001 110 | 111 | # prefix-tuning parameters. 112 | prefix_dim = 100 113 | init_prefix_from_vocab = False 114 | kronecker_prod = False 115 | 116 | # BitFit configuration. 117 | bitfit = False 118 | 119 | # Low-rank adapters. 120 | low_rank_adapters = False 121 | low_rank_w_init = "glorot-uniform" 122 | low_rank_rank = 1 123 | 124 | # whether using single adapter for all tasks 125 | use_single_adapter = False 126 | 127 | 128 | @dataclass 129 | class LRAdapterConfig(object): 130 | add_layer_norm_before_adapter: bool = False 131 | add_layer_norm_after_adapter: bool = False 132 | non_linearity: str = "gelu_new" 133 | reduction_factor: int = 16 134 | weight_init_range = 1e-2 135 | # Whether to use conditional layer norms for adapters. 136 | hidden_dim = 128 137 | # Whether to add adapter blocks, this is used in case we need 138 | # to tune only layer norms. 139 | task_adapter_layers_encoder = None 140 | task_adapter_layers_decoder = None 141 | task_adapter_in_decoder = True 142 | intrinsic_dim = 100 143 | normalize_intrinsic_projections = False 144 | # This can be either random, or fastfood. 145 | intrinsic_projection = "random" 146 | 147 | # Hypercomplex adapters parameters 148 | hypercomplex_adapters = False 149 | hypercomplex_division = 4 150 | train_task_adapters = True 151 | learn_phm = True 152 | hypercomplex_nonlinearity="glorot-uniform" 153 | shared_phm_rule = True 154 | factorized_phm = True 155 | shared_W_phm = False 156 | factorized_phm_rule = False 157 | phm_c_init = "normal" 158 | phm_rank = 1 159 | phm_init_range=0.0001 160 | 161 | # prefix-tuning parameters. 162 | prefix_dim = 100 163 | init_prefix_from_vocab = False 164 | kronecker_prod = False 165 | 166 | # BitFit configuration. 167 | bitfit = False 168 | 169 | # Low-rank adapters. 170 | low_rank_adapters = True 171 | low_rank_w_init = "glorot-uniform" 172 | low_rank_rank = 1 173 | 174 | # whether using single adapter for all tasks 175 | use_single_adapter = False -------------------------------------------------------------------------------- /VL-T5/src/adapters/hypercomplex/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ylsung/VL_adapter/545fcbbdbbaec4c442de35567f6ae477ff4e8265/VL-T5/src/adapters/hypercomplex/__init__.py -------------------------------------------------------------------------------- /VL-T5/src/adapters/hypercomplex/inits.py: -------------------------------------------------------------------------------- 1 | # The codes are from https://github.com/bayer-science-for-a-better-life/phc-gnn 2 | import torch 3 | import math 4 | 5 | 6 | def glorot_normal(tensor: torch.Tensor): 7 | return torch.nn.init.xavier_normal_(tensor, gain=math.sqrt(2)) 8 | 9 | def glorot_uniform(tensor: torch.Tensor): 10 | return torch.nn.init.xavier_uniform_(tensor, gain=math.sqrt(2)) -------------------------------------------------------------------------------- /VL-T5/src/adapters/hypercomplex/kronecker.py: -------------------------------------------------------------------------------- 1 | # The codes are from https://github.com/bayer-science-for-a-better-life/phc-gnn 2 | import torch 3 | 4 | # TODO: change this with torch.kron 5 | """A part of the pylabyk library: numpytorch.py at https://github.com/yulkang/pylabyk""" 6 | def kronecker_product(a, b): 7 | """ 8 | Kronecker product of matrices a and b with leading batch dimensions. 9 | Batch dimensions are broadcast. The number of them mush 10 | :type a: torch.Tensor 11 | :type b: torch.Tensor 12 | :rtype: torch.Tensor 13 | """ 14 | #return torch.stack([torch.kron(ai, bi) for ai, bi in zip(a,b)], dim=0) 15 | siz1 = torch.Size(torch.tensor(a.shape[-2:]) * torch.tensor(b.shape[-2:])) 16 | res = a.unsqueeze(-1).unsqueeze(-3) * b.unsqueeze(-2).unsqueeze(-4) 17 | siz0 = res.shape[:-4] 18 | out = res.reshape(siz0 + siz1) 19 | return out 20 | 21 | 22 | def kronecker_product_einsum_batched(A: torch.Tensor, B: torch.Tensor): 23 | """ 24 | Batched Version of Kronecker Products 25 | :param A: has shape (b, a, c) 26 | :param B: has shape (b, k, p) 27 | :return: (b, ak, cp) 28 | """ 29 | assert A.dim() == 3 and B.dim() == 3 30 | res = torch.einsum('bac,bkp->bakcp', A, B).view(A.size(0), 31 | A.size(1)*B.size(1), 32 | A.size(2)*B.size(2)) 33 | return res -------------------------------------------------------------------------------- /VL-T5/src/adapters/low_rank_layer.py: -------------------------------------------------------------------------------- 1 | """This script implements a low-rank linear layer.""" 2 | import torch 3 | import torch.nn as nn 4 | 5 | from .hypercomplex.inits import glorot_uniform, glorot_normal 6 | 7 | class LowRankLinear(torch.nn.Module): 8 | def __init__(self, input_dim: int, output_dim: int, rank: int = 1, 9 | bias: bool = True, w_init: str = "glorot-uniform"): 10 | super(LowRankLinear, self).__init__() 11 | self.input_dim = input_dim 12 | self.output_dim = output_dim 13 | self.rank = rank 14 | self.bias = bias 15 | self.w_init = w_init 16 | self.W_left = nn.Parameter(torch.Tensor(size=(input_dim, rank)), requires_grad=True) 17 | self.W_right = nn.Parameter(torch.Tensor(size=(rank, output_dim)), requires_grad=True) 18 | if bias: 19 | self.b = nn.Parameter(torch.Tensor(output_dim)) 20 | self.reset_parameters() 21 | 22 | def reset_parameters(self): 23 | if self.bias: 24 | self.b.data = torch.zeros_like(self.b.data) 25 | if self.w_init == "glorot-uniform": 26 | self.W_left.data = glorot_uniform(self.W_left.data) 27 | self.W_right.data = glorot_uniform(self.W_right.data) 28 | elif self.w_init == "glorot-normal": 29 | self.W_left.data = glorot_normal(self.W_left.data) 30 | self.W_right.data = glorot_normal(self.W_right.data) 31 | else: 32 | raise ValueError 33 | 34 | def forward(self, x: torch.Tensor) -> torch.Tensor: 35 | W = self.W_left.matmul(self.W_right) 36 | output = torch.matmul(input=x, other=W) 37 | if self.bias: 38 | output += self.b 39 | return output 40 | -------------------------------------------------------------------------------- /VL-T5/src/caption_model.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | from modeling_t5 import VLT5 7 | class VLT5COCOCaption(VLT5): 8 | def __init__(self, config): 9 | super().__init__(config) 10 | 11 | def train_step(self, batch): 12 | device = next(self.parameters()).device 13 | 14 | batch = self.vis_forward(batch, device) 15 | task = batch["task"] 16 | vis_feats = batch['vis_feats'].to(device) 17 | input_ids = batch['input_ids'].to(device) 18 | vis_pos = batch['boxes'].to(device) 19 | 20 | lm_labels = batch["target_ids"].to(device) 21 | 22 | reduce_loss = True 23 | output = self( 24 | input_ids=input_ids, 25 | vis_inputs=(vis_feats, vis_pos), 26 | labels=lm_labels, 27 | reduce_loss=reduce_loss, 28 | task=task, 29 | ) 30 | 31 | lm_mask = lm_labels != -100 32 | B, L = lm_labels.size() 33 | 34 | loss = output['loss'] 35 | 36 | result = { 37 | 'loss': loss 38 | } 39 | return result 40 | 41 | def test_step(self, batch, **kwargs): 42 | device = next(self.parameters()).device 43 | 44 | batch = self.vis_forward(batch, device) 45 | task = batch["task"] 46 | vis_feats = batch['vis_feats'].to(device) 47 | input_ids = batch['input_ids'].to(device) 48 | vis_pos = batch['boxes'].to(device) 49 | 50 | output = self.generate( 51 | input_ids=input_ids, 52 | vis_inputs=(vis_feats, vis_pos), 53 | task=task, 54 | **kwargs, 55 | ) 56 | 57 | generated_sents = self.tokenizer.batch_decode(output, skip_special_tokens=True) 58 | 59 | result = {} 60 | result['pred'] = generated_sents 61 | 62 | return result 63 | 64 | 65 | from modeling_bart import VLBart 66 | class VLBartCOCOCaption(VLBart): 67 | def __init__(self, config): 68 | super().__init__(config) 69 | 70 | def train_step(self, batch): 71 | device = next(self.parameters()).device 72 | 73 | batch = self.vis_forward(batch, device) 74 | task = batch["task"] 75 | vis_feats = batch['vis_feats'].to(device) 76 | input_ids = batch['input_ids'].to(device) 77 | vis_pos = batch['boxes'].to(device) 78 | 79 | lm_labels = batch["target_ids"].to(device) 80 | 81 | reduce_loss = True 82 | output = self( 83 | input_ids=input_ids, 84 | vis_inputs=(vis_feats, vis_pos), 85 | labels=lm_labels, 86 | reduce_loss=reduce_loss, 87 | task=task, 88 | ) 89 | 90 | lm_mask = lm_labels != -100 91 | B, L = lm_labels.size() 92 | 93 | loss = output['loss'] 94 | 95 | result = { 96 | 'loss': loss 97 | } 98 | return result 99 | 100 | def test_step(self, batch, **kwargs): 101 | device = next(self.parameters()).device 102 | 103 | batch = self.vis_forward(batch, device) 104 | task = batch["task"] 105 | vis_feats = batch['vis_feats'].to(device) 106 | input_ids = batch['input_ids'].to(device) 107 | vis_pos = batch['boxes'].to(device) 108 | 109 | output = self.generate( 110 | input_ids=input_ids, 111 | vis_inputs=(vis_feats, vis_pos), 112 | task=task, 113 | **kwargs 114 | ) 115 | 116 | generated_sents = self.tokenizer.batch_decode(output, skip_special_tokens=True) 117 | 118 | result = {} 119 | result['pred'] = generated_sents 120 | 121 | return result -------------------------------------------------------------------------------- /VL-T5/src/clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip import * 2 | -------------------------------------------------------------------------------- /VL-T5/src/clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ylsung/VL_adapter/545fcbbdbbaec4c442de35567f6ae477ff4e8265/VL-T5/src/clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /VL-T5/src/clip/simple_tokenizer.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import html 3 | import os 4 | from functools import lru_cache 5 | 6 | import ftfy 7 | import regex as re 8 | 9 | 10 | @lru_cache() 11 | def default_bpe(): 12 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz") 13 | 14 | 15 | @lru_cache() 16 | def bytes_to_unicode(): 17 | """ 18 | Returns list of utf-8 byte and a corresponding list of unicode strings. 19 | The reversible bpe codes work on unicode strings. 20 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 21 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 22 | This is a signficant percentage of your normal, say, 32K bpe vocab. 23 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 24 | And avoids mapping to whitespace/control characters the bpe code barfs on. 25 | """ 26 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) 27 | cs = bs[:] 28 | n = 0 29 | for b in range(2**8): 30 | if b not in bs: 31 | bs.append(b) 32 | cs.append(2**8+n) 33 | n += 1 34 | cs = [chr(n) for n in cs] 35 | return dict(zip(bs, cs)) 36 | 37 | 38 | def get_pairs(word): 39 | """Return set of symbol pairs in a word. 40 | Word is represented as tuple of symbols (symbols being variable-length strings). 41 | """ 42 | pairs = set() 43 | prev_char = word[0] 44 | for char in word[1:]: 45 | pairs.add((prev_char, char)) 46 | prev_char = char 47 | return pairs 48 | 49 | 50 | def basic_clean(text): 51 | text = ftfy.fix_text(text) 52 | text = html.unescape(html.unescape(text)) 53 | return text.strip() 54 | 55 | 56 | def whitespace_clean(text): 57 | text = re.sub(r'\s+', ' ', text) 58 | text = text.strip() 59 | return text 60 | 61 | 62 | class SimpleTokenizer(object): 63 | def __init__(self, bpe_path: str = default_bpe()): 64 | self.byte_encoder = bytes_to_unicode() 65 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 66 | merges = gzip.open(bpe_path).read().decode("utf-8").split('\n') 67 | merges = merges[1:49152-256-2+1] 68 | merges = [tuple(merge.split()) for merge in merges] 69 | vocab = list(bytes_to_unicode().values()) 70 | vocab = vocab + [v+'' for v in vocab] 71 | for merge in merges: 72 | vocab.append(''.join(merge)) 73 | vocab.extend(['<|startoftext|>', '<|endoftext|>']) 74 | self.encoder = dict(zip(vocab, range(len(vocab)))) 75 | self.decoder = {v: k for k, v in self.encoder.items()} 76 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 77 | self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} 78 | self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) 79 | 80 | def bpe(self, token): 81 | if token in self.cache: 82 | return self.cache[token] 83 | word = tuple(token[:-1]) + ( token[-1] + '',) 84 | pairs = get_pairs(word) 85 | 86 | if not pairs: 87 | return token+'' 88 | 89 | while True: 90 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) 91 | if bigram not in self.bpe_ranks: 92 | break 93 | first, second = bigram 94 | new_word = [] 95 | i = 0 96 | while i < len(word): 97 | try: 98 | j = word.index(first, i) 99 | new_word.extend(word[i:j]) 100 | i = j 101 | except: 102 | new_word.extend(word[i:]) 103 | break 104 | 105 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 106 | new_word.append(first+second) 107 | i += 2 108 | else: 109 | new_word.append(word[i]) 110 | i += 1 111 | new_word = tuple(new_word) 112 | word = new_word 113 | if len(word) == 1: 114 | break 115 | else: 116 | pairs = get_pairs(word) 117 | word = ' '.join(word) 118 | self.cache[token] = word 119 | return word 120 | 121 | def encode(self, text): 122 | bpe_tokens = [] 123 | text = whitespace_clean(basic_clean(text)).lower() 124 | for token in re.findall(self.pat, text): 125 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) 126 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) 127 | return bpe_tokens 128 | 129 | def decode(self, tokens): 130 | text = ''.join([self.decoder[token] for token in tokens]) 131 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') 132 | return text 133 | -------------------------------------------------------------------------------- /VL-T5/src/gqa_model.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import torch.nn as nn 4 | import numpy as np 5 | 6 | 7 | from modeling_t5 import VLT5 8 | class VLT5GQA(VLT5): 9 | def __init__(self, config): 10 | super().__init__(config) 11 | 12 | def train_step(self, batch): 13 | 14 | device = next(self.parameters()).device 15 | 16 | batch = self.vis_forward(batch, device) 17 | task = batch["task"] 18 | vis_feats = batch['vis_feats'].to(device) 19 | input_ids = batch['input_ids'].to(device) 20 | vis_pos = batch['boxes'].to(device) 21 | 22 | lm_labels = batch["target_ids"].to(device) 23 | 24 | output = self( 25 | input_ids=input_ids, 26 | vis_inputs=(vis_feats, vis_pos), 27 | labels=lm_labels, 28 | return_dict=True, 29 | task=task, 30 | ) 31 | assert 'loss' in output 32 | 33 | lm_mask = lm_labels != -100 34 | B, L = lm_labels.size() 35 | 36 | loss = output['loss'] 37 | 38 | loss = loss.view(B, L) * lm_mask 39 | 40 | loss = loss.sum(dim=1) / lm_mask.sum(dim=1).clamp(min=1) # B 41 | 42 | loss = loss.mean() 43 | 44 | result = { 45 | 'loss': loss 46 | } 47 | return result 48 | 49 | def test_step(self, batch, **kwargs): 50 | device = next(self.parameters()).device 51 | 52 | batch = self.vis_forward(batch, device) 53 | task = batch["task"] 54 | vis_feats = batch['vis_feats'].to(device) 55 | input_ids = batch['input_ids'].to(device) 56 | vis_pos = batch['boxes'].to(device) 57 | 58 | 59 | output = self.generate( 60 | input_ids=input_ids, 61 | vis_inputs=(vis_feats, vis_pos), 62 | task=task, 63 | **kwargs 64 | ) 65 | 66 | generated_sents = self.tokenizer.batch_decode(output, skip_special_tokens=True) 67 | 68 | result = {} 69 | result['pred_ans'] = generated_sents 70 | 71 | return result 72 | 73 | 74 | from modeling_bart import VLBart 75 | class VLBartGQA(VLBart): 76 | def __init__(self, config): 77 | super().__init__(config) 78 | 79 | def train_step(self, batch): 80 | 81 | device = next(self.parameters()).device 82 | 83 | batch = self.vis_forward(batch, device) 84 | task = batch["task"] 85 | vis_feats = batch['vis_feats'].to(device) 86 | input_ids = batch['input_ids'].to(device) 87 | vis_pos = batch['boxes'].to(device) 88 | 89 | lm_labels = batch["target_ids"].to(device) 90 | 91 | output = self( 92 | input_ids=input_ids, 93 | vis_inputs=(vis_feats, vis_pos), 94 | labels=lm_labels, 95 | return_dict=True, 96 | task=task, 97 | ) 98 | assert 'loss' in output 99 | 100 | lm_mask = lm_labels != -100 101 | B, L = lm_labels.size() 102 | 103 | loss = output['loss'] 104 | 105 | loss = loss.view(B, L) * lm_mask 106 | 107 | loss = loss.sum(dim=1) / lm_mask.sum(dim=1).clamp(min=1) # B 108 | 109 | loss = loss.mean() 110 | 111 | result = { 112 | 'loss': loss 113 | } 114 | return result 115 | 116 | def test_step(self, batch, **kwargs): 117 | device = next(self.parameters()).device 118 | 119 | batch = self.vis_forward(batch, device) 120 | task = batch["task"] 121 | vis_feats = batch['vis_feats'].to(device) 122 | input_ids = batch['input_ids'].to(device) 123 | vis_pos = batch['boxes'].to(device) 124 | 125 | 126 | output = self.generate( 127 | input_ids=input_ids, 128 | vis_inputs=(vis_feats, vis_pos), 129 | task=task, 130 | **kwargs 131 | ) 132 | 133 | generated_sents = self.tokenizer.batch_decode(output, skip_special_tokens=True) 134 | 135 | result = {} 136 | result['pred_ans'] = generated_sents 137 | 138 | return result 139 | -------------------------------------------------------------------------------- /VL-T5/src/lora/__init__.py: -------------------------------------------------------------------------------- 1 | # The codes in the folder are copied from https://github.com/microsoft/LoRA/tree/aa68d8a021c7ba08973e35fdfdc76338fdbfad57/loralib 2 | 3 | name = "lora" 4 | 5 | from .layers import * 6 | from .utils import * 7 | from .config import * 8 | from .controller import LoRALinearController -------------------------------------------------------------------------------- /VL-T5/src/lora/config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class LoraConfig(object): 6 | lora_dim = 4 7 | lora_alpha = 32 8 | lora_dropout = 0.1 9 | -------------------------------------------------------------------------------- /VL-T5/src/lora/controller.py: -------------------------------------------------------------------------------- 1 | """Implements Adapter Controller, a module that keeps multiple 2 | layers of Adapters, and controls which adapter layer to use.""" 3 | import os 4 | import math 5 | import torch 6 | import torch.nn as nn 7 | import torch.nn.functional as F 8 | from .layers import LoRALayer 9 | 10 | 11 | class LoRALinearController(nn.Linear, LoRALayer): 12 | """Implements Adapter controller module which controls the logics of 13 | putting adapter layers within transformer's layers.""" 14 | 15 | def __init__( 16 | self, 17 | in_features: int, 18 | out_features: int, 19 | fan_in_fan_out: bool = False, # Set this to True if the layer to replace stores weight like (fan_in, fan_out) 20 | config=None, 21 | **kwargs 22 | ): 23 | nn.Linear.__init__(self, in_features, out_features, **kwargs) 24 | 25 | self.tasks = config.tasks 26 | self.use_single_lora = config.use_single_lora 27 | 28 | r = config.lora_dim 29 | lora_alpha = config.lora_alpha 30 | lora_dropout = config.lora_dropout 31 | 32 | LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, 33 | merge_weights=True) 34 | 35 | self.fan_in_fan_out = fan_in_fan_out 36 | self.lora_As = nn.ParameterDict(dict()) 37 | self.lora_Bs = nn.ParameterDict(dict()) 38 | # Actual trainable parameters 39 | if r > 0: 40 | self.lora_As, self.lora_Bs = self.construct_lora_weights(self.tasks) 41 | self.scaling = self.lora_alpha / self.r 42 | # Freezing the pre-trained weight matrix 43 | self.weight.requires_grad = False 44 | self.reset_parameters() 45 | if fan_in_fan_out: 46 | self.weight.data = self.weight.data.T 47 | 48 | def reset_parameters(self): 49 | nn.Linear.reset_parameters(self) 50 | if hasattr(self, 'lora_As'): 51 | # initialize A the same way as the default for nn.Linear and B to zero 52 | for task in self.tasks: 53 | nn.init.kaiming_uniform_(self.lora_As[task], a=math.sqrt(5)) 54 | nn.init.zeros_(self.lora_Bs[task]) 55 | 56 | def forward(self, x, task): 57 | def T(w): 58 | return w.T if self.fan_in_fan_out else w 59 | 60 | result = F.linear(x, T(self.weight), bias=self.bias) 61 | 62 | lora_A = self.lora_As[task] 63 | lora_B = self.lora_Bs[task] 64 | 65 | if self.training: 66 | result += (self.lora_dropout(x) @ lora_A.T @ lora_B.T) * self.scaling 67 | else: 68 | result += (x @ lora_A.T @ lora_B.T) * self.scaling 69 | 70 | return result 71 | 72 | def get_task(self, task): 73 | return task 74 | 75 | def construct_lora_weights(self, tasks): 76 | if self.use_single_lora: 77 | lora_A = nn.Parameter(self.weight.new_zeros((self.r, self.in_features))) 78 | lora_B = nn.Parameter(self.weight.new_zeros((self.out_features, self.r))) 79 | for task in tasks: 80 | self.lora_As[task] = lora_A 81 | self.lora_Bs[task] = lora_B 82 | else: 83 | for task in tasks: 84 | self.lora_As[task] = nn.Parameter(self.weight.new_zeros((self.r, self.in_features))) 85 | self.lora_Bs[task] = nn.Parameter(self.weight.new_zeros((self.out_features, self.r))) 86 | 87 | return self.lora_As, self.lora_Bs 88 | -------------------------------------------------------------------------------- /VL-T5/src/lora/utils.py: -------------------------------------------------------------------------------- 1 | # ------------------------------------------------------------------------------------------ 2 | # Copyright (c) Microsoft Corporation. All rights reserved. 3 | # Licensed under the MIT License (MIT). See LICENSE in the repo root for license information. 4 | # ------------------------------------------------------------------------------------------ 5 | import torch 6 | import torch.nn as nn 7 | 8 | from typing import Dict 9 | 10 | from .layers import LoRALayer 11 | 12 | 13 | def mark_only_lora_as_trainable(model: nn.Module, bias: str = 'none') -> None: 14 | for n, p in model.named_parameters(): 15 | if 'lora_' not in n: 16 | p.requires_grad = False 17 | if bias == 'none': 18 | return 19 | elif bias == 'all': 20 | for n, p in model.named_parameters(): 21 | if 'bias' in n: 22 | p.requires_grad = True 23 | elif bias == 'lora_only': 24 | for m in model.modules(): 25 | if isinstance(m, LoRALayer) and \ 26 | hasattr(m, 'bias') and \ 27 | m.bias is not None: 28 | m.bias.requires_grad = True 29 | else: 30 | raise NotImplementedError 31 | 32 | 33 | def lora_state_dict(model: nn.Module, bias: str = 'none') -> Dict[str, torch.Tensor]: 34 | my_state_dict = model.state_dict() 35 | if bias == 'none': 36 | return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k} 37 | elif bias == 'all': 38 | return {k: my_state_dict[k] for k in my_state_dict if 'lora_' in k or 'bias' in k} 39 | elif bias == 'lora_only': 40 | to_return = {} 41 | for k in my_state_dict: 42 | if 'lora_' in k: 43 | to_return[k] = my_state_dict[k] 44 | bias_name = k.split('lora_')[0]+'bias' 45 | if bias_name in my_state_dict: 46 | to_return[bias_name] = my_state_dict[bias_name] 47 | return to_return 48 | else: 49 | raise NotImplementedError -------------------------------------------------------------------------------- /VL-T5/src/mmt_model.py: -------------------------------------------------------------------------------- 1 | 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | from modeling_t5 import VLT5 7 | class VLT5MMT(VLT5): 8 | def __init__(self, config): 9 | super().__init__(config) 10 | 11 | def train_step(self, batch): 12 | 13 | device = next(self.parameters()).device 14 | vis_feats = batch['vis_feats'].to(device) 15 | input_ids = batch['input_ids'].to(device) 16 | vis_pos = batch['boxes'].to(device) 17 | 18 | vis_attention_mask = batch['vis_attention_mask'].to(device) 19 | 20 | lm_labels = batch["target_ids"].to(device) 21 | 22 | output = self( 23 | input_ids=input_ids, 24 | vis_inputs=(vis_feats, vis_pos), 25 | vis_attention_mask=vis_attention_mask, 26 | labels=lm_labels, 27 | reduce_loss=True, 28 | return_dict=True 29 | ) 30 | 31 | loss = output['loss'] 32 | 33 | result = { 34 | 'loss': loss 35 | } 36 | return result 37 | 38 | def test_step(self, batch, **kwargs): 39 | device = next(self.parameters()).device 40 | vis_feats = batch['vis_feats'].to(device) 41 | input_ids = batch['input_ids'].to(device) 42 | vis_pos = batch['boxes'].to(device) 43 | 44 | vis_attention_mask = batch['vis_attention_mask'].to(device) 45 | 46 | output = self.generate( 47 | input_ids=input_ids, 48 | vis_inputs=(vis_feats, vis_pos), 49 | vis_attention_mask=vis_attention_mask, 50 | **kwargs 51 | ) 52 | 53 | generated_sents = self.tokenizer.batch_decode(output, skip_special_tokens=True) 54 | 55 | result = {} 56 | result['pred'] = generated_sents 57 | 58 | return result 59 | 60 | 61 | from modeling_bart import VLBart 62 | class VLBartMMT(VLBart): 63 | def __init__(self, config): 64 | super().__init__(config) 65 | 66 | def train_step(self, batch): 67 | 68 | device = next(self.parameters()).device 69 | vis_feats = batch['vis_feats'].to(device) 70 | input_ids = batch['input_ids'].to(device) 71 | vis_pos = batch['boxes'].to(device) 72 | 73 | vis_attention_mask = batch['vis_attention_mask'].to(device) 74 | 75 | lm_labels = batch["target_ids"].to(device) 76 | 77 | output = self( 78 | input_ids=input_ids, 79 | vis_inputs=(vis_feats, vis_pos), 80 | vis_attention_mask=vis_attention_mask, 81 | labels=lm_labels, 82 | reduce_loss=True, 83 | return_dict=True 84 | ) 85 | 86 | loss = output['loss'] 87 | 88 | result = { 89 | 'loss': loss 90 | } 91 | return result 92 | 93 | def test_step(self, batch, **kwargs): 94 | device = next(self.parameters()).device 95 | vis_feats = batch['vis_feats'].to(device) 96 | input_ids = batch['input_ids'].to(device) 97 | vis_pos = batch['boxes'].to(device) 98 | 99 | vis_attention_mask = batch['vis_attention_mask'].to(device) 100 | 101 | output = self.generate( 102 | input_ids=input_ids, 103 | vis_inputs=(vis_feats, vis_pos), 104 | vis_attention_mask=vis_attention_mask, 105 | **kwargs 106 | ) 107 | 108 | generated_sents = self.tokenizer.batch_decode(output, skip_special_tokens=True) 109 | 110 | result = {} 111 | result['pred'] = generated_sents 112 | 113 | return result 114 | -------------------------------------------------------------------------------- /VL-T5/src/multitask_data.py: -------------------------------------------------------------------------------- 1 | import more_itertools 2 | from typing import Any, Dict, Iterable, Union, List, Mapping 3 | import vqa_data 4 | import refcoco_data 5 | import itertools 6 | import random 7 | 8 | class MultitaskLoader(object): 9 | def __init__(self, loaders, shuffle=True, drop_last=False, sampling='roundrobin', n_batches=None, verbose=True): 10 | self.loaders = loaders 11 | self.verbose = verbose 12 | # self.loader_lens = [len(loader) for loader in self.loaders] 13 | self.task2len = {loader.task: len(loader) for loader in self.loaders} 14 | if self.verbose: 15 | print('Task2len:', self.task2len) 16 | self.task2loader = {loader.task: loader for loader in self.loaders} 17 | # print('loader lens:', self.loader_lens) 18 | 19 | self.shuffle = shuffle 20 | self.drop_last = drop_last 21 | self.sampling = sampling 22 | self.epoch_tasks = None 23 | self.n_batches = n_batches 24 | self.set_epoch(0) 25 | # print('loader indices:', self.loader_indices) 26 | 27 | def __iter__(self): 28 | self.task2iter = {loader.task: iter(loader) for loader in self.loaders} 29 | # self.loader_iters = [iter(loader) for loader in self.loaders] 30 | 31 | return self 32 | 33 | def set_epoch(self, epoch): 34 | for loader in self.loaders: 35 | loader.sampler.set_epoch(epoch) 36 | 37 | if self.sampling == 'roundrobin': 38 | epoch_tasks = [] 39 | for task, loader in self.task2loader.items(): 40 | n_batches = len(loader) 41 | epoch_tasks.extend([task]*n_batches) 42 | elif self.sampling == 'balanced': 43 | if self.n_batches is None: 44 | n_batches = sum(self.task2len.values()) // len(self.loaders) 45 | else: 46 | n_batches = self.n_batches 47 | if self.verbose: 48 | print('# batches:', n_batches) 49 | epoch_tasks = [] 50 | for task, loader in self.task2loader.items(): 51 | epoch_tasks.extend([task]*n_batches) 52 | 53 | if self.shuffle: 54 | random.Random(epoch).shuffle(epoch_tasks) 55 | self.epoch_tasks = epoch_tasks 56 | if self.verbose: 57 | print('# epoch_tasks:', len(self.epoch_tasks)) 58 | 59 | def __next__(self): 60 | if len(self.epoch_tasks) > 0: 61 | task = self.epoch_tasks.pop() 62 | loader_iter = self.task2iter[task] 63 | return next(loader_iter) 64 | else: 65 | raise StopIteration 66 | 67 | def __len__(self): 68 | return len(self.epoch_tasks) 69 | 70 | 71 | 72 | def _chunked_iterator(i: Iterable, chunk_size: int, drop_last: bool): 73 | chunks = more_itertools.chunked(i, chunk_size) 74 | if drop_last: 75 | return (chunk for chunk in chunks if len(chunk) == chunk_size) 76 | else: 77 | return chunks 78 | -------------------------------------------------------------------------------- /VL-T5/src/my_deepspeed.py: -------------------------------------------------------------------------------- 1 | # from transformers.deepspeed import HfDeepSpeedConfig 2 | import json 3 | 4 | def deepspeed_init(trainer, resume_from_checkpoint=None): 5 | """ 6 | Init DeepSpeed, after updating the DeepSpeed configuration with any relevant Trainer's args. 7 | If ``resume_from_checkpoint`` was passed then an attempt to resume from a previously saved checkpoint will be made. 8 | Args: 9 | trainer: Trainer object 10 | num_training_steps: per single gpu 11 | resume_from_checkpoint: path to a checkpoint if to resume from after normal DeepSpeedEngine load 12 | Returns: model, optimizer, lr_scheduler 13 | """ 14 | import deepspeed 15 | from deepspeed.utils import logger as ds_logger 16 | 17 | model = trainer.model 18 | args = trainer.args 19 | 20 | optimizer = trainer.optim 21 | lr_scheduler = trainer.lr_scheduler 22 | 23 | with open(args.deepspeed, "r") as f: 24 | ds_config = json.load(f) 25 | 26 | if args.fp16: 27 | ds_config["fp16"] = {"enabled": True, "loss_scale": 0} 28 | 29 | ds_config["gradient_clipping"] = args.clip_grad_norm 30 | ds_config["train_micro_batch_size_per_gpu"] = args.batch_size 31 | ds_config["zero_allow_untested_optimizer"] = True 32 | 33 | # hf_deepspeed_config = args.hf_deepspeed_config 34 | # hf_deepspeed_config.trainer_config_finalize(args, model, num_training_steps) 35 | 36 | # resume config update - some bits like `model` and `num_training_steps` only become available during train 37 | # config = HfDeepSpeedConfig(args.deepspeed) 38 | config = ds_config 39 | 40 | # keep for quick debug: 41 | # from pprint import pprint; pprint(config) 42 | 43 | # set the Deepspeed log level consistent with the trainer 44 | # ds_logger.setLevel(args.get_process_log_level()) 45 | 46 | model_parameters = filter(lambda p: p.requires_grad, model.parameters()) 47 | 48 | model, optimizer, _, lr_scheduler = deepspeed.initialize( 49 | model=model, 50 | model_parameters=model_parameters, 51 | config_params=config, 52 | optimizer=optimizer, 53 | lr_scheduler=lr_scheduler, 54 | ) 55 | 56 | if resume_from_checkpoint is not None: 57 | 58 | # it's possible that the user is trying to resume from model_path, which doesn't necessarily 59 | # contain a deepspeed checkpoint. e.g. examples just check if the dir exists and assume it's 60 | # a resume from a checkpoint and not just a local pretrained weight. So we check here if the 61 | # path contains what looks like a deepspeed checkpoint 62 | import glob 63 | 64 | deepspeed_checkpoint_dirs = sorted(glob.glob(f"{resume_from_checkpoint}/global_step*")) 65 | 66 | if len(deepspeed_checkpoint_dirs) > 0: 67 | logger.info(f"Attempting to resume from {resume_from_checkpoint}") 68 | # this magically updates self.optimizer and self.lr_scheduler 69 | load_path, _ = model.load_checkpoint( 70 | resume_from_checkpoint, load_optimizer_states=True, load_lr_scheduler_states=True 71 | ) 72 | if load_path is None: 73 | raise ValueError(f"[deepspeed] failed to resume from checkpoint {resume_from_checkpoint}") 74 | else: 75 | logger.info(f"{resume_from_checkpoint} doesn't have deepspeed checkpoints, doing nothing") 76 | 77 | return model, optimizer, lr_scheduler 78 | -------------------------------------------------------------------------------- /VL-T5/src/my_transformers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ylsung/VL_adapter/545fcbbdbbaec4c442de35567f6ae477ff4e8265/VL-T5/src/my_transformers/__init__.py -------------------------------------------------------------------------------- /VL-T5/src/prompt/__init__.py: -------------------------------------------------------------------------------- 1 | from .config import EncoderPromptConfig, DecoderPromptConfig 2 | from .prompt_controller import PromptController -------------------------------------------------------------------------------- /VL-T5/src/prompt/config.py: -------------------------------------------------------------------------------- 1 | from dataclasses import dataclass 2 | 3 | 4 | @dataclass 5 | class EncoderPromptConfig(object): 6 | seq_len = 0 7 | input_dim = 768 8 | mid_dim = 768 9 | use_input_prompt = True 10 | use_single_prompt = False 11 | 12 | @dataclass 13 | class DecoderPromptConfig(object): 14 | seq_len = 0 15 | input_dim = 768 16 | mid_dim = 768 17 | use_input_prompt = True 18 | use_single_prompt = False -------------------------------------------------------------------------------- /VL-T5/src/prompt/prompt_controller.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from .prompt_modeling import InputPrompts 4 | 5 | 6 | class PromptController(nn.Module): 7 | """Implements Adapter controller module which controls the logics of 8 | putting adapter layers within transformer's layers.""" 9 | 10 | def __init__(self, config): 11 | super().__init__() 12 | self.config = config 13 | self.prompts = nn.ModuleDict(dict()) 14 | self.tasks = config.tasks 15 | self.use_input_prompt = config.use_input_prompt 16 | self.use_single_prompt = config.use_single_prompt 17 | self.prompts = self.construct_prompts(self.tasks) 18 | 19 | def get_task(self, task): 20 | return task 21 | 22 | def construct_prompts(self, tasks): 23 | """ 24 | Constructs adapter layers and adds them to a dictionary for the given 25 | tasks. 26 | Args: 27 | tasks: A list of string containing the task names. 28 | """ 29 | 30 | if self.use_single_prompt: 31 | if self.use_input_prompt: 32 | prompt = InputPrompts(self.config) 33 | 34 | for task in tasks: 35 | self.prompts[task] = prompt 36 | 37 | else: 38 | for task in tasks: 39 | if self.use_input_prompt: 40 | prompt = InputPrompts(self.config) 41 | 42 | self.prompts[task] = prompt 43 | 44 | return self.prompts 45 | 46 | def convert_to_list(self, tasks): 47 | if isinstance(tasks, list): 48 | return tasks 49 | return [tasks] 50 | 51 | def get_prompt(self, task): 52 | """Given a task returns its corresponding adapter layer. 53 | Args: 54 | task: Input task name. 55 | Returns: 56 | Adapter layer corresponding to the given task. 57 | """ 58 | return self.prompts[task] 59 | 60 | def forward(self, bsz, device, task): 61 | """ 62 | Retrieves the adapter layer corresponding to the given 63 | task. It freezes the adapter layers for all the other tasks 64 | and call the selected adapter layer. 65 | Args: 66 | task: the name of the current task. 67 | inputs: the inputs to feed in in the adapter layer. 68 | Returns: 69 | outputs of the adapter layer. 70 | """ 71 | task = self.get_task(task) 72 | # Enables the adapter layer for the given task. 73 | prompt_module = self.get_prompt(task) 74 | 75 | trainable_prompt = prompt_module.get_prompt(bsz, device) 76 | 77 | return trainable_prompt 78 | 79 | -------------------------------------------------------------------------------- /VL-T5/src/prompt/prompt_modeling.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | class InputPrompts(nn.Module): 5 | def __init__(self, config): 6 | super().__init__() 7 | 8 | self.prompt_len = config.prompt_len 9 | self.input_dim = config.input_dim 10 | self.mid_dim = config.mid_dim 11 | 12 | self.prefix_tokens = torch.arange(self.prompt_len).long() 13 | self.prefix_embedding = nn.Sequential( 14 | nn.Embedding(self.prompt_len, self.input_dim), 15 | nn.Linear(self.input_dim, self.mid_dim), 16 | nn.Tanh(), 17 | nn.Linear(self.mid_dim, self.input_dim), 18 | ) 19 | 20 | def get_prompt(self, bsz, device): 21 | input_tokens = self.prefix_tokens.unsqueeze(0).expand(bsz, -1).to(device) # (B, L) 22 | prefix_prompt = self.prefix_embedding(input_tokens) # (B, L, d_model * n_heads * n_layer) 23 | 24 | return prefix_prompt -------------------------------------------------------------------------------- /VL-T5/src/utils.py: -------------------------------------------------------------------------------- 1 | import re 2 | import numpy as np 3 | import torch 4 | import torch.distributed as dist 5 | import collections 6 | import logging 7 | 8 | def get_area(pos): 9 | """ 10 | Args 11 | pos: [B, N, 4] 12 | (x1, x2, y1, y2) 13 | 14 | Return 15 | area : [B, N] 16 | """ 17 | # [B, N] 18 | height = pos[:, :, 3] - pos[:, :, 2] 19 | width = pos[:, :, 1] - pos[:, :, 0] 20 | area = height * width 21 | return area 22 | 23 | def get_relative_distance(pos): 24 | """ 25 | Args 26 | pos: [B, N, 4] 27 | (x1, x2, y1, y2) 28 | 29 | Return 30 | out : [B, N, N, 4] 31 | """ 32 | # B, N = pos.size()[:-1] 33 | 34 | # [B, N, N, 4] 35 | relative_distance = pos.unsqueeze(1) - pos.unsqueeze(2) 36 | 37 | return relative_distance 38 | 39 | 40 | class LossMeter(object): 41 | def __init__(self, maxlen=100): 42 | """Computes and stores the running average""" 43 | self.vals = collections.deque([], maxlen=maxlen) 44 | 45 | def __len__(self): 46 | return len(self.vals) 47 | 48 | def update(self, new_val): 49 | self.vals.append(new_val) 50 | 51 | @property 52 | def val(self): 53 | return sum(self.vals) / len(self.vals) 54 | 55 | def __repr__(self): 56 | return str(self.val) 57 | 58 | 59 | def count_parameters(model): 60 | return sum(p.numel() for p in model.parameters() if p.requires_grad) 61 | 62 | 63 | def load_state_dict(state_dict_path, loc='cpu'): 64 | state_dict = torch.load(state_dict_path, map_location=loc) 65 | # Change Multi GPU to single GPU 66 | original_keys = list(state_dict.keys()) 67 | for key in original_keys: 68 | if key.startswith("module."): 69 | new_key = key[len("module."):] 70 | state_dict[new_key] = state_dict.pop(key) 71 | return state_dict 72 | 73 | 74 | def set_global_logging_level(level=logging.ERROR, prefices=[""]): 75 | """ 76 | Override logging levels of different modules based on their name as a prefix. 77 | It needs to be invoked after the modules have been loaded so that their loggers have been initialized. 78 | 79 | Args: 80 | - level: desired level. e.g. logging.INFO. Optional. Default is logging.ERROR 81 | - prefices: list of one or more str prefices to match (e.g. ["transformers", "torch"]). Optional. 82 | Default is `[""]` to match all active loggers. 83 | The match is a case-sensitive `module_name.startswith(prefix)` 84 | """ 85 | prefix_re = re.compile(fr'^(?:{ "|".join(prefices) })') 86 | for name in logging.root.manager.loggerDict: 87 | if re.match(prefix_re, name): 88 | logging.getLogger(name).setLevel(level) 89 | 90 | 91 | def get_iou(anchors, gt_boxes): 92 | """ 93 | anchors: (N, 4) torch floattensor 94 | gt_boxes: (K, 4) torch floattensor 95 | overlaps: (N, K) ndarray of overlap between boxes and query_boxes 96 | """ 97 | N = anchors.size(0) 98 | 99 | if gt_boxes.size() == (4,): 100 | gt_boxes = gt_boxes.view(1, 4) 101 | K = gt_boxes.size(0) 102 | 103 | gt_boxes_area = ( 104 | (gt_boxes[:, 2] - gt_boxes[:, 0] + 1) * 105 | (gt_boxes[:, 3] - gt_boxes[:, 1] + 1) 106 | ).view(1, K) 107 | 108 | anchors_area = ( 109 | (anchors[:, 2] - anchors[:, 0] + 1) * 110 | (anchors[:, 3] - anchors[:, 1] + 1) 111 | ).view(N, 1) 112 | 113 | boxes = anchors.view(N, 1, 4).expand(N, K, 4) 114 | query_boxes = gt_boxes.view(1, K, 4).expand(N, K, 4) 115 | 116 | iw = ( 117 | torch.min(boxes[:, :, 2], query_boxes[:, :, 2]) 118 | - torch.max(boxes[:, :, 0], query_boxes[:, :, 0]) 119 | + 1 120 | ) 121 | iw[iw < 0] = 0 122 | 123 | ih = ( 124 | torch.min(boxes[:, :, 3], query_boxes[:, :, 3]) 125 | - torch.max(boxes[:, :, 1], query_boxes[:, :, 1]) 126 | + 1 127 | ) 128 | ih[ih < 0] = 0 129 | 130 | ua = anchors_area + gt_boxes_area - (iw * ih) 131 | overlaps = iw * ih / ua 132 | 133 | return overlaps 134 | 135 | 136 | def xywh_to_xyxy(boxes): 137 | """Convert [x y w h] box format to [x1 y1 x2 y2] format.""" 138 | return np.hstack((boxes[:, 0:2], boxes[:, 0:2] + boxes[:, 2:4] - 1)) 139 | 140 | 141 | from torch.optim import Optimizer 142 | 143 | class FusedOptimizer(Optimizer): 144 | def __init__(self, optimizers): 145 | self.optimizers = optimizers 146 | param_groups = [] 147 | for optimizer in self.optimizers: 148 | param_groups += optimizer.param_groups 149 | #super(FusedOptimizer, self).__init__([], {}) 150 | self.param_groups = param_groups 151 | 152 | def step(self): 153 | for optimizer in self.optimizers: 154 | optimizer.step() 155 | -------------------------------------------------------------------------------- /assets/vl_adapter_teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/ylsung/VL_adapter/545fcbbdbbaec4c442de35567f6ae477ff4e8265/assets/vl_adapter_teaser.png -------------------------------------------------------------------------------- /download_backbones.py: -------------------------------------------------------------------------------- 1 | 2 | from transformers import T5ForConditionalGeneration, T5Tokenizer 3 | from transformers import BartForConditionalGeneration, BartTokenizer 4 | 5 | if __name__ == '__main__': 6 | 7 | 8 | print('Downloading checkpoints if not cached') 9 | print('T5-base') 10 | model = T5ForConditionalGeneration.from_pretrained('t5-base') 11 | tokenizer = T5Tokenizer.from_pretrained('t5-base') 12 | print('BART-base') 13 | tokenizer = BartTokenizer.from_pretrained("facebook/bart-base") 14 | model = BartForConditionalGeneration.from_pretrained("facebook/bart-base") 15 | print('Done!') 16 | 17 | -------------------------------------------------------------------------------- /feature_extraction/README.md: -------------------------------------------------------------------------------- 1 | # Feature extraction 2 | 3 | 4 | ## Feature extraction using CLIP 5 | The commands to process COCO images 6 | ```bash 7 | model_type=$1 # one of [RN50, RN101, RN50x4, ViT-B/32, vit_base_patch32_224_in21k]. The code uses RN101. 8 | GPU=$2 9 | 10 | train_image_root=[The images that store training images] 11 | val_image_root=[The images that store validation images] 12 | test_image_root=[The images that store testing images] 13 | 14 | output_dir=[A folder that stores all clip_features] 15 | 16 | echo Use ${model_type} to extract features 17 | 18 | CUDA_VISIBLE_DEVICES=$2 python coco_CLIP.py --model_type ${model_type} --images_root ${train_image_root} --output_dir ${output_dir} 19 | CUDA_VISIBLE_DEVICES=$2 python coco_CLIP.py --model_type ${model_type} --images_root ${val_image_root} --output_dir ${output_dir} 20 | CUDA_VISIBLE_DEVICES=$2 python coco_CLIP.py --model_type ${model_type} --images_root ${test_image_root} --output_dir ${output_dir} 21 | ``` 22 | 23 | --- 24 | The following is the feature extraction using other vision encoders. 25 | 26 | 27 | 28 | We use [Hao Tan's Detectron2 implementation of 'Bottom-up feature extractor'](https://github.com/airsplay/py-bottom-up-attention), which is compatible with [the original Caffe implementation](https://github.com/peteanderson80/bottom-up-attention). 29 | 30 | Following LXMERT, we use the feature extractor which outputs 36 boxes per image. 31 | We store features in hdf5 format. 32 | 33 | 34 | ## Download features 35 | 36 | Download `datasets` folder from [Google Drive](https://drive.google.com/drive/folders/1MBBhlkP83VMKS2Qe0SmFfzkHhMpIG5wf?usp=sharing) 37 | 38 | 39 | ## Install feature extractor (optional) 40 | 41 | Please follow [the original installation guide](https://github.com/airsplay/py-bottom-up-attention#installation). 42 | 43 | ## Manually extract & convert features (optional) 44 | 45 | * `_prpoposal.py`: extract features from 36 detected boxes 46 | * `_gt.py`: extract features from ground truth boxes 47 | * `_mattnet.py`: extract features from box predictions shared from [MattNet](https://github.com/lichengunc/MAttNet#pre-computed-detectionsmasks) 48 | 49 | ```bash 50 | # Pretrain/VQA: Download LXMERT's COCO features (tsv) and convert to hdf5 51 | wget https://nlp.cs.unc.edu/data/lxmert_data/mscoco_imgfeat/train2014_obj36.zip 52 | wget https://nlp.cs.unc.edu/data/lxmert_data/mscoco_imgfeat/val2014_obj36.zip 53 | python tsv_to_h5.py --tsv_path train2014_obj36.tsv --h5_path train2014_obj36.h5 54 | python tsv_to_h5.py --tsv_path val2014_obj36.tsv --h5_path val2014_obj36.h5 55 | # Get resplit_val_obj36.h5 from val2014_obj36.h5 56 | python coco_val_compact.py 57 | 58 | # Pretrain(VG)/GQA: Download LXMERT's VG features (tsv) and convert to hdf5 59 | wget https://nlp.cs.unc.edu/data/lxmert_data/vg_gqa_imgfeat/vg_gqa_obj36.zip 60 | python tsv_to_h5.py --tsv_path vg_gqa_obj36.tsv --h5_path vg_gqa_obj36.h5 61 | 62 | # RefCOCOg 63 | python refcocog_gt.py --split train 64 | python refcocog_mattnet.py --split val 65 | python refcocog_mattnet.py --split test 66 | 67 | # NLVR2: Download LXMERT's COCO features (tsv) and convert to hdf5 68 | wget https://nlp.cs.unc.edu/data/lxmert_data/nlvr2_imgfeat/train_obj36.zip 69 | wget https://nlp.cs.unc.edu/data/lxmert_data/nlvr2_imgfeat/valid_obj36.zip 70 | wget https://nlp.cs.unc.edu/data/lxmert_data/nlvr2_imgfeat/test_obj36.zip 71 | python tsv_to_h5.py --tsv_path train_obj36.tsv --h5_path train_obj36.h5 72 | python tsv_to_h5.py --tsv_path valid_obj36.tsv --h5_path valid_obj36.h5 73 | python tsv_to_h5.py --tsv_path test_obj36.tsv --h5_path test_obj36.h5 74 | 75 | # Multi30K 76 | # Download images following https://github.com/multi30k/dataset 77 | python flickr30k_proposal.py --split trainval 78 | python flickr30k_proposal.py --split test2017 79 | python flickr30k_proposal.py --split test2018 80 | ``` -------------------------------------------------------------------------------- /feature_extraction/coco_gt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | # from detectron2_proposal_maxnms import collate_fn, extract, NUM_OBJECTS, DIM 4 | from detectron2_given_box_maxnms import extract, DIM 5 | from torch.utils.data import Dataset, DataLoader 6 | import cv2 7 | from tqdm import tqdm 8 | from pathlib import Path 9 | import argparse 10 | 11 | from pycocotools.coco import COCO 12 | import json 13 | import numpy as np 14 | 15 | 16 | class COCODataset(Dataset): 17 | def __init__(self, image_dir, box_ann_path, split='val2014'): 18 | self.image_dir = image_dir 19 | 20 | box_ann_path = str(box_ann_path) 21 | 22 | self.coco = COCO(box_ann_path) 23 | 24 | self.split = split 25 | with open(box_ann_path) as f: 26 | box_ann = json.load(f) 27 | id2name = {} 28 | for cat2name in box_ann['categories']: 29 | id2name[cat2name['id']] = cat2name['name'] 30 | self.id2name = id2name 31 | 32 | img_ids = [] 33 | boxes = [] 34 | captions = [] 35 | for img_id, anns in self.coco.imgToAnns.items(): 36 | img_ids.append(img_id) 37 | 38 | boxes.append([ann['bbox'] for ann in anns]) 39 | captions.append([self.id2name[ann['category_id']] for ann in anns]) 40 | 41 | assert len(img_ids) == len(boxes) 42 | assert len(img_ids) == len(captions) 43 | 44 | self.img_ids = img_ids 45 | self.boxes = boxes 46 | self.captions = captions 47 | 48 | def __len__(self): 49 | return len(self.coco.imgToAnns) 50 | 51 | def __getitem__(self, idx): 52 | 53 | image_id = self.img_ids[idx] 54 | 55 | image_name = f'COCO_{self.split}_{str(image_id).zfill(12)}' 56 | 57 | image_path = self.image_dir.joinpath(f'{image_name}.jpg') 58 | 59 | image_id = image_path.stem 60 | 61 | img = cv2.imread(str(image_path)) 62 | 63 | H, W, _ = img.shape 64 | 65 | boxes = [] 66 | for box in self.boxes[idx]: 67 | x, y, width, height = box 68 | x1 = x 69 | y1 = y 70 | x2 = x + width 71 | y2 = y + height 72 | boxes.append([x1, y1, x2, y2]) 73 | 74 | assert len(boxes) > 0 75 | 76 | boxes = np.array(boxes) 77 | 78 | captions = self.captions[idx] 79 | 80 | return { 81 | 'img_id': image_name, 82 | 'img': img, 83 | 'boxes': boxes, 84 | 'captions': captions 85 | } 86 | 87 | 88 | def collate_fn(batch): 89 | img_ids = [] 90 | imgs = [] 91 | boxes = [] 92 | captions = [] 93 | 94 | for i, entry in enumerate(batch): 95 | img_ids.append(entry['img_id']) 96 | imgs.append(entry['img']) 97 | boxes.append(entry['boxes']) 98 | captions.append(entry['captions']) 99 | 100 | batch_out = {} 101 | batch_out['img_ids'] = img_ids 102 | batch_out['imgs'] = imgs 103 | 104 | batch_out['boxes'] = boxes 105 | 106 | batch_out['captions'] = captions 107 | 108 | return batch_out 109 | 110 | 111 | if __name__ == "__main__": 112 | 113 | parser = argparse.ArgumentParser() 114 | parser.add_argument('--batchsize', default=1, type=int, help='batch_size') 115 | parser.add_argument('--cocoroot', type=str, default='/ssd-playpen/home/jmincho/workspace/datasets/COCO/') 116 | parser.add_argument('--split', type=str, default='valid', choices=['train', 'valid', 'test']) 117 | 118 | args = parser.parse_args() 119 | 120 | SPLIT2DIR = { 121 | 'train': 'train2014', 122 | 'valid': 'val2014', 123 | 'test': 'test2015', 124 | } 125 | 126 | coco_dir = Path(args.cocoroot).resolve() 127 | coco_img_dir = coco_dir.joinpath('images') 128 | coco_img_split_dir = coco_img_dir.joinpath(SPLIT2DIR[args.split]) 129 | box_ann_path = coco_dir.joinpath('annotations').joinpath(f'instances_{SPLIT2DIR[args.split]}.json') 130 | 131 | dataset_name = 'COCO' 132 | 133 | out_dir = coco_dir.joinpath('features') 134 | if not out_dir.exists(): 135 | out_dir.mkdir() 136 | 137 | print('Load images from', coco_img_split_dir) 138 | print('# Images:', len(list(coco_img_split_dir.iterdir()))) 139 | 140 | dataset = COCODataset(coco_img_split_dir, box_ann_path, SPLIT2DIR[args.split]) 141 | print('# Annotated Images:', len(dataset)) 142 | 143 | dataloader = DataLoader(dataset, batch_size=args.batchsize, 144 | shuffle=False, collate_fn=collate_fn, num_workers=4) 145 | 146 | output_fname = out_dir.joinpath(f'{SPLIT2DIR[args.split]}_GT.h5') 147 | print('features will be saved at', output_fname) 148 | 149 | desc = f'{dataset_name}_{SPLIT2DIR[args.split]}_{DIM}' 150 | 151 | extract(output_fname, dataloader, desc) 152 | -------------------------------------------------------------------------------- /feature_extraction/coco_proposal.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from detectron2_proposal_maxnms import collate_fn, extract, NUM_OBJECTS, DIM 4 | from torch.utils.data import Dataset, DataLoader 5 | import cv2 6 | from tqdm import tqdm 7 | from pathlib import Path 8 | import argparse 9 | 10 | 11 | class COCODataset(Dataset): 12 | def __init__(self, image_dir): 13 | self.image_dir = image_dir 14 | self.image_path_list = list(tqdm(image_dir.iterdir())) 15 | self.n_images = len(self.image_path_list) 16 | 17 | # self.transform = image_transform 18 | 19 | def __len__(self): 20 | return self.n_images 21 | 22 | def __getitem__(self, idx): 23 | image_path = self.image_path_list[idx] 24 | image_id = image_path.stem 25 | 26 | img = cv2.imread(str(image_path)) 27 | 28 | return { 29 | 'img_id': image_id, 30 | 'img': img 31 | } 32 | 33 | if __name__ == "__main__": 34 | 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument('--batchsize', default=1, type=int, help='batch_size') 37 | parser.add_argument('--cocoroot', type=str, default='/ssd-playpen/home/jmincho/workspace/datasets/COCO/') 38 | parser.add_argument('--split', type=str, default='valid', choices=['train', 'valid', 'test']) 39 | 40 | args = parser.parse_args() 41 | 42 | SPLIT2DIR = { 43 | 'train': 'train2014', 44 | 'valid': 'val2014', 45 | 'test': 'test2015', 46 | } 47 | 48 | coco_dir = Path(args.cocoroot).resolve() 49 | coco_img_dir = coco_dir.joinpath('images') 50 | coco_img_split_dir = coco_img_dir.joinpath(SPLIT2DIR[args.split]) 51 | 52 | dataset_name = 'COCO' 53 | 54 | out_dir = coco_dir.joinpath('features') 55 | if not out_dir.exists(): 56 | out_dir.mkdir() 57 | 58 | print('Load images from', coco_img_split_dir) 59 | print('# Images:', len(list(coco_img_split_dir.iterdir()))) 60 | 61 | dataset = COCODataset(coco_img_split_dir) 62 | 63 | dataloader = DataLoader(dataset, batch_size=args.batchsize, 64 | shuffle=False, collate_fn=collate_fn, num_workers=4) 65 | 66 | output_fname = out_dir.joinpath(f'{args.split}_boxes{NUM_OBJECTS}.h5') 67 | print('features will be saved at', output_fname) 68 | 69 | desc = f'{dataset_name}_{args.split}_{(NUM_OBJECTS, DIM)}' 70 | 71 | extract(output_fname, dataloader, desc) 72 | -------------------------------------------------------------------------------- /feature_extraction/coco_val_compact.py: -------------------------------------------------------------------------------- 1 | import h5py 2 | from tqdm import tqdm 3 | import json 4 | import pathlib 5 | import argparse 6 | 7 | if __name__ == '__main__': 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--batchsize', default=1, type=int, help='batch_size') 11 | parser.add_argument('--data_dir', type=str, 12 | default='.') 13 | 14 | args = parser.parse_args() 15 | 16 | data_dir = pathlib.Path(args.data_dir).resolve() 17 | coco_dir = data_dir.joinpath('COCO') 18 | 19 | with open(data_dir.joinpath('lxmert/mscoco_resplit_val.json'))as f: 20 | val_data = json.load(f) 21 | 22 | print(len(val_data)) 23 | 24 | source_f = h5py.File(coco_dir.joinpath('features/val2014_obj36.h5'), 'r') 25 | target_f = h5py.File(coco_dir.joinpath('features/resplit_val_obj36.h5'), 'w') 26 | 27 | img_id = val_data[0]['img_id'] 28 | 29 | keys = list(source_f[img_id].keys()) 30 | 31 | for datum in tqdm(val_data, ncols=50): 32 | img_id = datum['img_id'] 33 | 34 | grp = target_f.create_group(str(img_id)) 35 | for k in keys: 36 | grp[k] = source_f[f'{img_id}/{k}'][()] 37 | -------------------------------------------------------------------------------- /feature_extraction/flickr30k_proposal.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from detectron2_proposal_maxnms import collate_fn, extract, NUM_OBJECTS, DIM 4 | from torch.utils.data import Dataset, DataLoader 5 | import cv2 6 | from tqdm import tqdm 7 | from pathlib import Path 8 | import argparse 9 | 10 | 11 | class Flickr30KDataset(Dataset): 12 | def __init__(self, image_dir): 13 | self.image_dir = image_dir 14 | self.image_path_list = list(tqdm(image_dir.iterdir())) 15 | self.n_images = len(self.image_path_list) 16 | 17 | # self.transform = image_transform 18 | 19 | def __len__(self): 20 | return self.n_images 21 | 22 | def __getitem__(self, idx): 23 | image_path = self.image_path_list[idx] 24 | image_id = image_path.stem 25 | 26 | img = cv2.imread(str(image_path)) 27 | 28 | return { 29 | 'img_id': image_id, 30 | 'img': img 31 | } 32 | 33 | if __name__ == "__main__": 34 | 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument('--batchsize', default=1, type=int, help='batch_size') 37 | parser.add_argument('--flickrroot', type=str, 38 | default='/ssd-playpen/home/jmincho/workspace/datasets/flickr30k/') 39 | parser.add_argument('--split', type=str, default=None, choices=['trainval', 'test2017', 'test2018']) 40 | 41 | args = parser.parse_args() 42 | 43 | SPLIT2DIR = { 44 | 'trainval': 'flickr30k_images', 45 | 'test2017': 'test_2017_flickr_images', 46 | 'test2018': 'test_2018_flickr_images', 47 | } 48 | 49 | flickr_dir = Path(args.flickrroot).resolve() 50 | flickr_img_dir = flickr_dir.joinpath('flickr30k_images/').joinpath(SPLIT2DIR[args.split]) 51 | 52 | dataset_name = 'Flickr30K' 53 | 54 | out_dir = flickr_dir.joinpath('features') 55 | if not out_dir.exists(): 56 | out_dir.mkdir() 57 | 58 | print('Load images from', flickr_img_dir) 59 | print('# Images:', len(list(flickr_img_dir.iterdir()))) 60 | 61 | dataset = Flickr30KDataset(flickr_img_dir) 62 | 63 | dataloader = DataLoader(dataset, batch_size=args.batchsize, 64 | shuffle=False, collate_fn=collate_fn, num_workers=4) 65 | 66 | output_fname = out_dir.joinpath(f'{args.split}_boxes{NUM_OBJECTS}.h5') 67 | print('features will be saved at', output_fname) 68 | 69 | desc = f'{dataset_name}_{args.split}_{(NUM_OBJECTS, DIM)}' 70 | 71 | extract(output_fname, dataloader, desc) 72 | -------------------------------------------------------------------------------- /feature_extraction/process.sh: -------------------------------------------------------------------------------- 1 | # wget https://nlp.cs.unc.edu/data/lxmert_data/mscoco_imgfeat/train2014_obj36.zip 2 | # wget https://nlp.cs.unc.edu/data/lxmert_data/mscoco_imgfeat/val2014_obj36.zip 3 | unzip train2014_obj36.zip -d . 4 | unzip val2014_obj36.zip -d . 5 | python tsv_to_h5.py --tsv_path train2014_obj36.tsv --h5_path train2014_obj36.h5 6 | python tsv_to_h5.py --tsv_path val2014_obj36.tsv --h5_path val2014_obj36.h5 7 | # Get resplit_val_obj36.h5 from val2014_obj36.h5 8 | python coco_val_compact.py 9 | 10 | # Pretrain(VG)/GQA: Download LXMERT's VG features (tsv) and convert to hdf5 11 | # wget https://nlp.cs.unc.edu/data/lxmert_data/vg_gqa_imgfeat/vg_gqa_obj36.zip 12 | unzip vg_gpa_obj36.zip -d . 13 | python tsv_to_h5.py --tsv_path vg_gqa_obj36.tsv --h5_path vg_gqa_obj36.h5 14 | 15 | # RefCOCOg 16 | python refcocog_gt.py --split train 17 | python refcocog_mattnet.py --split val 18 | python refcocog_mattnet.py --split test 19 | 20 | # NLVR2: Download LXMERT's COCO features (tsv) and convert to hdf5 21 | # wget https://nlp.cs.unc.edu/data/lxmert_data/nlvr2_imgfeat/train_obj36.zip 22 | # wget https://nlp.cs.unc.edu/data/lxmert_data/nlvr2_imgfeat/valid_obj36.zip 23 | # wget https://nlp.cs.unc.edu/data/lxmert_data/nlvr2_imgfeat/test_obj36.zip 24 | unzip train_obj36.zip -d . 25 | unzip valid_obj36.zip -d . 26 | unzip test_obj36.zip -d . 27 | 28 | python tsv_to_h5.py --tsv_path train_obj36.tsv --h5_path train_obj36.h5 29 | python tsv_to_h5.py --tsv_path valid_obj36.tsv --h5_path valid_obj36.h5 30 | python tsv_to_h5.py --tsv_path test_obj36.tsv --h5_path test_obj36.h5 31 | 32 | # Multi30K 33 | # Download images following https://github.com/multi30k/dataset 34 | python flickr30k_proposal.py --split trainval 35 | python flickr30k_proposal.py --split test2017 36 | python flickr30k_proposal.py --split test2018 37 | -------------------------------------------------------------------------------- /feature_extraction/refcocog_gt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from pathlib import Path 4 | import argparse 5 | import json 6 | 7 | import cv2 8 | import numpy as np 9 | from tqdm import tqdm 10 | from torch.utils.data import Dataset, DataLoader 11 | 12 | from detectron2_given_box_maxnms import extract, DIM 13 | 14 | from pycocotools.coco import COCO 15 | 16 | 17 | class RefCOCODataset(Dataset): 18 | def __init__(self, refcoco_dir, refcoco_images_dir, coco_dir, split='val'): 19 | 20 | self.image_dir = refcoco_images_dir 21 | 22 | # coco_train_annFile = coco_dir.joinpath('annotations/instances_train2014.json') 23 | # self.coco = COCO(coco_train_annFile) 24 | 25 | assert split in ['train', 'val', 'test'] 26 | 27 | workspace_dir = Path(__file__).resolve().parent.parent 28 | refcoco_util_dir = workspace_dir.joinpath('refcoco_utils') 29 | import sys 30 | sys.path.append(str(refcoco_util_dir)) 31 | from refer import REFER 32 | self.refer = REFER('refcocog', 'umd') 33 | 34 | ref_ids = self.refer.getRefIds(split=split) 35 | 36 | id2dets = {} 37 | img_ids = [] 38 | image_fns = [] 39 | for ref_id in ref_ids: 40 | ref = self.refer.Refs[ref_id] 41 | img_id = ref['image_id'] 42 | 43 | if img_id not in img_ids: 44 | img_ids.append(img_id) 45 | 46 | fn_ann = ref['file_name'] 47 | 48 | # COCO_train2014_000000419645_398406.jpg 49 | # COCO_train2014_000000419645.jpg 50 | 51 | suffix = fn_ann.split('.')[-1] 52 | 53 | fname = '_'.join(fn_ann.split('_')[:-1]) + '.' + suffix 54 | 55 | image_fns.append(fname) 56 | 57 | detections = self.refer.imgToAnns[img_id] 58 | 59 | id2dets[img_id] = detections 60 | 61 | self.image_ids = img_ids 62 | self.image_fns = image_fns 63 | self.id2dets = id2dets 64 | 65 | def __len__(self): 66 | return len(self.image_ids) 67 | 68 | def __getitem__(self, idx): 69 | 70 | image_id = self.image_ids[idx] 71 | image_fn = self.image_fns[idx] 72 | image_path = self.image_dir.joinpath(image_fn) 73 | 74 | assert Path(image_path).exists(), image_path 75 | 76 | img = cv2.imread(str(image_path)) 77 | 78 | H, W, C = img.shape 79 | 80 | dets = self.id2dets[image_id] 81 | # cat_names = [det['category_name'] for det in dets] 82 | 83 | boxes = [] 84 | for i, region in enumerate([det['bbox'] for det in dets]): 85 | # (x1, y1, x2, y2) 86 | x, y, w, h = region[:4] 87 | x1, y1, x2, y2 = x, y, x+w, y+h 88 | 89 | # x1, y1, x2, y2 = region[:4] 90 | 91 | assert x2 <= W, (image_id, i, region) 92 | assert y2 <= H, (image_id, i, region) 93 | 94 | box = [x1, y1, x2, y2] 95 | boxes.append(box) 96 | 97 | boxes = np.array(boxes) 98 | 99 | return { 100 | 'img_id': str(image_id), 101 | 'img_fn': image_fn, 102 | 'img': img, 103 | 'boxes': boxes, 104 | # 'captions': cat_names 105 | } 106 | 107 | def collate_fn(batch): 108 | img_ids = [] 109 | imgs = [] 110 | 111 | boxes = [] 112 | 113 | captions = [] 114 | 115 | for i, entry in enumerate(batch): 116 | img_ids.append(entry['img_id']) 117 | imgs.append(entry['img']) 118 | boxes.append(entry['boxes']) 119 | # captions.append(entry['captions']) 120 | 121 | batch_out = {} 122 | batch_out['img_ids'] = img_ids 123 | batch_out['imgs'] = imgs 124 | 125 | batch_out['boxes'] = boxes 126 | 127 | # batch_out['captions'] = captions 128 | 129 | return batch_out 130 | 131 | 132 | if __name__ == "__main__": 133 | 134 | parser = argparse.ArgumentParser() 135 | parser.add_argument('--batchsize', default=1, type=int, help='batch_size') 136 | parser.add_argument('--refcocoroot', type=str, default='RefCOCO/') 137 | parser.add_argument('--cocoroot', type=str, default='COCO/') 138 | parser.add_argument('--split', type=str, default='val', choices=['train', 'val', 'test']) 139 | 140 | args = parser.parse_args() 141 | 142 | refcoco_dir = Path(args.refcocoroot).resolve() 143 | refcocog_dir = refcoco_dir.joinpath('refcocog') 144 | coco_dir = Path(args.cocoroot).resolve() 145 | refcoco_images_dir = coco_dir.joinpath('images/train2014') 146 | dataset_name = 'RefCOCOg' 147 | 148 | out_dir = refcocog_dir.joinpath('features') 149 | if not out_dir.exists(): 150 | out_dir.mkdir() 151 | 152 | dataset = RefCOCODataset(refcoco_dir, refcoco_images_dir, coco_dir, args.split) 153 | print('# Images:', len(dataset)) 154 | 155 | dataloader = DataLoader(dataset, batch_size=args.batchsize, 156 | shuffle=False, collate_fn=collate_fn, num_workers=4) 157 | 158 | output_fname = out_dir.joinpath(f'{args.split}_boxes_GT.h5') 159 | print('features will be saved at', output_fname) 160 | 161 | desc = f'{dataset_name}_given_boxes_({DIM})' 162 | 163 | extract(output_fname, dataloader, desc) 164 | -------------------------------------------------------------------------------- /feature_extraction/refcocog_mattnet.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from pathlib import Path 4 | import argparse 5 | import json 6 | 7 | import cv2 8 | import numpy as np 9 | from tqdm import tqdm 10 | from torch.utils.data import Dataset, DataLoader 11 | 12 | from detectron2_given_box_maxnms import extract, DIM 13 | 14 | class RefCOCODataset(Dataset): 15 | def __init__(self, refcoco_dir, refcoco_images_dir, split='val'): 16 | 17 | self.image_dir = refcoco_images_dir 18 | 19 | mattnet_maskrcnn_detections_path = refcoco_dir.joinpath('detections/refcocog_umd/res101_coco_minus_refer_notime_dets.json') 20 | with open(mattnet_maskrcnn_detections_path) as f: 21 | mattnet_maskrcnn_detections = json.load(f) 22 | 23 | id2dets = {} 24 | for det in mattnet_maskrcnn_detections: 25 | image_id = det['image_id'] 26 | if image_id not in id2dets: 27 | id2dets[image_id] = [] 28 | id2dets[image_id].append(det) 29 | self.id2dets = id2dets 30 | 31 | print('Load mattnet detections from', mattnet_maskrcnn_detections_path) 32 | 33 | assert split in ['train', 'val', 'test'] 34 | 35 | workspace_dir = Path(__file__).resolve().parent.parent 36 | refcoco_util_dir = workspace_dir.joinpath('refcoco_utils') 37 | import sys 38 | sys.path.append(str(refcoco_util_dir)) 39 | from refer import REFER 40 | self.refer = REFER('refcocog', 'umd') 41 | 42 | ref_ids = self.refer.getRefIds(split=split) 43 | img_ids = [] 44 | image_fns = [] 45 | for ref_id in ref_ids: 46 | ref = self.refer.Refs[ref_id] 47 | img_id = ref['image_id'] 48 | 49 | if img_id not in img_ids: 50 | img_ids.append(img_id) 51 | 52 | fn_ann = ref['file_name'] 53 | 54 | # COCO_train2014_000000419645_398406.jpg 55 | # COCO_train2014_000000419645.jpg 56 | 57 | suffix = fn_ann.split('.')[-1] 58 | 59 | fname = '_'.join(fn_ann.split('_')[:-1]) + '.' + suffix 60 | 61 | image_fns.append(fname) 62 | 63 | self.image_ids = img_ids 64 | self.image_fns = image_fns 65 | 66 | def __len__(self): 67 | return len(self.image_ids) 68 | 69 | def __getitem__(self, idx): 70 | 71 | image_id = self.image_ids[idx] 72 | image_fn = self.image_fns[idx] 73 | image_path = self.image_dir.joinpath(image_fn) 74 | 75 | assert Path(image_path).exists(), image_path 76 | 77 | img = cv2.imread(str(image_path)) 78 | 79 | H, W, C = img.shape 80 | 81 | dets = self.id2dets[image_id] 82 | cat_names = [det['category_name'] for det in dets] 83 | 84 | boxes = [] 85 | for i, region in enumerate([det['box'] for det in dets]): 86 | # (x1, y1, x2, y2) 87 | x,y,w,h = region[:4] 88 | 89 | x1, y1, x2, y2 = x, y, x+w, y+h 90 | 91 | assert x2 <= W, (image_id, i, region) 92 | assert y2 <= H, (image_id, i, region) 93 | 94 | box = [x1, y1, x2, y2] 95 | boxes.append(box) 96 | 97 | boxes = np.array(boxes) 98 | 99 | return { 100 | 'img_id': str(image_id), 101 | 'img_fn': image_fn, 102 | 'img': img, 103 | 'boxes': boxes, 104 | 'captions': cat_names 105 | } 106 | 107 | 108 | def collate_fn(batch): 109 | img_ids = [] 110 | imgs = [] 111 | 112 | boxes = [] 113 | 114 | captions = [] 115 | 116 | for i, entry in enumerate(batch): 117 | img_ids.append(entry['img_id']) 118 | imgs.append(entry['img']) 119 | boxes.append(entry['boxes']) 120 | captions.append(entry['captions']) 121 | 122 | batch_out = {} 123 | batch_out['img_ids'] = img_ids 124 | batch_out['imgs'] = imgs 125 | 126 | batch_out['boxes'] = boxes 127 | 128 | batch_out['captions'] = captions 129 | 130 | return batch_out 131 | 132 | 133 | if __name__ == "__main__": 134 | 135 | parser = argparse.ArgumentParser() 136 | parser.add_argument('--batchsize', default=1, type=int, help='batch_size') 137 | parser.add_argument('--refcocoroot', type=str, default='/ssd-playpen/home/jmincho/workspace/datasets/RefCOCO/') 138 | parser.add_argument('--cocoroot', type=str, default='/ssd-playpen/home/jmincho/workspace/datasets/COCO/') 139 | parser.add_argument('--split', type=str, default='val', choices=['train', 'val', 'test']) 140 | 141 | args = parser.parse_args() 142 | 143 | refcoco_dir = Path(args.refcocoroot).resolve() 144 | refcocog_dir = refcoco_dir.joinpath('refcocog') 145 | coco_dir = Path(args.cocoroot).resolve() 146 | refcoco_images_dir = coco_dir.joinpath('images/train2014') 147 | dataset_name = 'RefCOCOg' 148 | 149 | out_dir = refcocog_dir.joinpath('features') 150 | if not out_dir.exists(): 151 | out_dir.mkdir() 152 | 153 | dataset = RefCOCODataset(refcoco_dir, refcoco_images_dir, args.split) 154 | print('# Images:', len(dataset)) 155 | 156 | dataloader = DataLoader(dataset, batch_size=args.batchsize, 157 | shuffle=False, collate_fn=collate_fn, num_workers=4) 158 | 159 | output_fname = out_dir.joinpath(f'{args.split}_boxes_mattnet.h5') 160 | print('features will be saved at', output_fname) 161 | 162 | desc = f'{dataset_name}_given_boxes_({DIM})' 163 | 164 | extract(output_fname, dataloader, desc) 165 | -------------------------------------------------------------------------------- /feature_extraction/tsv_to_h5.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 Project LXRT 3 | 4 | import sys 5 | import csv 6 | import base64 7 | import time 8 | from tqdm import tqdm 9 | import numpy as np 10 | import h5py 11 | import argparse 12 | 13 | csv.field_size_limit(sys.maxsize) 14 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf", 15 | "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"] 16 | 17 | 18 | def load_obj_tsv(fname, topk=None): 19 | """Load object features from tsv file. 20 | :param fname: The path to the tsv file. 21 | :param topk: Only load features for top K images (lines) in the tsv file. 22 | Will load all the features if topk is either -1 or None. 23 | :return: A list of image object features where each feature is a dict. 24 | See FILENAMES above for the keys in the feature dict. 25 | """ 26 | data = [] 27 | start_time = time.time() 28 | print("Start to load Faster-RCNN detected objects from %s" % fname) 29 | with open(fname) as f: 30 | reader = csv.DictReader(f, FIELDNAMES, delimiter="\t") 31 | for i, item in tqdm(enumerate(reader), ncols=150): 32 | 33 | for key in ['img_h', 'img_w', 'num_boxes']: 34 | item[key] = int(item[key]) 35 | 36 | boxes = item['num_boxes'] 37 | decode_config = [ 38 | ('objects_id', (boxes, ), np.int64), 39 | ('objects_conf', (boxes, ), np.float32), 40 | ('attrs_id', (boxes, ), np.int64), 41 | ('attrs_conf', (boxes, ), np.float32), 42 | ('boxes', (boxes, 4), np.float32), 43 | ('features', (boxes, -1), np.float32), 44 | ] 45 | for key, shape, dtype in decode_config: 46 | item[key] = np.frombuffer( 47 | base64.b64decode(item[key]), dtype=dtype) 48 | item[key] = item[key].reshape(shape) 49 | item[key].setflags(write=False) 50 | 51 | data.append(item) 52 | if topk is not None and len(data) == topk: 53 | break 54 | elapsed_time = time.time() - start_time 55 | print("Loaded %d images in file %s in %d seconds." % 56 | (len(data), fname, elapsed_time)) 57 | return data 58 | 59 | if __name__ == '__main__': 60 | 61 | parser = argparse.ArgumentParser() 62 | parser.add_argument('--tsv_path', type=str, 63 | default='val2014_obj36.tsv') 64 | parser.add_argument('--h5_path', type=str, 65 | default='val2014_obj36.h5') 66 | 67 | args = parser.parse_args() 68 | dim = 2048 69 | 70 | print('Load ', args.tsv_path) 71 | data = load_obj_tsv(args.tsv_path) 72 | print('# data:', len(data)) 73 | 74 | output_fname = args.h5_path 75 | print('features will be saved at', output_fname) 76 | 77 | with h5py.File(output_fname, 'w') as f: 78 | for i, datum in tqdm(enumerate(data), 79 | ncols=150,): 80 | 81 | img_id = datum['img_id'] 82 | 83 | num_boxes = datum['num_boxes'] 84 | 85 | grp = f.create_group(img_id) 86 | grp['features'] = datum['features'].reshape(num_boxes, 2048) 87 | grp['obj_id'] = datum['objects_id'] 88 | grp['obj_conf'] = datum['objects_conf'] 89 | grp['attr_id'] = datum['attrs_id'] 90 | grp['attr_conf'] = datum['attrs_conf'] 91 | grp['boxes'] = datum['boxes'] 92 | grp['img_w'] = datum['img_w'] 93 | grp['img_h'] = datum['img_h'] 94 | -------------------------------------------------------------------------------- /feature_extraction/vcr_gt.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from pathlib import Path 4 | import argparse 5 | import json 6 | 7 | import cv2 8 | import numpy as np 9 | from tqdm import tqdm 10 | from torch.utils.data import Dataset, DataLoader 11 | 12 | from detectron2_given_box_maxnms import extract, DIM 13 | 14 | class VCRDataset(Dataset): 15 | def __init__(self, vcr_dir, vcr_images_dir, split='val'): 16 | 17 | self.image_dir = vcr_images_dir 18 | ann_path = vcr_dir.joinpath(f'{split}.jsonl') 19 | 20 | with open(ann_path, 'r') as f: 21 | _items = [json.loads(s) for s in f] 22 | print('Load images from', ann_path) 23 | 24 | image_ids = [] 25 | image_paths = [] 26 | items = [] 27 | for item in _items: 28 | if item['img_id'] not in image_ids: 29 | items.append(item) 30 | image_ids.append(item['img_id']) 31 | image_paths.append(item['img_fn']) 32 | 33 | self.items = items 34 | self.n_images = len(items) 35 | 36 | def __len__(self): 37 | return self.n_images 38 | 39 | def __getitem__(self, idx): 40 | 41 | item = self.items[idx] 42 | image_path = item['img_fn'] 43 | image_id = item['img_id'] 44 | 45 | image_path = self.image_dir.joinpath(image_path) 46 | 47 | assert Path(image_path).exists() 48 | 49 | img = cv2.imread(str(image_path)) 50 | 51 | metadata_path = self.image_dir.joinpath(item['metadata_fn']) 52 | with open(metadata_path) as f: 53 | metadata = json.load(f) 54 | boxes = [] 55 | regions = metadata['boxes'] 56 | 57 | for i, region in enumerate(regions): 58 | # (x1, y1, x2, y2) 59 | x1, y1, x2, y2 = region[:4] 60 | 61 | # assert x2 <= W, (image_id, i, region) 62 | # assert y2 <= H, (image_id, i, region) 63 | 64 | box = [x1, y1, x2, y2] 65 | boxes.append(box) 66 | 67 | 68 | boxes = np.array(boxes) 69 | 70 | return { 71 | 'img_id': image_id, 72 | 'img': img, 73 | 'boxes': boxes, 74 | 'captions': metadata['names'] 75 | } 76 | 77 | 78 | def collate_fn(batch): 79 | img_ids = [] 80 | imgs = [] 81 | 82 | boxes = [] 83 | 84 | captions = [] 85 | 86 | for i, entry in enumerate(batch): 87 | img_ids.append(entry['img_id']) 88 | imgs.append(entry['img']) 89 | boxes.append(entry['boxes']) 90 | captions.append(entry['captions']) 91 | 92 | batch_out = {} 93 | batch_out['img_ids'] = img_ids 94 | batch_out['imgs'] = imgs 95 | 96 | batch_out['boxes'] = boxes 97 | 98 | batch_out['captions'] = captions 99 | 100 | return batch_out 101 | 102 | 103 | if __name__ == "__main__": 104 | 105 | parser = argparse.ArgumentParser() 106 | parser.add_argument('--batchsize', default=1, type=int, help='batch_size') 107 | parser.add_argument('--vcrroot', type=str, default='/ssd-playpen/home/jmincho/workspace/datasets/VCR/') 108 | parser.add_argument('--split', type=str, default='val', choices=['train', 'val', 'test']) 109 | 110 | args = parser.parse_args() 111 | 112 | vcr_dir = Path(args.vcrroot).resolve() 113 | vcr_images_dir = vcr_dir.joinpath('vcr1images') 114 | dataset_name = 'VCR' 115 | 116 | out_dir = vcr_dir.joinpath('features') 117 | if not out_dir.exists(): 118 | out_dir.mkdir() 119 | 120 | dataset = VCRDataset(vcr_dir, vcr_images_dir, args.split) 121 | print('# Images:', len(dataset)) 122 | 123 | dataloader = DataLoader(dataset, batch_size=args.batchsize, 124 | shuffle=False, collate_fn=collate_fn, num_workers=4) 125 | 126 | output_fname = out_dir.joinpath(f'{args.split}_boxes_GT.h5') 127 | print('features will be saved at', output_fname) 128 | 129 | desc = f'{dataset_name}_given_boxes_({DIM})' 130 | 131 | extract(output_fname, dataloader, desc) 132 | -------------------------------------------------------------------------------- /feature_extraction/vcr_proposal.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | 3 | from detectron2_proposal_maxnms import collate_fn, extract, NUM_OBJECTS, DIM 4 | from torch.utils.data import Dataset, DataLoader 5 | import h5py 6 | import torch 7 | import cv2 8 | from tqdm import tqdm 9 | from pathlib import Path 10 | import argparse 11 | import json 12 | 13 | 14 | class VCRDataset(Dataset): 15 | def __init__(self, vcr_dir, vcr_images_dir, split='val'): 16 | 17 | self.image_dir = vcr_images_dir 18 | ann_path = vcr_dir.joinpath(f'{split}.jsonl') 19 | 20 | with open(ann_path, 'r') as f: 21 | _items = [json.loads(s) for s in f] 22 | print('Load images from', ann_path) 23 | 24 | image_ids = [] 25 | image_paths = [] 26 | items = [] 27 | for item in _items: 28 | if item['img_id'] not in image_ids: 29 | items.append(item) 30 | image_ids.append(item['img_id']) 31 | image_paths.append(item['img_fn']) 32 | 33 | self.items = items 34 | self.n_images = len(items) 35 | 36 | def __len__(self): 37 | return self.n_images 38 | 39 | def __getitem__(self, idx): 40 | 41 | item = self.items[idx] 42 | image_path = item['img_fn'] 43 | image_id = item['img_id'] 44 | 45 | image_path = self.image_dir.joinpath(image_path) 46 | 47 | assert Path(image_path).exists() 48 | 49 | img = cv2.imread(str(image_path)) 50 | 51 | return { 52 | 'img_id': image_id, 53 | 'img': img 54 | } 55 | 56 | if __name__ == "__main__": 57 | 58 | parser = argparse.ArgumentParser() 59 | parser.add_argument('--batchsize', default=1, type=int, help='batch_size') 60 | parser.add_argument('--vcrroot', type=str, default='/ssd-playpen/home/jmincho/workspace/datasets/VCR/') 61 | parser.add_argument('--split', type=str, default='val', choices=['train', 'val', 'test']) 62 | 63 | args = parser.parse_args() 64 | 65 | vcr_dir = Path(args.vcrroot).resolve() 66 | vcr_images_dir = vcr_dir.joinpath('vcr1images') 67 | dataset_name = 'VCR' 68 | 69 | out_dir = vcr_dir.joinpath('features') 70 | if not out_dir.exists(): 71 | out_dir.mkdir() 72 | 73 | # print('Load images from', coco_img_split_dir) 74 | 75 | dataset = VCRDataset(vcr_dir, vcr_images_dir, args.split) 76 | print('# Images:', len(dataset)) 77 | 78 | dataloader = DataLoader(dataset, batch_size=args.batchsize, 79 | shuffle=False, collate_fn=collate_fn, num_workers=4) 80 | 81 | output_fname = out_dir.joinpath(f'{args.split}_boxes{NUM_OBJECTS}.h5') 82 | print('features will be saved at', output_fname) 83 | 84 | desc = f'{dataset_name}_{args.split}_{(NUM_OBJECTS, DIM)}' 85 | 86 | extract(output_fname, dataloader, desc) 87 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.6.0 2 | transformers==4.2.1 3 | sentencepiece 4 | h5py 5 | wandb 6 | tqdm 7 | numpy 8 | pandas 9 | matplotlib 10 | ftfy 11 | timm 12 | pyyaml 13 | sacrebleu 14 | git+git://github.com/j-min/language-evaluation@master 15 | wget --------------------------------------------------------------------------------