├── images ├── .none ├── main_fig.png ├── teaser.pdf └── teaser.png ├── data_files └── .gitignore ├── src ├── lxrt │ ├── .gitignore │ ├── vit_explore_code.py │ ├── PositionalEncoding.py │ ├── entry_spatial.py │ ├── entry.py │ ├── optimization.py │ └── file_utils.py ├── pretrain │ ├── .gitignore │ └── qa_answer_table.py ├── tasks │ ├── .gitignore │ ├── .DS_Store │ ├── gqa_model.py │ ├── refcocoplus_model.py │ ├── vqa_model.py │ ├── mscoco_retrieval_model.py │ ├── nlvr2_model.py │ ├── refcocog_model.py │ ├── refcoco_model.py │ ├── nlvr2_data.py │ ├── vqa_data.py │ ├── nlvr2.py │ ├── gqa_data_patches.py │ ├── gqa_data.py │ ├── vqahat_data.py │ ├── refcocoplus_data.py │ ├── vqa.py │ ├── refcocog_data.py │ ├── mscoco_retrieval_data.py │ ├── refcocoplus.py │ └── refcocog.py ├── param.py └── utils.py ├── run ├── README.md ├── gqa_test.bash ├── gqa_finetune_caps.bash ├── pretrain_2stage_fulldata_no_init_self.bash ├── pretrain_2stage_fulldata_no_init_patches.bash ├── pretrain_2stage_fulldata_no_init_selfcross.bash ├── pretrain_2stage_fulldata_no_init_self_patches.bash ├── 2stage_fulldata_no_init_16_caps.bash ├── 2stage_fulldata_no_init_24_caps.bash ├── pretrain_2stage_fulldata_no_init_64_caps.bash ├── pretrain_2stage_fulldata_no_init_48_caps.bash ├── pretrain_2stage_fulldata_vit_bert_init.bash ├── pretrain_2stage_fulldata_no_init_selfcross_patches.bash ├── pretrain_2stage_fulldata_vit_bert_init_self.bash ├── pretrain_2stage_fulldata_vit_no_bert_init.bash ├── pretrain_2stage_fulldata_vit_bert_init_cross_patches.bash ├── pretrain_2stage_fulldata_vit_bert_init_selfcross.bash ├── pretrain_2stage_fulldata_vit_no_bert_init_self.bash ├── pretrain_2stage_fulldata_vit_bert_16_caps.bash ├── pretrain_2stage_fulldata_vit_bert_24_caps.bash ├── pretrain_2stage_fulldata_vit_bert_init_self_patches.bash ├── pretrain_2stage_fulldata_vit_no_bert_init_patches.bash ├── pretrain_2stage_fulldata_vit_no_bert_init_selfcross.bash ├── pretrain_2stage_fulldata_vit_bert_init_selfcross_patches.bash ├── pretrain_2stage_fulldata_vit_no_bert_init_self_patches.bash └── pretrain_2stage_fulldata_vit_no_bert_init_selfcross_patches.bash ├── teaser.pdf ├── main_fig.png ├── LICENSE └── README.md /images/.none: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /data_files/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/lxrt/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/pretrain/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /src/tasks/.gitignore: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /run/README.md: -------------------------------------------------------------------------------- 1 | To do: 2 | 3 | Add pretraining instructions in detail here. 4 | -------------------------------------------------------------------------------- /teaser.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aurooj/WSG-VQA-VLTransformers/HEAD/teaser.pdf -------------------------------------------------------------------------------- /main_fig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aurooj/WSG-VQA-VLTransformers/HEAD/main_fig.png -------------------------------------------------------------------------------- /images/main_fig.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aurooj/WSG-VQA-VLTransformers/HEAD/images/main_fig.png -------------------------------------------------------------------------------- /images/teaser.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aurooj/WSG-VQA-VLTransformers/HEAD/images/teaser.pdf -------------------------------------------------------------------------------- /images/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aurooj/WSG-VQA-VLTransformers/HEAD/images/teaser.png -------------------------------------------------------------------------------- /src/tasks/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/aurooj/WSG-VQA-VLTransformers/HEAD/src/tasks/.DS_Store -------------------------------------------------------------------------------- /src/lxrt/vit_explore_code.py: -------------------------------------------------------------------------------- 1 | import timm 2 | 3 | m = timm.create_model('mobilenetv3_large_100', pretrained=True) 4 | m.eval() 5 | from pprint import pprint 6 | model_names = timm.list_models(pretrained=True) 7 | pprint(model_names) -------------------------------------------------------------------------------- /run/gqa_test.bash: -------------------------------------------------------------------------------- 1 | # The name of this experiment. 2 | name=$2 3 | 4 | # Save logs and models under snap/gqa; make backup. 5 | output=snap/gqa/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # See Readme.md for option details. 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/tasks/gqa.py \ 13 | --train train --valid "" \ 14 | --llayers 5 --xlayers 2 --rlayers 5 --outputAttn --skipConnection \ 15 | --tqdm --output $output ${@:3} 16 | -------------------------------------------------------------------------------- /run/gqa_finetune_caps.bash: -------------------------------------------------------------------------------- 1 | # The name of this experiment. 2 | name=$2 3 | 4 | # Save logs and models under snap/gqa; make backup. 5 | output=snap/gqa/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # See Readme.md for option details. 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/tasks/gqa.py \ 13 | --train train --valid valid \ 14 | --llayers 5 --xlayers 2 --rlayers 5 --NUM_PRIM_CAPS 32 --NUM_VIS_CAPS 32 --skipConnection --crossAttn\ 15 | --loadLXMERT snap/pretrain/mm_capsules_pretrain_552_stage2_fixed_continued/BEST_EVAL_LOSS \ 16 | --batchSize 32 --optim bert --lr 1e-5 --epochs 20\ 17 | --tqdm --output $output ${@:3} 18 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2022 aukhan 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /src/tasks/gqa_model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import torch.nn as nn 5 | 6 | from src.param import args 7 | from src.lxrt.entry import LXRTEncoder 8 | from src.lxrt.modeling_capsbert import BertLayerNorm, GeLU 9 | 10 | # Max length including and 11 | MAX_GQA_LENGTH = 20 12 | 13 | 14 | class GQAModel(nn.Module): 15 | def __init__(self, num_answers): 16 | super().__init__() 17 | self.lxrt_encoder = LXRTEncoder( 18 | args, 19 | max_seq_length=MAX_GQA_LENGTH 20 | ) 21 | hid_dim = self.lxrt_encoder.dim 22 | self.logit_fc = nn.Sequential( 23 | nn.Linear(hid_dim, hid_dim * 2), 24 | GeLU(), 25 | BertLayerNorm(hid_dim * 2, eps=1e-12), 26 | nn.Linear(hid_dim * 2, num_answers) 27 | ) 28 | self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) 29 | self.args = args 30 | 31 | def forward(self, feat, pos, sent): 32 | """ 33 | b -- batch_size, o -- object_number, f -- visual_feature_size 34 | 35 | :param feat: (b, o, f) 36 | :param pos: (b, o, 4) 37 | :param sent: (b,) Type -- list of string 38 | :param leng: (b,) Type -- int numpy array 39 | :return: (b, num_answer) The logit of each answers. 40 | """ 41 | 42 | _, x, attn_probs = self.lxrt_encoder(sent, (feat, pos)) 43 | logit = self.logit_fc(x) 44 | 45 | return logit, attn_probs 46 | 47 | 48 | -------------------------------------------------------------------------------- /src/tasks/refcocoplus_model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import torch.nn as nn 5 | 6 | from src.param import args 7 | from src.lxrt.entry import LXRTEncoder 8 | from src.lxrt.modeling_capsbert import BertLayerNorm, GeLU 9 | 10 | # Max length including and 11 | MAX_GQA_LENGTH = 20 12 | 13 | 14 | class RefCOCOplusModel(nn.Module): 15 | def __init__(self): 16 | super().__init__() 17 | self.lxrt_encoder = LXRTEncoder( 18 | args, 19 | max_seq_length=MAX_GQA_LENGTH 20 | ) 21 | # hid_dim = self.lxrt_encoder.dim 22 | # self.logit_fc = nn.Sequential( 23 | # nn.Linear(hid_dim, hid_dim * 2), 24 | # GeLU(), 25 | # BertLayerNorm(hid_dim * 2, eps=1e-12), 26 | # nn.Linear(hid_dim * 2, num_answers) 27 | # ) 28 | # self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) 29 | self.args = args 30 | 31 | def forward(self, feat, pos, sent): 32 | """ 33 | b -- batch_size, o -- object_number, f -- visual_feature_size 34 | 35 | :param feat: (b, o, f) 36 | :param pos: (b, o, 4) 37 | :param sent: (b,) Type -- list of string 38 | :param leng: (b,) Type -- int numpy array 39 | :return: (b, num_answer) The logit of each answers. 40 | """ 41 | 42 | x, attn_probs = self.lxrt_encoder(sent, (feat, pos)) 43 | # logit = self.logit_fc(x) 44 | 45 | return attn_probs 46 | 47 | 48 | -------------------------------------------------------------------------------- /src/tasks/vqa_model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import torch.nn as nn 5 | 6 | from param import args 7 | from lxrt.entry import LXRTEncoder 8 | from lxrt.modeling import BertLayerNorm, GeLU 9 | 10 | # Max length including and 11 | MAX_VQA_LENGTH = 20 12 | 13 | 14 | class VQAModel(nn.Module): 15 | def __init__(self, num_answers): 16 | super().__init__() 17 | 18 | # Build LXRT encoder 19 | self.lxrt_encoder = LXRTEncoder( 20 | args, 21 | max_seq_length=MAX_VQA_LENGTH 22 | ) 23 | hid_dim = self.lxrt_encoder.dim 24 | 25 | # VQA Answer heads 26 | self.logit_fc = nn.Sequential( 27 | nn.Linear(hid_dim, hid_dim * 2), 28 | GeLU(), 29 | BertLayerNorm(hid_dim * 2, eps=1e-12), 30 | nn.Linear(hid_dim * 2, num_answers) 31 | ) 32 | self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) 33 | 34 | def forward(self, feat, pos, sent): 35 | """ 36 | b -- batch_size, o -- object_number, f -- visual_feature_size 37 | 38 | :param feat: (b, o, f) 39 | :param pos: (b, o, 4) 40 | :param sent: (b,) Type -- list of string 41 | :param leng: (b,) Type -- int numpy array 42 | :return: (b, num_answer) The logit of each answers. 43 | """ 44 | x = self.lxrt_encoder(sent, (feat, pos)) 45 | logit = self.logit_fc(x) 46 | 47 | return logit 48 | 49 | 50 | -------------------------------------------------------------------------------- /src/tasks/mscoco_retrieval_model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import torch.nn as nn 5 | 6 | from src.param import args 7 | from src.lxrt.entry import LXRTEncoder 8 | from src.lxrt.modeling_capsbert import BertLayerNorm, GeLU 9 | 10 | # Max length including and 11 | MAX_GQA_LENGTH = 20 12 | 13 | 14 | class MSCOCOModel(nn.Module): 15 | def __init__(self): 16 | super().__init__() 17 | self.lxrt_encoder = LXRTEncoder( 18 | args, 19 | max_seq_length=MAX_GQA_LENGTH, 20 | mode='lxr' 21 | ) 22 | # hid_dim = self.lxrt_encoder.dim 23 | # self.logit_fc = nn.Sequential( 24 | # nn.Linear(hid_dim, hid_dim * 2), 25 | # GeLU(), 26 | # BertLayerNorm(hid_dim * 2, eps=1e-12), 27 | # nn.Linear(hid_dim * 2, num_answers) 28 | # ) 29 | # self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) 30 | self.args = args 31 | 32 | def forward(self, feat, pos, sent): 33 | """ 34 | b -- batch_size, o -- object_number, f -- visual_feature_size 35 | 36 | :param feat: (b, o, f) 37 | :param pos: (b, o, 4) 38 | :param sent: (b,) Type -- list of string 39 | :param leng: (b,) Type -- int numpy array 40 | :return: (b, num_answer) The logit of each answers. 41 | """ 42 | 43 | feats, x, attn_probs = self.lxrt_encoder(sent, (feat, pos)) 44 | # logit = self.logit_fc(x) 45 | 46 | return feats, x, attn_probs 47 | 48 | 49 | -------------------------------------------------------------------------------- /src/lxrt/PositionalEncoding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | 5 | class FixedPositionalEncoding(nn.Module): 6 | def __init__(self, embedding_dim, max_length=5000): 7 | super(FixedPositionalEncoding, self).__init__() 8 | 9 | pe = torch.zeros(max_length, embedding_dim) 10 | position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1) 11 | div_term = torch.exp( 12 | torch.arange(0, embedding_dim, 2).float() 13 | * (-torch.log(torch.tensor(10000.0)) / embedding_dim) 14 | ) 15 | pe[:, 0::2] = torch.sin(position * div_term) 16 | pe[:, 1::2] = torch.cos(position * div_term) 17 | pe = pe.unsqueeze(0).transpose(0, 1) 18 | self.register_buffer('pe', pe) 19 | 20 | def forward(self, x): 21 | x = x + self.pe[: x.size(0), :] 22 | return x 23 | 24 | 25 | class LearnedPositionalEncoding(nn.Module): 26 | def __init__(self, max_position_embeddings, embedding_dim, seq_length): 27 | super(LearnedPositionalEncoding, self).__init__() 28 | self.pe = nn.Embedding(max_position_embeddings, embedding_dim) 29 | self.seq_length = seq_length 30 | 31 | self.register_buffer( 32 | "position_ids", 33 | torch.arange(max_position_embeddings).expand((1, -1)), 34 | ) 35 | 36 | def forward(self, x, position_ids=None): 37 | if position_ids is None: 38 | position_ids = self.position_ids[:, : self.seq_length] 39 | 40 | position_embeddings = self.pe(position_ids) 41 | return x + position_embeddings 42 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_no_init_self.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_self_allqa_no_init_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --fromScratch --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \ 18 | --tqdm --output $output ${@:2} 19 | 20 | 21 | 22 | # The name of experiment 23 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_self_allqa_no_init_stage_2 24 | 25 | # Create dirs and make backup 26 | output=snap/pretrain/$name 27 | mkdir -p $output/src 28 | cp -r src/* $output/src/ 29 | cp $0 $output/run.bash 30 | 31 | # Pre-training 32 | #batch size reduced due to additional cross attn layer (large model) 33 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 34 | python src/pretrain/lxmert_pretrain.py \ 35 | --taskMatched --taskMaskLM --taskQA \ 36 | --train mscoco_train,vgnococo --valid mscoco_minival \ 37 | --llayers 5 --rlayers 5 --xlayers 2\ 38 | --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_grid_x_self_allqa_no_init_stage_1/BEST_EVAL_LOSS \ 39 | --skipConnection --crossAttn --crossAttnType self --freezeWeights\ 40 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \ 41 | --tqdm --output $output ${@:2} 42 | 43 | 44 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_no_init_patches.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_allqa_no_init_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --fromScratch --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \ 18 | --patches --tqdm --output $output ${@:2} 19 | 20 | 21 | 22 | # The name of experiment 23 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_allqa_no_init_stage_2 24 | 25 | # Create dirs and make backup 26 | output=snap/pretrain/$name 27 | mkdir -p $output/src 28 | cp -r src/* $output/src/ 29 | cp $0 $output/run.bash 30 | 31 | # Pre-training 32 | #batch size reduced due to additional cross attn layer (large model) 33 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 34 | python src/pretrain/lxmert_pretrain.py \ 35 | --taskMatched --taskMaskLM --taskQA \ 36 | --train mscoco_train,vgnococo --valid mscoco_minival \ 37 | --llayers 5 --rlayers 5 --xlayers 2\ 38 | --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_allqa_no_init_stage_1/BEST_EVAL_LOSS 39 | --skipConnection --crossAttn --crossAttnType cross --freezeWeights\ 40 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \ 41 | --patches --tqdm --output $output ${@:2} 42 | 43 | 44 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_no_init_selfcross.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_allqa_no_init_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --fromScratch --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \ 18 | --tqdm --output $output ${@:2} 19 | 20 | 21 | 22 | # The name of experiment 23 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_allqa_no_init_stage_2 24 | 25 | # Create dirs and make backup 26 | output=snap/pretrain/$name 27 | mkdir -p $output/src 28 | cp -r src/* $output/src/ 29 | cp $0 $output/run.bash 30 | 31 | # Pre-training 32 | #batch size reduced due to additional cross attn layer (large model) 33 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 34 | python src/pretrain/lxmert_pretrain.py \ 35 | --taskMatched --taskMaskLM --taskQA \ 36 | --train mscoco_train,vgnococo --valid mscoco_minival \ 37 | --llayers 5 --rlayers 5 --xlayers 2 \ 38 | --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_allqa_no_init_stage_1/BEST_EVAL_LOSS \ 39 | --skipConnection --crossAttn --crossAttnType cross_self --freezeWeights\ 40 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \ 41 | --tqdm --output $output ${@:2} 42 | 43 | 44 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_no_init_self_patches.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_self_allqa_no_init_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --fromScratch --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \ 18 | --patches --tqdm --output $output ${@:2} 19 | 20 | 21 | 22 | # The name of experiment 23 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_self_allqa_no_init_stage_2 24 | 25 | # Create dirs and make backup 26 | output=snap/pretrain/$name 27 | mkdir -p $output/src 28 | cp -r src/* $output/src/ 29 | cp $0 $output/run.bash 30 | 31 | # Pre-training 32 | #batch size reduced due to additional cross attn layer (large model) 33 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 34 | python src/pretrain/lxmert_pretrain.py \ 35 | --taskMatched --taskMaskLM --taskQA \ 36 | --train mscoco_train,vgnococo --valid mscoco_minival \ 37 | --llayers 5 --rlayers 5 --xlayers 2\ 38 | --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_self_allqa_no_init_stage_1/BEST_EVAL_LOSS \ 39 | --skipConnection --crossAttn --crossAttnType self --freezeWeights\ 40 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \ 41 | --patches --tqdm --output $output ${@:2} 42 | 43 | 44 | -------------------------------------------------------------------------------- /run/2stage_fulldata_no_init_16_caps.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=16caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --fromScratch --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \ 18 | --NUM_PRIM_CAPS 16 --NUM_VIS_CAPS 16 \ 19 | --tqdm --output $output ${@:2} 20 | 21 | 22 | 23 | # The name of experiment 24 | name=16caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_2 25 | 26 | # Create dirs and make backup 27 | output=snap/pretrain/$name 28 | mkdir -p $output/src 29 | cp -r src/* $output/src/ 30 | cp $0 $output/run.bash 31 | 32 | # Pre-training 33 | #batch size reduced due to additional cross attn layer (large model) 34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 35 | python src/pretrain/lxmert_pretrain.py \ 36 | --taskMatched --taskMaskLM --taskQA \ 37 | --train mscoco_train,vgnococo --valid mscoco_minival \ 38 | --llayers 5 --rlayers 5 --xlayers 2 \ 39 | --loadLXMERT snap/pretrain/16caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_1/BEST_EVAL_LOSS \ 40 | --skipConnection --crossAttn --crossAttnType cross --freezeWeights\ 41 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \ 42 | --NUM_PRIM_CAPS 16 --NUM_VIS_CAPS 16 \ 43 | --tqdm --output $output ${@:2} 44 | 45 | 46 | -------------------------------------------------------------------------------- /run/2stage_fulldata_no_init_24_caps.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=24caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --fromScratch --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \ 18 | --NUM_PRIM_CAPS 24 --NUM_VIS_CAPS 24 \ 19 | --tqdm --output $output ${@:2} 20 | 21 | 22 | 23 | # The name of experiment 24 | name=24caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_2 25 | 26 | # Create dirs and make backup 27 | output=snap/pretrain/$name 28 | mkdir -p $output/src 29 | cp -r src/* $output/src/ 30 | cp $0 $output/run.bash 31 | 32 | # Pre-training 33 | #batch size reduced due to additional cross attn layer (large model) 34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 35 | python src/pretrain/lxmert_pretrain.py \ 36 | --taskMatched --taskMaskLM --taskQA \ 37 | --train mscoco_train,vgnococo --valid mscoco_minival \ 38 | --llayers 5 --rlayers 5 --xlayers 2 \ 39 | --loadLXMERT snap/pretrain/24caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_1/BEST_EVAL_LOSS \ 40 | --skipConnection --crossAttn --crossAttnType cross --freezeWeights\ 41 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \ 42 | --NUM_PRIM_CAPS 24 --NUM_VIS_CAPS 24 \ 43 | --tqdm --output $output ${@:2} 44 | 45 | 46 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_no_init_64_caps.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=64caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --fromScratch --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \ 18 | --NUM_PRIM_CAPS 64 --NUM_VIS_CAPS 64 \ 19 | --tqdm --output $output ${@:2} 20 | 21 | 22 | 23 | # The name of experiment 24 | name=64caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_2 25 | 26 | # Create dirs and make backup 27 | output=snap/pretrain/$name 28 | mkdir -p $output/src 29 | cp -r src/* $output/src/ 30 | cp $0 $output/run.bash 31 | 32 | # Pre-training 33 | #batch size reduced due to additional cross attn layer (large model) 34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 35 | python src/pretrain/lxmert_pretrain.py \ 36 | --taskMatched --taskMaskLM --taskQA \ 37 | --train mscoco_train,vgnococo --valid mscoco_minival \ 38 | --llayers 5 --rlayers 5 --xlayers 2\ 39 | --loadLXMERT snap/pretrain/64caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_1/BEST_EVAL_LOSS \ 40 | --skipConnection --crossAttn --crossAttnType cross --freezeWeights\ 41 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \ 42 | --NUM_PRIM_CAPS 64 --NUM_VIS_CAPS 64 \ 43 | --tqdm --output $output ${@:2} 44 | 45 | 46 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_no_init_48_caps.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=48caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --fromScratch --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \ 18 | --NUM_PRIM_CAPS 48 --NUM_VIS_CAPS 48 \ 19 | --tqdm --output $output ${@:2} 20 | 21 | 22 | 23 | # The name of experiment 24 | name=48caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_2 25 | 26 | # Create dirs and make backup 27 | output=snap/pretrain/$name 28 | mkdir -p $output/src 29 | cp -r src/* $output/src/ 30 | cp $0 $output/run.bash 31 | 32 | # Pre-training 33 | #batch size reduced due to additional cross attn layer (large model) 34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 35 | python src/pretrain/lxmert_pretrain.py \ 36 | --taskMatched --taskMaskLM --taskQA \ 37 | --train mscoco_train,vgnococo --valid mscoco_minival \ 38 | --llayers 5 --rlayers 5 --xlayers 2 \ 39 | --loadLXMERT snap/pretrain/48caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_1/BEST_EVAL_LOSS \ 40 | --skipConnection --crossAttn --crossAttnType cross --freezeWeights\ 41 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \ 42 | --NUM_PRIM_CAPS 48 --NUM_VIS_CAPS 48 \ 43 | --tqdm --output $output ${@:2} 44 | 45 | 46 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_vit_bert_init.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_vit_bert_wts_allqa_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 18 | --vitInit --startIndex 7 \ 19 | --tqdm --output $output ${@:2} 20 | 21 | 22 | 23 | # The name of experiment 24 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_vit_bert_wts_allqa_stage_2 25 | 26 | # Create dirs and make backup 27 | output=snap/pretrain/$name 28 | mkdir -p $output/src 29 | cp -r src/* $output/src/ 30 | cp $0 $output/run.bash 31 | 32 | # Pre-training 33 | #batch size reduced due to additional cross attn layer (large model) 34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 35 | python src/pretrain/lxmert_pretrain.py \ 36 | --taskMatched --taskMaskLM --taskQA \ 37 | --train mscoco_train,vgnococo --valid mscoco_minival \ 38 | --llayers 5 --rlayers 5 --xlayers 2\ 39 | --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_grid_x_vit_bert_wts_allqa_stage_1/BEST_EVAL_LOSS \ 40 | --skipConnection --crossAttn --crossAttnType cross --freezeWeights\ 41 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 42 | --vitInit --startIndex 7 \ 43 | --tqdm --output $output ${@:2} 44 | 45 | 46 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_no_init_selfcross_patches.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_allqa_no_init_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --fromScratch --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \ 18 | --patches --tqdm --output $output ${@:2} 19 | 20 | 21 | 22 | # The name of experiment 23 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_allqa_no_init_stage_2 24 | 25 | # Create dirs and make backup 26 | output=snap/pretrain/$name 27 | mkdir -p $output/src 28 | cp -r src/* $output/src/ 29 | cp $0 $output/run.bash 30 | 31 | # Pre-training 32 | #batch size reduced due to additional cross attn layer (large model) 33 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 34 | python src/pretrain/lxmert_pretrain.py \ 35 | --taskMatched --taskMaskLM --taskQA \ 36 | --train mscoco_train,vgnococo --valid mscoco_minival \ 37 | --llayers 5 --rlayers 5 --xlayers 2 \ 38 | --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_allqa_no_init_stage_1/BEST_EVAL_LOSS \ 39 | --skipConnection --crossAttn --crossAttnType cross_self --freezeWeights\ 40 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \ 41 | --patches --tqdm --output $output ${@:2} 42 | 43 | 44 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_vit_bert_init_self.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_self_vit7_bert_wts_allqa_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 18 | --vitInit --startIndex 7 \ 19 | --tqdm --output $output ${@:2} 20 | 21 | 22 | 23 | # The name of experiment 24 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_self_vit7_bert_wts_allqa_stage_2 25 | 26 | # Create dirs and make backup 27 | output=snap/pretrain/$name 28 | mkdir -p $output/src 29 | cp -r src/* $output/src/ 30 | cp $0 $output/run.bash 31 | 32 | # Pre-training 33 | #batch size reduced due to additional cross attn layer (large model) 34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 35 | python src/pretrain/lxmert_pretrain.py \ 36 | --taskMatched --taskMaskLM --taskQA \ 37 | --train mscoco_train,vgnococo --valid mscoco_minival \ 38 | --llayers 5 --rlayers 5 --xlayers 2\ 39 | --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_grid_x_self_vit7_bert_wts_allqa_stage_1/BEST_EVAL_LOSS \ 40 | --skipConnection --crossAttn --crossAttnType self --freezeWeights\ 41 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 42 | --vitInit --startIndex 7 \ 43 | --tqdm --output $output ${@:2} 44 | 45 | 46 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_vit_no_bert_init.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_allqa_vit7_no_bert_init_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --fromScratch --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 18 | --vitInit --startIndex 7 \ 19 | --tqdm --output $output ${@:2} 20 | 21 | 22 | 23 | # The name of experiment 24 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_allqa_vit7_no_bert_init_stage_2 25 | 26 | # Create dirs and make backup 27 | output=snap/pretrain/$name 28 | mkdir -p $output/src 29 | cp -r src/* $output/src/ 30 | cp $0 $output/run.bash 31 | 32 | # Pre-training 33 | #batch size reduced due to additional cross attn layer (large model) 34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 35 | python src/pretrain/lxmert_pretrain.py \ 36 | --taskMatched --taskMaskLM --taskQA \ 37 | --train mscoco_train,vgnococo --valid mscoco_minival \ 38 | --llayers 5 --rlayers 5 --xlayers 2\ 39 | --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_grid_x_allqa_vit7_no_bert_init_stage_1/BEST_EVAL_LOSS \ 40 | --skipConnection --crossAttn --crossAttnType cross --freezeWeights\ 41 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 42 | --vitInit --startIndex 7 \ 43 | --tqdm --output $output ${@:2} 44 | 45 | 46 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_vit_bert_init_cross_patches.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_vit7_bert_wts_allqa_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 18 | --vitInit --startIndex 7 \ 19 | --patches --tqdm --output $output ${@:2} 20 | 21 | 22 | 23 | # The name of experiment 24 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_vit7_bert_wts_allqa_stage_2 25 | 26 | # Create dirs and make backup 27 | output=snap/pretrain/$name 28 | mkdir -p $output/src 29 | cp -r src/* $output/src/ 30 | cp $0 $output/run.bash 31 | 32 | # Pre-training 33 | #batch size reduced due to additional cross attn layer (large model) 34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 35 | python src/pretrain/lxmert_pretrain.py \ 36 | --taskMatched --taskMaskLM --taskQA \ 37 | --train mscoco_train,vgnococo --valid mscoco_minival \ 38 | --llayers 5 --rlayers 5 --xlayers 2 \ 39 | --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_vit7_bert_wts_allqa_stage_1/BEST_EVAL_LOSS \ 40 | --skipConnection --crossAttn --crossAttnType cross --freezeWeights\ 41 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 42 | --vitInit --startIndex 7 \ 43 | --patches --tqdm --output $output ${@:2} 44 | 45 | 46 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_vit_bert_init_selfcross.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_vit7_bert_wts_allqa_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 18 | --vitInit --startIndex 7 \ 19 | --tqdm --output $output ${@:2} 20 | 21 | 22 | 23 | # The name of experiment 24 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_vit0_bert_wts_allqa_stage_2 25 | 26 | # Create dirs and make backup 27 | output=snap/pretrain/$name 28 | mkdir -p $output/src 29 | cp -r src/* $output/src/ 30 | cp $0 $output/run.bash 31 | 32 | # Pre-training 33 | #batch size reduced due to additional cross attn layer (large model) 34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 35 | python src/pretrain/lxmert_pretrain.py \ 36 | --taskMatched --taskMaskLM --taskQA \ 37 | --train mscoco_train,vgnococo --valid mscoco_minival \ 38 | --llayers 5 --rlayers 5 --xlayers 2\ 39 | --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_vit7_bert_wts_allqa_stage_1/BEST_EVAL_LOSS 40 | --skipConnection --crossAttn --crossAttnType cross_self --freezeWeights\ 41 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 42 | --vitInit --startIndex 7 \ 43 | --tqdm --output $output ${@:2} 44 | 45 | 46 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_vit_no_bert_init_self.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_self_allqa_vit7_no_bert_init_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --fromScratch --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 18 | --vitInit --startIndex 7 \ 19 | --tqdm --output $output ${@:2} 20 | 21 | 22 | 23 | # The name of experiment 24 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_self_allqa_vit7_no_bert_init_stage_2 25 | 26 | # Create dirs and make backup 27 | output=snap/pretrain/$name 28 | mkdir -p $output/src 29 | cp -r src/* $output/src/ 30 | cp $0 $output/run.bash 31 | 32 | # Pre-training 33 | #batch size reduced due to additional cross attn layer (large model) 34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 35 | python src/pretrain/lxmert_pretrain.py \ 36 | --taskMatched --taskMaskLM --taskQA \ 37 | --train mscoco_train,vgnococo --valid mscoco_minival \ 38 | --llayers 5 --rlayers 5 --xlayers 2\ 39 | --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_grid_x_self_allqa_vit7_no_bert_init_stage_1/BEST_EVAL_LOSS \ 40 | --skipConnection --crossAttn --crossAttnType self --freezeWeights\ 41 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 42 | --vitInit --startIndex 7 \ 43 | --tqdm --output $output ${@:2} 44 | 45 | 46 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_vit_bert_16_caps.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=16caps_itm_mlm_qa_552_grid_x_allqa_vit_bert_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \ 18 | --NUM_PRIM_CAPS 16 --NUM_VIS_CAPS 16 \ 19 | --vitInit --startIndex 7 \ 20 | --tqdm --output $output ${@:2} 21 | 22 | 23 | 24 | # The name of experiment 25 | name=16caps_itm_mlm_qa_552_grid_x_allqa_vit_bert_stage_2 26 | 27 | # Create dirs and make backup 28 | output=snap/pretrain/$name 29 | mkdir -p $output/src 30 | cp -r src/* $output/src/ 31 | cp $0 $output/run.bash 32 | 33 | # Pre-training 34 | #batch size reduced due to additional cross attn layer (large model) 35 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 36 | python src/pretrain/lxmert_pretrain.py \ 37 | --taskMatched --taskMaskLM --taskQA \ 38 | --train mscoco_train,vgnococo --valid mscoco_minival \ 39 | --llayers 5 --rlayers 5 --xlayers 2 \ 40 | --loadLXMERT snap/pretrain/16caps_itm_mlm_qa_552_grid_x_allqa_vit_bert_stage_1/BEST_EVAL_LOSS \ 41 | --skipConnection --crossAttn --crossAttnType cross --freezeWeights\ 42 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \ 43 | --NUM_PRIM_CAPS 16 --NUM_VIS_CAPS 16 \ 44 | --vitInit --startIndex 7 \ 45 | --tqdm --output $output ${@:2} 46 | 47 | 48 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_vit_bert_24_caps.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=24caps_itm_mlm_qa_552_grid_x_allqa_vit_bert_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \ 18 | --NUM_PRIM_CAPS 24 --NUM_VIS_CAPS 24 \ 19 | --vitInit --startIndex 7 \ 20 | --tqdm --output $output ${@:2} 21 | 22 | 23 | 24 | # The name of experiment 25 | name=24caps_itm_mlm_qa_552_grid_x_allqa_vit_bert_stage_2 26 | 27 | # Create dirs and make backup 28 | output=snap/pretrain/$name 29 | mkdir -p $output/src 30 | cp -r src/* $output/src/ 31 | cp $0 $output/run.bash 32 | 33 | # Pre-training 34 | #batch size reduced due to additional cross attn layer (large model) 35 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 36 | python src/pretrain/lxmert_pretrain.py \ 37 | --taskMatched --taskMaskLM --taskQA \ 38 | --train mscoco_train,vgnococo --valid mscoco_minival \ 39 | --llayers 5 --rlayers 5 --xlayers 2 \ 40 | --loadLXMERT snap/pretrain/24caps_itm_mlm_qa_552_grid_x_allqa_vit_bert_stage_1/BEST_EVAL_LOSS \ 41 | --skipConnection --crossAttn --crossAttnType cross --freezeWeights\ 42 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \ 43 | --NUM_PRIM_CAPS 24 --NUM_VIS_CAPS 24 \ 44 | --vitInit --startIndex 7 \ 45 | --tqdm --output $output ${@:2} 46 | 47 | 48 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_vit_bert_init_self_patches.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_self_vit7_bert_wts_allqa_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 18 | --vitInit --startIndex 7 \ 19 | --patches --tqdm --output $output ${@:2} 20 | 21 | 22 | 23 | # The name of experiment 24 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_self_vit7_bert_wts_allqa_stage_2 25 | 26 | # Create dirs and make backup 27 | output=snap/pretrain/$name 28 | mkdir -p $output/src 29 | cp -r src/* $output/src/ 30 | cp $0 $output/run.bash 31 | 32 | # Pre-training 33 | #batch size reduced due to additional cross attn layer (large model) 34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 35 | python src/pretrain/lxmert_pretrain.py \ 36 | --taskMatched --taskMaskLM --taskQA \ 37 | --train mscoco_train,vgnococo --valid mscoco_minival \ 38 | --llayers 5 --rlayers 5 --xlayers 2\ 39 | --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_self_vit7_bert_wts_allqa_stage_1/BEST_EVAL_LOSS \ 40 | --skipConnection --crossAttn --crossAttnType self --freezeWeights\ 41 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 42 | --vitInit --startIndex 7 \ 43 | --patches --tqdm --output $output ${@:2} 44 | 45 | 46 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_vit_no_bert_init_patches.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_allqa_vit7_no_bert_init_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --fromScratch --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 18 | --vitInit --startIndex 7 \ 19 | --patches --tqdm --output $output ${@:2} 20 | 21 | 22 | 23 | # The name of experiment 24 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_allqa_vit7_no_bert_init_stage_2 25 | 26 | # Create dirs and make backup 27 | output=snap/pretrain/$name 28 | mkdir -p $output/src 29 | cp -r src/* $output/src/ 30 | cp $0 $output/run.bash 31 | 32 | # Pre-training 33 | #batch size reduced due to additional cross attn layer (large model) 34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 35 | python src/pretrain/lxmert_pretrain.py \ 36 | --taskMatched --taskMaskLM --taskQA \ 37 | --train mscoco_train,vgnococo --valid mscoco_minival \ 38 | --llayers 5 --rlayers 5 --xlayers 2\ 39 | --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_allqa_vit7_no_bert_init_stage_1/BEST_EVAL_LOSS \ 40 | --skipConnection --crossAttn --crossAttnType cross --freezeWeights\ 41 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 42 | --vitInit --startIndex 7 \ 43 | --patches --tqdm --output $output ${@:2} 44 | 45 | 46 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_vit_no_bert_init_selfcross.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_allqa_vit7_no_bert_init_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --fromScratch --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 18 | --vitInit --startIndex 7 \ 19 | --tqdm --output $output ${@:2} 20 | 21 | 22 | 23 | # The name of experiment 24 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_allqa_vit7_no_bert_init_stage_2 25 | 26 | # Create dirs and make backup 27 | output=snap/pretrain/$name 28 | mkdir -p $output/src 29 | cp -r src/* $output/src/ 30 | cp $0 $output/run.bash 31 | 32 | # Pre-training 33 | #batch size reduced due to additional cross attn layer (large model) 34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 35 | python src/pretrain/lxmert_pretrain.py \ 36 | --taskMatched --taskMaskLM --taskQA \ 37 | --train mscoco_train,vgnococo --valid mscoco_minival \ 38 | --llayers 5 --rlayers 5 --xlayers 2 \ 39 | --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_allqa_vit7_no_bert_init_stage_1/BEST_EVAL_LOSS \ 40 | --skipConnection --crossAttn --crossAttnType cross_self --freezeWeights\ 41 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 42 | --vitInit --startIndex 7 \ 43 | --tqdm --output $output ${@:2} 44 | 45 | 46 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_vit_bert_init_selfcross_patches.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_vit7_bert_wts_allqa_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 18 | --vitInit --startIndex 7 \ 19 | --patches --tqdm --output $output ${@:2} 20 | 21 | 22 | 23 | # The name of experiment 24 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_vit7_bert_wts_allqa_stage_2 25 | 26 | # Create dirs and make backup 27 | output=snap/pretrain/$name 28 | mkdir -p $output/src 29 | cp -r src/* $output/src/ 30 | cp $0 $output/run.bash 31 | 32 | # Pre-training 33 | #batch size reduced due to additional cross attn layer (large model) 34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 35 | python src/pretrain/lxmert_pretrain.py \ 36 | --taskMatched --taskMaskLM --taskQA \ 37 | --train mscoco_train,vgnococo --valid mscoco_minival \ 38 | --llayers 5 --rlayers 5 --xlayers 2\ 39 | --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_vit7_bert_wts_allqa_stage_1/BEST_EVAL_LOSS \ 40 | --skipConnection --crossAttn --crossAttnType cross_self --freezeWeights\ 41 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 42 | --vitInit --startIndex 7 \ 43 | --patches --tqdm --output $output ${@:2} 44 | 45 | 46 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_vit_no_bert_init_self_patches.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_self_allqa_vit7_no_bert_init_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --fromScratch --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 18 | --vitInit --startIndex 7 \ 19 | --patches --tqdm --output $output ${@:2} 20 | 21 | 22 | 23 | # The name of experiment 24 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_self_allqa_vit7_no_bert_init_stage_2 25 | 26 | # Create dirs and make backup 27 | output=snap/pretrain/$name 28 | mkdir -p $output/src 29 | cp -r src/* $output/src/ 30 | cp $0 $output/run.bash 31 | 32 | # Pre-training 33 | #batch size reduced due to additional cross attn layer (large model) 34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 35 | python src/pretrain/lxmert_pretrain.py \ 36 | --taskMatched --taskMaskLM --taskQA \ 37 | --train mscoco_train,vgnococo --valid mscoco_minival \ 38 | --llayers 5 --rlayers 5 --xlayers 2\ 39 | --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_self_allqa_vit7_no_bert_init_stage_1/BEST_EVAL_LOSS \ 40 | --skipConnection --crossAttn --crossAttnType self --freezeWeights\ 41 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 42 | --vitInit --startIndex 7 \ 43 | --patches --tqdm --output $output ${@:2} 44 | 45 | 46 | -------------------------------------------------------------------------------- /run/pretrain_2stage_fulldata_vit_no_bert_init_selfcross_patches.bash: -------------------------------------------------------------------------------- 1 | # The name of experiment 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_allqa_vit7_no_bert_init_stage_1 3 | 4 | # Create dirs and make backup 5 | output=snap/pretrain/$name 6 | mkdir -p $output/src 7 | cp -r src/* $output/src/ 8 | cp $0 $output/run.bash 9 | 10 | # Pre-training 11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 12 | python src/pretrain/lxmert_pretrain.py \ 13 | --taskMatched --taskMaskLM --taskQA \ 14 | --train mscoco_train,vgnococo --valid mscoco_minival \ 15 | --llayers 5 --rlayers 5 --xlayers 0\ 16 | --fromScratch --skipConnection --crossAttnType no_cross\ 17 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 18 | --vitInit --startIndex 7 \ 19 | --patches --tqdm --output $output ${@:2} 20 | 21 | 22 | 23 | # The name of experiment 24 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_allqa_vit7_no_bert_init_stage_2 25 | 26 | # Create dirs and make backup 27 | output=snap/pretrain/$name 28 | mkdir -p $output/src 29 | cp -r src/* $output/src/ 30 | cp $0 $output/run.bash 31 | 32 | # Pre-training 33 | #batch size reduced due to additional cross attn layer (large model) 34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \ 35 | python src/pretrain/lxmert_pretrain.py \ 36 | --taskMatched --taskMaskLM --taskQA \ 37 | --train mscoco_train,vgnococo --valid mscoco_minival \ 38 | --llayers 5 --rlayers 5 --xlayers 2 \ 39 | --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_allqa_vit7_no_bert_init_stage_1/BEST_EVAL_LOSS \ 40 | --skipConnection --crossAttn --crossAttnType cross_self --freezeWeights\ 41 | --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \ 42 | --vitInit --startIndex 7 \ 43 | --patches --tqdm --output $output ${@:2} 44 | 45 | 46 | -------------------------------------------------------------------------------- /src/tasks/nlvr2_model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import torch.nn as nn 5 | from lxrt.modeling import GeLU, BertLayerNorm 6 | from lxrt.entry import LXRTEncoder 7 | from param import args 8 | 9 | 10 | class NLVR2Model(nn.Module): 11 | def __init__(self): 12 | super().__init__() 13 | self.lxrt_encoder = LXRTEncoder( 14 | args, 15 | max_seq_length=20 16 | ) 17 | self.hid_dim = hid_dim = self.lxrt_encoder.dim 18 | self.logit_fc = nn.Sequential( 19 | nn.Linear(hid_dim * 2, hid_dim * 2), 20 | GeLU(), 21 | BertLayerNorm(hid_dim * 2, eps=1e-12), 22 | nn.Linear(hid_dim * 2, 2) 23 | ) 24 | self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) 25 | 26 | def forward(self, feat, pos, sent): 27 | """ 28 | :param feat: b, 2, o, f 29 | :param pos: b, 2, o, 4 30 | :param sent: b, (string) 31 | :param leng: b, (numpy, int) 32 | :return: 33 | """ 34 | # Pairing images and sentences: 35 | # The input of NLVR2 is two images and one sentence. In batch level, they are saved as 36 | # [ [img0_0, img0_1], [img1_0, img1_1], ...] and [sent0, sent1, ...] 37 | # Here, we flat them to 38 | # feat/pos = [ img0_0, img0_1, img1_0, img1_1, ...] 39 | # sent = [ sent0, sent0, sent1, sent1, ...] 40 | sent = sum(zip(sent, sent), ()) 41 | batch_size, img_num, obj_num, feat_size = feat.size() 42 | assert img_num == 2 and obj_num == 36 and feat_size == 2048 43 | feat = feat.view(batch_size * 2, obj_num, feat_size) 44 | pos = pos.view(batch_size * 2, obj_num, 4) 45 | 46 | # Extract feature --> Concat 47 | x = self.lxrt_encoder(sent, (feat, pos)) 48 | x = x.view(-1, self.hid_dim*2) 49 | 50 | # Compute logit of answers 51 | logit = self.logit_fc(x) 52 | 53 | return logit 54 | 55 | 56 | -------------------------------------------------------------------------------- /src/tasks/refcocog_model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import torch.nn as nn 5 | 6 | from src.param import args 7 | from src.lxrt.entry import LXRTEncoder 8 | from src.lxrt.modeling_capsbert import BertLayerNorm, GeLU, MLP 9 | 10 | # Max length including and 11 | MAX_GQA_LENGTH = 20 12 | 13 | 14 | class RefCOCOgModel(nn.Module): 15 | def __init__(self, train_paradigm='full'): 16 | super().__init__() 17 | # train_paradigm has two options: 'full', 'weak' 18 | # 'full' for bounding box supervision 19 | # 'weak' for image-text pair supervision 20 | self.train_paradigm = train_paradigm 21 | self.lxrt_encoder = LXRTEncoder( 22 | args, 23 | max_seq_length=MAX_GQA_LENGTH 24 | ) 25 | hid_dim = self.lxrt_encoder.dim 26 | if self.train_paradigm == 'full': 27 | # train with bounding box labels 28 | self.logit_fc = MLP(hid_dim, hid_dim, 4, 3) 29 | elif self.train_paradigm == 'weak': 30 | # weak supervision, only use image-text labels 31 | self.logit_fc = nn.Sequential( 32 | nn.Linear(hid_dim * 2, hid_dim * 2), 33 | GeLU(), 34 | BertLayerNorm(hid_dim * 2, eps=1e-12), 35 | nn.Linear(hid_dim * 2, 2) 36 | ) 37 | else: 38 | raise NotImplementedError() 39 | 40 | self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) 41 | self.args = args 42 | 43 | def forward(self, feat, pos, sent): 44 | """ 45 | b -- batch_size, o -- object_number, f -- visual_feature_size 46 | 47 | :param feat: (b, o, f) 48 | :param pos: (b, o, 4) 49 | :param sent: (b,) Type -- list of string 50 | :param leng: (b,) Type -- int numpy array 51 | :return: (b, num_answer) The logit of each answers. 52 | """ 53 | 54 | _, x, attn_probs = self.lxrt_encoder(sent, (feat, pos)) 55 | logit = self.logit_fc(x) 56 | assert logit.size()[-1] == 2 if self.train_paradigm == 'weak' else \ 57 | logit.size()[-1] == 4 and self.train_paradigm == 'full' 58 | 59 | return logit, attn_probs 60 | 61 | 62 | -------------------------------------------------------------------------------- /src/tasks/refcoco_model.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import torch.nn as nn 5 | 6 | from src.param import args 7 | from src.lxrt.entry import LXRTEncoder 8 | from src.lxrt.modeling_capsbert import BertLayerNorm, GeLU, MLP, BertReferExpHead 9 | 10 | # Max length including and 11 | MAX_GQA_LENGTH = 20 12 | 13 | 14 | class RefCOCOModel(nn.Module): 15 | def __init__(self, train_paradigm='full'): 16 | super().__init__() 17 | #train_paradigm has two options: 'full', 'weak' 18 | # 'full' for bounding box supervision 19 | # 'weak' for image-text pair supervision 20 | self.train_paradigm = train_paradigm 21 | self.lxrt_encoder = LXRTEncoder( 22 | args, 23 | max_seq_length=MAX_GQA_LENGTH, 24 | # cross_attn_type=args.cross_attn_type 25 | ) 26 | hid_dim = self.lxrt_encoder.dim 27 | if self.train_paradigm == 'full': 28 | #train with bounding box labels 29 | # self.logit_fc = MLP(hid_dim, hid_dim, 4, 3) 30 | self.logit_fc = BertReferExpHead(hidden_size=hid_dim, out_dim1=4, out_dim2=9) 31 | elif self.train_paradigm == 'weak': 32 | # weak supervision, only use image-text labels 33 | self.logit_fc = nn.Sequential( 34 | nn.Linear(hid_dim * 2, hid_dim * 2), 35 | GeLU(), 36 | BertLayerNorm(hid_dim * 2, eps=1e-12), 37 | nn.Linear(hid_dim * 2, 2) 38 | ) 39 | else: 40 | raise NotImplementedError() 41 | 42 | self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights) 43 | self.args = args 44 | 45 | def forward(self, feat, pos, sent): 46 | """ 47 | b -- batch_size, o -- object_number, f -- visual_feature_size 48 | 49 | :param feat: (b, o, f) 50 | :param pos: (b, o, 4) 51 | :param sent: (b,) Type -- list of string 52 | :param leng: (b,) Type -- int numpy array 53 | :return: (b, num_answer) The logit of each answers. 54 | """ 55 | 56 | feat_seq, x, attn_probs = self.lxrt_encoder(sent, (feat, pos)) 57 | # x = feat_seq[1][:,0] #taking first token from visual features sequence 58 | if self.train_paradigm == "full": 59 | logits, box_params = self.logit_fc(x) 60 | else: 61 | logits = self.logit_fc(x) 62 | box_params = None 63 | assert logits.size()[-1] == 2 if self.train_paradigm == 'weak' else logits.size()[-1] == 4 and self.train_paradigm == 'full' 64 | 65 | return logits, box_params, attn_probs 66 | 67 | 68 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Weakly Supervised Grounding for VQA in Vision-Language Transformers [ECCV 2022] 2 | 3 | [Aisha Urooj Khan](https://aishaurooj.wixsite.com/aishaurooj), [Hilde Kuehne](https://hildekuehne.github.io/), [Chuang Gan](https://people.csail.mit.edu/ganchuang/), [Niels Da Vitoria Lobo](https://www.crcv.ucf.edu/person/niels-lobo/), [Mubarak Shah](https://www.crcv.ucf.edu/person/mubarak-shah/) 4 | 5 | [`Website`]() | [`arXiv`]() | [`BibTeX`](#citation) 6 | 7 | Official Pytorch implementation and pre-trained models for Weakly Supervised Grounding for VQA in Vision-Language Transformers (coming soon). 8 | 9 | ## Abstract 10 | Transformers for visual-language representation learning have been getting a lot of interest and shown tremendous performance on visual question answering (VQA) and grounding. But most systems that show good performance of those tasks still rely on pre-trained object detectors during training, which limits their applicability to the object classes available for those detectors. 11 | To mitigate this limitation, the following paper focuses on the problem of weakly supervised grounding in context of visual question answering in transformers. The approach leverages capsules by grouping each visual token in the visual encoder and uses activations from language self-attention layers as a text-guided selection module to mask those capsules before they are forwarded to the next layer. 12 | We evaluate our approach on the challenging GQA as well as VQA-HAT dataset for VQA grounding. 13 | Our experiments show that: while removing the information of masked objects from standard transformer architectures leads to a significant drop in performance, the integration of capsules significantly improves the grounding ability of such systems and provides new state-of-the-art results compared to other approaches in the field. 14 | 15 |

16 | 18 |

19 | 20 | #####

(a) Proposed Architecture, (b) Proposed Capsule Encoding layer, (c) Proposed Capsule Layer

21 | 22 | ### Qualitative Results 23 | ![gqa-qualitative](images/teaser.png) 24 | 25 | ### Code 26 | This code is built upon code base of [LXMERT](https://github.com/airsplay/lxmert). Thanks to [Hao Tan](https://scholar.google.com/citations?user=OV1Y3FUAAAAJ&hl=en) for providing excellent code for their model. 27 | 28 | #### Datasets 29 | for pretraining, we used MSCOCO, VG for image-captions pairs and Viz7W, VQA v2.0, GQA for question-image pairs. We used instructions provided by [LXMERT](https://github.com/airsplay/lxmert) to prepare the data except a few changes. 30 | 1. We removed GQA validation set from pretraining data as we use it for grounding evaluation. 31 | 2. We validate our pretraining on mscoco-minival split. 32 | 33 | #### Pretraining 34 | To pretrain the backbone, use the following command: 35 | 36 | ``` 37 | bash run/pretrain_2stage_fulldata_no_init_16_caps.bash 38 | ``` 39 | 40 | #### Finetuning on downstream tasks 41 | 42 | ##### GQA 43 | See ``` run/gqa_finetune_caps.bash ``` for finetuning on GQA dataset. 44 | 45 | ##### VQA-HAT 46 | Finetuning on VQA-HAT is similar to how we finetune the model on GQA. I will keep adding more concrete details in next few days. 47 | 48 | ### Citation 49 | If this work is useful for your research, please cite our paper. 50 | 51 | ```bibtex 52 | @InProceedings{10.1007/978-3-031-19833-5_38, 53 | author="Khan, Aisha Urooj 54 | and Kuehne, Hilde 55 | and Gan, Chuang 56 | and Lobo, Niels Da Vitoria 57 | and Shah, Mubarak", 58 | editor="Avidan, Shai 59 | and Brostow, Gabriel 60 | and Ciss{\'e}, Moustapha 61 | and Farinella, Giovanni Maria 62 | and Hassner, Tal", 63 | title="Weakly Supervised Grounding for VQA in Vision-Language Transformers", 64 | booktitle="Computer Vision -- ECCV 2022", 65 | year="2022", 66 | publisher="Springer Nature Switzerland", 67 | address="Cham", 68 | pages="652--670", 69 | isbn="978-3-031-19833-5" 70 | } 71 | ``` 72 | 73 | ### Questions? 74 | Please contact 'aishaurooj@gmail.com' 75 | 76 | 77 | 78 | -------------------------------------------------------------------------------- /src/tasks/nlvr2_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import json 5 | 6 | import numpy as np 7 | from torch.utils.data import Dataset 8 | 9 | from param import args 10 | from utils import load_obj_tsv 11 | 12 | # Load part of the dataset for fast checking. 13 | # Notice that here is the number of images instead of the number of data, 14 | # which means all related data to the images would be used. 15 | TINY_IMG_NUM = 512 16 | FAST_IMG_NUM = 5000 17 | 18 | 19 | class NLVR2Dataset: 20 | """ 21 | An NLVR2 data example in json file: 22 | { 23 | "identifier": "train-10171-0-0", 24 | "img0": "train-10171-0-img0", 25 | "img1": "train-10171-0-img1", 26 | "label": 0, 27 | "sent": "An image shows one leather pencil case, displayed open with writing implements tucked inside. 28 | ", 29 | "uid": "nlvr2_train_0" 30 | } 31 | """ 32 | def __init__(self, splits: str): 33 | self.name = splits 34 | self.splits = splits.split(',') 35 | 36 | # Loading datasets to data 37 | self.data = [] 38 | for split in self.splits: 39 | self.data.extend(json.load(open("data/nlvr2/%s.json" % split))) 40 | print("Load %d data from split(s) %s." % (len(self.data), self.name)) 41 | 42 | # List to dict (for evaluation and others) 43 | self.id2datum = { 44 | datum['uid']: datum 45 | for datum in self.data 46 | } 47 | 48 | def __len__(self): 49 | return len(self.data) 50 | 51 | 52 | """ 53 | An example in obj36 tsv: 54 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf", 55 | "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"] 56 | FIELDNAMES would be keys in the dict returned by load_obj_tsv. 57 | """ 58 | class NLVR2TorchDataset(Dataset): 59 | def __init__(self, dataset: NLVR2Dataset): 60 | super().__init__() 61 | self.raw_dataset = dataset 62 | 63 | if args.tiny: 64 | topk = TINY_IMG_NUM 65 | elif args.fast: 66 | topk = FAST_IMG_NUM 67 | else: 68 | topk = -1 69 | 70 | # Loading detection features to img_data 71 | img_data = [] 72 | if 'train' in dataset.splits: 73 | img_data.extend(load_obj_tsv('data/nlvr2_imgfeat/train_obj36.tsv', topk=topk)) 74 | if 'valid' in dataset.splits: 75 | img_data.extend(load_obj_tsv('data/nlvr2_imgfeat/valid_obj36.tsv', topk=topk)) 76 | if 'test' in dataset.name: 77 | img_data.extend(load_obj_tsv('data/nlvr2_imgfeat/test_obj36.tsv', topk=topk)) 78 | self.imgid2img = {} 79 | for img_datum in img_data: 80 | self.imgid2img[img_datum['img_id']] = img_datum 81 | 82 | # Filter out the dataset 83 | self.data = [] 84 | for datum in self.raw_dataset.data: 85 | if datum['img0'] in self.imgid2img and datum['img1'] in self.imgid2img: 86 | self.data.append(datum) 87 | print("Use %d data in torch dataset" % (len(self.data))) 88 | print() 89 | 90 | def __len__(self): 91 | return len(self.data) 92 | 93 | def __getitem__(self, item: int): 94 | datum = self.data[item] 95 | 96 | ques_id = datum['uid'] 97 | ques = datum['sent'] 98 | 99 | # Get image info 100 | boxes2 = [] 101 | feats2 = [] 102 | for key in ['img0', 'img1']: 103 | img_id = datum[key] 104 | img_info = self.imgid2img[img_id] 105 | boxes = img_info['boxes'].copy() 106 | feats = img_info['features'].copy() 107 | assert len(boxes) == len(feats) 108 | 109 | # Normalize the boxes (to 0 ~ 1) 110 | img_h, img_w = img_info['img_h'], img_info['img_w'] 111 | boxes[..., (0, 2)] /= img_w 112 | boxes[..., (1, 3)] /= img_h 113 | np.testing.assert_array_less(boxes, 1+1e-5) 114 | np.testing.assert_array_less(-boxes, 0+1e-5) 115 | 116 | boxes2.append(boxes) 117 | feats2.append(feats) 118 | feats = np.stack(feats2) 119 | boxes = np.stack(boxes2) 120 | 121 | # Create target 122 | if 'label' in datum: 123 | label = datum['label'] 124 | return ques_id, feats, boxes, ques, label 125 | else: 126 | return ques_id, feats, boxes, ques 127 | 128 | 129 | class NLVR2Evaluator: 130 | def __init__(self, dataset: NLVR2Dataset): 131 | self.dataset = dataset 132 | 133 | def evaluate(self, quesid2ans: dict): 134 | score = 0. 135 | for quesid, ans in quesid2ans.items(): 136 | datum = self.dataset.id2datum[quesid] 137 | label = datum['label'] 138 | if ans == label: 139 | score += 1 140 | return score / len(quesid2ans) 141 | 142 | def dump_result(self, quesid2ans: dict, path): 143 | """ 144 | Dump result to a CSV file, which is compatible with NLVR2 evaluation system. 145 | NLVR2 CSV file requirement: 146 | Each line contains: identifier, answer 147 | 148 | :param quesid2ans: nlvr2 uid to ans (either "True" or "False") 149 | :param path: The desired path of saved file. 150 | :return: 151 | """ 152 | with open(path, 'w') as f: 153 | for uid, ans in quesid2ans.items(): 154 | idt = self.dataset.id2datum[uid]["identifier"] 155 | ans = 'True' if ans == 1 else 'False' 156 | f.write("%s,%s\n" % (idt, ans)) 157 | 158 | -------------------------------------------------------------------------------- /src/pretrain/qa_answer_table.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import json 5 | import torch 6 | 7 | 8 | class AnswerTable: 9 | ANS_CONVERT = { 10 | "a man": "man", 11 | "the man": "man", 12 | "a woman": "woman", 13 | "the woman": "woman", 14 | 'one': '1', 15 | 'two': '2', 16 | 'three': '3', 17 | 'four': '4', 18 | 'five': '5', 19 | 'six': '6', 20 | 'seven': '7', 21 | 'eight': '8', 22 | 'nine': '9', 23 | 'ten': '10', 24 | 'grey': 'gray', 25 | } 26 | 27 | def __init__(self, dsets=None): 28 | self.all_ans = json.load(open("data/lxmert/all_ans.json")) 29 | if dsets is not None: 30 | dsets = set(dsets) 31 | # If the answer is used in the dsets 32 | self.anss = [ans['ans'] for ans in self.all_ans if 33 | len(set(ans['dsets']) & dsets) > 0] 34 | else: 35 | self.anss = [ans['ans'] for ans in self.all_ans] 36 | self.ans_set = set(self.anss) 37 | 38 | self._id2ans_map = self.anss 39 | self._ans2id_map = {ans: ans_id for ans_id, ans in enumerate(self.anss)} 40 | 41 | assert len(self._id2ans_map) == len(self._ans2id_map) 42 | for ans_id, ans in enumerate(self._id2ans_map): 43 | assert self._ans2id_map[ans] == ans_id 44 | 45 | def convert_ans(self, ans): 46 | if len(ans) == 0: 47 | return "" 48 | ans = ans.lower() 49 | if ans[-1] == '.': 50 | ans = ans[:-1].strip() 51 | if ans.startswith("a "): 52 | ans = ans[2:].strip() 53 | if ans.startswith("an "): 54 | ans = ans[3:].strip() 55 | if ans.startswith("the "): 56 | ans = ans[4:].strip() 57 | if ans in self.ANS_CONVERT: 58 | ans = self.ANS_CONVERT[ans] 59 | return ans 60 | 61 | def ans2id(self, ans): 62 | return self._ans2id_map[ans] 63 | 64 | def id2ans(self, ans_id): 65 | return self._id2ans_map[ans_id] 66 | 67 | def ans2id_map(self): 68 | return self._ans2id_map.copy() 69 | 70 | def id2ans_map(self): 71 | return self._id2ans_map.copy() 72 | 73 | def used(self, ans): 74 | return ans in self.ans_set 75 | 76 | def all_answers(self): 77 | return self.anss.copy() 78 | 79 | @property 80 | def num_answers(self): 81 | return len(self.anss) 82 | 83 | 84 | def load_lxmert_qa(path, model, label2ans): 85 | """ 86 | Load model weights from LXMERT pre-training. 87 | The answers in the fine-tuned QA task (indicated by label2ans) 88 | would also be properly initialized with LXMERT pre-trained 89 | QA heads. 90 | 91 | :param path: Path to LXMERT snapshot. 92 | :param model: LXRT model instance. 93 | :param label2ans: The label2ans dict of fine-tuned QA datasets, like 94 | {0: 'cat', 1: 'dog', ...} 95 | :return: 96 | """ 97 | print("Load QA pre-trained LXMERT from %s " % path) 98 | loaded_state_dict = torch.load("%s_LXRT.pth" % path) 99 | model_state_dict = model.state_dict() 100 | 101 | # Handle Multi-GPU pre-training --> Single GPU fine-tuning 102 | for key in list(loaded_state_dict.keys()): 103 | loaded_state_dict[key.replace("module.", '')] = loaded_state_dict.pop(key) 104 | 105 | # Isolate bert model 106 | bert_state_dict = {} 107 | for key, value in loaded_state_dict.items(): 108 | if key.startswith('bert.'): 109 | bert_state_dict[key] = value 110 | 111 | # Isolate answer head 112 | answer_state_dict = {} 113 | for key, value in loaded_state_dict.items(): 114 | if key.startswith("answer_head."): 115 | answer_state_dict[key.replace('answer_head.', '')] = value 116 | 117 | # Do surgery on answer state dict 118 | ans_weight = answer_state_dict['logit_fc.3.weight'] 119 | ans_bias = answer_state_dict['logit_fc.3.bias'] 120 | import copy 121 | new_answer_weight = copy.deepcopy(model_state_dict['logit_fc.3.weight']) 122 | new_answer_bias = copy.deepcopy(model_state_dict['logit_fc.3.bias']) 123 | answer_table = AnswerTable() 124 | loaded = 0 125 | unload = 0 126 | if type(label2ans) is list: 127 | label2ans = {label: ans for label, ans in enumerate(label2ans)} 128 | for label, ans in label2ans.items(): 129 | new_ans = answer_table.convert_ans(ans) 130 | if answer_table.used(new_ans): 131 | ans_id_9500 = answer_table.ans2id(new_ans) 132 | new_answer_weight[label] = ans_weight[ans_id_9500] 133 | new_answer_bias[label] = ans_bias[ans_id_9500] 134 | loaded += 1 135 | else: 136 | new_answer_weight[label] = 0. 137 | new_answer_bias[label] = 0. 138 | unload += 1 139 | print("Loaded %d answers from LXRTQA pre-training and %d not" % (loaded, unload)) 140 | print() 141 | answer_state_dict['logit_fc.3.weight'] = new_answer_weight 142 | answer_state_dict['logit_fc.3.bias'] = new_answer_bias 143 | 144 | # Load Bert Weights 145 | bert_model_keys = set(model.lxrt_encoder.model.state_dict().keys()) 146 | bert_loaded_keys = set(bert_state_dict.keys()) 147 | assert len(bert_model_keys - bert_loaded_keys) == 0 148 | model.lxrt_encoder.model.load_state_dict(bert_state_dict, strict=False) 149 | 150 | # Load Answer Logic FC Weights 151 | model_keys = set(model.state_dict().keys()) 152 | ans_loaded_keys = set(answer_state_dict.keys()) 153 | assert len(ans_loaded_keys - model_keys) == 0 154 | 155 | model.load_state_dict(answer_state_dict, strict=False) 156 | 157 | 158 | 159 | -------------------------------------------------------------------------------- /src/lxrt/entry_spatial.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 project LXRT. 3 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 4 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import os 19 | 20 | import torch 21 | import torch.nn as nn 22 | 23 | from src.lxrt.tokenization import BertTokenizer 24 | from src.lxrt.modeling_spatial import LXRTFeatureExtraction as VisualBertForLXRFeature, VISUAL_CONFIG 25 | 26 | 27 | class InputFeatures(object): 28 | """A single set of features of data.""" 29 | 30 | def __init__(self, input_ids, input_mask, segment_ids): 31 | self.input_ids = input_ids 32 | self.input_mask = input_mask 33 | self.segment_ids = segment_ids 34 | 35 | 36 | def convert_sents_to_features(sents, max_seq_length, tokenizer): 37 | """Loads a data file into a list of `InputBatch`s.""" 38 | 39 | features = [] 40 | for (i, sent) in enumerate(sents): 41 | tokens_a = tokenizer.tokenize(sent.strip()) 42 | 43 | # Account for [CLS] and [SEP] with "- 2" 44 | if len(tokens_a) > max_seq_length - 2: 45 | tokens_a = tokens_a[:(max_seq_length - 2)] 46 | 47 | # Keep segment id which allows loading BERT-weights. 48 | tokens = ["[CLS]"] + tokens_a + ["[SEP]"] 49 | segment_ids = [0] * len(tokens) 50 | 51 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 52 | 53 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 54 | # tokens are attended to. 55 | input_mask = [1] * len(input_ids) 56 | 57 | # Zero-pad up to the sequence length. 58 | padding = [0] * (max_seq_length - len(input_ids)) 59 | input_ids += padding 60 | input_mask += padding 61 | segment_ids += padding 62 | 63 | assert len(input_ids) == max_seq_length 64 | assert len(input_mask) == max_seq_length 65 | assert len(segment_ids) == max_seq_length 66 | 67 | features.append( 68 | InputFeatures(input_ids=input_ids, 69 | input_mask=input_mask, 70 | segment_ids=segment_ids)) 71 | return features 72 | 73 | 74 | def set_visual_config(args): 75 | VISUAL_CONFIG.l_layers = args.llayers 76 | VISUAL_CONFIG.x_layers = args.xlayers 77 | VISUAL_CONFIG.r_layers = args.rlayers 78 | 79 | #capsules config 80 | VISUAL_CONFIG.num_prim_caps = args.NUM_PRIM_CAPS 81 | VISUAL_CONFIG.num_vis_caps = args.NUM_VIS_CAPS 82 | VISUAL_CONFIG.pose_matrix_dim = args.POSE_DIM 83 | VISUAL_CONFIG.hw = args.HW 84 | VISUAL_CONFIG.caps_dim = args.NUM_VIS_CAPS * (args.POSE_DIM*args.POSE_DIM+1) 85 | 86 | print(VISUAL_CONFIG.num_prim_caps) 87 | 88 | 89 | class LXRTEncoder(nn.Module): 90 | def __init__(self, args, max_seq_length, mode='x'): 91 | super().__init__() 92 | self.max_seq_length = max_seq_length 93 | set_visual_config(args) 94 | self.args = args 95 | 96 | # Using the bert tokenizer 97 | self.tokenizer = BertTokenizer.from_pretrained( 98 | "bert-base-uncased", 99 | do_lower_case=True 100 | ) 101 | 102 | # Build LXRT Model 103 | self.model = VisualBertForLXRFeature.from_pretrained( 104 | "bert-base-uncased", 105 | mode=mode, 106 | skip_connection=args.skip_connection, 107 | shared_weights=args.shared_weights, 108 | ) 109 | 110 | if args.from_scratch: 111 | print("initializing all the weights") 112 | self.model.apply(self.model.init_bert_weights) 113 | 114 | def multi_gpu(self): 115 | self.model = nn.DataParallel(self.model) 116 | 117 | @property 118 | def dim(self): 119 | return 768 120 | 121 | def forward(self, sents, feats, visual_attention_mask=None): 122 | train_features = convert_sents_to_features( 123 | sents, self.max_seq_length, self.tokenizer) 124 | 125 | input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long).cuda() 126 | input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long).cuda() 127 | segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long).cuda() 128 | 129 | # print(feats[0].shape) 130 | output, attn_probs = self.model(input_ids, segment_ids, input_mask, 131 | visual_feats=feats, 132 | visual_attention_mask=visual_attention_mask, output_all_attention_masks=self.args.output_attention) 133 | return output, attn_probs 134 | 135 | def save(self, path): 136 | torch.save(self.model.state_dict(), 137 | os.path.join("%s_LXRT.pth" % path)) 138 | 139 | def load(self, path): 140 | # Load state_dict from snapshot file 141 | print("Load LXMERT pre-trained model from %s" % path) 142 | state_dict = torch.load("%s_LXRT.pth" % path) 143 | new_state_dict = {} 144 | for key, value in state_dict.items(): 145 | if key.startswith("module."): 146 | new_state_dict[key[len("module."):]] = value 147 | else: 148 | new_state_dict[key] = value 149 | state_dict = new_state_dict 150 | 151 | # Print out the differences of pre-trained and model weights. 152 | load_keys = set(state_dict.keys()) 153 | model_keys = set(self.model.state_dict().keys()) 154 | print() 155 | print("Weights in loaded but not in model:") 156 | for key in sorted(load_keys.difference(model_keys)): 157 | print(key) 158 | print() 159 | print("Weights in model but not in loaded:") 160 | for key in sorted(model_keys.difference(load_keys)): 161 | print(key) 162 | print() 163 | 164 | # Load weights to model 165 | self.model.load_state_dict(state_dict, strict=False) 166 | 167 | 168 | 169 | 170 | -------------------------------------------------------------------------------- /src/tasks/vqa_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import json 5 | import os 6 | import pickle 7 | 8 | import numpy as np 9 | import torch 10 | from torch.utils.data import Dataset 11 | 12 | from param import args 13 | from utils import load_obj_tsv 14 | 15 | # Load part of the dataset for fast checking. 16 | # Notice that here is the number of images instead of the number of data, 17 | # which means all related data to the images would be used. 18 | TINY_IMG_NUM = 512 19 | FAST_IMG_NUM = 5000 20 | 21 | # The path to data and image features. 22 | VQA_DATA_ROOT = 'data/vqa/' 23 | MSCOCO_IMGFEAT_ROOT = 'data/mscoco_imgfeat/' 24 | SPLIT2NAME = { 25 | 'train': 'train2014', 26 | 'valid': 'val2014', 27 | 'minival': 'val2014', 28 | 'nominival': 'val2014', 29 | 'test': 'test2015', 30 | } 31 | 32 | 33 | class VQADataset: 34 | """ 35 | A VQA data example in json file: 36 | { 37 | "answer_type": "other", 38 | "img_id": "COCO_train2014_000000458752", 39 | "label": { 40 | "net": 1 41 | }, 42 | "question_id": 458752000, 43 | "question_type": "what is this", 44 | "sent": "What is this photo taken looking through?" 45 | } 46 | """ 47 | def __init__(self, splits: str): 48 | self.name = splits 49 | self.splits = splits.split(',') 50 | 51 | # Loading datasets 52 | self.data = [] 53 | for split in self.splits: 54 | self.data.extend(json.load(open("data/vqa/%s.json" % split))) 55 | print("Load %d data from split(s) %s." % (len(self.data), self.name)) 56 | 57 | # Convert list to dict (for evaluation) 58 | self.id2datum = { 59 | datum['question_id']: datum 60 | for datum in self.data 61 | } 62 | 63 | # Answers 64 | self.ans2label = json.load(open("data/vqa/trainval_ans2label.json")) 65 | self.label2ans = json.load(open("data/vqa/trainval_label2ans.json")) 66 | assert len(self.ans2label) == len(self.label2ans) 67 | 68 | @property 69 | def num_answers(self): 70 | return len(self.ans2label) 71 | 72 | def __len__(self): 73 | return len(self.data) 74 | 75 | 76 | """ 77 | An example in obj36 tsv: 78 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf", 79 | "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"] 80 | FIELDNAMES would be keys in the dict returned by load_obj_tsv. 81 | """ 82 | class VQATorchDataset(Dataset): 83 | def __init__(self, dataset: VQADataset): 84 | super().__init__() 85 | self.raw_dataset = dataset 86 | 87 | if args.tiny: 88 | topk = TINY_IMG_NUM 89 | elif args.fast: 90 | topk = FAST_IMG_NUM 91 | else: 92 | topk = None 93 | 94 | # Loading detection features to img_data 95 | img_data = [] 96 | for split in dataset.splits: 97 | # Minival is 5K images in MS COCO, which is used in evaluating VQA/LXMERT-pre-training. 98 | # It is saved as the top 5K features in val2014_***.tsv 99 | load_topk = 5000 if (split == 'minival' and topk is None) else topk 100 | img_data.extend(load_obj_tsv( 101 | os.path.join(MSCOCO_IMGFEAT_ROOT, '%s_obj36.tsv' % (SPLIT2NAME[split])), 102 | topk=load_topk)) 103 | 104 | # Convert img list to dict 105 | self.imgid2img = {} 106 | for img_datum in img_data: 107 | self.imgid2img[img_datum['img_id']] = img_datum 108 | 109 | # Only kept the data with loaded image features 110 | self.data = [] 111 | for datum in self.raw_dataset.data: 112 | if datum['img_id'] in self.imgid2img: 113 | self.data.append(datum) 114 | print("Use %d data in torch dataset" % (len(self.data))) 115 | print() 116 | 117 | def __len__(self): 118 | return len(self.data) 119 | 120 | def __getitem__(self, item: int): 121 | datum = self.data[item] 122 | 123 | img_id = datum['img_id'] 124 | ques_id = datum['question_id'] 125 | ques = datum['sent'] 126 | 127 | # Get image info 128 | img_info = self.imgid2img[img_id] 129 | obj_num = img_info['num_boxes'] 130 | feats = img_info['features'].copy() 131 | boxes = img_info['boxes'].copy() 132 | assert obj_num == len(boxes) == len(feats) 133 | 134 | # Normalize the boxes (to 0 ~ 1) 135 | img_h, img_w = img_info['img_h'], img_info['img_w'] 136 | boxes = boxes.copy() 137 | boxes[:, (0, 2)] /= img_w 138 | boxes[:, (1, 3)] /= img_h 139 | np.testing.assert_array_less(boxes, 1+1e-5) 140 | np.testing.assert_array_less(-boxes, 0+1e-5) 141 | 142 | # Provide label (target) 143 | if 'label' in datum: 144 | label = datum['label'] 145 | target = torch.zeros(self.raw_dataset.num_answers) 146 | for ans, score in label.items(): 147 | target[self.raw_dataset.ans2label[ans]] = score 148 | return ques_id, feats, boxes, ques, target 149 | else: 150 | return ques_id, feats, boxes, ques 151 | 152 | 153 | class VQAEvaluator: 154 | def __init__(self, dataset: VQADataset): 155 | self.dataset = dataset 156 | 157 | def evaluate(self, quesid2ans: dict): 158 | score = 0. 159 | for quesid, ans in quesid2ans.items(): 160 | datum = self.dataset.id2datum[quesid] 161 | label = datum['label'] 162 | if ans in label: 163 | score += label[ans] 164 | return score / len(quesid2ans) 165 | 166 | def dump_result(self, quesid2ans: dict, path): 167 | """ 168 | Dump results to a json file, which could be submitted to the VQA online evaluation. 169 | VQA json file submission requirement: 170 | results = [result] 171 | result = { 172 | "question_id": int, 173 | "answer": str 174 | } 175 | 176 | :param quesid2ans: dict of quesid --> ans 177 | :param path: The desired path of saved file. 178 | """ 179 | with open(path, 'w') as f: 180 | result = [] 181 | for ques_id, ans in quesid2ans.items(): 182 | result.append({ 183 | 'question_id': ques_id, 184 | 'answer': ans 185 | }) 186 | json.dump(result, f, indent=4, sort_keys=True) 187 | 188 | 189 | -------------------------------------------------------------------------------- /src/tasks/nlvr2.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import os 5 | import collections 6 | 7 | from tqdm import tqdm 8 | import torch 9 | import torch.nn as nn 10 | from torch.utils.data.dataloader import DataLoader 11 | 12 | from param import args 13 | from tasks.nlvr2_model import NLVR2Model 14 | from tasks.nlvr2_data import NLVR2Dataset, NLVR2TorchDataset, NLVR2Evaluator 15 | 16 | DataTuple = collections.namedtuple("DataTuple", 'dataset loader evaluator') 17 | 18 | 19 | def get_tuple(splits: str, bs:int, shuffle=False, drop_last=False) -> DataTuple: 20 | dset = NLVR2Dataset(splits) 21 | tset = NLVR2TorchDataset(dset) 22 | evaluator = NLVR2Evaluator(dset) 23 | data_loader = DataLoader( 24 | tset, batch_size=bs, 25 | shuffle=shuffle, num_workers=args.num_workers, 26 | drop_last=drop_last, pin_memory=True 27 | ) 28 | 29 | return DataTuple(dataset=dset, loader=data_loader, evaluator=evaluator) 30 | 31 | 32 | class NLVR2: 33 | def __init__(self): 34 | self.train_tuple = get_tuple( 35 | args.train, bs=args.batch_size, shuffle=True, drop_last=True 36 | ) 37 | if args.valid != "": 38 | valid_bsize = 2048 if args.multiGPU else 512 39 | self.valid_tuple = get_tuple( 40 | args.valid, bs=valid_bsize, 41 | shuffle=False, drop_last=False 42 | ) 43 | else: 44 | self.valid_tuple = None 45 | 46 | self.model = NLVR2Model() 47 | 48 | # Load pre-trained weights 49 | if args.load_lxmert is not None: 50 | self.model.lxrt_encoder.load(args.load_lxmert) 51 | 52 | # GPU options 53 | if args.multiGPU: 54 | self.model.lxrt_encoder.multi_gpu() 55 | self.model = self.model.cuda() 56 | 57 | # Losses and optimizer 58 | self.mce_loss = nn.CrossEntropyLoss(ignore_index=-1) 59 | if 'bert' in args.optim: 60 | batch_per_epoch = len(self.train_tuple.loader) 61 | t_total = int(batch_per_epoch * args.epochs) 62 | print("Total Iters: %d" % t_total) 63 | from lxrt.optimization import BertAdam 64 | self.optim = BertAdam(list(self.model.parameters()), 65 | lr=args.lr, 66 | warmup=0.1, 67 | t_total=t_total) 68 | else: 69 | self.optim = args.optimizer(list(self.model.parameters()), args.lr) 70 | 71 | self.output = args.output 72 | os.makedirs(self.output, exist_ok=True) 73 | 74 | def train(self, train_tuple, eval_tuple): 75 | dset, loader, evaluator = train_tuple 76 | iter_wrapper = (lambda x: tqdm(x, total=len(loader))) if args.tqdm else (lambda x: x) 77 | 78 | best_valid = 0. 79 | for epoch in range(args.epochs): 80 | quesid2ans = {} 81 | for i, (ques_id, feats, boxes, sent, label) in iter_wrapper(enumerate(loader)): 82 | self.model.train() 83 | 84 | self.optim.zero_grad() 85 | feats, boxes, label = feats.cuda(), boxes.cuda(), label.cuda() 86 | logit = self.model(feats, boxes, sent) 87 | 88 | loss = self.mce_loss(logit, label) 89 | 90 | loss.backward() 91 | nn.utils.clip_grad_norm_(self.model.parameters(), 5.) 92 | self.optim.step() 93 | 94 | score, predict = logit.max(1) 95 | for qid, l in zip(ques_id, predict.cpu().numpy()): 96 | quesid2ans[qid] = l 97 | 98 | log_str = "\nEpoch %d: Train %0.2f\n" % (epoch, evaluator.evaluate(quesid2ans) * 100.) 99 | 100 | if self.valid_tuple is not None: # Do Validation 101 | valid_score = self.evaluate(eval_tuple) 102 | if valid_score > best_valid: 103 | best_valid = valid_score 104 | self.save("BEST") 105 | 106 | log_str += "Epoch %d: Valid %0.2f\n" % (epoch, valid_score * 100.) + \ 107 | "Epoch %d: Best %0.2f\n" % (epoch, best_valid * 100.) 108 | 109 | print(log_str, end='') 110 | 111 | with open(self.output + "/log.log", 'a') as f: 112 | f.write(log_str) 113 | f.flush() 114 | 115 | self.save("LAST") 116 | 117 | def predict(self, eval_tuple: DataTuple, dump=None): 118 | self.model.eval() 119 | dset, loader, evaluator = eval_tuple 120 | quesid2ans = {} 121 | for i, datum_tuple in enumerate(loader): 122 | ques_id, feats, boxes, sent = datum_tuple[:4] # avoid handling target 123 | with torch.no_grad(): 124 | feats, boxes = feats.cuda(), boxes.cuda() 125 | logit = self.model(feats, boxes, sent) 126 | score, predict = logit.max(1) 127 | for qid, l in zip(ques_id, predict.cpu().numpy()): 128 | quesid2ans[qid] = l 129 | if dump is not None: 130 | evaluator.dump_result(quesid2ans, dump) 131 | return quesid2ans 132 | 133 | def evaluate(self, eval_tuple: DataTuple, dump=None): 134 | dset, loader, evaluator = eval_tuple 135 | quesid2ans = self.predict(eval_tuple, dump) 136 | return evaluator.evaluate(quesid2ans) 137 | 138 | def save(self, name): 139 | torch.save(self.model.state_dict(), 140 | os.path.join(self.output, "%s.pth" % name)) 141 | 142 | def load(self, path): 143 | print("Load model from %s" % path) 144 | state_dict = torch.load("%s.pth" % path) 145 | self.model.load_state_dict(state_dict) 146 | 147 | 148 | if __name__ == "__main__": 149 | # Build Class 150 | nlvr2 = NLVR2() 151 | 152 | # Load Model 153 | if args.load is not None: 154 | nlvr2.load(args.load) 155 | 156 | # Test or Train 157 | if args.test is not None: 158 | args.fast = args.tiny = False # Always loading all data in test 159 | if 'hidden' in args.test: 160 | nlvr2.predict( 161 | get_tuple(args.test, bs=args.batch_size, 162 | shuffle=False, drop_last=False), 163 | dump=os.path.join(args.output, 'hidden_predict.csv') 164 | ) 165 | elif 'test' in args.test or 'valid' in args.test: 166 | result = nlvr2.evaluate( 167 | get_tuple(args.test, bs=args.batch_size, 168 | shuffle=False, drop_last=False), 169 | dump=os.path.join(args.output, '%s_predict.csv' % args.test) 170 | ) 171 | print(result) 172 | else: 173 | assert False, "No such test option for %s" % args.test 174 | else: 175 | print('Splits in Train data:', nlvr2.train_tuple.dataset.splits) 176 | if nlvr2.valid_tuple is not None: 177 | print('Splits in Valid data:', nlvr2.valid_tuple.dataset.splits) 178 | else: 179 | print("DO NOT USE VALIDATION") 180 | nlvr2.train(nlvr2.train_tuple, nlvr2.valid_tuple) 181 | 182 | 183 | -------------------------------------------------------------------------------- /src/tasks/gqa_data_patches.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import json 5 | 6 | import numpy as np 7 | import torch 8 | from torch.utils.data import Dataset 9 | 10 | from src.param import args 11 | from src.utils import load_obj_tsv, load_patches 12 | 13 | # Load part of the dataset for fast checking. 14 | # Notice that here is the number of images instead of the number of data, 15 | # which means all related data to the images would be used. 16 | TINY_IMG_NUM = 512 17 | FAST_IMG_NUM = 5000 18 | 19 | 20 | class GQADataset: 21 | """ 22 | A GQA data example in json file: 23 | { 24 | "img_id": "2375429", 25 | "label": { 26 | "pipe": 1.0 27 | }, 28 | "question_id": "07333408", 29 | "sent": "What is on the white wall?" 30 | } 31 | """ 32 | def __init__(self, splits: str): 33 | self.name = splits 34 | self.splits = splits.split(',') 35 | 36 | # Loading datasets to data 37 | self.data = [] 38 | for split in self.splits: 39 | self.data.extend(json.load(open("data/gqa/%s.json" % split))) 40 | print("Load %d data from split(s) %s." % (len(self.data), self.name)) 41 | 42 | # List to dict (for evaluation and others) 43 | self.id2datum = { 44 | datum['question_id']: datum 45 | for datum in self.data 46 | } 47 | 48 | # Answers 49 | self.ans2label = json.load(open("data/gqa/trainval_ans2label.json")) 50 | self.label2ans = json.load(open("data/gqa/trainval_label2ans.json")) 51 | assert len(self.ans2label) == len(self.label2ans) 52 | for ans, label in self.ans2label.items(): 53 | assert self.label2ans[label] == ans 54 | 55 | @property 56 | def num_answers(self): 57 | return len(self.ans2label) 58 | 59 | def __len__(self): 60 | return len(self.data) 61 | 62 | 63 | class GQABufferLoader(): 64 | def __init__(self): 65 | self.key2data = {} 66 | 67 | def load_data(self, name, number): 68 | if name == 'testdev': 69 | path = "data/gqa/testdev_patches_32x32.hdf5" 70 | elif name == 'valid': 71 | path = "data/gqa/valid_patches_32x32.hdf5" 72 | else: 73 | path = "data/gqa/train_patches_32x32.hdf5" 74 | key = "%s_%d" % (path, number) 75 | if key not in self.key2data: 76 | self.key2data[key] = load_patches( 77 | path, 78 | dataset='gqa', 79 | topk=number 80 | ) 81 | return self.key2data[key] 82 | 83 | 84 | gqa_buffer_loader = GQABufferLoader() 85 | 86 | 87 | """ 88 | Example in obj tsv: 89 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf", 90 | "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"] 91 | """ 92 | class GQATorchDataset(Dataset): 93 | def __init__(self, dataset: GQADataset): 94 | super().__init__() 95 | self.raw_dataset = dataset 96 | 97 | if args.tiny: 98 | topk = TINY_IMG_NUM 99 | elif args.fast: 100 | topk = FAST_IMG_NUM 101 | else: 102 | topk = -1 103 | 104 | # Loading detection features to img_data 105 | # Since images in train and valid both come from Visual Genome, 106 | # buffer the image loading to save memory. 107 | img_data = [] 108 | if 'testdev' in dataset.splits or 'testdev_all' in dataset.splits: # Always loading all the data in testdev 109 | img_data.extend(gqa_buffer_loader.load_data('testdev', -1)) 110 | elif 'valid' in dataset.splits: 111 | img_data.extend(gqa_buffer_loader.load_data('valid', -1)) 112 | else: 113 | img_data.extend(gqa_buffer_loader.load_data('train', topk)) 114 | self.imgid2img = {} 115 | for img_datum in img_data: 116 | self.imgid2img[img_datum['img_id']] = img_datum 117 | 118 | # Only kept the data with loaded image features 119 | self.data = [] 120 | for datum in self.raw_dataset.data: 121 | if datum['img_id'] in self.imgid2img: 122 | self.data.append(datum) 123 | print("Use %d data in torch dataset" % (len(self.data))) 124 | print() 125 | 126 | def __len__(self): 127 | return len(self.data) 128 | 129 | def __getitem__(self, item: int): 130 | datum = self.data[item] 131 | 132 | img_id = datum['img_id'] 133 | ques_id = datum['question_id'] 134 | ques = datum['sent'] 135 | h,w = 7,7 136 | # Get image info 137 | img_info = self.imgid2img[img_id] 138 | obj_num = img_info['num_boxes'] 139 | # boxes = img_info['boxes'].copy() 140 | feats = img_info['features'].copy() 141 | # assert len(boxes) == len(feats) == obj_num 142 | boxes = np.ones(h*w + 1, dtype=np.float32) # assuming feats of shape [d, h, w] 143 | # Normalize the boxes (to 0 ~ 1) 144 | # img_h, img_w = img_info['img_h'], img_info['img_w'] 145 | # boxes = boxes.copy() 146 | # boxes[:, (0, 2)] /= img_w 147 | # boxes[:, (1, 3)] /= img_h 148 | # np.testing.assert_array_less(boxes, 1+1e-5) 149 | # np.testing.assert_array_less(-boxes, 0+1e-5) 150 | 151 | # Create target 152 | if 'label' in datum: 153 | label = datum['label'] 154 | target = torch.zeros(self.raw_dataset.num_answers) 155 | for ans, score in label.items(): 156 | if ans in self.raw_dataset.ans2label: 157 | target[self.raw_dataset.ans2label[ans]] = score 158 | return ques_id, feats, boxes, ques, target 159 | else: 160 | return ques_id, feats, boxes, ques 161 | 162 | 163 | class GQAEvaluator: 164 | def __init__(self, dataset: GQADataset): 165 | self.dataset = dataset 166 | 167 | def evaluate(self, quesid2ans: dict): 168 | score = 0. 169 | for quesid, ans in quesid2ans.items(): 170 | datum = self.dataset.id2datum[quesid] 171 | label = datum['label'] 172 | if ans in label: 173 | score += label[ans] 174 | return score / len(quesid2ans) 175 | 176 | def save_json(self, data, file_path): 177 | with open(file_path, "w") as f: 178 | json.dump(data, f) 179 | 180 | def dump_result(self, quesid2ans: dict, path): 181 | """ 182 | Dump the result to a GQA-challenge submittable json file. 183 | GQA json file submission requirement: 184 | results = [result] 185 | result = { 186 | "questionId": str, # Note: it's a actually an int number but the server requires an str. 187 | "prediction": str 188 | } 189 | 190 | :param quesid2ans: A dict mapping question id to its predicted answer. 191 | :param path: The file path to save the json file. 192 | :return: 193 | """ 194 | with open(path, 'w') as f: 195 | result = [] 196 | for ques_id, ans in quesid2ans.items(): 197 | result.append({ 198 | 'questionId': ques_id, 199 | 'prediction': ans 200 | }) 201 | json.dump(result, f, indent=4, sort_keys=True) 202 | 203 | 204 | -------------------------------------------------------------------------------- /src/tasks/gqa_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import json 5 | 6 | import numpy as np 7 | import torch 8 | from torch.utils.data import Dataset 9 | 10 | from src.param import args 11 | from src.utils import load_obj_tsv, load_spatial_gqa 12 | 13 | # Load part of the dataset for fast checking. 14 | # Notice that here is the number of images instead of the number of data, 15 | # which means all related data to the images would be used. 16 | TINY_IMG_NUM = 512 17 | FAST_IMG_NUM = 5000 18 | 19 | 20 | class GQADataset: 21 | """ 22 | A GQA data example in json file: 23 | { 24 | "img_id": "2375429", 25 | "label": { 26 | "pipe": 1.0 27 | }, 28 | "question_id": "07333408", 29 | "sent": "What is on the white wall?" 30 | } 31 | """ 32 | def __init__(self, splits: str): 33 | self.name = splits 34 | self.splits = splits.split(',') 35 | 36 | # Loading datasets to data 37 | self.data = [] 38 | for split in self.splits: 39 | self.data.extend(json.load(open("../../data/gqa/%s.json" % split))) 40 | print("Load %d data from split(s) %s." % (len(self.data), self.name)) 41 | 42 | # List to dict (for evaluation and others) 43 | self.id2datum = { 44 | datum['question_id']: datum 45 | for datum in self.data 46 | } 47 | 48 | # Answers 49 | self.ans2label = json.load(open("../../data/gqa/trainval_ans2label.json")) 50 | self.label2ans = json.load(open("../../data/gqa/trainval_label2ans.json")) 51 | assert len(self.ans2label) == len(self.label2ans) 52 | for ans, label in self.ans2label.items(): 53 | assert self.label2ans[label] == ans 54 | 55 | @property 56 | def num_answers(self): 57 | return len(self.ans2label) 58 | 59 | def __len__(self): 60 | return len(self.data) 61 | 62 | 63 | class GQABufferLoader(): 64 | def __init__(self): 65 | self.key2data = {} 66 | 67 | def load_data(self, name, number): 68 | if name == 'testdev': 69 | # path = "data/vg_gqa_imgfeat/gqa_testdev_obj36.tsv" 70 | path = "../../data/gqa/gqa_spatial.h5" 71 | else: 72 | # path = "data/vg_gqa_imgfeat/vg_gqa_obj36.tsv" 73 | path = "../../data/gqa/gqa_spatial.h5" 74 | key = "%s_%d" % (path, number) 75 | if key not in self.key2data: 76 | self.key2data[key] = load_spatial_gqa( 77 | path, 78 | topk=number 79 | ) 80 | return self.key2data[key] 81 | 82 | 83 | gqa_buffer_loader = GQABufferLoader() 84 | 85 | 86 | """ 87 | Example in obj tsv: 88 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf", 89 | "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"] 90 | """ 91 | class GQATorchDataset(Dataset): 92 | def __init__(self, dataset: GQADataset): 93 | super().__init__() 94 | self.raw_dataset = dataset 95 | # self.img_info_data = json.load('data/gqa/gqa_spatial_merged_info.json') 96 | 97 | if args.tiny: 98 | topk = TINY_IMG_NUM 99 | elif args.fast: 100 | topk = FAST_IMG_NUM 101 | else: 102 | topk = -1 103 | 104 | # Loading detection features to img_data 105 | # Since images in train and valid both come from Visual Genome, 106 | # buffer the image loading to save memory. 107 | img_data = [] 108 | if 'testdev' in dataset.splits or 'testdev_all' in dataset.splits: # Always loading all the data in testdev 109 | img_data.extend(gqa_buffer_loader.load_data('testdev', -1)) 110 | else: 111 | img_data.extend(gqa_buffer_loader.load_data('train', topk)) 112 | self.imgid2img = {} 113 | for img_datum in img_data: 114 | self.imgid2img[img_datum['img_id']] = img_datum 115 | 116 | # Only kept the data with loaded image features 117 | self.data = [] 118 | for datum in self.raw_dataset.data: 119 | if datum['img_id'] in self.imgid2img: 120 | self.data.append(datum) 121 | print("Use %d data in torch dataset" % (len(self.data))) 122 | print() 123 | 124 | def __len__(self): 125 | return len(self.data) 126 | 127 | def __getitem__(self, item: int): 128 | datum = self.data[item] 129 | 130 | img_id = datum['img_id'] 131 | ques_id = datum['question_id'] 132 | ques = datum['sent'] 133 | 134 | # Get image info 135 | img_info = self.imgid2img[img_id] 136 | obj_num = img_info['num_boxes'] 137 | # boxes = img_info['boxes'].copy() 138 | 139 | feats = img_info['features'].copy() 140 | ##Aisha change: 141 | 142 | boxes = np.ones(feats.shape[1]*feats.shape[2]+1, dtype=np.float32) #assuming feats of shape [d, h, w] 143 | # assert len(boxes) == len(feats) == obj_num 144 | 145 | # Normalize the boxes (to 0 ~ 1) 146 | # img_h, img_w = img_info['img_h'], img_info['img_w'] 147 | # boxes = boxes.copy() 148 | # boxes[:, (0, 2)] /= img_w 149 | # boxes[:, (1, 3)] /= img_h 150 | # np.testing.assert_array_less(boxes, 1+1e-5) 151 | # np.testing.assert_array_less(-boxes, 0+1e-5) 152 | 153 | # Create target 154 | if 'label' in datum: 155 | label = datum['label'] 156 | target = torch.zeros(self.raw_dataset.num_answers) 157 | for ans, score in label.items(): 158 | if ans in self.raw_dataset.ans2label: 159 | target[self.raw_dataset.ans2label[ans]] = score 160 | return ques_id, feats, boxes, ques, target 161 | else: 162 | return ques_id, feats, boxes, ques 163 | 164 | 165 | class GQAEvaluator: 166 | def __init__(self, dataset: GQADataset): 167 | self.dataset = dataset 168 | 169 | def evaluate(self, quesid2ans: dict): 170 | score = 0. 171 | for quesid, ans in quesid2ans.items(): 172 | datum = self.dataset.id2datum[quesid] 173 | label = datum['label'] 174 | if ans in label: 175 | score += label[ans] 176 | return score / len(quesid2ans) 177 | 178 | def save_json(self, data, file_path): 179 | with open(file_path, "w") as f: 180 | json.dump(data, f) 181 | 182 | def dump_result(self, quesid2ans: dict, path): 183 | """ 184 | Dump the result to a GQA-challenge submittable json file. 185 | GQA json file submission requirement: 186 | results = [result] 187 | result = { 188 | "questionId": str, # Note: it's a actually an int number but the server requires an str. 189 | "prediction": str 190 | } 191 | 192 | :param quesid2ans: A dict mapping question id to its predicted answer. 193 | :param path: The file path to save the json file. 194 | :return: 195 | """ 196 | with open(path, 'w') as f: 197 | result = [] 198 | for ques_id, ans in quesid2ans.items(): 199 | result.append({ 200 | 'questionId': ques_id, 201 | 'prediction': ans 202 | }) 203 | json.dump(result, f, indent=4, sort_keys=True) 204 | 205 | 206 | -------------------------------------------------------------------------------- /src/tasks/vqahat_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import json 5 | 6 | import numpy as np 7 | import torch 8 | from torch.utils.data import Dataset 9 | 10 | from src.param import args 11 | from src.utils import load_spatial_data 12 | 13 | # Load part of the dataset for fast checking. 14 | # Notice that here is the number of images instead of the number of data, 15 | # which means all related data to the images would be used. 16 | TINY_IMG_NUM = 512 17 | FAST_IMG_NUM = 5000 18 | 19 | 20 | class GQADataset: 21 | """ 22 | A GQA data example in json file: 23 | { 24 | "img_id": "2375429", 25 | "label": { 26 | "pipe": 1.0 27 | }, 28 | "question_id": "07333408", 29 | "sent": "What is on the white wall?" 30 | } 31 | """ 32 | def __init__(self, splits: str): 33 | self.name = splits 34 | self.splits = splits.split(',') 35 | 36 | # Loading datasets to data 37 | self.data = [] 38 | for split in self.splits: 39 | self.data.extend(json.load(open("../../data/VQA_HAT/%s.json" % split))) 40 | print("Load %d data from split(s) %s." % (len(self.data), self.name)) 41 | 42 | # List to dict (for evaluation and others) 43 | self.id2datum = { 44 | datum['question_id']: datum 45 | for datum in self.data 46 | } 47 | 48 | # Answers 49 | self.ans2label = json.load(open("../../data/VQA_HAT/trainval_ans2label.json")) 50 | self.label2ans = json.load(open("../../data/VQA_HAT/trainval_label2ans.json")) 51 | assert len(self.ans2label) == len(self.label2ans) 52 | for ans, label in self.ans2label.items(): 53 | assert self.label2ans[label] == ans 54 | 55 | @property 56 | def num_answers(self): 57 | return len(self.ans2label) 58 | 59 | def __len__(self): 60 | return len(self.data) 61 | 62 | 63 | class GQABufferLoader(): 64 | def __init__(self): 65 | self.key2data = {} 66 | 67 | def load_data(self, name, number): 68 | if name == 'testdev': 69 | # path = "data/vg_gqa_imgfeat/gqa_testdev_obj36.tsv" 70 | path = "../../data/mscoco_imgfeat/valid_features.hdf5" 71 | elif name == 'valid': 72 | # path = "data/vg_gqa_imgfeat/vg_gqa_obj36.tsv" 73 | path = "../../data/mscoco_imgfeat/valid_features.hdf5" 74 | else: 75 | path = "../../data/mscoco_imgfeat/train_features.hdf5" 76 | key = "%s_%d" % (path, number) 77 | print(key) 78 | if key not in self.key2data: 79 | self.key2data[key] = load_spatial_data( 80 | path, 81 | topk=number 82 | ) 83 | return self.key2data[key] 84 | 85 | 86 | gqa_buffer_loader = GQABufferLoader() 87 | 88 | 89 | """ 90 | Example in obj tsv: 91 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf", 92 | "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"] 93 | """ 94 | class GQATorchDataset(Dataset): 95 | def __init__(self, dataset: GQADataset): 96 | super().__init__() 97 | self.raw_dataset = dataset 98 | # self.img_info_data = json.load('data/gqa/gqa_spatial_merged_info.json') 99 | 100 | if args.tiny: 101 | topk = TINY_IMG_NUM 102 | elif args.fast: 103 | topk = FAST_IMG_NUM 104 | else: 105 | topk = -1 106 | 107 | # Loading detection features to img_data 108 | # Since images in train and valid both come from Visual Genome, 109 | # buffer the image loading to save memory. 110 | img_data = [] 111 | if 'testdev' in dataset.splits or 'testdev_all' in dataset.splits: # Always loading all the data in testdev 112 | img_data.extend(gqa_buffer_loader.load_data('testdev', -1)) 113 | elif 'valid' in dataset.splits: 114 | img_data.extend(gqa_buffer_loader.load_data('valid', -1)) 115 | else: 116 | img_data.extend(gqa_buffer_loader.load_data('train', topk)) 117 | self.imgid2img = {} 118 | for img_datum in img_data: 119 | self.imgid2img[img_datum['img_id']] = img_datum 120 | 121 | # Only kept the data with loaded image features 122 | self.data = [] 123 | for datum in self.raw_dataset.data: 124 | datum_img_id = "COCO_val2014_{0:012d}".format(datum['image_id']) 125 | if datum_img_id in self.imgid2img: 126 | datum['image_id'] = datum_img_id 127 | self.data.append(datum) 128 | print("Use %d data in torch dataset" % (len(self.data))) 129 | print() 130 | 131 | def __len__(self): 132 | return len(self.data) 133 | 134 | def __getitem__(self, item: int): 135 | datum = self.data[item] 136 | 137 | img_id = datum['image_id'] 138 | ques_id = datum['question_id'] 139 | ques = datum['question'] 140 | 141 | # Get image info 142 | img_info = self.imgid2img[img_id] 143 | obj_num = img_info['num_boxes'] 144 | # boxes = img_info['boxes'].copy() 145 | 146 | feats = img_info['features'].copy() 147 | ##Aisha change: 148 | 149 | boxes = np.ones(feats.shape[1]*feats.shape[2]+1, dtype=np.float32) #assuming feats of shape [d, h, w] 150 | # assert len(boxes) == len(feats) == obj_num 151 | 152 | # Normalize the boxes (to 0 ~ 1) 153 | # img_h, img_w = img_info['img_h'], img_info['img_w'] 154 | # boxes = boxes.copy() 155 | # boxes[:, (0, 2)] /= img_w 156 | # boxes[:, (1, 3)] /= img_h 157 | # np.testing.assert_array_less(boxes, 1+1e-5) 158 | # np.testing.assert_array_less(-boxes, 0+1e-5) 159 | 160 | # Create target 161 | if 'label' in datum: 162 | label = datum['label'] 163 | target = torch.zeros(self.raw_dataset.num_answers) 164 | for ans, score in label.items(): 165 | if ans in self.raw_dataset.ans2label: 166 | target[self.raw_dataset.ans2label[ans]] = score 167 | return ques_id, feats, boxes, ques, target 168 | else: 169 | return ques_id, feats, boxes, ques 170 | 171 | 172 | class GQAEvaluator: 173 | def __init__(self, dataset: GQADataset): 174 | self.dataset = dataset 175 | 176 | def evaluate(self, quesid2ans: dict): 177 | score = 0. 178 | for quesid, ans in quesid2ans.items(): 179 | datum = self.dataset.id2datum[quesid] 180 | label = datum['label'] 181 | if ans in label: 182 | score += label[ans] 183 | return score / len(quesid2ans) 184 | 185 | def save_json(self, data, file_path): 186 | with open(file_path, "w") as f: 187 | json.dump(data, f) 188 | 189 | def dump_result(self, quesid2ans: dict, path): 190 | """ 191 | Dump the result to a GQA-challenge submittable json file. 192 | GQA json file submission requirement: 193 | results = [result] 194 | result = { 195 | "questionId": str, # Note: it's a actually an int number but the server requires an str. 196 | "prediction": str 197 | } 198 | 199 | :param quesid2ans: A dict mapping question id to its predicted answer. 200 | :param path: The file path to save the json file. 201 | :return: 202 | """ 203 | with open(path, 'w') as f: 204 | result = [] 205 | for ques_id, ans in quesid2ans.items(): 206 | result.append({ 207 | 'questionId': ques_id, 208 | 'prediction': ans 209 | }) 210 | json.dump(result, f, indent=4, sort_keys=True) 211 | 212 | 213 | -------------------------------------------------------------------------------- /src/param.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import argparse 5 | import random 6 | 7 | import numpy as np 8 | import torch 9 | 10 | 11 | def get_optimizer(optim): 12 | # Bind the optimizer 13 | if optim == 'rms': 14 | print("Optimizer: Using RMSProp") 15 | optimizer = torch.optim.RMSprop 16 | elif optim == 'adam': 17 | print("Optimizer: Using Adam") 18 | optimizer = torch.optim.Adam 19 | elif optim == 'adamax': 20 | print("Optimizer: Using Adamax") 21 | optimizer = torch.optim.Adamax 22 | elif optim == 'sgd': 23 | print("Optimizer: sgd") 24 | optimizer = torch.optim.SGD 25 | elif 'bert' in optim: 26 | optimizer = 'bert' # The bert optimizer will be bind later. 27 | else: 28 | assert False, "Please add your optimizer %s in the list." % optim 29 | 30 | return optimizer 31 | 32 | 33 | def parse_args(): 34 | parser = argparse.ArgumentParser() 35 | 36 | # Data Splits 37 | parser.add_argument("--train", default='train') 38 | parser.add_argument("--valid", default='valid') 39 | parser.add_argument("--test", default=None) 40 | 41 | # Training Hyper-parameters 42 | parser.add_argument('--batchSize', dest='batch_size', type=int, default=256) 43 | parser.add_argument('--optim', default='bert') 44 | parser.add_argument('--lr', type=float, default=1e-4) 45 | parser.add_argument('--epochs', type=int, default=10) 46 | parser.add_argument('--dropout', type=float, default=0.1) 47 | parser.add_argument('--margin', type=float, default=0.1) 48 | parser.add_argument('--seed', type=int, default=9595, help='random seed') 49 | 50 | # Debugging 51 | parser.add_argument('--output', type=str, default='snap/test') 52 | parser.add_argument("--fast", action='store_const', default=False, const=True) 53 | parser.add_argument("--tiny", action='store_const', default=False, const=True) 54 | parser.add_argument("--tqdm", action='store_const', default=False, const=True) 55 | 56 | # Model Loading 57 | parser.add_argument('--load', type=str, default=None, 58 | help='Load the model (usually the fine-tuned model).') 59 | parser.add_argument('--loadLXMERT', dest='load_lxmert', type=str, default=None, 60 | help='Load the pre-trained LXMERT model.') 61 | parser.add_argument('--loadLXMERTQA', dest='load_lxmert_qa', type=str, default=None, 62 | help='Load the pre-trained LXMERT model with QA answer head.') 63 | parser.add_argument("--fromScratch", dest='from_scratch', action='store_const', default=False, const=True, 64 | help='If none of the --load, --loadLXMERT, --loadLXMERTQA is set, ' 65 | 'the model would be trained from scratch. If --fromScratch is' 66 | ' not specified, the model would load BERT-pre-trained weights by' 67 | ' default. ') 68 | 69 | parser.add_argument("--vitInit", dest='vit_init', action='store_const', default=False, const=True, 70 | help='If --vitInit specified, rlayers will be initialized from vit weights ' 71 | 'starting from layer index specified with --startIndex') 72 | 73 | # Optimization 74 | parser.add_argument("--mceLoss", dest='mce_loss', action='store_const', default=False, const=True) 75 | 76 | # LXRT Model Config 77 | # Note: LXRT = L, X, R (three encoders), Transformer 78 | parser.add_argument("--llayers", default=9, type=int, help='Number of Language layers') 79 | parser.add_argument("--xlayers", default=5, type=int, help='Number of CROSS-modality layers.') 80 | parser.add_argument("--rlayers", default=5, type=int, help='Number of object Relationship layers.') 81 | parser.add_argument("--startIndex", dest='start_index', default=7, type=int, 82 | help='Specify the layer index to start loading vit weights from.') 83 | parser.add_argument("--skipConnection", dest='skip_connection', action='store_const', default=False, const=True) 84 | parser.add_argument("--sharedWeights", dest='shared_weights', action='store_const', default=False, const=True) 85 | parser.add_argument("--normInputs", dest='norm_inputs', action='store_const', default=False, const=True) 86 | parser.add_argument("--crossAttn", dest='cross_attn', action='store_const', default=False, const=True) 87 | parser.add_argument("--crossAttnType", dest='cross_attn_type', default="cross", type=str, 88 | choices=["cross", "self", 'cross_self', 'no_cross', 'old'], 89 | help='Types of cross-modality attention') 90 | parser.add_argument("--patches", dest='patches', action='store_const', default=False, const=True) 91 | parser.add_argument("--attnRouting", dest='attn_routing', action='store_const', default=False, const=True) 92 | parser.add_argument("--freezeWeights", dest='freeze_weights', action='store_const', default=False, const=True) 93 | parser.add_argument("--noCaps", dest='no_caps', action='store_const', default=False, const=True) 94 | parser.add_argument("--NUM_PRIM_CAPS", default=32, type=int, help='Number of primary capsules.') 95 | parser.add_argument("--NUM_VIS_CAPS", default=32, type=int, help='Number of visual capsules.') 96 | parser.add_argument("--POSE_DIM", default=4, type=int, help='Pose matrix size. Default is 4.') 97 | parser.add_argument("--HW", default=7, type=int, help='Spatial feature map size.') 98 | 99 | # LXRT evaluation 100 | parser.add_argument("--outputAttn", dest='output_attention', action='store_const', default=False, const=True) 101 | parser.add_argument("--numBlock", dest='num_block', default=-1, type=int) 102 | parser.add_argument("--gradCAM", dest='gradcam', action='store_const', default=False, const=True) 103 | # LXMERT Pre-training Config 104 | parser.add_argument("--taskMatched", dest='task_matched', action='store_const', default=False, const=True) 105 | parser.add_argument("--taskMaskLM", dest='task_mask_lm', action='store_const', default=False, const=True) 106 | parser.add_argument("--taskObjPredict", dest='task_obj_predict', action='store_const', default=False, const=True) 107 | parser.add_argument("--taskQA", dest='task_qa', action='store_const', default=False, const=True) 108 | parser.add_argument("--taskGrounding", dest='task_grounding', action='store_const', default=False, const=True) 109 | parser.add_argument("--taskContrastive", dest='task_contrastive', action='store_const', default=False, const=True) 110 | parser.add_argument("--visualLosses", dest='visual_losses', default='obj,attr,feat', type=str) 111 | parser.add_argument("--qaSets", dest='qa_sets', default=None, type=str) 112 | parser.add_argument("--excludeSet", dest='exclude_set', default='', type=str) 113 | parser.add_argument("--wordMaskRate", dest='word_mask_rate', default=0.15, type=float) 114 | parser.add_argument("--objMaskRate", dest='obj_mask_rate', default=0.15, type=float) 115 | 116 | # LXMERT Finetuning Config 117 | parser.add_argument('--trainParadigm', 118 | dest='train_paradigm', 119 | default='full', 120 | const='full', 121 | nargs='?', 122 | choices=['full', 'weak', 'zero'], 123 | help='training paradigm for refer expression task') 124 | # Training configuration 125 | parser.add_argument("--multiGPU", action='store_const', default=False, const=True) 126 | parser.add_argument("--numWorkers", dest='num_workers', default=0) 127 | 128 | # Parse the arguments. 129 | args = parser.parse_args() 130 | 131 | # Bind optimizer class. 132 | args.optimizer = get_optimizer(args.optim) 133 | 134 | # Set seeds 135 | torch.manual_seed(args.seed) 136 | random.seed(args.seed) 137 | np.random.seed(args.seed) 138 | 139 | return args 140 | 141 | 142 | args = parse_args() 143 | -------------------------------------------------------------------------------- /src/tasks/refcocoplus_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import json 5 | 6 | import numpy as np 7 | import torch 8 | from torch.utils.data import Dataset 9 | 10 | from src.param import args 11 | from src.utils import load_obj_tsv, load_spatial_data 12 | 13 | # Load part of the dataset for fast checking. 14 | # Notice that here is the number of images instead of the number of data, 15 | # which means all related data to the images would be used. 16 | TINY_IMG_NUM = 512 17 | FAST_IMG_NUM = 5000 18 | 19 | 20 | class RefCOCOplusDataset: 21 | """ 22 | A GQA data example in json file: 23 | { 24 | "caption": caption, 25 | "sent_id": sent_id, 26 | "image_id": image_id, 27 | "refBox": refBox, 28 | "ref_id": ref_id, --> unique id assigned to each data sample 29 | } 30 | """ 31 | def __init__(self, splits: str): 32 | self.name = splits 33 | self.splits = splits.split(',') 34 | 35 | # Loading datasets to data 36 | self.data = [] 37 | for split in self.splits: 38 | self.data.extend(json.load(open("/data/Grounded-RL2021/lxmert/data/refcoco+/annotations_%s.json" % split))) 39 | print("Load %d data from split(s) %s." % (len(self.data), self.name)) 40 | 41 | 42 | # List to dict (for evaluation and others) 43 | self.id2datum = { 44 | datum['sent_id']: datum 45 | for datum in self.data 46 | } 47 | 48 | # Answers 49 | # self.ans2label = json.load(open("data/refcoco/trainval_ans2label.json")) 50 | # self.label2ans = json.load(open("data/refcoco/trainval_label2ans.json")) 51 | # assert len(self.ans2label) == len(self.label2ans) 52 | # for ans, label in self.ans2label.items(): 53 | # assert self.label2ans[label] == ans 54 | 55 | @property 56 | # def num_answers(self): 57 | # return len(self.ans2label) 58 | 59 | def __len__(self): 60 | return len(self.data) 61 | 62 | 63 | class RefCOCOplusBufferLoader(): 64 | def __init__(self): 65 | self.key2data = {} 66 | 67 | def load_data(self, name, number): 68 | # if name == 'testdev': 69 | # # path = "data/vg_gqa_imgfeat/gqa_testdev_obj36.tsv" 70 | # path = "data/refcoco/refcoco_testdev_spatial.h5" 71 | # else: 72 | # # path = "data/vg_gqa_imgfeat/vg_gqa_obj36.tsv" 73 | # path = "data/refcoco/refcoco_testdev_spatial.h5" 74 | path = "/data/Grounded-RL2021/lxmert/data/refcoco+/{}_features.hdf5".format(name) 75 | key = "%s_%d" % (path, number) 76 | if key not in self.key2data: 77 | self.key2data[key] = load_spatial_data( 78 | path, 79 | topk=number 80 | ) 81 | return self.key2data[key] 82 | 83 | 84 | gqa_buffer_loader = RefCOCOplusBufferLoader() 85 | 86 | 87 | """ 88 | Example in obj tsv: 89 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf", 90 | "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"] 91 | """ 92 | class RefCOCOplusTorchDataset(Dataset): 93 | def __init__(self, dataset: RefCOCOplusDataset): 94 | super().__init__() 95 | self.raw_dataset = dataset 96 | # self.img_info_data = json.load('data/gqa/gqa_spatial_merged_info.json') 97 | 98 | if args.tiny: 99 | topk = TINY_IMG_NUM 100 | elif args.fast: 101 | topk = FAST_IMG_NUM 102 | else: 103 | topk = -1 104 | 105 | # Loading detection features to img_data 106 | # Since images in train and valid both come from Visual Genome, 107 | # buffer the image loading to save memory. 108 | img_data = [] 109 | if 'test' in dataset.splits or 'test' in dataset.splits: # Always loading all the data in testdev 110 | img_data.extend(gqa_buffer_loader.load_data('test', -1)) 111 | elif 'valid' in dataset.splits or 'valid' in dataset.splits: # Always loading all the data in testdev 112 | img_data.extend(gqa_buffer_loader.load_data('valid', -1)) 113 | else: 114 | img_data.extend(gqa_buffer_loader.load_data('train', topk)) 115 | self.imgid2img = {} 116 | for img_datum in img_data: 117 | self.imgid2img[img_datum['image_id']] = img_datum 118 | 119 | # Only kept the data with loaded image features 120 | self.data = [] 121 | for datum in self.raw_dataset.data: 122 | if datum['image_id'] in self.imgid2img: 123 | self.data.append(datum) 124 | print("Use %d data in torch dataset" % (len(self.data))) 125 | print() 126 | 127 | def __len__(self): 128 | return len(self.data) 129 | 130 | def __getitem__(self, item: int): 131 | datum = self.data[item] 132 | 133 | img_id = datum['image_id'] 134 | sent_id = datum['sent_id'] 135 | sent = datum['caption'] 136 | 137 | # Get image info 138 | img_info = self.imgid2img[img_id] 139 | obj_num = img_info['num_boxes'] 140 | # boxes = img_info['boxes'].copy() 141 | 142 | feats = img_info['features'].copy() 143 | ##Aisha change: 144 | 145 | boxes = np.ones(feats.shape[1]*feats.shape[2], dtype=np.float32) #assuming feats of shape [d, h, w] 146 | # assert len(boxes) == len(feats) == obj_num 147 | 148 | target_box = datum['refBox'] 149 | # Normalize the boxes (to 0 ~ 1) 150 | img_h, img_w = img_info['img_h'], img_info['img_w'] 151 | target_box = target_box.copy() 152 | # target_box[:, (0, 2)] /= img_w 153 | # target_box[:, (1, 3)] /= img_h 154 | target_box[0] /= img_w 155 | target_box[2] /= img_w 156 | target_box[1] /= img_h 157 | target_box[3] /= img_h 158 | np.testing.assert_array_less(np.array(target_box), 1+1e-5) 159 | np.testing.assert_array_less(-np.array(target_box), 0+1e-5) 160 | 161 | # Create target 162 | # if 'label' in datum: 163 | # label = datum['label'] 164 | # target = torch.zeros(self.raw_dataset.num_answers) 165 | # for ans, score in label.items(): 166 | # if ans in self.raw_dataset.ans2label: 167 | # target[self.raw_dataset.ans2label[ans]] = score 168 | # return ref_id, feats, target_box, sent, target 169 | # else: 170 | return sent_id, feats, boxes, sent, target_box 171 | 172 | 173 | class RefCOCOplusEvaluator: 174 | def __init__(self, dataset: RefCOCOplusDataset): 175 | self.dataset = dataset 176 | 177 | def evaluate(self, sentid2box: dict): 178 | score = 0. 179 | for sentid, box in sentid2box.items(): 180 | datum = self.dataset.id2datum[sentid] 181 | label = datum['refBox'] 182 | if box in label: 183 | score += label[box] 184 | return score / len(sentid2box) 185 | 186 | def save_json(self, data, file_path): 187 | with open(file_path, "w") as f: 188 | json.dump(data, f) 189 | 190 | def dump_result(self, quesid2ans: dict, path): 191 | """ 192 | Dump the result to a GQA-challenge submittable json file. 193 | GQA json file submission requirement: 194 | results = [result] 195 | result = { 196 | "questionId": str, # Note: it's a actually an int number but the server requires an str. 197 | "prediction": str 198 | } 199 | 200 | :param quesid2ans: A dict mapping question id to its predicted answer. 201 | :param path: The file path to save the json file. 202 | :return: 203 | """ 204 | with open(path, 'w') as f: 205 | result = [] 206 | for ques_id, ans in quesid2ans.items(): 207 | result.append({ 208 | 'questionId': ques_id, 209 | 'prediction': ans 210 | }) 211 | json.dump(result, f, indent=4, sort_keys=True) 212 | 213 | 214 | -------------------------------------------------------------------------------- /src/lxrt/entry.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 project LXRT. 3 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 4 | # Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. 5 | # 6 | # Licensed under the Apache License, Version 2.0 (the "License"); 7 | # you may not use this file except in compliance with the License. 8 | # You may obtain a copy of the License at 9 | # 10 | # http://www.apache.org/licenses/LICENSE-2.0 11 | # 12 | # Unless required by applicable law or agreed to in writing, software 13 | # distributed under the License is distributed on an "AS IS" BASIS, 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 15 | # See the License for the specific language governing permissions and 16 | # limitations under the License. 17 | 18 | import os 19 | 20 | import torch 21 | import torch.nn as nn 22 | 23 | from src.lxrt.tokenization import BertTokenizer 24 | from src.lxrt.modeling_capsbert import LXRTFeatureExtraction as VisualBertForLXRFeature, VISUAL_CONFIG 25 | 26 | 27 | class InputFeatures(object): 28 | """A single set of features of data.""" 29 | 30 | def __init__(self, input_ids, input_mask, segment_ids): 31 | self.input_ids = input_ids 32 | self.input_mask = input_mask 33 | self.segment_ids = segment_ids 34 | 35 | 36 | def convert_sents_to_features(sents, max_seq_length, tokenizer): 37 | """Loads a data file into a list of `InputBatch`s.""" 38 | 39 | features = [] 40 | for (i, sent) in enumerate(sents): 41 | tokens_a = tokenizer.tokenize(sent.strip()) 42 | 43 | # Account for [CLS] and [SEP] with "- 2" 44 | if len(tokens_a) > max_seq_length - 2: 45 | tokens_a = tokens_a[:(max_seq_length - 2)] 46 | 47 | # Keep segment id which allows loading BERT-weights. 48 | tokens = ["[CLS]"] + tokens_a + ["[SEP]"] 49 | segment_ids = [0] * len(tokens) 50 | 51 | input_ids = tokenizer.convert_tokens_to_ids(tokens) 52 | 53 | # The mask has 1 for real tokens and 0 for padding tokens. Only real 54 | # tokens are attended to. 55 | input_mask = [1] * len(input_ids) 56 | 57 | # Zero-pad up to the sequence length. 58 | padding = [0] * (max_seq_length - len(input_ids)) 59 | input_ids += padding 60 | input_mask += padding 61 | segment_ids += padding 62 | 63 | assert len(input_ids) == max_seq_length 64 | assert len(input_mask) == max_seq_length 65 | assert len(segment_ids) == max_seq_length 66 | 67 | features.append( 68 | InputFeatures(input_ids=input_ids, 69 | input_mask=input_mask, 70 | segment_ids=segment_ids)) 71 | return features 72 | 73 | 74 | def set_visual_config(args): 75 | VISUAL_CONFIG.l_layers = args.llayers 76 | VISUAL_CONFIG.x_layers = args.xlayers 77 | VISUAL_CONFIG.r_layers = args.rlayers 78 | 79 | #capsules config 80 | VISUAL_CONFIG.num_prim_caps = args.NUM_PRIM_CAPS 81 | VISUAL_CONFIG.num_vis_caps = args.NUM_VIS_CAPS 82 | VISUAL_CONFIG.pose_matrix_dim = args.POSE_DIM 83 | VISUAL_CONFIG.hw = args.HW 84 | VISUAL_CONFIG.caps_dim = args.NUM_VIS_CAPS * (args.POSE_DIM*args.POSE_DIM+1) 85 | VISUAL_CONFIG.is_attn_routing = args.attn_routing 86 | print(VISUAL_CONFIG.num_prim_caps) 87 | 88 | 89 | class LXRTEncoder(nn.Module): 90 | def __init__(self, args, max_seq_length, mode='x'): 91 | super().__init__() 92 | self.max_seq_length = max_seq_length 93 | set_visual_config(args) 94 | self.args = args 95 | self.mode = mode 96 | if torch.cuda.is_available(): 97 | self.device = 'cuda' 98 | else: 99 | self.device = 'cpu' 100 | 101 | # Using the bert tokenizer 102 | self.tokenizer = BertTokenizer.from_pretrained( 103 | "bert-base-uncased", 104 | do_lower_case=True 105 | ) 106 | cross_attn_type = args.cross_attn_type if hasattr(args, 'cross_attn_type') else 'old' 107 | 108 | # Build LXRT Model 109 | self.model = VisualBertForLXRFeature.from_pretrained( 110 | "bert-base-uncased", 111 | mode=mode, 112 | skip_connection=args.skip_connection, 113 | shared_weights=args.shared_weights, 114 | cross_attn = args.cross_attn, 115 | cross_attn_type=cross_attn_type, 116 | freeze_weights = args.freeze_weights, 117 | patches=args.patches, 118 | margin=args.margin, 119 | vit_init=args.vit_init, 120 | start_index=args.start_index, 121 | no_caps = args.no_caps 122 | ) 123 | 124 | if args.from_scratch: 125 | print("initializing all the weights") 126 | self.model.apply(self.model.init_bert_weights) 127 | 128 | # GPU Options 129 | if torch.cuda.is_available(): 130 | self.model = self.model.cuda() 131 | if args.multiGPU: 132 | self.model = nn.DataParallel(self.model) 133 | 134 | def multi_gpu(self): 135 | self.model = nn.DataParallel(self.model) 136 | 137 | @property 138 | def dim(self): 139 | return 768 140 | 141 | def forward(self, sents, feats, visual_attention_mask=None): 142 | train_features = convert_sents_to_features( 143 | sents, self.max_seq_length, self.tokenizer) 144 | 145 | input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long).to(self.device) 146 | input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long).to(self.device) 147 | segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long).to(self.device) 148 | 149 | # print(feats[0].shape) 150 | if self.mode == 'lxr': 151 | feat, output, attn_probs = self.model(input_ids, segment_ids, input_mask, 152 | visual_feats=feats, 153 | visual_attention_mask=visual_attention_mask, 154 | output_all_attention_masks=self.args.output_attention) 155 | else: 156 | feat = None 157 | output, attn_probs = self.model(input_ids, segment_ids, input_mask, 158 | visual_feats=feats, 159 | visual_attention_mask=visual_attention_mask, output_all_attention_masks=self.args.output_attention) 160 | return feat, output, attn_probs 161 | 162 | def save(self, path): 163 | torch.save(self.model.state_dict(), 164 | os.path.join("%s_LXRT.pth" % path)) 165 | 166 | def load(self, path): 167 | # Load state_dict from snapshot file 168 | print("Load LXMERT pre-trained model from %s" % path) 169 | state_dict = torch.load("%s_LXRT.pth" % path, map_location=torch.device(self.device)) 170 | new_state_dict = {} 171 | for key, value in state_dict.items(): 172 | if key.startswith("module."): 173 | new_state_dict[key[len("module."):]] = value 174 | 175 | else: 176 | new_state_dict[key] = value 177 | if key.startswith("lxrt_encoder.model."): 178 | new_state_dict[key[len("lxrt_encoder.model."):]] = value 179 | # else: 180 | # new_state_dict[key] = value 181 | state_dict = new_state_dict 182 | 183 | # Print out the differences of pre-trained and model weights. 184 | load_keys = set(state_dict.keys()) 185 | model_keys = set(self.model.state_dict().keys()) 186 | print() 187 | print("Weights in loaded but not in model:") 188 | for key in sorted(load_keys.difference(model_keys)): 189 | print(key) 190 | print() 191 | print("Weights in model but not in loaded:") 192 | for key in sorted(model_keys.difference(load_keys)): 193 | print(key) 194 | print() 195 | 196 | # Load weights to model 197 | self.model.load_state_dict(state_dict, strict=False) 198 | 199 | 200 | 201 | 202 | -------------------------------------------------------------------------------- /src/lxrt/optimization.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2019 project LXRT 3 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | """PyTorch optimization for BERT model.""" 17 | 18 | import math 19 | import torch 20 | from torch.optim import Optimizer 21 | from torch.optim.optimizer import required 22 | import logging 23 | 24 | logger = logging.getLogger(__name__) 25 | 26 | def warmup_cosine(x, warmup=0.002): 27 | if x < warmup: 28 | return x/warmup 29 | return 0.5 * (1.0 + torch.cos(math.pi * x)) 30 | 31 | def warmup_constant(x, warmup=0.002): 32 | """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps. 33 | Learning rate is 1. afterwards. """ 34 | if x < warmup: 35 | return x/warmup 36 | return 1.0 37 | 38 | def warmup_linear(x, warmup=0.002): 39 | """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step. 40 | After `t_total`-th training step, learning rate is zero. """ 41 | if x < warmup: 42 | return x/warmup 43 | return max((x-1.)/(warmup-1.), 0) 44 | 45 | SCHEDULES = { 46 | 'warmup_cosine': warmup_cosine, 47 | 'warmup_constant': warmup_constant, 48 | 'warmup_linear': warmup_linear, 49 | } 50 | 51 | 52 | class BertAdam(Optimizer): 53 | """Implements BERT version of Adam algorithm with weight decay fix. 54 | Params: 55 | lr: learning rate 56 | warmup: portion of t_total for the warmup, -1 means no warmup. Default: -1 57 | t_total: total number of training steps for the learning 58 | rate schedule, -1 means constant learning rate. Default: -1 59 | schedule: schedule to use for the warmup (see above). Default: 'warmup_linear' 60 | b1: Adams b1. Default: 0.9 61 | b2: Adams b2. Default: 0.999 62 | e: Adams epsilon. Default: 1e-6 63 | weight_decay: Weight decay. Default: 0.01 64 | max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0 65 | """ 66 | def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear', 67 | b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01, 68 | max_grad_norm=1.0): 69 | if lr is not required and lr < 0.0: 70 | raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr)) 71 | if schedule not in SCHEDULES: 72 | raise ValueError("Invalid schedule parameter: {}".format(schedule)) 73 | if not 0.0 <= warmup < 1.0 and not warmup == -1: 74 | raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup)) 75 | if not 0.0 <= b1 < 1.0: 76 | raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1)) 77 | if not 0.0 <= b2 < 1.0: 78 | raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2)) 79 | if not e >= 0.0: 80 | raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e)) 81 | defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total, 82 | b1=b1, b2=b2, e=e, weight_decay=weight_decay, 83 | max_grad_norm=max_grad_norm) 84 | super(BertAdam, self).__init__(params, defaults) 85 | 86 | def get_lr(self): 87 | lr = [] 88 | for group in self.param_groups: 89 | for p in group['params']: 90 | state = self.state[p] 91 | if len(state) == 0: 92 | return [0] 93 | if group['t_total'] != -1: 94 | schedule_fct = SCHEDULES[group['schedule']] 95 | lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup']) 96 | else: 97 | lr_scheduled = group['lr'] 98 | lr.append(lr_scheduled) 99 | return lr 100 | 101 | def step(self, closure=None): 102 | """Performs a single optimization step. 103 | 104 | Arguments: 105 | closure (callable, optional): A closure that reevaluates the model 106 | and returns the loss. 107 | """ 108 | loss = None 109 | if closure is not None: 110 | loss = closure() 111 | 112 | warned_for_t_total = False 113 | 114 | for group in self.param_groups: 115 | for p in group['params']: 116 | if p.grad is None: 117 | continue 118 | grad = p.grad.data 119 | if grad.is_sparse: 120 | raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead') 121 | 122 | state = self.state[p] 123 | 124 | # State initialization 125 | if len(state) == 0: 126 | state['step'] = 0 127 | # Exponential moving average of gradient values 128 | state['next_m'] = torch.zeros_like(p.data) 129 | # Exponential moving average of squared gradient values 130 | state['next_v'] = torch.zeros_like(p.data) 131 | 132 | next_m, next_v = state['next_m'], state['next_v'] 133 | beta1, beta2 = group['b1'], group['b2'] 134 | 135 | # LXRT: grad is clipped outside. 136 | # Add grad clipping 137 | # if group['max_grad_norm'] > 0: 138 | # clip_grad_norm_(p, group['max_grad_norm']) 139 | 140 | # Decay the first and second moment running average coefficient 141 | # In-place operations to update the averages at the same time 142 | next_m.mul_(beta1).add_(1 - beta1, grad) 143 | next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad) 144 | update = next_m / (next_v.sqrt() + group['e']) 145 | 146 | # Just adding the square of the weights to the loss function is *not* 147 | # the correct way of using L2 regularization/weight decay with Adam, 148 | # since that will interact with the m and v parameters in strange ways. 149 | # 150 | # Instead we want to decay the weights in a manner that doesn't interact 151 | # with the m/v parameters. This is equivalent to adding the square 152 | # of the weights to the loss with plain (non-momentum) SGD. 153 | if group['weight_decay'] > 0.0: 154 | update += group['weight_decay'] * p.data 155 | 156 | if group['t_total'] != -1: 157 | schedule_fct = SCHEDULES[group['schedule']] 158 | progress = state['step']/group['t_total'] 159 | lr_scheduled = group['lr'] * schedule_fct(progress, group['warmup']) 160 | # warning for exceeding t_total (only active with warmup_linear 161 | if group['schedule'] == "warmup_linear" and progress > 1. and not warned_for_t_total: 162 | logger.warning( 163 | "Training beyond specified 't_total' steps with schedule '{}'. Learning rate set to {}. " 164 | "Please set 't_total' of {} correctly.".format(group['schedule'], lr_scheduled, self.__class__.__name__)) 165 | warned_for_t_total = True 166 | # end warning 167 | else: 168 | lr_scheduled = group['lr'] 169 | 170 | update_with_lr = lr_scheduled * update 171 | p.data.add_(-update_with_lr) 172 | 173 | state['step'] += 1 174 | 175 | # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1 176 | # No bias correction 177 | # bias_correction1 = 1 - beta1 ** state['step'] 178 | # bias_correction2 = 1 - beta2 ** state['step'] 179 | 180 | return loss 181 | -------------------------------------------------------------------------------- /src/tasks/vqa.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import os 5 | import collections 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torch.utils.data.dataloader import DataLoader 10 | from tqdm import tqdm 11 | 12 | from param import args 13 | from pretrain.qa_answer_table import load_lxmert_qa 14 | from tasks.vqa_model import VQAModel 15 | from tasks.vqa_data import VQADataset, VQATorchDataset, VQAEvaluator 16 | 17 | DataTuple = collections.namedtuple("DataTuple", 'dataset loader evaluator') 18 | 19 | 20 | def get_data_tuple(splits: str, bs:int, shuffle=False, drop_last=False) -> DataTuple: 21 | dset = VQADataset(splits) 22 | tset = VQATorchDataset(dset) 23 | evaluator = VQAEvaluator(dset) 24 | data_loader = DataLoader( 25 | tset, batch_size=bs, 26 | shuffle=shuffle, num_workers=args.num_workers, 27 | drop_last=drop_last, pin_memory=True 28 | ) 29 | 30 | return DataTuple(dataset=dset, loader=data_loader, evaluator=evaluator) 31 | 32 | 33 | class VQA: 34 | def __init__(self): 35 | # Datasets 36 | self.train_tuple = get_data_tuple( 37 | args.train, bs=args.batch_size, shuffle=True, drop_last=True 38 | ) 39 | if args.valid != "": 40 | self.valid_tuple = get_data_tuple( 41 | args.valid, bs=1024, 42 | shuffle=False, drop_last=False 43 | ) 44 | else: 45 | self.valid_tuple = None 46 | 47 | # Model 48 | self.model = VQAModel(self.train_tuple.dataset.num_answers) 49 | 50 | # Load pre-trained weights 51 | if args.load_lxmert is not None: 52 | self.model.lxrt_encoder.load(args.load_lxmert) 53 | if args.load_lxmert_qa is not None: 54 | load_lxmert_qa(args.load_lxmert_qa, self.model, 55 | label2ans=self.train_tuple.dataset.label2ans) 56 | 57 | # GPU options 58 | self.model = self.model.cuda() 59 | if args.multiGPU: 60 | self.model.lxrt_encoder.multi_gpu() 61 | 62 | # Loss and Optimizer 63 | self.bce_loss = nn.BCEWithLogitsLoss() 64 | if 'bert' in args.optim: 65 | batch_per_epoch = len(self.train_tuple.loader) 66 | t_total = int(batch_per_epoch * args.epochs) 67 | print("BertAdam Total Iters: %d" % t_total) 68 | from lxrt.optimization import BertAdam 69 | self.optim = BertAdam(list(self.model.parameters()), 70 | lr=args.lr, 71 | warmup=0.1, 72 | t_total=t_total) 73 | else: 74 | self.optim = args.optimizer(self.model.parameters(), args.lr) 75 | 76 | # Output Directory 77 | self.output = args.output 78 | os.makedirs(self.output, exist_ok=True) 79 | 80 | def train(self, train_tuple, eval_tuple): 81 | dset, loader, evaluator = train_tuple 82 | iter_wrapper = (lambda x: tqdm(x, total=len(loader))) if args.tqdm else (lambda x: x) 83 | 84 | best_valid = 0. 85 | for epoch in range(args.epochs): 86 | quesid2ans = {} 87 | for i, (ques_id, feats, boxes, sent, target) in iter_wrapper(enumerate(loader)): 88 | 89 | self.model.train() 90 | self.optim.zero_grad() 91 | 92 | feats, boxes, target = feats.cuda(), boxes.cuda(), target.cuda() 93 | logit = self.model(feats, boxes, sent) 94 | assert logit.dim() == target.dim() == 2 95 | loss = self.bce_loss(logit, target) 96 | loss = loss * logit.size(1) 97 | 98 | loss.backward() 99 | nn.utils.clip_grad_norm_(self.model.parameters(), 5.) 100 | self.optim.step() 101 | 102 | score, label = logit.max(1) 103 | for qid, l in zip(ques_id, label.cpu().numpy()): 104 | ans = dset.label2ans[l] 105 | quesid2ans[qid.item()] = ans 106 | 107 | log_str = "\nEpoch %d: Train %0.2f\n" % (epoch, evaluator.evaluate(quesid2ans) * 100.) 108 | 109 | if self.valid_tuple is not None: # Do Validation 110 | valid_score = self.evaluate(eval_tuple) 111 | if valid_score > best_valid: 112 | best_valid = valid_score 113 | self.save("BEST") 114 | 115 | log_str += "Epoch %d: Valid %0.2f\n" % (epoch, valid_score * 100.) + \ 116 | "Epoch %d: Best %0.2f\n" % (epoch, best_valid * 100.) 117 | 118 | print(log_str, end='') 119 | 120 | with open(self.output + "/log.log", 'a') as f: 121 | f.write(log_str) 122 | f.flush() 123 | 124 | self.save("LAST") 125 | 126 | def predict(self, eval_tuple: DataTuple, dump=None): 127 | """ 128 | Predict the answers to questions in a data split. 129 | 130 | :param eval_tuple: The data tuple to be evaluated. 131 | :param dump: The path of saved file to dump results. 132 | :return: A dict of question_id to answer. 133 | """ 134 | self.model.eval() 135 | dset, loader, evaluator = eval_tuple 136 | quesid2ans = {} 137 | for i, datum_tuple in enumerate(loader): 138 | ques_id, feats, boxes, sent = datum_tuple[:4] # Avoid seeing ground truth 139 | with torch.no_grad(): 140 | feats, boxes = feats.cuda(), boxes.cuda() 141 | logit = self.model(feats, boxes, sent) 142 | score, label = logit.max(1) 143 | for qid, l in zip(ques_id, label.cpu().numpy()): 144 | ans = dset.label2ans[l] 145 | quesid2ans[qid.item()] = ans 146 | if dump is not None: 147 | evaluator.dump_result(quesid2ans, dump) 148 | return quesid2ans 149 | 150 | def evaluate(self, eval_tuple: DataTuple, dump=None): 151 | """Evaluate all data in data_tuple.""" 152 | quesid2ans = self.predict(eval_tuple, dump) 153 | return eval_tuple.evaluator.evaluate(quesid2ans) 154 | 155 | @staticmethod 156 | def oracle_score(data_tuple): 157 | dset, loader, evaluator = data_tuple 158 | quesid2ans = {} 159 | for i, (ques_id, feats, boxes, sent, target) in enumerate(loader): 160 | _, label = target.max(1) 161 | for qid, l in zip(ques_id, label.cpu().numpy()): 162 | ans = dset.label2ans[l] 163 | quesid2ans[qid.item()] = ans 164 | return evaluator.evaluate(quesid2ans) 165 | 166 | def save(self, name): 167 | torch.save(self.model.state_dict(), 168 | os.path.join(self.output, "%s.pth" % name)) 169 | 170 | def load(self, path): 171 | print("Load model from %s" % path) 172 | state_dict = torch.load("%s.pth" % path) 173 | self.model.load_state_dict(state_dict) 174 | 175 | 176 | if __name__ == "__main__": 177 | # Build Class 178 | vqa = VQA() 179 | 180 | # Load VQA model weights 181 | # Note: It is different from loading LXMERT pre-trained weights. 182 | if args.load is not None: 183 | vqa.load(args.load) 184 | 185 | # Test or Train 186 | if args.test is not None: 187 | args.fast = args.tiny = False # Always loading all data in test 188 | if 'test' in args.test: 189 | vqa.predict( 190 | get_data_tuple(args.test, bs=950, 191 | shuffle=False, drop_last=False), 192 | dump=os.path.join(args.output, 'test_predict.json') 193 | ) 194 | elif 'val' in args.test: 195 | # Since part of valididation data are used in pre-training/fine-tuning, 196 | # only validate on the minival set. 197 | result = vqa.evaluate( 198 | get_data_tuple('minival', bs=950, 199 | shuffle=False, drop_last=False), 200 | dump=os.path.join(args.output, 'minival_predict.json') 201 | ) 202 | print(result) 203 | else: 204 | assert False, "No such test option for %s" % args.test 205 | else: 206 | print('Splits in Train data:', vqa.train_tuple.dataset.splits) 207 | if vqa.valid_tuple is not None: 208 | print('Splits in Valid data:', vqa.valid_tuple.dataset.splits) 209 | print("Valid Oracle: %0.2f" % (vqa.oracle_score(vqa.valid_tuple) * 100)) 210 | else: 211 | print("DO NOT USE VALIDATION") 212 | vqa.train(vqa.train_tuple, vqa.valid_tuple) 213 | 214 | 215 | -------------------------------------------------------------------------------- /src/tasks/refcocog_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import json 5 | import random 6 | 7 | import numpy as np 8 | import torch 9 | from torch.utils.data import Dataset 10 | 11 | from src import eval_utils 12 | from src.param import args 13 | from src.utils import load_obj_tsv, load_spatial_data 14 | 15 | # Load part of the dataset for fast checking. 16 | # Notice that here is the number of images instead of the number of data, 17 | # which means all related data to the images would be used. 18 | TINY_IMG_NUM = 512 19 | FAST_IMG_NUM = 5000 20 | 21 | 22 | class RefCOCOgDataset: 23 | """ 24 | A GQA data example in json file: 25 | { 26 | "caption": caption, 27 | "sent_id": sent_id, 28 | "image_id": image_id, 29 | "refBox": refBox, 30 | "ref_id": ref_id, --> unique id assigned to each data sample 31 | } 32 | """ 33 | def __init__(self, splits: str): 34 | self.name = splits 35 | self.splits = splits.split(',') 36 | 37 | # Loading datasets to data 38 | self.data = [] 39 | for split in self.splits: 40 | self.data.extend(json.load(open("../../data/refcocog/annotations_%s.json" % split))) 41 | print("Load %d data from split(s) %s." % (len(self.data), self.name)) 42 | 43 | 44 | # List to dict (for evaluation and others) 45 | self.id2datum = { 46 | datum['sent_id']: datum 47 | for datum in self.data 48 | } 49 | 50 | # Answers 51 | # self.ans2label = json.load(open("data/refcoco/trainval_ans2label.json")) 52 | # self.label2ans = json.load(open("data/refcoco/trainval_label2ans.json")) 53 | # assert len(self.ans2label) == len(self.label2ans) 54 | # for ans, label in self.ans2label.items(): 55 | # assert self.label2ans[label] == ans 56 | 57 | @property 58 | # def num_answers(self): 59 | # return len(self.ans2label) 60 | 61 | def __len__(self): 62 | return len(self.data) 63 | 64 | 65 | class RefCOCOgBufferLoader(): 66 | def __init__(self): 67 | self.key2data = {} 68 | 69 | def load_data(self, name, number): 70 | # if name == 'testdev': 71 | # # path = "data/vg_gqa_imgfeat/gqa_testdev_obj36.tsv" 72 | # path = "data/refcoco/refcoco_testdev_spatial.h5" 73 | # else: 74 | # # path = "data/vg_gqa_imgfeat/vg_gqa_obj36.tsv" 75 | # path = "data/refcoco/refcoco_testdev_spatial.h5" 76 | path = "../../data/refcocog/{}_features.hdf5".format(name) 77 | key = "%s_%d" % (path, number) 78 | if key not in self.key2data: 79 | self.key2data[key] = load_spatial_data( 80 | path, 81 | topk=number 82 | ) 83 | return self.key2data[key] 84 | 85 | 86 | refcocog_buffer_loader = RefCOCOgBufferLoader() 87 | 88 | 89 | """ 90 | Example in obj tsv: 91 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf", 92 | "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"] 93 | """ 94 | class RefCOCOgTorchDataset(Dataset): 95 | def __init__(self, dataset: RefCOCOgDataset): 96 | super().__init__() 97 | self.weakly_supervise = args.train_paradigm == 'weak' 98 | self.raw_dataset = dataset 99 | # self.img_info_data = json.load('data/gqa/gqa_spatial_merged_info.json') 100 | 101 | if args.tiny: 102 | topk = TINY_IMG_NUM 103 | elif args.fast: 104 | topk = FAST_IMG_NUM 105 | else: 106 | topk = -1 107 | 108 | # Loading detection features to img_data 109 | # Since images in train and valid both come from Visual Genome, 110 | # buffer the image loading to save memory. 111 | img_data = [] 112 | if 'test' in dataset.splits or 'test' in dataset.splits: # Always loading all the data in testdev 113 | img_data.extend(refcocog_buffer_loader.load_data('test', -1)) 114 | elif 'valid' in dataset.splits or 'valid' in dataset.splits: # Always loading all the data in testdev 115 | img_data.extend(refcocog_buffer_loader.load_data('valid', -1)) 116 | else: 117 | img_data.extend(refcocog_buffer_loader.load_data('train', topk)) 118 | self.imgid2img = {} 119 | for img_datum in img_data: 120 | self.imgid2img[img_datum['image_id']] = img_datum 121 | 122 | # Only kept the data with loaded image features 123 | self.data = [] 124 | for datum in self.raw_dataset.data: 125 | if datum['image_id'] in self.imgid2img: 126 | self.data.append(datum) 127 | print("Use %d data in torch dataset" % (len(self.data))) 128 | print() 129 | 130 | def __len__(self): 131 | return len(self.data) 132 | 133 | def __getitem__(self, item: int): 134 | datum = self.data[item] 135 | 136 | img_id = datum['image_id'] 137 | sent_id = datum['sent_id'] 138 | sent = datum['caption'] 139 | 140 | # If weakly supervision, replace the sentence with an sentence 141 | # corresponding to other image. 142 | is_matched = 1 143 | if self.weakly_supervise: 144 | if random.random() < 0.5: 145 | is_matched = 0 146 | other_datum = self.data[random.randint(0, len(self.data) - 1)] 147 | while other_datum['img_id'] == img_id: 148 | other_datum = self.data[random.randint(0, len(self.data) - 1)] 149 | sent = other_datum['sent'] 150 | 151 | # Get image info 152 | img_info = self.imgid2img[img_id] 153 | obj_num = img_info['num_boxes'] 154 | # boxes = img_info['boxes'].copy() 155 | 156 | feats = img_info['features'].copy() 157 | ##Aisha change: 158 | 159 | boxes = np.ones(feats.shape[1]*feats.shape[2]+1, dtype=np.float32) #assuming feats of shape [d, h, w] 160 | # assert len(boxes) == len(feats) == obj_num 161 | 162 | target_box = datum['refBox'] 163 | # Normalize the boxes (to 0 ~ 1) 164 | img_h, img_w = img_info['img_h'], img_info['img_w'] 165 | target_box = target_box.copy() 166 | # target_box[:, (0, 2)] /= img_w 167 | # target_box[:, (1, 3)] /= img_h 168 | target_box[0] /= img_w 169 | target_box[2] /= img_w 170 | target_box[1] /= img_h 171 | target_box[3] /= img_h 172 | np.testing.assert_array_less(np.array(target_box), 1+1e-5) 173 | np.testing.assert_array_less(-np.array(target_box), 0+1e-5) 174 | 175 | # Create target 176 | # if 'label' in datum: 177 | # label = datum['label'] 178 | # target = torch.zeros(self.raw_dataset.num_answers) 179 | # for ans, score in label.items(): 180 | # if ans in self.raw_dataset.ans2label: 181 | # target[self.raw_dataset.ans2label[ans]] = score 182 | # return ref_id, feats, target_box, sent, target 183 | # else: 184 | return sent_id, feats, boxes, sent, torch.tensor(target_box), is_matched 185 | 186 | 187 | class RefCOCOgEvaluator: 188 | def __init__(self, dataset: RefCOCOgDataset): 189 | self.dataset = dataset 190 | 191 | def evaluate(self, sentid2box: dict): 192 | sid2iou = {} 193 | 194 | for sentid, pred_box in sentid2box.items(): 195 | datum = self.dataset.id2datum[sentid.item()] 196 | gt_box = datum['refBox'] 197 | miou, accu = eval_utils.trans_vg_eval_val(torch.as_tensor(pred_box), torch.as_tensor(gt_box)) 198 | sid2iou[sentid] = miou.detach().cpu().numpy() 199 | 200 | accu = self.iou_acc(sid2iou) 201 | return accu.float() / len(sentid2box) 202 | 203 | def iou_acc(self, sid2iou): 204 | accu = torch.sum(torch.FloatTensor(list(sid2iou.values())) >= 0.5) 205 | return accu 206 | 207 | def save_json(self, data, file_path): 208 | with open(file_path, "w") as f: 209 | json.dump(data, f) 210 | 211 | def dump_result(self, sentid2box: dict, path): 212 | """ 213 | Dump the result to a GQA-challenge submittable json file. 214 | GQA json file submission requirement: 215 | results = [result] 216 | result = { 217 | "questionId": str, # Note: it's a actually an int number but the server requires an str. 218 | "prediction": str 219 | } 220 | 221 | :param sentid2box: A dict mapping question id to its predicted answer. 222 | :param path: The file path to save the json file. 223 | :return: 224 | """ 225 | with open(path, 'w') as f: 226 | result = [] 227 | for sent_id, box in sentid2box.items(): 228 | result.append({ 229 | 'questionId': sent_id, 230 | 'prediction': box 231 | }) 232 | json.dump(result, f, indent=4, sort_keys=True) 233 | 234 | 235 | -------------------------------------------------------------------------------- /src/utils.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 Project LXRT 3 | import base64 4 | import csv 5 | import json 6 | import os 7 | import sys 8 | import time 9 | 10 | import h5py 11 | import numpy as np 12 | 13 | csv.field_size_limit(sys.maxsize) 14 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf", 15 | "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"] 16 | 17 | 18 | def load_json(file_path): 19 | with open(file_path, "r") as f: 20 | return json.load(f) 21 | 22 | 23 | def save_json(data, file_path): 24 | with open(file_path, "w") as f: 25 | json.dump(data, f) 26 | 27 | 28 | def load_obj_tsv(fname, topk=None): 29 | """Load object features from tsv file. 30 | 31 | :param fname: The path to the tsv file. 32 | :param topk: Only load features for top K images (lines) in the tsv file. 33 | Will load all the features if topk is either -1 or None. 34 | :return: A list of image object features where each feature is a dict. 35 | See FILENAMES above for the keys in the feature dict. 36 | """ 37 | data = [] 38 | start_time = time.time() 39 | print("Start to load Faster-RCNN detected objects from %s" % fname) 40 | 41 | with open(fname) as f: 42 | reader = csv.DictReader(f, FIELDNAMES, delimiter="\t") 43 | for i, item in enumerate(reader): 44 | 45 | for key in ['img_h', 'img_w', 'num_boxes']: 46 | if item[key] == '': 47 | item[key] = 0 48 | item[key] = int(item[key]) 49 | 50 | boxes = item['num_boxes'] 51 | decode_config = [ 52 | ('objects_id', (boxes,), np.int64), 53 | ('objects_conf', (boxes,), np.float32), 54 | ('attrs_id', (boxes,), np.int64), 55 | ('attrs_conf', (boxes,), np.float32), 56 | ('boxes', (boxes, 4), np.float32), 57 | ('features', (7, 7, 1024), np.float64), 58 | ] 59 | for key, shape, dtype in decode_config: 60 | if key == 'features': 61 | decoded_item = base64.b64decode(item[key]) 62 | item[key] = np.frombuffer(decoded_item) # todo: replace dummy data with orig features 63 | else: 64 | item[key] = np.frombuffer(base64.b64decode(item[key]), dtype=dtype) 65 | item[key] = item[key].reshape(shape) 66 | item[key].setflags(write=False) 67 | 68 | data.append(item) 69 | if topk is not None and len(data) == topk: 70 | break 71 | elapsed_time = time.time() - start_time 72 | print("Loaded %d images in file %s in %d seconds." % (len(data), fname, elapsed_time)) 73 | return data 74 | 75 | ###### additional functions, Author: Aisha Urooj ####### 76 | def load_spatial_data(fname, topk=None): 77 | """Load object features from tsv file. 78 | 79 | :param fname: The path to the tsv file. 80 | :param topk: Only load features for top K images (lines) in the tsv file. 81 | Will load all the features if topk is either -1 or None. 82 | :return: A list of image object features where each feature is a dict. 83 | See FILENAMES above for the keys in the feature dict. 84 | """ 85 | data = [] 86 | fparts = fname.split('/') 87 | print(fparts[:-1]) 88 | fpath = os.path.join(*fparts[:-1]) 89 | fn = fparts[-1] 90 | split = fn.split('_')[0] 91 | mapping_fn = os.path.join(fpath, 'img_id2idx_{}.json'.format(split)) 92 | start_time = time.time() 93 | print("Reading %s file" % mapping_fn) 94 | img_id2idx_dict = load_json(mapping_fn) 95 | print("Start to load ResNet152 features from %s" % fname) 96 | 97 | h = h5py.File(os.path.join(fpath, '{}_features.hdf5'.format(split)), 'r') 98 | img_features = h['data'] 99 | 100 | for img_id, item in img_id2idx_dict.items(): 101 | item["features"] = img_features[item["i"]] 102 | item["img_id"] = img_id 103 | 104 | for key in ['objects_id', 'objects_conf', 'attrs_id', 'attrs_conf', 'boxes', 'features']: 105 | if item[key] is not None: 106 | item[key].setflags(write=False) 107 | else: 108 | if key == 'boxes': 109 | item[key] = np.zeros((1, 4)) 110 | else: 111 | item[key] = np.array([0, 0]) 112 | 113 | data.append(item) 114 | 115 | if topk is not None and len(data) == topk: 116 | break 117 | 118 | elapsed_time = time.time() - start_time 119 | print("Loaded %d images in file %s in %d seconds." % (len(data), fname, elapsed_time)) 120 | 121 | return data 122 | 123 | 124 | def load_spatial_gqa(fname, topk=None): 125 | """Load object features from tsv file. 126 | 127 | :param fname: The path to the tsv file. 128 | :param topk: Only load features for top K images (lines) in the tsv file. 129 | Will load all the features if topk is either -1 or None. 130 | :return: A list of image object features where each feature is a dict. 131 | See FILENAMES above for the keys in the feature dict. 132 | """ 133 | # todo: adopt function to read gqa data 134 | data = [] 135 | fparts = fname.split('/') 136 | print(fparts[:-1]) 137 | fpath = os.path.join(*fparts[:-1]) 138 | fn = fparts[-1] 139 | split = fn.split('_')[0] 140 | mapping_fn = os.path.join(fpath, 'gqa_spatial_merged_info.json') 141 | start_time = time.time() 142 | print("Reading %s file" % mapping_fn) 143 | img_id2idx_dict = load_json(mapping_fn) 144 | print("Start to load ResNet101 features from %s" % fname) 145 | 146 | h = h5py.File(os.path.join(fpath, 'gqa_spatial.h5'), 'r') 147 | img_features = h['features'] 148 | 149 | for img_id, item in img_id2idx_dict.items(): 150 | item["features"] = img_features[item["index"]] 151 | item["img_id"] = img_id 152 | 153 | item['objects_id'] = None 154 | item['objects_conf'] = None 155 | item['attrs_id'] = None 156 | item['attrs_conf'] = None 157 | item['boxes'] = None 158 | item['num_boxes'] = 0 159 | 160 | for key in ['objects_id', 'objects_conf', 'attrs_id', 'attrs_conf', 'boxes', 'features']: 161 | if item[key] is not None: 162 | item[key].setflags(write=False) 163 | else: 164 | if key == 'boxes': 165 | item[key] = np.zeros((1, 4)) 166 | else: 167 | item[key] = np.array([0, 0]) 168 | 169 | data.append(item) 170 | 171 | if topk is not None and len(data) == topk: 172 | break 173 | elapsed_time = time.time() - start_time 174 | print("Loaded %d images in file %s in %d seconds." % (len(data), fname, elapsed_time)) 175 | return data 176 | 177 | 178 | def load_patches(fname, dataset='', topk=None): 179 | """Load object features from tsv file. 180 | 181 | :param fname: The path to the tsv file. 182 | :param topk: Only load features for top K images (lines) in the tsv file. 183 | Will load all the features if topk is either -1 or None. 184 | :return: A list of image object features where each feature is a dict. 185 | See FILENAMES above for the keys in the feature dict. 186 | """ 187 | # todo: adopt function to read gqa data 188 | assert dataset != '' 189 | data = [] 190 | fparts = fname.split('/') 191 | print(fparts[:-1]) 192 | fpath = os.path.join(*fparts[:-1]) 193 | fn = fparts[-1] 194 | split = fn.split('_')[0] 195 | mapping_fn = os.path.join(fpath, 'img_id2idx_{dataset}_{split}_32x32.json'.format(dataset=dataset, 196 | split=split)) 197 | start_time = time.time() 198 | print("Reading %s file" % mapping_fn) 199 | img_id2idx_dict = load_json(mapping_fn) 200 | print("Start to load image patches from %s" % fname) 201 | 202 | h = h5py.File(os.path.join(fpath, '{split}_patches_32x32.hdf5'.format(split=split)), 'r') 203 | img_features = h['data'] 204 | 205 | for img_id, item in img_id2idx_dict.items(): 206 | item["features"] = img_features[item["i"]] 207 | item["img_id"] = img_id 208 | 209 | item['objects_id'] = None 210 | item['objects_conf'] = None 211 | item['attrs_id'] = None 212 | item['attrs_conf'] = None 213 | item['boxes'] = None 214 | item['num_boxes'] = 0 215 | 216 | for key in ['objects_id', 'objects_conf', 'attrs_id', 'attrs_conf', 'boxes', 'features']: 217 | if item[key] is not None: 218 | item[key].setflags(write=False) 219 | else: 220 | if key == 'boxes': 221 | item[key] = np.zeros((1, 4)) 222 | else: 223 | item[key] = np.array([0, 0]) 224 | 225 | data.append(item) 226 | if topk is not None and len(data) == topk: 227 | break 228 | 229 | elapsed_time = time.time() - start_time 230 | print("Loaded %d images in file %s in %d seconds." % (len(data), fname, elapsed_time)) 231 | return data 232 | -------------------------------------------------------------------------------- /src/lxrt/file_utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | Utilities for working with the local dataset cache. 3 | This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp 4 | Copyright by the AllenNLP authors. 5 | """ 6 | import json 7 | import logging 8 | import os 9 | import shutil 10 | import sys 11 | import tempfile 12 | from functools import wraps 13 | from hashlib import sha256 14 | from io import open 15 | 16 | import boto3 17 | import requests 18 | from botocore.exceptions import ClientError 19 | from tqdm import tqdm 20 | 21 | try: 22 | from urllib.parse import urlparse 23 | except ImportError: 24 | from urlparse import urlparse 25 | 26 | try: 27 | from pathlib import Path 28 | PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', 29 | Path.home() / '.pytorch_pretrained_bert')) 30 | except (AttributeError, ImportError): 31 | PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', 32 | os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert')) 33 | 34 | logger = logging.getLogger(__name__) # pylint: disable=invalid-name 35 | 36 | 37 | def url_to_filename(url, etag=None): 38 | """ 39 | Convert `url` into a hashed filename in a repeatable way. 40 | If `etag` is specified, append its hash to the url's, delimited 41 | by a period. 42 | """ 43 | url_bytes = url.encode('utf-8') 44 | url_hash = sha256(url_bytes) 45 | filename = url_hash.hexdigest() 46 | 47 | if etag: 48 | etag_bytes = etag.encode('utf-8') 49 | etag_hash = sha256(etag_bytes) 50 | filename += '.' + etag_hash.hexdigest() 51 | 52 | return filename 53 | 54 | 55 | def filename_to_url(filename, cache_dir=None): 56 | """ 57 | Return the url and etag (which may be ``None``) stored for `filename`. 58 | Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist. 59 | """ 60 | if cache_dir is None: 61 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 62 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 63 | cache_dir = str(cache_dir) 64 | 65 | cache_path = os.path.join(cache_dir, filename) 66 | if not os.path.exists(cache_path): 67 | raise EnvironmentError("file {} not found".format(cache_path)) 68 | 69 | meta_path = cache_path + '.json' 70 | if not os.path.exists(meta_path): 71 | raise EnvironmentError("file {} not found".format(meta_path)) 72 | 73 | with open(meta_path, encoding="utf-8") as meta_file: 74 | metadata = json.load(meta_file) 75 | url = metadata['url'] 76 | etag = metadata['etag'] 77 | 78 | return url, etag 79 | 80 | 81 | def cached_path(url_or_filename, cache_dir=None): 82 | """ 83 | Given something that might be a URL (or might be a local path), 84 | determine which. If it's a URL, download the file and cache it, and 85 | return the path to the cached file. If it's already a local path, 86 | make sure the file exists and then return the path. 87 | """ 88 | if cache_dir is None: 89 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 90 | if sys.version_info[0] == 3 and isinstance(url_or_filename, Path): 91 | url_or_filename = str(url_or_filename) 92 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 93 | cache_dir = str(cache_dir) 94 | 95 | parsed = urlparse(url_or_filename) 96 | 97 | if parsed.scheme in ('http', 'https', 's3'): 98 | # URL, so get it from the cache (downloading if necessary) 99 | return get_from_cache(url_or_filename, cache_dir) 100 | elif os.path.exists(url_or_filename): 101 | # File, and it exists. 102 | return url_or_filename 103 | elif parsed.scheme == '': 104 | # File, but it doesn't exist. 105 | raise EnvironmentError("file {} not found".format(url_or_filename)) 106 | else: 107 | # Something unknown 108 | raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename)) 109 | 110 | 111 | def split_s3_path(url): 112 | """Split a full s3 path into the bucket name and path.""" 113 | parsed = urlparse(url) 114 | if not parsed.netloc or not parsed.path: 115 | raise ValueError("bad s3 path {}".format(url)) 116 | bucket_name = parsed.netloc 117 | s3_path = parsed.path 118 | # Remove '/' at beginning of path. 119 | if s3_path.startswith("/"): 120 | s3_path = s3_path[1:] 121 | return bucket_name, s3_path 122 | 123 | 124 | def s3_request(func): 125 | """ 126 | Wrapper function for s3 requests in order to create more helpful error 127 | messages. 128 | """ 129 | 130 | @wraps(func) 131 | def wrapper(url, *args, **kwargs): 132 | try: 133 | return func(url, *args, **kwargs) 134 | except ClientError as exc: 135 | if int(exc.response["Error"]["Code"]) == 404: 136 | raise EnvironmentError("file {} not found".format(url)) 137 | else: 138 | raise 139 | 140 | return wrapper 141 | 142 | 143 | @s3_request 144 | def s3_etag(url): 145 | """Check ETag on S3 object.""" 146 | s3_resource = boto3.resource("s3") 147 | bucket_name, s3_path = split_s3_path(url) 148 | s3_object = s3_resource.Object(bucket_name, s3_path) 149 | return s3_object.e_tag 150 | 151 | 152 | @s3_request 153 | def s3_get(url, temp_file): 154 | """Pull a file directly from S3.""" 155 | s3_resource = boto3.resource("s3") 156 | bucket_name, s3_path = split_s3_path(url) 157 | s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file) 158 | 159 | 160 | def http_get(url, temp_file): 161 | req = requests.get(url, stream=True) 162 | content_length = req.headers.get('Content-Length') 163 | total = int(content_length) if content_length is not None else None 164 | progress = tqdm(unit="B", total=total) 165 | for chunk in req.iter_content(chunk_size=1024): 166 | if chunk: # filter out keep-alive new chunks 167 | progress.update(len(chunk)) 168 | temp_file.write(chunk) 169 | progress.close() 170 | 171 | 172 | def get_from_cache(url, cache_dir=None): 173 | """ 174 | Given a URL, look for the corresponding dataset in the local cache. 175 | If it's not there, download it. Then return the path to the cached file. 176 | """ 177 | if cache_dir is None: 178 | cache_dir = PYTORCH_PRETRAINED_BERT_CACHE 179 | if sys.version_info[0] == 3 and isinstance(cache_dir, Path): 180 | cache_dir = str(cache_dir) 181 | 182 | if not os.path.exists(cache_dir): 183 | os.makedirs(cache_dir) 184 | 185 | # Get eTag to add to filename, if it exists. 186 | if url.startswith("s3://"): 187 | etag = s3_etag(url) 188 | else: 189 | response = requests.head(url, allow_redirects=True) 190 | if response.status_code != 200: 191 | raise IOError("HEAD request failed for url {} with status code {}" 192 | .format(url, response.status_code)) 193 | etag = response.headers.get("ETag") 194 | 195 | filename = url_to_filename(url, etag) 196 | 197 | # get cache path to put the file 198 | cache_path = os.path.join(cache_dir, filename) 199 | 200 | if not os.path.exists(cache_path): 201 | # Download to temporary file, then copy to cache dir once finished. 202 | # Otherwise you get corrupt cache entries if the download gets interrupted. 203 | with tempfile.NamedTemporaryFile() as temp_file: 204 | logger.info("%s not found in cache, downloading to %s", url, temp_file.name) 205 | 206 | # GET file object 207 | if url.startswith("s3://"): 208 | s3_get(url, temp_file) 209 | else: 210 | http_get(url, temp_file) 211 | 212 | # we are copying the file before closing it, so flush to avoid truncation 213 | temp_file.flush() 214 | # shutil.copyfileobj() starts at the current position, so go to the start 215 | temp_file.seek(0) 216 | 217 | logger.info("copying %s to cache at %s", temp_file.name, cache_path) 218 | with open(cache_path, 'wb') as cache_file: 219 | shutil.copyfileobj(temp_file, cache_file) 220 | 221 | logger.info("creating metadata file for %s", cache_path) 222 | meta = {'url': url, 'etag': etag} 223 | meta_path = cache_path + '.json' 224 | with open(meta_path, 'w', encoding="utf-8") as meta_file: 225 | json.dump(meta, meta_file) 226 | 227 | logger.info("removing temp file %s", temp_file.name) 228 | 229 | return cache_path 230 | 231 | 232 | def read_set_from_file(filename): 233 | ''' 234 | Extract a de-duped collection (set) of text from a file. 235 | Expected file format is one item per line. 236 | ''' 237 | collection = set() 238 | with open(filename, 'r', encoding='utf-8') as file_: 239 | for line in file_: 240 | collection.add(line.rstrip()) 241 | return collection 242 | 243 | 244 | def get_file_extension(path, dot=True, lower=True): 245 | ext = os.path.splitext(path)[1] 246 | ext = ext if dot else ext[1:] 247 | return ext.lower() if lower else ext 248 | -------------------------------------------------------------------------------- /src/tasks/mscoco_retrieval_data.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import json 5 | 6 | import numpy as np 7 | import torch 8 | from torch.utils.data import Dataset 9 | 10 | from src.param import args 11 | from src.utils import load_obj_tsv, load_spatial_data 12 | 13 | # Load part of the dataset for fast checking. 14 | # Notice that here is the number of images instead of the number of data, 15 | # which means all related data to the images would be used. 16 | TINY_IMG_NUM = 512 17 | FAST_IMG_NUM = 5000 18 | 19 | def make_uid(img_id, dset, sent_idx): 20 | return "%s_%s_%03d" % (img_id, dset, sent_idx), 21 | 22 | class MSCOCODataset: 23 | """ 24 | A GQA data example in json file: 25 | { 26 | "caption": caption, 27 | "sent_id": sent_id, 28 | "image_id": image_id, 29 | "refBox": refBox, 30 | "ref_id": ref_id, --> unique id assigned to each data sample 31 | } 32 | """ 33 | def __init__(self, splits: str): 34 | self.name = splits 35 | self.splits = splits.split(',') 36 | 37 | # Loading datasets to data 38 | self.data = [] 39 | for split in self.splits: 40 | if split == 'train': 41 | self.data.extend( 42 | json.load(open("/media/data/data/data/lxmert/mscoco_%s.json" % split))) 43 | else: 44 | self.data.extend(json.load(open("/media/data/data/data/lxmert/mscoco_karpathy_retrieval_%s.json" % split))) 45 | print("Load %d data from split(s) %s." % (len(self.data), self.name)) 46 | 47 | data_flattened = self.flatten_data() 48 | self.data = data_flattened 49 | # List to dict (for evaluation and others) 50 | self.id2datum = { 51 | datum['uid']: datum 52 | for datum in self.data 53 | } 54 | 55 | # Answers 56 | # self.ans2label = json.load(open("data/refcoco/trainval_ans2label.json")) 57 | # self.label2ans = json.load(open("data/refcoco/trainval_label2ans.json")) 58 | # assert len(self.ans2label) == len(self.label2ans) 59 | # for ans, label in self.ans2label.items(): 60 | # assert self.label2ans[label] == ans 61 | 62 | def flatten_data(self): 63 | data_flattened = [] 64 | for datum in self.data: 65 | sentf = datum['sentf'] 66 | for sents_cat, sents in sentf.items(): 67 | if sents_cat == 'mscoco': 68 | # print(sents_cat) 69 | if sents_cat in datum['labelf']: 70 | labels = datum['labelf'][sents_cat] 71 | else: 72 | labels = None 73 | for sent_idx, sent in enumerate(sents): 74 | new_datum = { 75 | 'uid': make_uid(datum['img_id'], sents_cat, sent_idx), 76 | 'img_id': datum['img_id'], 77 | 'sent_id': sent_idx, 78 | 'sent': sent 79 | } 80 | if labels is not None: 81 | new_datum['label'] = labels[sent_idx] 82 | data_flattened.append(new_datum) 83 | break 84 | print("Use %d data in torch dataset" % (len(data_flattened))) 85 | return data_flattened 86 | 87 | @property 88 | # def num_answers(self): 89 | # return len(self.ans2label) 90 | 91 | def __len__(self): 92 | return len(self.data) 93 | 94 | 95 | class MSCOCOBufferLoader(): 96 | def __init__(self): 97 | self.key2data = {} 98 | 99 | def load_data(self, name, number): 100 | # if name == 'testdev': 101 | # # path = "data/vg_gqa_imgfeat/gqa_testdev_obj36.tsv" 102 | # path = "data/refcoco/refcoco_testdev_spatial.h5" 103 | # else: 104 | # # path = "data/vg_gqa_imgfeat/vg_gqa_obj36.tsv" 105 | # path = "data/refcoco/refcoco_testdev_spatial.h5" 106 | path = "/media/data/data/data/mscoco_imgfeat/{}_features.hdf5".format(name) 107 | key = "%s_%d" % (path, number) 108 | if key not in self.key2data: 109 | self.key2data[key] = load_spatial_data( 110 | path, 111 | topk=number 112 | ) 113 | return self.key2data[key] 114 | 115 | 116 | mscoco_buffer_loader = MSCOCOBufferLoader() 117 | 118 | 119 | """ 120 | Example in obj tsv: 121 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf", 122 | "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"] 123 | """ 124 | class MSCOCOTorchDataset(Dataset): 125 | def __init__(self, dataset: MSCOCODataset): 126 | super().__init__() 127 | self.raw_dataset = dataset 128 | # self.img_info_data = json.load('data/gqa/gqa_spatial_merged_info.json') 129 | 130 | if args.tiny: 131 | topk = TINY_IMG_NUM 132 | elif args.fast: 133 | topk = FAST_IMG_NUM 134 | else: 135 | topk = -1 136 | 137 | # Loading detection features to img_data 138 | # Since images in train and valid both come from Visual Genome, 139 | # buffer the image loading to save memory. 140 | img_data = [] 141 | if 'test' in dataset.splits or 'test' in dataset.splits: # Always loading all the data in testdev 142 | img_data.extend(mscoco_buffer_loader.load_data('test', -1)) 143 | elif 'valid' in dataset.splits or 'valid' in dataset.splits: # Always loading all the data in testdev 144 | img_data.extend(mscoco_buffer_loader.load_data('valid', -1)) 145 | else: 146 | img_data.extend(mscoco_buffer_loader.load_data('train', topk)) 147 | self.imgid2img = {} 148 | for img_datum in img_data: 149 | self.imgid2img[img_datum['img_id']] = img_datum 150 | 151 | # Only kept the data with loaded image features 152 | self.data = [] 153 | for datum in self.raw_dataset.data: 154 | if datum['img_id'] in self.imgid2img: 155 | self.data.append(datum) 156 | print("Use %d data in torch dataset" % (len(self.data))) 157 | print() 158 | 159 | def __len__(self): 160 | return len(self.data) 161 | 162 | def __getitem__(self, item: int): 163 | datum = self.data[item] 164 | 165 | img_id = datum['img_id'] 166 | sent_id = datum['uid'] 167 | sent = datum['sent'] 168 | 169 | # Get image info 170 | img_info = self.imgid2img[img_id] 171 | obj_num = img_info['num_boxes'] 172 | # boxes = img_info['boxes'].copy() 173 | 174 | feats = img_info['features'].copy() 175 | ##Aisha change: 176 | 177 | boxes = np.ones(feats.shape[1]*feats.shape[2]+1, dtype=np.float32) #assuming feats of shape [d, h, w] 178 | # assert len(boxes) == len(feats) == obj_num 179 | 180 | # target_box = datum['refBox'] 181 | # # Normalize the boxes (to 0 ~ 1) 182 | # img_h, img_w = img_info['img_h'], img_info['img_w'] 183 | # target_box = target_box.copy() 184 | # # target_box[:, (0, 2)] /= img_w 185 | # # target_box[:, (1, 3)] /= img_h 186 | # target_box[0] /= img_w 187 | # target_box[2] /= img_w 188 | # target_box[1] /= img_h 189 | # target_box[3] /= img_h 190 | # np.testing.assert_array_less(np.array(target_box), 1+1e-5) 191 | # np.testing.assert_array_less(-np.array(target_box), 0+1e-5) 192 | 193 | # Create target 194 | # if 'label' in datum: 195 | # label = datum['label'] 196 | # target = torch.zeros(self.raw_dataset.num_answers) 197 | # for ans, score in label.items(): 198 | # if ans in self.raw_dataset.ans2label: 199 | # target[self.raw_dataset.ans2label[ans]] = score 200 | # return ref_id, feats, target_box, sent, target 201 | # else: 202 | return sent_id, feats, boxes, sent 203 | 204 | 205 | class MSCOCOEvaluator: 206 | def __init__(self, dataset: MSCOCODataset): 207 | self.dataset = dataset 208 | 209 | def evaluate(self, sentid2box: dict): 210 | score = 0. 211 | for sentid, box in sentid2box.items(): 212 | datum = self.dataset.id2datum[sentid] 213 | label = datum['refBox'] 214 | if box in label: 215 | score += label[box] 216 | return score / len(sentid2box) 217 | 218 | def save_json(self, data, file_path): 219 | with open(file_path, "w") as f: 220 | json.dump(data, f) 221 | 222 | def dump_result(self, quesid2ans: dict, path): 223 | """ 224 | Dump the result to a GQA-challenge submittable json file. 225 | GQA json file submission requirement: 226 | results = [result] 227 | result = { 228 | "questionId": str, # Note: it's a actually an int number but the server requires an str. 229 | "prediction": str 230 | } 231 | 232 | :param quesid2ans: A dict mapping question id to its predicted answer. 233 | :param path: The file path to save the json file. 234 | :return: 235 | """ 236 | with open(path, 'w') as f: 237 | result = [] 238 | for ques_id, ans in quesid2ans.items(): 239 | result.append({ 240 | 'questionId': ques_id, 241 | 'prediction': ans 242 | }) 243 | json.dump(result, f, indent=4, sort_keys=True) 244 | 245 | 246 | -------------------------------------------------------------------------------- /src/tasks/refcocoplus.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import os 5 | import collections 6 | 7 | import gc 8 | import torch 9 | from tqdm import tqdm 10 | import torch.nn as nn 11 | from torch.utils.data.dataloader import DataLoader 12 | 13 | from src.param import args 14 | from src.pretrain.qa_answer_table import load_lxmert_qa 15 | from src.tasks.refcocoplus_model import RefCOCOplusModel 16 | from src.tasks.refcocoplus_data import RefCOCOplusDataset, RefCOCOplusTorchDataset, RefCOCOplusEvaluator 17 | 18 | print(args) 19 | DataTuple = collections.namedtuple("DataTuple", 'dataset loader evaluator') 20 | 21 | 22 | def get_tuple(splits: str, bs:int, shuffle=False, drop_last=False) -> DataTuple: 23 | dset = RefCOCOplusDataset(splits) 24 | tset = RefCOCOplusTorchDataset(dset) 25 | evaluator = RefCOCOplusEvaluator(dset) 26 | data_loader = DataLoader( 27 | tset, batch_size=bs, 28 | shuffle=shuffle, num_workers=args.num_workers, 29 | drop_last=drop_last, pin_memory=True 30 | ) 31 | 32 | return DataTuple(dataset=dset, loader=data_loader, evaluator=evaluator) 33 | 34 | 35 | class RefCOCOplus: 36 | def __init__(self): 37 | self.train_tuple = get_tuple( 38 | args.train, bs=args.batch_size, shuffle=True, drop_last=True 39 | ) 40 | if args.valid != "": 41 | valid_bsize = args.batch_size #2048 if args.multiGPU else args.batch_size#512 42 | self.valid_tuple = get_tuple( 43 | args.valid, bs=valid_bsize, 44 | shuffle=False, drop_last=False 45 | ) 46 | else: 47 | self.valid_tuple = None 48 | 49 | self.model = RefCOCOplusModel() 50 | 51 | # Load pre-trained weights 52 | if args.load_lxmert is not None: 53 | self.model.lxrt_encoder.load(args.load_lxmert) 54 | if args.load_lxmert_qa is not None: 55 | load_lxmert_qa(args.load_lxmert_qa, self.model, 56 | label2ans=self.train_tuple.dataset.label2ans) 57 | 58 | # GPU options 59 | self.model = self.model.cuda() 60 | if args.multiGPU: 61 | self.model.lxrt_encoder.multi_gpu() 62 | 63 | # Losses and optimizer 64 | self.bce_loss = nn.BCEWithLogitsLoss() 65 | self.mce_loss = nn.CrossEntropyLoss(ignore_index=-1) 66 | if 'bert' in args.optim: 67 | batch_per_epoch = len(self.train_tuple.loader) 68 | t_total = int(batch_per_epoch * args.epochs) 69 | print("Total Iters: %d" % t_total) 70 | from src.lxrt.optimization import BertAdam 71 | self.optim = BertAdam(list(self.model.parameters()), 72 | lr=args.lr, 73 | warmup=0.1, 74 | t_total=t_total) 75 | else: 76 | self.optim = args.optimizer(list(self.model.parameters()), args.lr) 77 | 78 | self.output = args.output 79 | 80 | os.makedirs(self.output, exist_ok=True) 81 | 82 | def train(self, train_tuple, eval_tuple): 83 | dset, loader, evaluator = train_tuple 84 | iter_wrapper = (lambda x: tqdm(x, total=len(loader))) if args.tqdm else (lambda x: x) 85 | 86 | best_valid = 0. 87 | for epoch in range(args.epochs): 88 | # log_str = '' 89 | quesid2ans = {} 90 | for i, (ques_id, feats, boxes, sent, target) in iter_wrapper(enumerate(loader)): 91 | 92 | self.model.train() 93 | self.optim.zero_grad() 94 | 95 | feats, boxes, target = feats.cuda(), boxes.cuda(), target.cuda() 96 | logit, attn_probs = self.model(feats, boxes, sent) 97 | assert logit.dim() == target.dim() == 2 98 | if args.mce_loss: 99 | max_value, target = target.max(1) 100 | loss = self.mce_loss(logit, target) * logit.size(1) 101 | else: 102 | loss = self.bce_loss(logit, target) 103 | loss = loss * logit.size(1) 104 | 105 | loss.backward() 106 | nn.utils.clip_grad_norm_(self.model.parameters(), 5.) 107 | self.optim.step() 108 | 109 | score, label = logit.max(1) 110 | for qid, l in zip(ques_id, label.cpu().numpy()): 111 | ans = dset.label2ans[l] 112 | quesid2ans[qid] = ans 113 | 114 | # del logit, attn_probs 115 | # gc.collect() 116 | 117 | log_str = "\nEpoch %d: Train %0.2f\n" % (epoch, evaluator.evaluate(quesid2ans) * 100.) 118 | 119 | # to handle GPU OOM error 120 | torch.cuda.empty_cache() 121 | 122 | if self.valid_tuple is not None: # Do Validation 123 | valid_score = self.evaluate(eval_tuple) 124 | if valid_score > best_valid: 125 | best_valid = valid_score 126 | self.save("BEST") 127 | 128 | log_str += "Epoch %d: Valid %0.2f\n" % (epoch, valid_score * 100.) + \ 129 | "Epoch %d: Best %0.2f\n" % (epoch, best_valid * 100.) 130 | 131 | print(log_str, end='') 132 | 133 | with open(self.output + "/log.log", 'a') as f: 134 | f.write(log_str) 135 | f.flush() 136 | 137 | self.save("LAST") 138 | 139 | def predict(self, eval_tuple: DataTuple, dump=None): 140 | self.model.eval() 141 | dset, loader, evaluator = eval_tuple 142 | quesid2ans = {} 143 | results = [] 144 | for i, datum_tuple in enumerate(loader): 145 | ques_id, feats, boxes, sent = datum_tuple[:4] # avoid handling target 146 | attention = [] 147 | 148 | with torch.no_grad(): 149 | feats, boxes = feats.cuda(), boxes.cuda() 150 | attn_probs = self.model(feats, boxes, sent) 151 | # print(attn_probs) 152 | if self.model.args.output_attention: 153 | last_layer_att_score = torch.squeeze(attn_probs[1][-1]['attn'][:, :, 0, :]) # batch_size, att_head, target_num_feat, source_num_feat -> use all att head and CLS as target 154 | # print(last_layer_att_score.shape) 155 | last_layer_att_score = last_layer_att_score.cpu().numpy().tolist() 156 | else: 157 | last_layer_att_score = [] 158 | 159 | # score, label = logit.max(1) 160 | for qid in ques_id: 161 | # ans = dset.label2ans[l] 162 | # quesid2ans[qid] = ans 163 | results.append( 164 | { 165 | "questionId": qid.tolist(), 166 | # "prediction": ans, 167 | "attention": last_layer_att_score 168 | } 169 | ) 170 | 171 | # del logit, attn_probs, datum_tuple 172 | # gc.collect() 173 | 174 | evaluator.save_json(results, '/data/Grounded-RL2021/lxmert/snap/refcoco+ /attentions.json') 175 | 176 | # if dump is not None: 177 | # evaluator.dump_result(quesid2ans, dump) 178 | # return quesid2ans 179 | 180 | def evaluate(self, eval_tuple: DataTuple, dump=None): 181 | dset, loader, evaluator = eval_tuple 182 | self.predict(eval_tuple, dump) 183 | # return evaluator.evaluate(quesid2ans) 184 | 185 | @staticmethod 186 | def oracle_score(data_tuple): 187 | dset, loader, evaluator = data_tuple 188 | quesid2ans = {} 189 | for i, (ques_id, feats, boxes, sent, target) in enumerate(loader): 190 | _, label = target.max(1) 191 | for qid, l in zip(ques_id, label.cpu().numpy()): 192 | ans = dset.label2ans[l] 193 | quesid2ans[qid] = ans 194 | return evaluator.evaluate(quesid2ans) 195 | 196 | def save(self, name): 197 | torch.save(self.model.state_dict(), 198 | os.path.join(self.output, "%s.pth" % name)) 199 | 200 | def load(self, path): 201 | print("Load model from %s" % path) 202 | state_dict = torch.load("%s.pth" % path) 203 | for key in list(state_dict.keys()): 204 | if '.module' in key: 205 | state_dict[key.replace('.module', '')] = state_dict.pop(key) 206 | self.model.load_state_dict(state_dict, strict=False) 207 | 208 | 209 | if __name__ == "__main__": 210 | torch.backends.cudnn.benchmark = True 211 | torch.backends.cudnn.enabled = True 212 | # Build Class 213 | refcoco = RefCOCOplus() 214 | 215 | 216 | # Load Model 217 | if args.load is not None: 218 | refcoco.load(args.load) 219 | 220 | # Test or Train 221 | if args.test is not None: 222 | args.fast = args.tiny = False # Always loading all data in test 223 | if 'submit' in args.test: 224 | refcoco.predict( 225 | get_tuple(args.test, bs=args.batch_size, 226 | shuffle=False, drop_last=False), 227 | dump=os.path.join(args.output, 'submit_predict.json') 228 | ) 229 | if 'test' in args.test: 230 | result = refcoco.evaluate( 231 | get_tuple('test', bs=args.batch_size, 232 | shuffle=False, drop_last=False), 233 | dump=os.path.join(args.output, 'test_predict.json') 234 | ) 235 | print(result) 236 | if 'valid' in args.test: 237 | result = refcoco.evaluate( 238 | get_tuple('valid', bs=args.batch_size, 239 | shuffle=False, drop_last=False), 240 | dump=os.path.join(args.output, 'valid_predict.json') 241 | ) 242 | print(result) 243 | else: 244 | # print("Train Oracle: %0.2f" % (gqa.oracle_score(gqa.train_tuple) * 100)) 245 | print('Splits in Train data:', refcoco.train_tuple.dataset.splits) 246 | if refcoco.valid_tuple is not None: 247 | print('Splits in Valid data:', refcoco.valid_tuple.dataset.splits) 248 | print("Valid Oracle: %0.2f" % (refcoco.oracle_score(refcoco.valid_tuple) * 100)) 249 | else: 250 | print("DO NOT USE VALIDATION") 251 | refcoco.train(refcoco.train_tuple, refcoco.valid_tuple) 252 | 253 | 254 | -------------------------------------------------------------------------------- /src/tasks/refcocog.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyleft 2019 project LXRT. 3 | 4 | import os 5 | import collections 6 | 7 | import gc 8 | import torch 9 | from tqdm import tqdm 10 | import torch.nn as nn 11 | from torch.utils.data.dataloader import DataLoader 12 | 13 | from src import eval_utils 14 | from src.param import args 15 | from src.pretrain.qa_answer_table import load_lxmert_qa 16 | from src.tasks.refcocog_model import RefCOCOgModel 17 | from src.tasks.refcocog_data import RefCOCOgDataset, RefCOCOgTorchDataset, RefCOCOgEvaluator 18 | 19 | print(args) 20 | DataTuple = collections.namedtuple("DataTuple", 'dataset loader evaluator') 21 | 22 | 23 | def get_tuple(splits: str, bs:int, shuffle=False, drop_last=False) -> DataTuple: 24 | dset = RefCOCOgDataset(splits) 25 | tset = RefCOCOgTorchDataset(dset) 26 | evaluator = RefCOCOgEvaluator(dset) 27 | data_loader = DataLoader( 28 | tset, batch_size=bs, 29 | shuffle=shuffle, num_workers=args.num_workers, 30 | drop_last=drop_last, pin_memory=True 31 | ) 32 | 33 | return DataTuple(dataset=dset, loader=data_loader, evaluator=evaluator) 34 | 35 | 36 | class RefCOCOg: 37 | def __init__(self): 38 | self.weakly_supervise = args.train_paradigm == 'weak' 39 | self.train_tuple = get_tuple( 40 | args.train, bs=args.batch_size, shuffle=True, drop_last=True 41 | ) 42 | if args.valid != "": 43 | valid_bsize = args.batch_size #2048 if args.multiGPU else args.batch_size#512 44 | self.valid_tuple = get_tuple( 45 | args.valid, bs=valid_bsize, 46 | shuffle=False, drop_last=False 47 | ) 48 | else: 49 | self.valid_tuple = None 50 | 51 | self.model = RefCOCOgModel(train_paradigm=args.train_paradigm) 52 | 53 | # Load pre-trained weights 54 | if args.load_lxmert is not None: 55 | self.model.lxrt_encoder.load(args.load_lxmert) 56 | if args.load_lxmert_qa is not None: 57 | load_lxmert_qa(args.load_lxmert_qa, self.model, 58 | label2ans=self.train_tuple.dataset.label2ans) 59 | 60 | # GPU options 61 | if torch.cuda.is_available(): 62 | self.device = 'cuda' 63 | else: 64 | self.device = 'cpu' 65 | self.model = self.model.to(self.device) 66 | if args.multiGPU and self.device == 'cuda': 67 | self.model.lxrt_encoder.multi_gpu() 68 | 69 | # Losses and optimizer 70 | self.bce_loss = nn.BCEWithLogitsLoss() 71 | self.mce_loss = nn.CrossEntropyLoss(ignore_index=-1) 72 | self.l1_loss = nn.L1Loss(reduction='none') 73 | 74 | if 'bert' in args.optim: 75 | batch_per_epoch = len(self.train_tuple.loader) 76 | t_total = int(batch_per_epoch * args.epochs) 77 | print("Total Iters: %d" % t_total) 78 | from src.lxrt.optimization import BertAdam 79 | self.optim = BertAdam(list(self.model.parameters()), 80 | lr=args.lr, 81 | warmup=0.1, 82 | t_total=t_total) 83 | else: 84 | self.optim = args.optimizer(list(self.model.parameters()), args.lr) 85 | 86 | self.output = args.output 87 | 88 | os.makedirs(self.output, exist_ok=True) 89 | 90 | def train(self, train_tuple, eval_tuple): 91 | dset, loader, evaluator = train_tuple 92 | iter_wrapper = (lambda x: tqdm(x, total=len(loader))) if args.tqdm else (lambda x: x) 93 | 94 | best_valid = 0. 95 | for epoch in range(args.epochs): 96 | # log_str = '' 97 | sentid2pbox = {} 98 | for i, (sent_id, feats, boxes, sent, target, is_matched) in iter_wrapper(enumerate(loader)): 99 | 100 | self.model.train() 101 | self.optim.zero_grad() 102 | 103 | feats, boxes, target, is_matched = feats.to(self.device), boxes.to(self.device), \ 104 | target.to(self.device), is_matched.to(self.device) 105 | 106 | logit, attn_probs = self.model(feats, boxes, sent) 107 | assert logit.dim() == target.dim() == 2 or logit.dim() == target.dim() == 4 108 | 109 | if self.weakly_supervise: 110 | loss = self.bce_loss(logit, is_matched) 111 | else: 112 | loss = self.l1_loss(logit, target) 113 | loss = loss.sum() / logit.shape[0] 114 | 115 | loss.backward() 116 | nn.utils.clip_grad_norm_(self.model.parameters(), 5.) 117 | self.optim.step() 118 | 119 | if self.weakly_supervise: 120 | pass 121 | else: 122 | miou, accu = eval_utils.trans_vg_eval_val(logit, target) 123 | print('Epoch: {epoch}, Iteration: {iter}, loss: {loss:.6f}, miou: {miou:.4f}, Accuracy: {acc:.4f}'.format( 124 | epoch=epoch, 125 | iter=i, 126 | loss=loss.item(), 127 | miou=miou.detach().mean().cpu().numpy(), 128 | acc=accu 129 | )) 130 | #todo: fix evaluation code for ref expression task 131 | pred_boxes = eval_utils.get_pred_boxes(logit) 132 | # score, label = logit.max(1) 133 | for sid, pbox in zip(sent_id, pred_boxes.cpu().detach().numpy()): 134 | sentid2pbox[sid] = pbox 135 | 136 | log_str = "\nEpoch %d: Train %0.2f\n" % (epoch, evaluator.evaluate(sentid2pbox) * 100.) 137 | 138 | # to handle GPU OOM error 139 | torch.cuda.empty_cache() 140 | 141 | if self.valid_tuple is not None: # Do Validation 142 | valid_score = self.evaluate(eval_tuple) 143 | if valid_score > best_valid: 144 | best_valid = valid_score 145 | self.save("BEST") 146 | 147 | log_str += "Epoch %d: Valid %0.2f\n" % (epoch, valid_score * 100.) + \ 148 | "Epoch %d: Best %0.2f\n" % (epoch, best_valid * 100.) 149 | 150 | print(log_str, end='') 151 | 152 | with open(self.output + "/log.log", 'a') as f: 153 | f.write(log_str) 154 | f.flush() 155 | 156 | self.save("LAST") 157 | 158 | def predict(self, eval_tuple: DataTuple, dump=None): 159 | self.model.eval() 160 | dset, loader, evaluator = eval_tuple 161 | sentid2ans = {} 162 | results = [] 163 | for i, datum_tuple in enumerate(loader): 164 | sent_id, feats, boxes, sent = datum_tuple[:4] # avoid handling target 165 | attention = [] 166 | 167 | with torch.no_grad(): 168 | feats, boxes = feats.to(self.device), boxes.to(self.device) 169 | logits, attn_probs = self.model(feats, boxes, sent) 170 | # print(attn_probs) 171 | if self.model.args.output_attention: 172 | last_layer_att_score = torch.squeeze(attn_probs[1][-1]['attn'][:, :, 0, :]) # batch_size, att_head, target_num_feat, source_num_feat -> use all att head and CLS as target 173 | # print(last_layer_att_score.shape) 174 | last_layer_att_score = last_layer_att_score.cpu().numpy().tolist() 175 | else: 176 | last_layer_att_score = [] 177 | 178 | pred_boxes = eval_utils.get_pred_boxes(logits) 179 | 180 | for sid in sent_id: 181 | # ans = dset.label2ans[l] 182 | sentid2ans[sid] = logits 183 | results.append( 184 | { 185 | "questionId": sid.tolist(), 186 | "prediction": pred_boxes, 187 | "attention": last_layer_att_score 188 | } 189 | ) 190 | 191 | # del logit, attn_probs, datum_tuple 192 | # gc.collect() 193 | 194 | evaluator.save_json(results, '/data/Grounded-RL2021/lxmert/snap/refcocog/attentions.json') 195 | 196 | if dump is not None: 197 | evaluator.dump_result(sentid2ans, dump) 198 | return sentid2ans 199 | 200 | def evaluate(self, eval_tuple: DataTuple, dump=None): 201 | dset, loader, evaluator = eval_tuple 202 | sentid2box = self.predict(eval_tuple, dump) 203 | return evaluator.evaluate(sentid2box) 204 | 205 | @staticmethod 206 | def oracle_score(data_tuple): 207 | dset, loader, evaluator = data_tuple 208 | sentid2box = {} 209 | for i, (ques_id, feats, boxes, sent, target_box, is_matched) in enumerate(loader): 210 | # target_ = torch.stack(target_box, dim=0).permute(1, 0) 211 | miou, acc = eval_utils.trans_vg_eval_val(target_box, target_box, oracle=True) 212 | # _, label = target_box.max(1) 213 | for sid, iou in zip(ques_id, miou.cpu().numpy()): 214 | # ans = dset.label2ans[l] 215 | sentid2box[sid] = iou 216 | return evaluator.iou_acc(sentid2box) / dset.__len__ 217 | 218 | def save(self, name): 219 | torch.save(self.model.state_dict(), 220 | os.path.join(self.output, "%s.pth" % name)) 221 | 222 | def load(self, path): 223 | print("Load model from %s" % path) 224 | state_dict = torch.load("%s.pth" % path) 225 | for key in list(state_dict.keys()): 226 | if '.module' in key: 227 | state_dict[key.replace('.module', '')] = state_dict.pop(key) 228 | self.model.load_state_dict(state_dict, strict=False) 229 | 230 | 231 | if __name__ == "__main__": 232 | torch.backends.cudnn.benchmark = True 233 | torch.backends.cudnn.enabled = True 234 | # Build Class 235 | refcocog = RefCOCOg() 236 | 237 | # Load Model 238 | if args.load is not None: 239 | refcocog.load(args.load) 240 | 241 | # Test or Train 242 | if args.test is not None: 243 | args.fast = args.tiny = False # Always loading all data in test 244 | if 'submit' in args.test: 245 | refcocog.predict( 246 | get_tuple(args.test, bs=args.batch_size, 247 | shuffle=False, drop_last=False), 248 | dump=os.path.join(args.output, 'submit_predict.json') 249 | ) 250 | if 'test' in args.test: 251 | result = refcocog.evaluate( 252 | get_tuple('test', bs=args.batch_size, 253 | shuffle=False, drop_last=False), 254 | dump=os.path.join(args.output, 'test_predict.json') 255 | ) 256 | print(result) 257 | if 'valid' in args.test: 258 | result = refcocog.evaluate( 259 | get_tuple('valid', bs=args.batch_size, 260 | shuffle=False, drop_last=False), 261 | dump=os.path.join(args.output, 'valid_predict.json') 262 | ) 263 | print(result) 264 | else: 265 | # print("Train Oracle: %0.2f" % (gqa.oracle_score(gqa.train_tuple) * 100)) 266 | print('Splits in Train data:', refcocog.train_tuple.dataset.splits) 267 | if refcocog.valid_tuple is not None: 268 | print('Splits in Valid data:', refcocog.valid_tuple.dataset.splits) 269 | print("Valid Oracle: %0.2f" % (refcocog.oracle_score(refcocog.valid_tuple) * 100)) 270 | else: 271 | print("DO NOT USE VALIDATION") 272 | refcocog.train(refcocog.train_tuple, refcocog.valid_tuple) 273 | 274 | 275 | --------------------------------------------------------------------------------