├── images
    ├── .none
    ├── main_fig.png
    ├── teaser.pdf
    └── teaser.png
├── data_files
    └── .gitignore
├── src
    ├── lxrt
    │   ├── .gitignore
    │   ├── vit_explore_code.py
    │   ├── PositionalEncoding.py
    │   ├── entry_spatial.py
    │   ├── entry.py
    │   ├── optimization.py
    │   └── file_utils.py
    ├── pretrain
    │   ├── .gitignore
    │   └── qa_answer_table.py
    ├── tasks
    │   ├── .gitignore
    │   ├── .DS_Store
    │   ├── gqa_model.py
    │   ├── refcocoplus_model.py
    │   ├── vqa_model.py
    │   ├── mscoco_retrieval_model.py
    │   ├── nlvr2_model.py
    │   ├── refcocog_model.py
    │   ├── refcoco_model.py
    │   ├── nlvr2_data.py
    │   ├── vqa_data.py
    │   ├── nlvr2.py
    │   ├── gqa_data_patches.py
    │   ├── gqa_data.py
    │   ├── vqahat_data.py
    │   ├── refcocoplus_data.py
    │   ├── vqa.py
    │   ├── refcocog_data.py
    │   ├── mscoco_retrieval_data.py
    │   ├── refcocoplus.py
    │   └── refcocog.py
    ├── param.py
    └── utils.py
├── run
    ├── README.md
    ├── gqa_test.bash
    ├── gqa_finetune_caps.bash
    ├── pretrain_2stage_fulldata_no_init_self.bash
    ├── pretrain_2stage_fulldata_no_init_patches.bash
    ├── pretrain_2stage_fulldata_no_init_selfcross.bash
    ├── pretrain_2stage_fulldata_no_init_self_patches.bash
    ├── 2stage_fulldata_no_init_16_caps.bash
    ├── 2stage_fulldata_no_init_24_caps.bash
    ├── pretrain_2stage_fulldata_no_init_64_caps.bash
    ├── pretrain_2stage_fulldata_no_init_48_caps.bash
    ├── pretrain_2stage_fulldata_vit_bert_init.bash
    ├── pretrain_2stage_fulldata_no_init_selfcross_patches.bash
    ├── pretrain_2stage_fulldata_vit_bert_init_self.bash
    ├── pretrain_2stage_fulldata_vit_no_bert_init.bash
    ├── pretrain_2stage_fulldata_vit_bert_init_cross_patches.bash
    ├── pretrain_2stage_fulldata_vit_bert_init_selfcross.bash
    ├── pretrain_2stage_fulldata_vit_no_bert_init_self.bash
    ├── pretrain_2stage_fulldata_vit_bert_16_caps.bash
    ├── pretrain_2stage_fulldata_vit_bert_24_caps.bash
    ├── pretrain_2stage_fulldata_vit_bert_init_self_patches.bash
    ├── pretrain_2stage_fulldata_vit_no_bert_init_patches.bash
    ├── pretrain_2stage_fulldata_vit_no_bert_init_selfcross.bash
    ├── pretrain_2stage_fulldata_vit_bert_init_selfcross_patches.bash
    ├── pretrain_2stage_fulldata_vit_no_bert_init_self_patches.bash
    └── pretrain_2stage_fulldata_vit_no_bert_init_selfcross_patches.bash
├── teaser.pdf
├── main_fig.png
├── LICENSE
└── README.md


/images/.none:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/data_files/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/lxrt/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/pretrain/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/src/tasks/.gitignore:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/run/README.md:
--------------------------------------------------------------------------------
1 | To do:
2 | 
3 | Add pretraining instructions in detail here.
4 | 


--------------------------------------------------------------------------------
/teaser.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aurooj/WSG-VQA-VLTransformers/HEAD/teaser.pdf


--------------------------------------------------------------------------------
/main_fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aurooj/WSG-VQA-VLTransformers/HEAD/main_fig.png


--------------------------------------------------------------------------------
/images/main_fig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aurooj/WSG-VQA-VLTransformers/HEAD/images/main_fig.png


--------------------------------------------------------------------------------
/images/teaser.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aurooj/WSG-VQA-VLTransformers/HEAD/images/teaser.pdf


--------------------------------------------------------------------------------
/images/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aurooj/WSG-VQA-VLTransformers/HEAD/images/teaser.png


--------------------------------------------------------------------------------
/src/tasks/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/aurooj/WSG-VQA-VLTransformers/HEAD/src/tasks/.DS_Store


--------------------------------------------------------------------------------
/src/lxrt/vit_explore_code.py:
--------------------------------------------------------------------------------
1 | import timm
2 | 
3 | m = timm.create_model('mobilenetv3_large_100', pretrained=True)
4 | m.eval()
5 | from pprint import pprint
6 | model_names = timm.list_models(pretrained=True)
7 | pprint(model_names)


--------------------------------------------------------------------------------
/run/gqa_test.bash:
--------------------------------------------------------------------------------
 1 | # The name of this experiment.
 2 | name=$2
 3 | 
 4 | # Save logs and models under snap/gqa; make backup.
 5 | output=snap/gqa/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # See Readme.md for option details.
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/tasks/gqa.py \
13 |      --train train --valid "" \
14 |     --llayers 5 --xlayers 2  --rlayers 5 --outputAttn --skipConnection \
15 |     --tqdm --output $output ${@:3}
16 | 


--------------------------------------------------------------------------------
/run/gqa_finetune_caps.bash:
--------------------------------------------------------------------------------
 1 | # The name of this experiment.
 2 | name=$2
 3 | 
 4 | # Save logs and models under snap/gqa; make backup.
 5 | output=snap/gqa/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # See Readme.md for option details.
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/tasks/gqa.py \
13 |     --train train --valid valid \
14 |     --llayers 5 --xlayers 2 --rlayers 5 --NUM_PRIM_CAPS 32 --NUM_VIS_CAPS 32 --skipConnection --crossAttn\
15 |     --loadLXMERT snap/pretrain/mm_capsules_pretrain_552_stage2_fixed_continued/BEST_EVAL_LOSS \
16 |     --batchSize 32 --optim bert --lr 1e-5 --epochs 20\
17 |     --tqdm --output $output ${@:3}
18 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 aukhan
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/src/tasks/gqa_model.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyleft 2019 project LXRT.
 3 | 
 4 | import torch.nn as nn
 5 | 
 6 | from src.param import args
 7 | from src.lxrt.entry import LXRTEncoder
 8 | from src.lxrt.modeling_capsbert import BertLayerNorm, GeLU
 9 | 
10 | # Max length including <bos> and <eos>
11 | MAX_GQA_LENGTH = 20
12 | 
13 | 
14 | class GQAModel(nn.Module):
15 |     def __init__(self, num_answers):
16 |         super().__init__()
17 |         self.lxrt_encoder = LXRTEncoder(
18 |             args,
19 |             max_seq_length=MAX_GQA_LENGTH
20 |         )
21 |         hid_dim = self.lxrt_encoder.dim
22 |         self.logit_fc = nn.Sequential(
23 |             nn.Linear(hid_dim, hid_dim * 2),
24 |             GeLU(),
25 |             BertLayerNorm(hid_dim * 2, eps=1e-12),
26 |             nn.Linear(hid_dim * 2, num_answers)
27 |         )
28 |         self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
29 |         self.args = args
30 | 
31 |     def forward(self, feat, pos, sent):
32 |         """
33 |         b -- batch_size, o -- object_number, f -- visual_feature_size
34 | 
35 |         :param feat: (b, o, f)
36 |         :param pos:  (b, o, 4)
37 |         :param sent: (b,) Type -- list of string
38 |         :param leng: (b,) Type -- int numpy array
39 |         :return: (b, num_answer) The logit of each answers.
40 |         """
41 | 
42 |         _, x, attn_probs = self.lxrt_encoder(sent, (feat, pos))
43 |         logit = self.logit_fc(x)
44 | 
45 |         return logit, attn_probs
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/src/tasks/refcocoplus_model.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyleft 2019 project LXRT.
 3 | 
 4 | import torch.nn as nn
 5 | 
 6 | from src.param import args
 7 | from src.lxrt.entry import LXRTEncoder
 8 | from src.lxrt.modeling_capsbert import BertLayerNorm, GeLU
 9 | 
10 | # Max length including <bos> and <eos>
11 | MAX_GQA_LENGTH = 20
12 | 
13 | 
14 | class RefCOCOplusModel(nn.Module):
15 |     def __init__(self):
16 |         super().__init__()
17 |         self.lxrt_encoder = LXRTEncoder(
18 |             args,
19 |             max_seq_length=MAX_GQA_LENGTH
20 |         )
21 |         # hid_dim = self.lxrt_encoder.dim
22 |         # self.logit_fc = nn.Sequential(
23 |         #     nn.Linear(hid_dim, hid_dim * 2),
24 |         #     GeLU(),
25 |         #     BertLayerNorm(hid_dim * 2, eps=1e-12),
26 |         #     nn.Linear(hid_dim * 2, num_answers)
27 |         # )
28 |         # self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
29 |         self.args = args
30 | 
31 |     def forward(self, feat, pos, sent):
32 |         """
33 |         b -- batch_size, o -- object_number, f -- visual_feature_size
34 | 
35 |         :param feat: (b, o, f)
36 |         :param pos:  (b, o, 4)
37 |         :param sent: (b,) Type -- list of string
38 |         :param leng: (b,) Type -- int numpy array
39 |         :return: (b, num_answer) The logit of each answers.
40 |         """
41 | 
42 |         x, attn_probs = self.lxrt_encoder(sent, (feat, pos))
43 |         # logit = self.logit_fc(x)
44 | 
45 |         return attn_probs
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/src/tasks/vqa_model.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyleft 2019 project LXRT.
 3 | 
 4 | import torch.nn as nn
 5 | 
 6 | from param import args
 7 | from lxrt.entry import LXRTEncoder
 8 | from lxrt.modeling import BertLayerNorm, GeLU
 9 | 
10 | # Max length including <bos> and <eos>
11 | MAX_VQA_LENGTH = 20
12 | 
13 | 
14 | class VQAModel(nn.Module):
15 |     def __init__(self, num_answers):
16 |         super().__init__()
17 |         
18 |         # Build LXRT encoder
19 |         self.lxrt_encoder = LXRTEncoder(
20 |             args,
21 |             max_seq_length=MAX_VQA_LENGTH
22 |         )
23 |         hid_dim = self.lxrt_encoder.dim
24 |         
25 |         # VQA Answer heads
26 |         self.logit_fc = nn.Sequential(
27 |             nn.Linear(hid_dim, hid_dim * 2),
28 |             GeLU(),
29 |             BertLayerNorm(hid_dim * 2, eps=1e-12),
30 |             nn.Linear(hid_dim * 2, num_answers)
31 |         )
32 |         self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
33 | 
34 |     def forward(self, feat, pos, sent):
35 |         """
36 |         b -- batch_size, o -- object_number, f -- visual_feature_size
37 | 
38 |         :param feat: (b, o, f)
39 |         :param pos:  (b, o, 4)
40 |         :param sent: (b,) Type -- list of string
41 |         :param leng: (b,) Type -- int numpy array
42 |         :return: (b, num_answer) The logit of each answers.
43 |         """
44 |         x = self.lxrt_encoder(sent, (feat, pos))
45 |         logit = self.logit_fc(x)
46 | 
47 |         return logit
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/src/tasks/mscoco_retrieval_model.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyleft 2019 project LXRT.
 3 | 
 4 | import torch.nn as nn
 5 | 
 6 | from src.param import args
 7 | from src.lxrt.entry import LXRTEncoder
 8 | from src.lxrt.modeling_capsbert import BertLayerNorm, GeLU
 9 | 
10 | # Max length including <bos> and <eos>
11 | MAX_GQA_LENGTH = 20
12 | 
13 | 
14 | class MSCOCOModel(nn.Module):
15 |     def __init__(self):
16 |         super().__init__()
17 |         self.lxrt_encoder = LXRTEncoder(
18 |             args,
19 |             max_seq_length=MAX_GQA_LENGTH,
20 |             mode='lxr'
21 |         )
22 |         # hid_dim = self.lxrt_encoder.dim
23 |         # self.logit_fc = nn.Sequential(
24 |         #     nn.Linear(hid_dim, hid_dim * 2),
25 |         #     GeLU(),
26 |         #     BertLayerNorm(hid_dim * 2, eps=1e-12),
27 |         #     nn.Linear(hid_dim * 2, num_answers)
28 |         # )
29 |         # self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
30 |         self.args = args
31 | 
32 |     def forward(self, feat, pos, sent):
33 |         """
34 |         b -- batch_size, o -- object_number, f -- visual_feature_size
35 | 
36 |         :param feat: (b, o, f)
37 |         :param pos:  (b, o, 4)
38 |         :param sent: (b,) Type -- list of string
39 |         :param leng: (b,) Type -- int numpy array
40 |         :return: (b, num_answer) The logit of each answers.
41 |         """
42 | 
43 |         feats, x, attn_probs = self.lxrt_encoder(sent, (feat, pos))
44 |         # logit = self.logit_fc(x)
45 | 
46 |         return feats, x, attn_probs
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/src/lxrt/PositionalEncoding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | 
 4 | 
 5 | class FixedPositionalEncoding(nn.Module):
 6 |     def __init__(self, embedding_dim, max_length=5000):
 7 |         super(FixedPositionalEncoding, self).__init__()
 8 | 
 9 |         pe = torch.zeros(max_length, embedding_dim)
10 |         position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)
11 |         div_term = torch.exp(
12 |             torch.arange(0, embedding_dim, 2).float()
13 |             * (-torch.log(torch.tensor(10000.0)) / embedding_dim)
14 |         )
15 |         pe[:, 0::2] = torch.sin(position * div_term)
16 |         pe[:, 1::2] = torch.cos(position * div_term)
17 |         pe = pe.unsqueeze(0).transpose(0, 1)
18 |         self.register_buffer('pe', pe)
19 | 
20 |     def forward(self, x):
21 |         x = x + self.pe[: x.size(0), :]
22 |         return x
23 | 
24 | 
25 | class LearnedPositionalEncoding(nn.Module):
26 |     def __init__(self, max_position_embeddings, embedding_dim, seq_length):
27 |         super(LearnedPositionalEncoding, self).__init__()
28 |         self.pe = nn.Embedding(max_position_embeddings, embedding_dim)
29 |         self.seq_length = seq_length
30 | 
31 |         self.register_buffer(
32 |             "position_ids",
33 |             torch.arange(max_position_embeddings).expand((1, -1)),
34 |         )
35 | 
36 |     def forward(self, x, position_ids=None):
37 |         if position_ids is None:
38 |             position_ids = self.position_ids[:, : self.seq_length]
39 | 
40 |         position_embeddings = self.pe(position_ids)
41 |         return x + position_embeddings
42 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_no_init_self.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_self_allqa_no_init_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --fromScratch --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \
18 |      --tqdm --output $output ${@:2}
19 | 
20 | 
21 | 
22 | # The name of experiment
23 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_self_allqa_no_init_stage_2
24 | 
25 | # Create dirs and make backup
26 | output=snap/pretrain/$name
27 | mkdir -p $output/src
28 | cp -r src/* $output/src/
29 | cp $0 $output/run.bash
30 | 
31 | # Pre-training
32 | #batch size reduced due to additional cross attn layer (large model)
33 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
34 |     python src/pretrain/lxmert_pretrain.py \
35 |      --taskMatched --taskMaskLM --taskQA \
36 |     --train mscoco_train,vgnococo --valid mscoco_minival \
37 |     --llayers 5  --rlayers 5 --xlayers 2\
38 |    --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_grid_x_self_allqa_no_init_stage_1/BEST_EVAL_LOSS \
39 |     --skipConnection --crossAttn --crossAttnType self --freezeWeights\
40 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \
41 |     --tqdm --output $output ${@:2}
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_no_init_patches.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_allqa_no_init_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --fromScratch --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \
18 |     --patches --tqdm --output $output ${@:2}
19 | 
20 | 
21 | 
22 | # The name of experiment
23 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_allqa_no_init_stage_2
24 | 
25 | # Create dirs and make backup
26 | output=snap/pretrain/$name
27 | mkdir -p $output/src
28 | cp -r src/* $output/src/
29 | cp $0 $output/run.bash
30 | 
31 | # Pre-training
32 | #batch size reduced due to additional cross attn layer (large model)
33 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
34 |     python src/pretrain/lxmert_pretrain.py \
35 |      --taskMatched --taskMaskLM --taskQA \
36 |     --train mscoco_train,vgnococo --valid mscoco_minival \
37 |     --llayers 5  --rlayers 5 --xlayers 2\
38 |    --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_allqa_no_init_stage_1/BEST_EVAL_LOSS
39 |     --skipConnection --crossAttn --crossAttnType cross --freezeWeights\
40 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \
41 |     --patches --tqdm --output $output ${@:2}
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_no_init_selfcross.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_allqa_no_init_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --fromScratch --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \
18 |     --tqdm --output $output ${@:2}
19 | 
20 | 
21 | 
22 | # The name of experiment
23 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_allqa_no_init_stage_2
24 | 
25 | # Create dirs and make backup
26 | output=snap/pretrain/$name
27 | mkdir -p $output/src
28 | cp -r src/* $output/src/
29 | cp $0 $output/run.bash
30 | 
31 | # Pre-training
32 | #batch size reduced due to additional cross attn layer (large model)
33 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
34 |     python src/pretrain/lxmert_pretrain.py \
35 |      --taskMatched --taskMaskLM --taskQA \
36 |     --train mscoco_train,vgnococo --valid mscoco_minival \
37 |     --llayers 5  --rlayers 5 --xlayers 2 \
38 |    --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_allqa_no_init_stage_1/BEST_EVAL_LOSS \
39 |     --skipConnection --crossAttn --crossAttnType cross_self --freezeWeights\
40 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \
41 |     --tqdm --output $output ${@:2}
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_no_init_self_patches.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_self_allqa_no_init_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --fromScratch --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \
18 |     --patches --tqdm --output $output ${@:2}
19 | 
20 | 
21 | 
22 | # The name of experiment
23 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_self_allqa_no_init_stage_2
24 | 
25 | # Create dirs and make backup
26 | output=snap/pretrain/$name
27 | mkdir -p $output/src
28 | cp -r src/* $output/src/
29 | cp $0 $output/run.bash
30 | 
31 | # Pre-training
32 | #batch size reduced due to additional cross attn layer (large model)
33 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
34 |     python src/pretrain/lxmert_pretrain.py \
35 |      --taskMatched --taskMaskLM --taskQA \
36 |     --train mscoco_train,vgnococo --valid mscoco_minival \
37 |     --llayers 5  --rlayers 5 --xlayers 2\
38 |    --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_self_allqa_no_init_stage_1/BEST_EVAL_LOSS \
39 |     --skipConnection --crossAttn --crossAttnType self --freezeWeights\
40 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \
41 |     --patches --tqdm --output $output ${@:2}
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/run/2stage_fulldata_no_init_16_caps.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=16caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --fromScratch --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \
18 |     --NUM_PRIM_CAPS 16 --NUM_VIS_CAPS 16 \
19 |     --tqdm --output $output ${@:2}
20 | 
21 | 
22 | 
23 | # The name of experiment
24 | name=16caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_2
25 | 
26 | # Create dirs and make backup
27 | output=snap/pretrain/$name
28 | mkdir -p $output/src
29 | cp -r src/* $output/src/
30 | cp $0 $output/run.bash
31 | 
32 | # Pre-training
33 | #batch size reduced due to additional cross attn layer (large model)
34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
35 |     python src/pretrain/lxmert_pretrain.py \
36 |      --taskMatched --taskMaskLM --taskQA \
37 |     --train mscoco_train,vgnococo --valid mscoco_minival \
38 |     --llayers 5  --rlayers 5 --xlayers 2 \
39 |    --loadLXMERT snap/pretrain/16caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_1/BEST_EVAL_LOSS \
40 |     --skipConnection --crossAttn --crossAttnType cross --freezeWeights\
41 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \
42 |     --NUM_PRIM_CAPS 16 --NUM_VIS_CAPS 16 \
43 |     --tqdm --output $output ${@:2}
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/run/2stage_fulldata_no_init_24_caps.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=24caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --fromScratch --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \
18 |     --NUM_PRIM_CAPS 24 --NUM_VIS_CAPS 24 \
19 |     --tqdm --output $output ${@:2}
20 | 
21 | 
22 | 
23 | # The name of experiment
24 | name=24caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_2
25 | 
26 | # Create dirs and make backup
27 | output=snap/pretrain/$name
28 | mkdir -p $output/src
29 | cp -r src/* $output/src/
30 | cp $0 $output/run.bash
31 | 
32 | # Pre-training
33 | #batch size reduced due to additional cross attn layer (large model)
34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
35 |     python src/pretrain/lxmert_pretrain.py \
36 |      --taskMatched --taskMaskLM --taskQA \
37 |     --train mscoco_train,vgnococo --valid mscoco_minival \
38 |     --llayers 5  --rlayers 5 --xlayers 2 \
39 |    --loadLXMERT snap/pretrain/24caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_1/BEST_EVAL_LOSS \
40 |     --skipConnection --crossAttn --crossAttnType cross --freezeWeights\
41 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \
42 |     --NUM_PRIM_CAPS 24 --NUM_VIS_CAPS 24 \
43 |     --tqdm --output $output ${@:2}
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_no_init_64_caps.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=64caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --fromScratch --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \
18 |     --NUM_PRIM_CAPS 64 --NUM_VIS_CAPS 64 \
19 |     --tqdm --output $output ${@:2}
20 | 
21 | 
22 | 
23 | # The name of experiment
24 | name=64caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_2
25 | 
26 | # Create dirs and make backup
27 | output=snap/pretrain/$name
28 | mkdir -p $output/src
29 | cp -r src/* $output/src/
30 | cp $0 $output/run.bash
31 | 
32 | # Pre-training
33 | #batch size reduced due to additional cross attn layer (large model)
34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
35 |     python src/pretrain/lxmert_pretrain.py \
36 |      --taskMatched --taskMaskLM --taskQA \
37 |     --train mscoco_train,vgnococo --valid mscoco_minival \
38 |     --llayers 5  --rlayers 5 --xlayers 2\
39 |    --loadLXMERT snap/pretrain/64caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_1/BEST_EVAL_LOSS \
40 |     --skipConnection --crossAttn --crossAttnType cross --freezeWeights\
41 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \
42 |     --NUM_PRIM_CAPS 64 --NUM_VIS_CAPS 64 \
43 |     --tqdm --output $output ${@:2}
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_no_init_48_caps.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=48caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --fromScratch --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \
18 |     --NUM_PRIM_CAPS 48 --NUM_VIS_CAPS 48 \
19 |     --tqdm --output $output ${@:2}
20 | 
21 | 
22 | 
23 | # The name of experiment
24 | name=48caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_2
25 | 
26 | # Create dirs and make backup
27 | output=snap/pretrain/$name
28 | mkdir -p $output/src
29 | cp -r src/* $output/src/
30 | cp $0 $output/run.bash
31 | 
32 | # Pre-training
33 | #batch size reduced due to additional cross attn layer (large model)
34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
35 |     python src/pretrain/lxmert_pretrain.py \
36 |      --taskMatched --taskMaskLM --taskQA \
37 |     --train mscoco_train,vgnococo --valid mscoco_minival \
38 |     --llayers 5  --rlayers 5 --xlayers 2 \
39 |    --loadLXMERT snap/pretrain/48caps_itm_mlm_qa_552_grid_x_allqa_no_init_stage_1/BEST_EVAL_LOSS \
40 |     --skipConnection --crossAttn --crossAttnType cross --freezeWeights\
41 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \
42 |     --NUM_PRIM_CAPS 48 --NUM_VIS_CAPS 48 \
43 |     --tqdm --output $output ${@:2}
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_vit_bert_init.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_vit_bert_wts_allqa_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
18 |     --vitInit --startIndex 7 \
19 |     --tqdm --output $output ${@:2}
20 | 
21 | 
22 | 
23 | # The name of experiment
24 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_vit_bert_wts_allqa_stage_2
25 | 
26 | # Create dirs and make backup
27 | output=snap/pretrain/$name
28 | mkdir -p $output/src
29 | cp -r src/* $output/src/
30 | cp $0 $output/run.bash
31 | 
32 | # Pre-training
33 | #batch size reduced due to additional cross attn layer (large model)
34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
35 |     python src/pretrain/lxmert_pretrain.py \
36 |      --taskMatched --taskMaskLM --taskQA \
37 |     --train mscoco_train,vgnococo --valid mscoco_minival \
38 |     --llayers 5  --rlayers 5 --xlayers 2\
39 |    --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_grid_x_vit_bert_wts_allqa_stage_1/BEST_EVAL_LOSS \
40 |     --skipConnection --crossAttn --crossAttnType cross --freezeWeights\
41 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
42 |     --vitInit --startIndex 7 \
43 |     --tqdm --output $output ${@:2}
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_no_init_selfcross_patches.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_allqa_no_init_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --fromScratch --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \
18 |     --patches --tqdm --output $output ${@:2}
19 | 
20 | 
21 | 
22 | # The name of experiment
23 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_allqa_no_init_stage_2
24 | 
25 | # Create dirs and make backup
26 | output=snap/pretrain/$name
27 | mkdir -p $output/src
28 | cp -r src/* $output/src/
29 | cp $0 $output/run.bash
30 | 
31 | # Pre-training
32 | #batch size reduced due to additional cross attn layer (large model)
33 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
34 |     python src/pretrain/lxmert_pretrain.py \
35 |      --taskMatched --taskMaskLM --taskQA \
36 |     --train mscoco_train,vgnococo --valid mscoco_minival \
37 |     --llayers 5  --rlayers 5 --xlayers 2 \
38 |    --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_allqa_no_init_stage_1/BEST_EVAL_LOSS \
39 |     --skipConnection --crossAttn --crossAttnType cross_self --freezeWeights\
40 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \
41 |     --patches --tqdm --output $output ${@:2}
42 | 
43 | 
44 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_vit_bert_init_self.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_self_vit7_bert_wts_allqa_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
18 |     --vitInit --startIndex 7 \
19 |     --tqdm --output $output ${@:2}
20 | 
21 | 
22 | 
23 | # The name of experiment
24 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_self_vit7_bert_wts_allqa_stage_2
25 | 
26 | # Create dirs and make backup
27 | output=snap/pretrain/$name
28 | mkdir -p $output/src
29 | cp -r src/* $output/src/
30 | cp $0 $output/run.bash
31 | 
32 | # Pre-training
33 | #batch size reduced due to additional cross attn layer (large model)
34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
35 |     python src/pretrain/lxmert_pretrain.py \
36 |      --taskMatched --taskMaskLM --taskQA \
37 |     --train mscoco_train,vgnococo --valid mscoco_minival \
38 |     --llayers 5  --rlayers 5 --xlayers 2\
39 |    --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_grid_x_self_vit7_bert_wts_allqa_stage_1/BEST_EVAL_LOSS \
40 |     --skipConnection --crossAttn --crossAttnType self --freezeWeights\
41 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
42 |     --vitInit --startIndex 7 \
43 |     --tqdm --output $output ${@:2}
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_vit_no_bert_init.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_allqa_vit7_no_bert_init_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --fromScratch --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
18 |     --vitInit --startIndex 7 \
19 |     --tqdm --output $output ${@:2}
20 | 
21 | 
22 | 
23 | # The name of experiment
24 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_allqa_vit7_no_bert_init_stage_2
25 | 
26 | # Create dirs and make backup
27 | output=snap/pretrain/$name
28 | mkdir -p $output/src
29 | cp -r src/* $output/src/
30 | cp $0 $output/run.bash
31 | 
32 | # Pre-training
33 | #batch size reduced due to additional cross attn layer (large model)
34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
35 |     python src/pretrain/lxmert_pretrain.py \
36 |      --taskMatched --taskMaskLM --taskQA \
37 |     --train mscoco_train,vgnococo --valid mscoco_minival \
38 |     --llayers 5  --rlayers 5 --xlayers 2\
39 |    --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_grid_x_allqa_vit7_no_bert_init_stage_1/BEST_EVAL_LOSS \
40 |     --skipConnection --crossAttn --crossAttnType cross --freezeWeights\
41 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
42 |     --vitInit --startIndex 7 \
43 |     --tqdm --output $output ${@:2}
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_vit_bert_init_cross_patches.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_vit7_bert_wts_allqa_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
18 |     --vitInit --startIndex 7 \
19 |     --patches --tqdm --output $output ${@:2}
20 | 
21 | 
22 | 
23 | # The name of experiment
24 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_vit7_bert_wts_allqa_stage_2
25 | 
26 | # Create dirs and make backup
27 | output=snap/pretrain/$name
28 | mkdir -p $output/src
29 | cp -r src/* $output/src/
30 | cp $0 $output/run.bash
31 | 
32 | # Pre-training
33 | #batch size reduced due to additional cross attn layer (large model)
34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
35 |     python src/pretrain/lxmert_pretrain.py \
36 |      --taskMatched --taskMaskLM --taskQA \
37 |     --train mscoco_train,vgnococo --valid mscoco_minival \
38 |     --llayers 5  --rlayers 5 --xlayers 2 \
39 |    --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_vit7_bert_wts_allqa_stage_1/BEST_EVAL_LOSS \
40 |     --skipConnection --crossAttn --crossAttnType cross --freezeWeights\
41 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
42 |     --vitInit --startIndex 7 \
43 |     --patches --tqdm --output $output ${@:2}
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_vit_bert_init_selfcross.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_vit7_bert_wts_allqa_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
18 |     --vitInit --startIndex 7 \
19 |     --tqdm --output $output ${@:2}
20 | 
21 | 
22 | 
23 | # The name of experiment
24 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_vit0_bert_wts_allqa_stage_2
25 | 
26 | # Create dirs and make backup
27 | output=snap/pretrain/$name
28 | mkdir -p $output/src
29 | cp -r src/* $output/src/
30 | cp $0 $output/run.bash
31 | 
32 | # Pre-training
33 | #batch size reduced due to additional cross attn layer (large model)
34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
35 |     python src/pretrain/lxmert_pretrain.py \
36 |      --taskMatched --taskMaskLM --taskQA \
37 |     --train mscoco_train,vgnococo --valid mscoco_minival \
38 |     --llayers 5  --rlayers 5 --xlayers 2\
39 |    --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_vit7_bert_wts_allqa_stage_1/BEST_EVAL_LOSS
40 |     --skipConnection --crossAttn --crossAttnType cross_self --freezeWeights\
41 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
42 |     --vitInit --startIndex 7 \
43 |     --tqdm --output $output ${@:2}
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_vit_no_bert_init_self.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_self_allqa_vit7_no_bert_init_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --fromScratch --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
18 |     --vitInit --startIndex 7 \
19 |      --tqdm --output $output ${@:2}
20 | 
21 | 
22 | 
23 | # The name of experiment
24 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_self_allqa_vit7_no_bert_init_stage_2
25 | 
26 | # Create dirs and make backup
27 | output=snap/pretrain/$name
28 | mkdir -p $output/src
29 | cp -r src/* $output/src/
30 | cp $0 $output/run.bash
31 | 
32 | # Pre-training
33 | #batch size reduced due to additional cross attn layer (large model)
34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
35 |     python src/pretrain/lxmert_pretrain.py \
36 |      --taskMatched --taskMaskLM --taskQA \
37 |     --train mscoco_train,vgnococo --valid mscoco_minival \
38 |     --llayers 5  --rlayers 5 --xlayers 2\
39 |    --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_grid_x_self_allqa_vit7_no_bert_init_stage_1/BEST_EVAL_LOSS \
40 |     --skipConnection --crossAttn --crossAttnType self --freezeWeights\
41 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
42 |     --vitInit --startIndex 7 \
43 |     --tqdm --output $output ${@:2}
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_vit_bert_16_caps.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=16caps_itm_mlm_qa_552_grid_x_allqa_vit_bert_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \
18 |     --NUM_PRIM_CAPS 16 --NUM_VIS_CAPS 16 \
19 |     --vitInit --startIndex 7 \
20 |     --tqdm --output $output ${@:2}
21 | 
22 | 
23 | 
24 | # The name of experiment
25 | name=16caps_itm_mlm_qa_552_grid_x_allqa_vit_bert_stage_2
26 | 
27 | # Create dirs and make backup
28 | output=snap/pretrain/$name
29 | mkdir -p $output/src
30 | cp -r src/* $output/src/
31 | cp $0 $output/run.bash
32 | 
33 | # Pre-training
34 | #batch size reduced due to additional cross attn layer (large model)
35 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
36 |     python src/pretrain/lxmert_pretrain.py \
37 |      --taskMatched --taskMaskLM --taskQA \
38 |     --train mscoco_train,vgnococo --valid mscoco_minival \
39 |     --llayers 5  --rlayers 5 --xlayers 2 \
40 |    --loadLXMERT snap/pretrain/16caps_itm_mlm_qa_552_grid_x_allqa_vit_bert_stage_1/BEST_EVAL_LOSS \
41 |     --skipConnection --crossAttn --crossAttnType cross --freezeWeights\
42 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \
43 |     --NUM_PRIM_CAPS 16 --NUM_VIS_CAPS 16 \
44 |     --vitInit --startIndex 7 \
45 |     --tqdm --output $output ${@:2}
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_vit_bert_24_caps.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=24caps_itm_mlm_qa_552_grid_x_allqa_vit_bert_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 20 \
18 |     --NUM_PRIM_CAPS 24 --NUM_VIS_CAPS 24 \
19 |     --vitInit --startIndex 7 \
20 |     --tqdm --output $output ${@:2}
21 | 
22 | 
23 | 
24 | # The name of experiment
25 | name=24caps_itm_mlm_qa_552_grid_x_allqa_vit_bert_stage_2
26 | 
27 | # Create dirs and make backup
28 | output=snap/pretrain/$name
29 | mkdir -p $output/src
30 | cp -r src/* $output/src/
31 | cp $0 $output/run.bash
32 | 
33 | # Pre-training
34 | #batch size reduced due to additional cross attn layer (large model)
35 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
36 |     python src/pretrain/lxmert_pretrain.py \
37 |      --taskMatched --taskMaskLM --taskQA \
38 |     --train mscoco_train,vgnococo --valid mscoco_minival \
39 |     --llayers 5  --rlayers 5 --xlayers 2 \
40 |    --loadLXMERT snap/pretrain/24caps_itm_mlm_qa_552_grid_x_allqa_vit_bert_stage_1/BEST_EVAL_LOSS \
41 |     --skipConnection --crossAttn --crossAttnType cross --freezeWeights\
42 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 15 \
43 |     --NUM_PRIM_CAPS 24 --NUM_VIS_CAPS 24 \
44 |     --vitInit --startIndex 7 \
45 |     --tqdm --output $output ${@:2}
46 | 
47 | 
48 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_vit_bert_init_self_patches.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_self_vit7_bert_wts_allqa_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
18 |     --vitInit --startIndex 7 \
19 |     --patches --tqdm --output $output ${@:2}
20 | 
21 | 
22 | 
23 | # The name of experiment
24 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_self_vit7_bert_wts_allqa_stage_2
25 | 
26 | # Create dirs and make backup
27 | output=snap/pretrain/$name
28 | mkdir -p $output/src
29 | cp -r src/* $output/src/
30 | cp $0 $output/run.bash
31 | 
32 | # Pre-training
33 | #batch size reduced due to additional cross attn layer (large model)
34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
35 |     python src/pretrain/lxmert_pretrain.py \
36 |      --taskMatched --taskMaskLM --taskQA \
37 |     --train mscoco_train,vgnococo --valid mscoco_minival \
38 |     --llayers 5  --rlayers 5 --xlayers 2\
39 |    --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_self_vit7_bert_wts_allqa_stage_1/BEST_EVAL_LOSS \
40 |     --skipConnection --crossAttn --crossAttnType self --freezeWeights\
41 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
42 |     --vitInit --startIndex 7 \
43 |     --patches --tqdm --output $output ${@:2}
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_vit_no_bert_init_patches.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_allqa_vit7_no_bert_init_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --fromScratch --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
18 |     --vitInit --startIndex 7 \
19 |     --patches --tqdm --output $output ${@:2}
20 | 
21 | 
22 | 
23 | # The name of experiment
24 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_allqa_vit7_no_bert_init_stage_2
25 | 
26 | # Create dirs and make backup
27 | output=snap/pretrain/$name
28 | mkdir -p $output/src
29 | cp -r src/* $output/src/
30 | cp $0 $output/run.bash
31 | 
32 | # Pre-training
33 | #batch size reduced due to additional cross attn layer (large model)
34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
35 |     python src/pretrain/lxmert_pretrain.py \
36 |      --taskMatched --taskMaskLM --taskQA \
37 |     --train mscoco_train,vgnococo --valid mscoco_minival \
38 |     --llayers 5  --rlayers 5 --xlayers 2\
39 |    --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_allqa_vit7_no_bert_init_stage_1/BEST_EVAL_LOSS \
40 |     --skipConnection --crossAttn --crossAttnType cross --freezeWeights\
41 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
42 |     --vitInit --startIndex 7 \
43 |     --patches --tqdm --output $output ${@:2}
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_vit_no_bert_init_selfcross.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_allqa_vit7_no_bert_init_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --fromScratch --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
18 |     --vitInit --startIndex 7 \
19 |     --tqdm --output $output ${@:2}
20 | 
21 | 
22 | 
23 | # The name of experiment
24 | name=lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_allqa_vit7_no_bert_init_stage_2
25 | 
26 | # Create dirs and make backup
27 | output=snap/pretrain/$name
28 | mkdir -p $output/src
29 | cp -r src/* $output/src/
30 | cp $0 $output/run.bash
31 | 
32 | # Pre-training
33 | #batch size reduced due to additional cross attn layer (large model)
34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
35 |     python src/pretrain/lxmert_pretrain.py \
36 |      --taskMatched --taskMaskLM --taskQA \
37 |     --train mscoco_train,vgnococo --valid mscoco_minival \
38 |     --llayers 5  --rlayers 5 --xlayers 2 \
39 |    --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_grid_x_selfcross_allqa_vit7_no_bert_init_stage_1/BEST_EVAL_LOSS \
40 |     --skipConnection --crossAttn --crossAttnType cross_self --freezeWeights\
41 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
42 |     --vitInit --startIndex 7 \
43 |     --tqdm --output $output ${@:2}
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_vit_bert_init_selfcross_patches.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_vit7_bert_wts_allqa_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
18 |     --vitInit --startIndex 7 \
19 |     --patches --tqdm --output $output ${@:2}
20 | 
21 | 
22 | 
23 | # The name of experiment
24 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_vit7_bert_wts_allqa_stage_2
25 | 
26 | # Create dirs and make backup
27 | output=snap/pretrain/$name
28 | mkdir -p $output/src
29 | cp -r src/* $output/src/
30 | cp $0 $output/run.bash
31 | 
32 | # Pre-training
33 | #batch size reduced due to additional cross attn layer (large model)
34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
35 |     python src/pretrain/lxmert_pretrain.py \
36 |      --taskMatched --taskMaskLM --taskQA \
37 |     --train mscoco_train,vgnococo --valid mscoco_minival \
38 |     --llayers 5  --rlayers 5 --xlayers 2\
39 |    --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_vit7_bert_wts_allqa_stage_1/BEST_EVAL_LOSS \
40 |     --skipConnection --crossAttn --crossAttnType cross_self --freezeWeights\
41 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
42 |     --vitInit --startIndex 7 \
43 |     --patches --tqdm --output $output ${@:2}
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_vit_no_bert_init_self_patches.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_self_allqa_vit7_no_bert_init_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --fromScratch --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
18 |     --vitInit --startIndex 7 \
19 |     --patches --tqdm --output $output ${@:2}
20 | 
21 | 
22 | 
23 | # The name of experiment
24 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_self_allqa_vit7_no_bert_init_stage_2
25 | 
26 | # Create dirs and make backup
27 | output=snap/pretrain/$name
28 | mkdir -p $output/src
29 | cp -r src/* $output/src/
30 | cp $0 $output/run.bash
31 | 
32 | # Pre-training
33 | #batch size reduced due to additional cross attn layer (large model)
34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
35 |     python src/pretrain/lxmert_pretrain.py \
36 |      --taskMatched --taskMaskLM --taskQA \
37 |     --train mscoco_train,vgnococo --valid mscoco_minival \
38 |     --llayers 5  --rlayers 5 --xlayers 2\
39 |    --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_self_allqa_vit7_no_bert_init_stage_1/BEST_EVAL_LOSS \
40 |     --skipConnection --crossAttn --crossAttnType self --freezeWeights\
41 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
42 |     --vitInit --startIndex 7 \
43 |     --patches --tqdm --output $output ${@:2}
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/run/pretrain_2stage_fulldata_vit_no_bert_init_selfcross_patches.bash:
--------------------------------------------------------------------------------
 1 | # The name of experiment
 2 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_allqa_vit7_no_bert_init_stage_1
 3 | 
 4 | # Create dirs and make backup
 5 | output=snap/pretrain/$name
 6 | mkdir -p $output/src
 7 | cp -r src/* $output/src/
 8 | cp $0 $output/run.bash
 9 | 
10 | # Pre-training
11 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
12 |     python src/pretrain/lxmert_pretrain.py \
13 |      --taskMatched --taskMaskLM --taskQA \
14 |     --train mscoco_train,vgnococo --valid mscoco_minival \
15 |     --llayers 5  --rlayers 5 --xlayers 0\
16 |     --fromScratch --skipConnection --crossAttnType no_cross\
17 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
18 |     --vitInit --startIndex 7 \
19 |     --patches --tqdm --output $output ${@:2}
20 | 
21 | 
22 | 
23 | # The name of experiment
24 | name=lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_allqa_vit7_no_bert_init_stage_2
25 | 
26 | # Create dirs and make backup
27 | output=snap/pretrain/$name
28 | mkdir -p $output/src
29 | cp -r src/* $output/src/
30 | cp $0 $output/run.bash
31 | 
32 | # Pre-training
33 | #batch size reduced due to additional cross attn layer (large model)
34 | CUDA_VISIBLE_DEVICES=$1 PYTHONPATH=$PYTHONPATH:./src \
35 |     python src/pretrain/lxmert_pretrain.py \
36 |      --taskMatched --taskMaskLM --taskQA \
37 |     --train mscoco_train,vgnococo --valid mscoco_minival \
38 |     --llayers 5  --rlayers 5 --xlayers 2 \
39 |    --loadLXMERT snap/pretrain/lxmert_pretrain_itm_mlm_qa_552_patches_x_selfcross_allqa_vit7_no_bert_init_stage_1/BEST_EVAL_LOSS \
40 |     --skipConnection --crossAttn --crossAttnType cross_self --freezeWeights\
41 |     --batchSize 1024 --optim bert --lr 1e-4 --epochs 10 \
42 |     --vitInit --startIndex 7 \
43 |     --patches --tqdm --output $output ${@:2}
44 | 
45 | 
46 | 


--------------------------------------------------------------------------------
/src/tasks/nlvr2_model.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyleft 2019 project LXRT.
 3 | 
 4 | import torch.nn as nn
 5 | from lxrt.modeling import GeLU, BertLayerNorm
 6 | from lxrt.entry import LXRTEncoder
 7 | from param import args
 8 | 
 9 | 
10 | class NLVR2Model(nn.Module):
11 |     def __init__(self):
12 |         super().__init__()
13 |         self.lxrt_encoder = LXRTEncoder(
14 |             args,
15 |             max_seq_length=20
16 |         )
17 |         self.hid_dim = hid_dim = self.lxrt_encoder.dim
18 |         self.logit_fc = nn.Sequential(
19 |             nn.Linear(hid_dim * 2, hid_dim * 2),
20 |             GeLU(),
21 |             BertLayerNorm(hid_dim * 2, eps=1e-12),
22 |             nn.Linear(hid_dim * 2, 2)
23 |         )
24 |         self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
25 | 
26 |     def forward(self, feat, pos, sent):
27 |         """
28 |         :param feat: b, 2, o, f
29 |         :param pos:  b, 2, o, 4
30 |         :param sent: b, (string)
31 |         :param leng: b, (numpy, int)
32 |         :return:
33 |         """
34 |         # Pairing images and sentences:
35 |         # The input of NLVR2 is two images and one sentence. In batch level, they are saved as
36 |         #   [ [img0_0, img0_1], [img1_0, img1_1], ...] and [sent0, sent1, ...]
37 |         # Here, we flat them to
38 |         #   feat/pos = [ img0_0, img0_1, img1_0, img1_1, ...]
39 |         #   sent     = [ sent0,  sent0,  sent1,  sent1,  ...]
40 |         sent = sum(zip(sent, sent), ())
41 |         batch_size, img_num, obj_num, feat_size = feat.size()
42 |         assert img_num == 2 and obj_num == 36 and feat_size == 2048
43 |         feat = feat.view(batch_size * 2, obj_num, feat_size)
44 |         pos = pos.view(batch_size * 2, obj_num, 4)
45 | 
46 |         # Extract feature --> Concat
47 |         x = self.lxrt_encoder(sent, (feat, pos))
48 |         x = x.view(-1, self.hid_dim*2)
49 | 
50 |         # Compute logit of answers
51 |         logit = self.logit_fc(x)
52 | 
53 |         return logit
54 | 
55 | 
56 | 


--------------------------------------------------------------------------------
/src/tasks/refcocog_model.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyleft 2019 project LXRT.
 3 | 
 4 | import torch.nn as nn
 5 | 
 6 | from src.param import args
 7 | from src.lxrt.entry import LXRTEncoder
 8 | from src.lxrt.modeling_capsbert import BertLayerNorm, GeLU, MLP
 9 | 
10 | # Max length including <bos> and <eos>
11 | MAX_GQA_LENGTH = 20
12 | 
13 | 
14 | class RefCOCOgModel(nn.Module):
15 |     def __init__(self, train_paradigm='full'):
16 |         super().__init__()
17 |         # train_paradigm has two options: 'full', 'weak'
18 |         # 'full' for bounding box supervision
19 |         # 'weak' for image-text pair supervision
20 |         self.train_paradigm = train_paradigm
21 |         self.lxrt_encoder = LXRTEncoder(
22 |             args,
23 |             max_seq_length=MAX_GQA_LENGTH
24 |         )
25 |         hid_dim = self.lxrt_encoder.dim
26 |         if self.train_paradigm == 'full':
27 |             # train with bounding box labels
28 |             self.logit_fc = MLP(hid_dim, hid_dim, 4, 3)
29 |         elif self.train_paradigm == 'weak':
30 |             # weak supervision, only use image-text labels
31 |             self.logit_fc = nn.Sequential(
32 |                 nn.Linear(hid_dim * 2, hid_dim * 2),
33 |                 GeLU(),
34 |                 BertLayerNorm(hid_dim * 2, eps=1e-12),
35 |                 nn.Linear(hid_dim * 2, 2)
36 |             )
37 |         else:
38 |             raise NotImplementedError()
39 | 
40 |         self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
41 |         self.args = args
42 | 
43 |     def forward(self, feat, pos, sent):
44 |         """
45 |         b -- batch_size, o -- object_number, f -- visual_feature_size
46 | 
47 |         :param feat: (b, o, f)
48 |         :param pos:  (b, o, 4)
49 |         :param sent: (b,) Type -- list of string
50 |         :param leng: (b,) Type -- int numpy array
51 |         :return: (b, num_answer) The logit of each answers.
52 |         """
53 | 
54 |         _, x, attn_probs = self.lxrt_encoder(sent, (feat, pos))
55 |         logit = self.logit_fc(x)
56 |         assert logit.size()[-1] == 2 if self.train_paradigm == 'weak' else \
57 |             logit.size()[-1] == 4 and self.train_paradigm == 'full'
58 | 
59 |         return logit, attn_probs
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/src/tasks/refcoco_model.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyleft 2019 project LXRT.
 3 | 
 4 | import torch.nn as nn
 5 | 
 6 | from src.param import args
 7 | from src.lxrt.entry import LXRTEncoder
 8 | from src.lxrt.modeling_capsbert import BertLayerNorm, GeLU, MLP, BertReferExpHead
 9 | 
10 | # Max length including <bos> and <eos>
11 | MAX_GQA_LENGTH = 20
12 | 
13 | 
14 | class RefCOCOModel(nn.Module):
15 |     def __init__(self, train_paradigm='full'):
16 |         super().__init__()
17 |         #train_paradigm has two options: 'full', 'weak'
18 |         # 'full' for bounding box supervision
19 |         # 'weak' for image-text pair supervision
20 |         self.train_paradigm = train_paradigm
21 |         self.lxrt_encoder = LXRTEncoder(
22 |             args,
23 |             max_seq_length=MAX_GQA_LENGTH,
24 |             # cross_attn_type=args.cross_attn_type
25 |         )
26 |         hid_dim = self.lxrt_encoder.dim
27 |         if self.train_paradigm == 'full':
28 |             #train with bounding box labels
29 |             # self.logit_fc = MLP(hid_dim, hid_dim, 4, 3)
30 |             self.logit_fc = BertReferExpHead(hidden_size=hid_dim, out_dim1=4, out_dim2=9)
31 |         elif self.train_paradigm == 'weak':
32 |             # weak supervision, only use image-text labels
33 |             self.logit_fc = nn.Sequential(
34 |                 nn.Linear(hid_dim * 2, hid_dim * 2),
35 |                 GeLU(),
36 |                 BertLayerNorm(hid_dim * 2, eps=1e-12),
37 |                 nn.Linear(hid_dim * 2, 2)
38 |             )
39 |         else:
40 |             raise NotImplementedError()
41 | 
42 |         self.logit_fc.apply(self.lxrt_encoder.model.init_bert_weights)
43 |         self.args = args
44 | 
45 |     def forward(self, feat, pos, sent):
46 |         """
47 |         b -- batch_size, o -- object_number, f -- visual_feature_size
48 | 
49 |         :param feat: (b, o, f)
50 |         :param pos:  (b, o, 4)
51 |         :param sent: (b,) Type -- list of string
52 |         :param leng: (b,) Type -- int numpy array
53 |         :return: (b, num_answer) The logit of each answers.
54 |         """
55 | 
56 |         feat_seq, x, attn_probs = self.lxrt_encoder(sent, (feat, pos))
57 |         # x = feat_seq[1][:,0] #taking first token from visual features sequence
58 |         if self.train_paradigm == "full":
59 |             logits, box_params = self.logit_fc(x)
60 |         else:
61 |             logits = self.logit_fc(x)
62 |             box_params = None
63 |         assert logits.size()[-1] == 2 if self.train_paradigm == 'weak' else logits.size()[-1] == 4 and self.train_paradigm == 'full'
64 | 
65 |         return logits, box_params, attn_probs
66 | 
67 | 
68 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Weakly Supervised Grounding for VQA in Vision-Language Transformers [ECCV 2022]
 2 | 
 3 | [Aisha Urooj Khan](https://aishaurooj.wixsite.com/aishaurooj), [Hilde Kuehne](https://hildekuehne.github.io/), [Chuang Gan](https://people.csail.mit.edu/ganchuang/), [Niels Da Vitoria Lobo](https://www.crcv.ucf.edu/person/niels-lobo/), [Mubarak Shah](https://www.crcv.ucf.edu/person/mubarak-shah/)
 4 | 
 5 | [`Website`]() | [`arXiv`]() | [`BibTeX`](#citation)
 6 | 
 7 | Official Pytorch implementation and pre-trained models for Weakly Supervised Grounding for VQA in Vision-Language Transformers (coming soon).
 8 | 
 9 | ## Abstract
10 | Transformers for visual-language representation learning have been getting a lot of interest and shown tremendous performance on visual question answering (VQA) and grounding. But most systems that show good performance of those tasks still rely on pre-trained object detectors during training, which limits their applicability to the object classes available for those detectors. 
11 | To mitigate this limitation, the following paper focuses on the problem of weakly supervised grounding in context of visual question answering in transformers.  The approach leverages capsules by grouping each visual token in the visual encoder and uses activations from language self-attention layers as a text-guided selection module to mask those capsules before they are forwarded to the next layer. 
12 | We evaluate our approach on the challenging GQA as well as VQA-HAT dataset for VQA grounding.
13 | Our experiments show that: while removing the information of masked objects from standard transformer architectures leads to a significant drop in performance, the integration of capsules significantly improves the grounding ability of such systems and provides new state-of-the-art results compared to other approaches in the field.
14 | 
15 | <p align="center">
16 | <img src="images/main_fig.png" width=100% height=100% 
17 | class="center">
18 | </p>
19 | 
20 | ##### <p align="center">(a) Proposed Architecture, (b) Proposed Capsule Encoding layer, (c) Proposed Capsule Layer </p>
21 | 
22 | ### Qualitative Results
23 | ![gqa-qualitative](images/teaser.png)
24 | 
25 | ### Code
26 | This code is built upon code base of [LXMERT](https://github.com/airsplay/lxmert). Thanks to [Hao Tan](https://scholar.google.com/citations?user=OV1Y3FUAAAAJ&hl=en) for providing excellent code for their model.
27 | 
28 | #### Datasets
29 | for pretraining, we used MSCOCO, VG for image-captions pairs and Viz7W, VQA v2.0, GQA for question-image pairs. We used instructions provided by [LXMERT](https://github.com/airsplay/lxmert) to prepare the data except a few changes.
30 | 1. We removed GQA validation set from pretraining data as we use it for grounding evaluation. 
31 | 2. We validate our pretraining on mscoco-minival split.
32 | 
33 | #### Pretraining 
34 | To pretrain the backbone, use the following command:
35 | 
36 | ```
37 | bash run/pretrain_2stage_fulldata_no_init_16_caps.bash
38 | ```
39 | 
40 | #### Finetuning on downstream tasks
41 | 
42 | ##### GQA
43 | See ``` run/gqa_finetune_caps.bash ``` for finetuning on GQA dataset. 
44 | 
45 | ##### VQA-HAT
46 | Finetuning on VQA-HAT is similar to how we finetune the model on GQA. I will keep adding more concrete details in next few days.
47 | 
48 | ### Citation
49 | If this work is useful for your research, please cite our paper.
50 | 
51 | ```bibtex
52 | @InProceedings{10.1007/978-3-031-19833-5_38,
53 | author="Khan, Aisha Urooj
54 | and Kuehne, Hilde
55 | and Gan, Chuang
56 | and Lobo, Niels Da Vitoria
57 | and Shah, Mubarak",
58 | editor="Avidan, Shai
59 | and Brostow, Gabriel
60 | and Ciss{\'e}, Moustapha
61 | and Farinella, Giovanni Maria
62 | and Hassner, Tal",
63 | title="Weakly Supervised Grounding for VQA in Vision-Language Transformers",
64 | booktitle="Computer Vision -- ECCV 2022",
65 | year="2022",
66 | publisher="Springer Nature Switzerland",
67 | address="Cham",
68 | pages="652--670",
69 | isbn="978-3-031-19833-5"
70 | }
71 | ```
72 | 
73 | ### Questions?
74 | Please contact 'aishaurooj@gmail.com'
75 | 
76 | 
77 | 
78 | 


--------------------------------------------------------------------------------
/src/tasks/nlvr2_data.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyleft 2019 project LXRT.
  3 | 
  4 | import json
  5 | 
  6 | import numpy as np
  7 | from torch.utils.data import Dataset
  8 | 
  9 | from param import args
 10 | from utils import load_obj_tsv
 11 | 
 12 | # Load part of the dataset for fast checking.
 13 | # Notice that here is the number of images instead of the number of data,
 14 | # which means all related data to the images would be used.
 15 | TINY_IMG_NUM = 512
 16 | FAST_IMG_NUM = 5000
 17 | 
 18 | 
 19 | class NLVR2Dataset:
 20 |     """
 21 |     An NLVR2 data example in json file:
 22 |     {
 23 |         "identifier": "train-10171-0-0",
 24 |         "img0": "train-10171-0-img0",
 25 |         "img1": "train-10171-0-img1",
 26 |         "label": 0,
 27 |         "sent": "An image shows one leather pencil case, displayed open with writing implements tucked inside.
 28 |         ",
 29 |         "uid": "nlvr2_train_0"
 30 |     }
 31 |     """
 32 |     def __init__(self, splits: str):
 33 |         self.name = splits
 34 |         self.splits = splits.split(',')
 35 | 
 36 |         # Loading datasets to data
 37 |         self.data = []
 38 |         for split in self.splits:
 39 |             self.data.extend(json.load(open("data/nlvr2/%s.json" % split)))
 40 |         print("Load %d data from split(s) %s." % (len(self.data), self.name))
 41 | 
 42 |         # List to dict (for evaluation and others)
 43 |         self.id2datum = {
 44 |             datum['uid']: datum
 45 |             for datum in self.data
 46 |         }
 47 | 
 48 |     def __len__(self):
 49 |         return len(self.data)
 50 | 
 51 | 
 52 | """
 53 | An example in obj36 tsv:
 54 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf",
 55 |               "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"]
 56 | FIELDNAMES would be keys in the dict returned by load_obj_tsv.
 57 | """
 58 | class NLVR2TorchDataset(Dataset):
 59 |     def __init__(self, dataset: NLVR2Dataset):
 60 |         super().__init__()
 61 |         self.raw_dataset = dataset
 62 | 
 63 |         if args.tiny:
 64 |             topk = TINY_IMG_NUM
 65 |         elif args.fast:
 66 |             topk = FAST_IMG_NUM
 67 |         else:
 68 |             topk = -1
 69 | 
 70 |         # Loading detection features to img_data
 71 |         img_data = []
 72 |         if 'train' in dataset.splits:
 73 |             img_data.extend(load_obj_tsv('data/nlvr2_imgfeat/train_obj36.tsv', topk=topk))
 74 |         if 'valid' in dataset.splits:
 75 |             img_data.extend(load_obj_tsv('data/nlvr2_imgfeat/valid_obj36.tsv', topk=topk))
 76 |         if 'test' in dataset.name:
 77 |             img_data.extend(load_obj_tsv('data/nlvr2_imgfeat/test_obj36.tsv', topk=topk))
 78 |         self.imgid2img = {}
 79 |         for img_datum in img_data:
 80 |             self.imgid2img[img_datum['img_id']] = img_datum
 81 | 
 82 |         # Filter out the dataset
 83 |         self.data = []
 84 |         for datum in self.raw_dataset.data:
 85 |             if datum['img0'] in self.imgid2img and datum['img1'] in self.imgid2img:
 86 |                 self.data.append(datum)
 87 |         print("Use %d data in torch dataset" % (len(self.data)))
 88 |         print()
 89 | 
 90 |     def __len__(self):
 91 |         return len(self.data)
 92 | 
 93 |     def __getitem__(self, item: int):
 94 |         datum = self.data[item]
 95 | 
 96 |         ques_id = datum['uid']
 97 |         ques = datum['sent']
 98 | 
 99 |         # Get image info
100 |         boxes2 = []
101 |         feats2 = []
102 |         for key in ['img0', 'img1']:
103 |             img_id = datum[key]
104 |             img_info = self.imgid2img[img_id]
105 |             boxes = img_info['boxes'].copy()
106 |             feats = img_info['features'].copy()
107 |             assert len(boxes) == len(feats)
108 | 
109 |             # Normalize the boxes (to 0 ~ 1)
110 |             img_h, img_w = img_info['img_h'], img_info['img_w']
111 |             boxes[..., (0, 2)] /= img_w
112 |             boxes[..., (1, 3)] /= img_h
113 |             np.testing.assert_array_less(boxes, 1+1e-5)
114 |             np.testing.assert_array_less(-boxes, 0+1e-5)
115 | 
116 |             boxes2.append(boxes)
117 |             feats2.append(feats)
118 |         feats = np.stack(feats2)
119 |         boxes = np.stack(boxes2)
120 | 
121 |         # Create target
122 |         if 'label' in datum:
123 |             label = datum['label']
124 |             return ques_id, feats, boxes, ques, label
125 |         else:
126 |             return ques_id, feats, boxes, ques
127 | 
128 | 
129 | class NLVR2Evaluator:
130 |     def __init__(self, dataset: NLVR2Dataset):
131 |         self.dataset = dataset
132 | 
133 |     def evaluate(self, quesid2ans: dict):
134 |         score = 0.
135 |         for quesid, ans in quesid2ans.items():
136 |             datum = self.dataset.id2datum[quesid]
137 |             label = datum['label']
138 |             if ans == label:
139 |                 score += 1
140 |         return score / len(quesid2ans)
141 | 
142 |     def dump_result(self, quesid2ans: dict, path):
143 |         """
144 |         Dump result to a CSV file, which is compatible with NLVR2 evaluation system.
145 |         NLVR2 CSV file requirement:
146 |             Each line contains: identifier, answer
147 | 
148 |         :param quesid2ans: nlvr2 uid to ans (either "True" or "False")
149 |         :param path: The desired path of saved file.
150 |         :return:
151 |         """
152 |         with open(path, 'w') as f:
153 |             for uid, ans in quesid2ans.items():
154 |                 idt = self.dataset.id2datum[uid]["identifier"]
155 |                 ans = 'True' if ans == 1 else 'False'
156 |                 f.write("%s,%s\n" % (idt, ans))
157 | 
158 | 


--------------------------------------------------------------------------------
/src/pretrain/qa_answer_table.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyleft 2019 project LXRT.
  3 | 
  4 | import json
  5 | import torch
  6 | 
  7 | 
  8 | class AnswerTable:
  9 |     ANS_CONVERT = {
 10 |         "a man": "man",
 11 |         "the man": "man",
 12 |         "a woman": "woman",
 13 |         "the woman": "woman",
 14 |         'one': '1',
 15 |         'two': '2',
 16 |         'three': '3',
 17 |         'four': '4',
 18 |         'five': '5',
 19 |         'six': '6',
 20 |         'seven': '7',
 21 |         'eight': '8',
 22 |         'nine': '9',
 23 |         'ten': '10',
 24 |         'grey': 'gray',
 25 |     }
 26 | 
 27 |     def __init__(self, dsets=None):
 28 |         self.all_ans = json.load(open("data/lxmert/all_ans.json"))
 29 |         if dsets is not None:
 30 |             dsets = set(dsets)
 31 |             # If the answer is used in the dsets
 32 |             self.anss = [ans['ans'] for ans in self.all_ans if
 33 |                          len(set(ans['dsets']) & dsets) > 0]
 34 |         else:
 35 |             self.anss = [ans['ans'] for ans in self.all_ans]
 36 |         self.ans_set = set(self.anss)
 37 | 
 38 |         self._id2ans_map = self.anss
 39 |         self._ans2id_map = {ans: ans_id for ans_id, ans in enumerate(self.anss)}
 40 | 
 41 |         assert len(self._id2ans_map) == len(self._ans2id_map)
 42 |         for ans_id, ans in enumerate(self._id2ans_map):
 43 |             assert self._ans2id_map[ans] == ans_id
 44 | 
 45 |     def convert_ans(self, ans):
 46 |         if len(ans) == 0:
 47 |             return ""
 48 |         ans = ans.lower()
 49 |         if ans[-1] == '.':
 50 |             ans = ans[:-1].strip()
 51 |         if ans.startswith("a "):
 52 |             ans = ans[2:].strip()
 53 |         if ans.startswith("an "):
 54 |             ans = ans[3:].strip()
 55 |         if ans.startswith("the "):
 56 |             ans = ans[4:].strip()
 57 |         if ans in self.ANS_CONVERT:
 58 |             ans = self.ANS_CONVERT[ans]
 59 |         return ans
 60 | 
 61 |     def ans2id(self, ans):
 62 |         return self._ans2id_map[ans]
 63 | 
 64 |     def id2ans(self, ans_id):
 65 |         return self._id2ans_map[ans_id]
 66 | 
 67 |     def ans2id_map(self):
 68 |         return self._ans2id_map.copy()
 69 | 
 70 |     def id2ans_map(self):
 71 |         return self._id2ans_map.copy()
 72 | 
 73 |     def used(self, ans):
 74 |         return ans in self.ans_set
 75 | 
 76 |     def all_answers(self):
 77 |         return self.anss.copy()
 78 | 
 79 |     @property
 80 |     def num_answers(self):
 81 |         return len(self.anss)
 82 | 
 83 | 
 84 | def load_lxmert_qa(path, model, label2ans):
 85 |     """
 86 |     Load model weights from LXMERT pre-training.
 87 |     The answers in the fine-tuned QA task (indicated by label2ans)
 88 |         would also be properly initialized with LXMERT pre-trained
 89 |         QA heads.
 90 | 
 91 |     :param path: Path to LXMERT snapshot.
 92 |     :param model: LXRT model instance.
 93 |     :param label2ans: The label2ans dict of fine-tuned QA datasets, like
 94 |         {0: 'cat', 1: 'dog', ...}
 95 |     :return:
 96 |     """
 97 |     print("Load QA pre-trained LXMERT from %s " % path)
 98 |     loaded_state_dict = torch.load("%s_LXRT.pth" % path)
 99 |     model_state_dict = model.state_dict()
100 | 
101 |     # Handle Multi-GPU pre-training --> Single GPU fine-tuning
102 |     for key in list(loaded_state_dict.keys()):
103 |         loaded_state_dict[key.replace("module.", '')] = loaded_state_dict.pop(key)
104 | 
105 |     # Isolate bert model
106 |     bert_state_dict = {}
107 |     for key, value in loaded_state_dict.items():
108 |         if key.startswith('bert.'):
109 |             bert_state_dict[key] = value
110 | 
111 |     # Isolate answer head
112 |     answer_state_dict = {}
113 |     for key, value in loaded_state_dict.items():
114 |         if key.startswith("answer_head."):
115 |             answer_state_dict[key.replace('answer_head.', '')] = value
116 | 
117 |     # Do surgery on answer state dict
118 |     ans_weight = answer_state_dict['logit_fc.3.weight']
119 |     ans_bias = answer_state_dict['logit_fc.3.bias']
120 |     import copy
121 |     new_answer_weight = copy.deepcopy(model_state_dict['logit_fc.3.weight'])
122 |     new_answer_bias = copy.deepcopy(model_state_dict['logit_fc.3.bias'])
123 |     answer_table = AnswerTable()
124 |     loaded = 0
125 |     unload = 0
126 |     if type(label2ans) is list:
127 |         label2ans = {label: ans for label, ans in enumerate(label2ans)}
128 |     for label, ans in label2ans.items():
129 |         new_ans = answer_table.convert_ans(ans)
130 |         if answer_table.used(new_ans):
131 |             ans_id_9500 = answer_table.ans2id(new_ans)
132 |             new_answer_weight[label] = ans_weight[ans_id_9500]
133 |             new_answer_bias[label] = ans_bias[ans_id_9500]
134 |             loaded += 1
135 |         else:
136 |             new_answer_weight[label] = 0.
137 |             new_answer_bias[label] = 0.
138 |             unload += 1
139 |     print("Loaded %d answers from LXRTQA pre-training and %d not" % (loaded, unload))
140 |     print()
141 |     answer_state_dict['logit_fc.3.weight'] = new_answer_weight
142 |     answer_state_dict['logit_fc.3.bias'] = new_answer_bias
143 | 
144 |     # Load Bert Weights
145 |     bert_model_keys = set(model.lxrt_encoder.model.state_dict().keys())
146 |     bert_loaded_keys = set(bert_state_dict.keys())
147 |     assert len(bert_model_keys - bert_loaded_keys) == 0
148 |     model.lxrt_encoder.model.load_state_dict(bert_state_dict, strict=False)
149 | 
150 |     # Load Answer Logic FC Weights
151 |     model_keys = set(model.state_dict().keys())
152 |     ans_loaded_keys = set(answer_state_dict.keys())
153 |     assert len(ans_loaded_keys - model_keys) == 0
154 | 
155 |     model.load_state_dict(answer_state_dict, strict=False)
156 | 
157 | 
158 | 
159 | 


--------------------------------------------------------------------------------
/src/lxrt/entry_spatial.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 project LXRT.
  3 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  4 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | import os
 19 | 
 20 | import torch
 21 | import torch.nn as nn
 22 | 
 23 | from src.lxrt.tokenization import BertTokenizer
 24 | from src.lxrt.modeling_spatial import LXRTFeatureExtraction as VisualBertForLXRFeature, VISUAL_CONFIG
 25 | 
 26 | 
 27 | class InputFeatures(object):
 28 |     """A single set of features of data."""
 29 | 
 30 |     def __init__(self, input_ids, input_mask, segment_ids):
 31 |         self.input_ids = input_ids
 32 |         self.input_mask = input_mask
 33 |         self.segment_ids = segment_ids
 34 | 
 35 | 
 36 | def convert_sents_to_features(sents, max_seq_length, tokenizer):
 37 |     """Loads a data file into a list of `InputBatch`s."""
 38 | 
 39 |     features = []
 40 |     for (i, sent) in enumerate(sents):
 41 |         tokens_a = tokenizer.tokenize(sent.strip())
 42 | 
 43 |         # Account for [CLS] and [SEP] with "- 2"
 44 |         if len(tokens_a) > max_seq_length - 2:
 45 |             tokens_a = tokens_a[:(max_seq_length - 2)]
 46 |         
 47 |         # Keep segment id which allows loading BERT-weights.
 48 |         tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
 49 |         segment_ids = [0] * len(tokens)
 50 | 
 51 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
 52 | 
 53 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
 54 |         # tokens are attended to.
 55 |         input_mask = [1] * len(input_ids)
 56 | 
 57 |         # Zero-pad up to the sequence length.
 58 |         padding = [0] * (max_seq_length - len(input_ids))
 59 |         input_ids += padding
 60 |         input_mask += padding
 61 |         segment_ids += padding
 62 | 
 63 |         assert len(input_ids) == max_seq_length
 64 |         assert len(input_mask) == max_seq_length
 65 |         assert len(segment_ids) == max_seq_length
 66 | 
 67 |         features.append(
 68 |                 InputFeatures(input_ids=input_ids,
 69 |                               input_mask=input_mask,
 70 |                               segment_ids=segment_ids))
 71 |     return features
 72 | 
 73 | 
 74 | def set_visual_config(args):
 75 |     VISUAL_CONFIG.l_layers = args.llayers
 76 |     VISUAL_CONFIG.x_layers = args.xlayers
 77 |     VISUAL_CONFIG.r_layers = args.rlayers
 78 | 
 79 |     #capsules config
 80 |     VISUAL_CONFIG.num_prim_caps = args.NUM_PRIM_CAPS
 81 |     VISUAL_CONFIG.num_vis_caps = args.NUM_VIS_CAPS
 82 |     VISUAL_CONFIG.pose_matrix_dim = args.POSE_DIM
 83 |     VISUAL_CONFIG.hw = args.HW
 84 |     VISUAL_CONFIG.caps_dim = args.NUM_VIS_CAPS * (args.POSE_DIM*args.POSE_DIM+1)
 85 | 
 86 |     print(VISUAL_CONFIG.num_prim_caps)
 87 | 
 88 | 
 89 | class LXRTEncoder(nn.Module):
 90 |     def __init__(self, args, max_seq_length, mode='x'):
 91 |         super().__init__()
 92 |         self.max_seq_length = max_seq_length
 93 |         set_visual_config(args)
 94 |         self.args = args
 95 | 
 96 |         # Using the bert tokenizer
 97 |         self.tokenizer = BertTokenizer.from_pretrained(
 98 |             "bert-base-uncased",
 99 |             do_lower_case=True
100 |         )
101 | 
102 |         # Build LXRT Model
103 |         self.model = VisualBertForLXRFeature.from_pretrained(
104 |             "bert-base-uncased",
105 |             mode=mode,
106 |             skip_connection=args.skip_connection,
107 |             shared_weights=args.shared_weights,
108 |         )
109 | 
110 |         if args.from_scratch:
111 |             print("initializing all the weights")
112 |             self.model.apply(self.model.init_bert_weights)
113 | 
114 |     def multi_gpu(self):
115 |         self.model = nn.DataParallel(self.model)
116 | 
117 |     @property
118 |     def dim(self):
119 |         return 768
120 | 
121 |     def forward(self, sents, feats, visual_attention_mask=None):
122 |         train_features = convert_sents_to_features(
123 |             sents, self.max_seq_length, self.tokenizer)
124 | 
125 |         input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long).cuda()
126 |         input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long).cuda()
127 |         segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long).cuda()
128 | 
129 |         # print(feats[0].shape)
130 |         output, attn_probs = self.model(input_ids, segment_ids, input_mask,
131 |                             visual_feats=feats,
132 |                             visual_attention_mask=visual_attention_mask, output_all_attention_masks=self.args.output_attention)
133 |         return output, attn_probs
134 | 
135 |     def save(self, path):
136 |         torch.save(self.model.state_dict(),
137 |                    os.path.join("%s_LXRT.pth" % path))
138 | 
139 |     def load(self, path):
140 |         # Load state_dict from snapshot file
141 |         print("Load LXMERT pre-trained model from %s" % path)
142 |         state_dict = torch.load("%s_LXRT.pth" % path)
143 |         new_state_dict = {}
144 |         for key, value in state_dict.items():
145 |             if key.startswith("module."):
146 |                 new_state_dict[key[len("module."):]] = value
147 |             else:
148 |                 new_state_dict[key] = value
149 |         state_dict = new_state_dict
150 | 
151 |         # Print out the differences of pre-trained and model weights.
152 |         load_keys = set(state_dict.keys())
153 |         model_keys = set(self.model.state_dict().keys())
154 |         print()
155 |         print("Weights in loaded but not in model:")
156 |         for key in sorted(load_keys.difference(model_keys)):
157 |             print(key)
158 |         print()
159 |         print("Weights in model but not in loaded:")
160 |         for key in sorted(model_keys.difference(load_keys)):
161 |             print(key)
162 |         print()
163 | 
164 |         # Load weights to model
165 |         self.model.load_state_dict(state_dict, strict=False)
166 | 
167 | 
168 | 
169 | 
170 | 


--------------------------------------------------------------------------------
/src/tasks/vqa_data.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyleft 2019 project LXRT.
  3 | 
  4 | import json
  5 | import os
  6 | import pickle
  7 | 
  8 | import numpy as np
  9 | import torch
 10 | from torch.utils.data import Dataset
 11 | 
 12 | from param import args
 13 | from utils import load_obj_tsv
 14 | 
 15 | # Load part of the dataset for fast checking.
 16 | # Notice that here is the number of images instead of the number of data,
 17 | # which means all related data to the images would be used.
 18 | TINY_IMG_NUM = 512
 19 | FAST_IMG_NUM = 5000
 20 | 
 21 | # The path to data and image features.
 22 | VQA_DATA_ROOT = 'data/vqa/'
 23 | MSCOCO_IMGFEAT_ROOT = 'data/mscoco_imgfeat/'
 24 | SPLIT2NAME = {
 25 |     'train': 'train2014',
 26 |     'valid': 'val2014',
 27 |     'minival': 'val2014',
 28 |     'nominival': 'val2014',
 29 |     'test': 'test2015',
 30 | }
 31 | 
 32 | 
 33 | class VQADataset:
 34 |     """
 35 |     A VQA data example in json file:
 36 |         {
 37 |             "answer_type": "other",
 38 |             "img_id": "COCO_train2014_000000458752",
 39 |             "label": {
 40 |                 "net": 1
 41 |             },
 42 |             "question_id": 458752000,
 43 |             "question_type": "what is this",
 44 |             "sent": "What is this photo taken looking through?"
 45 |         }
 46 |     """
 47 |     def __init__(self, splits: str):
 48 |         self.name = splits
 49 |         self.splits = splits.split(',')
 50 | 
 51 |         # Loading datasets
 52 |         self.data = []
 53 |         for split in self.splits:
 54 |             self.data.extend(json.load(open("data/vqa/%s.json" % split)))
 55 |         print("Load %d data from split(s) %s." % (len(self.data), self.name))
 56 | 
 57 |         # Convert list to dict (for evaluation)
 58 |         self.id2datum = {
 59 |             datum['question_id']: datum
 60 |             for datum in self.data
 61 |         }
 62 | 
 63 |         # Answers
 64 |         self.ans2label = json.load(open("data/vqa/trainval_ans2label.json"))
 65 |         self.label2ans = json.load(open("data/vqa/trainval_label2ans.json"))
 66 |         assert len(self.ans2label) == len(self.label2ans)
 67 | 
 68 |     @property
 69 |     def num_answers(self):
 70 |         return len(self.ans2label)
 71 | 
 72 |     def __len__(self):
 73 |         return len(self.data)
 74 | 
 75 | 
 76 | """
 77 | An example in obj36 tsv:
 78 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf",
 79 |               "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"]
 80 | FIELDNAMES would be keys in the dict returned by load_obj_tsv.
 81 | """
 82 | class VQATorchDataset(Dataset):
 83 |     def __init__(self, dataset: VQADataset):
 84 |         super().__init__()
 85 |         self.raw_dataset = dataset
 86 | 
 87 |         if args.tiny:
 88 |             topk = TINY_IMG_NUM
 89 |         elif args.fast:
 90 |             topk = FAST_IMG_NUM
 91 |         else:
 92 |             topk = None
 93 | 
 94 |         # Loading detection features to img_data
 95 |         img_data = []
 96 |         for split in dataset.splits:
 97 |             # Minival is 5K images in MS COCO, which is used in evaluating VQA/LXMERT-pre-training.
 98 |             # It is saved as the top 5K features in val2014_***.tsv
 99 |             load_topk = 5000 if (split == 'minival' and topk is None) else topk
100 |             img_data.extend(load_obj_tsv(
101 |                 os.path.join(MSCOCO_IMGFEAT_ROOT, '%s_obj36.tsv' % (SPLIT2NAME[split])),
102 |                 topk=load_topk))
103 | 
104 |         # Convert img list to dict
105 |         self.imgid2img = {}
106 |         for img_datum in img_data:
107 |             self.imgid2img[img_datum['img_id']] = img_datum
108 | 
109 |         # Only kept the data with loaded image features
110 |         self.data = []
111 |         for datum in self.raw_dataset.data:
112 |             if datum['img_id'] in self.imgid2img:
113 |                 self.data.append(datum)
114 |         print("Use %d data in torch dataset" % (len(self.data)))
115 |         print()
116 | 
117 |     def __len__(self):
118 |         return len(self.data)
119 | 
120 |     def __getitem__(self, item: int):
121 |         datum = self.data[item]
122 | 
123 |         img_id = datum['img_id']
124 |         ques_id = datum['question_id']
125 |         ques = datum['sent']
126 | 
127 |         # Get image info
128 |         img_info = self.imgid2img[img_id]
129 |         obj_num = img_info['num_boxes']
130 |         feats = img_info['features'].copy()
131 |         boxes = img_info['boxes'].copy()
132 |         assert obj_num == len(boxes) == len(feats)
133 | 
134 |         # Normalize the boxes (to 0 ~ 1)
135 |         img_h, img_w = img_info['img_h'], img_info['img_w']
136 |         boxes = boxes.copy()
137 |         boxes[:, (0, 2)] /= img_w
138 |         boxes[:, (1, 3)] /= img_h
139 |         np.testing.assert_array_less(boxes, 1+1e-5)
140 |         np.testing.assert_array_less(-boxes, 0+1e-5)
141 | 
142 |         # Provide label (target)
143 |         if 'label' in datum:
144 |             label = datum['label']
145 |             target = torch.zeros(self.raw_dataset.num_answers)
146 |             for ans, score in label.items():
147 |                 target[self.raw_dataset.ans2label[ans]] = score
148 |             return ques_id, feats, boxes, ques, target
149 |         else:
150 |             return ques_id, feats, boxes, ques
151 | 
152 | 
153 | class VQAEvaluator:
154 |     def __init__(self, dataset: VQADataset):
155 |         self.dataset = dataset
156 | 
157 |     def evaluate(self, quesid2ans: dict):
158 |         score = 0.
159 |         for quesid, ans in quesid2ans.items():
160 |             datum = self.dataset.id2datum[quesid]
161 |             label = datum['label']
162 |             if ans in label:
163 |                 score += label[ans]
164 |         return score / len(quesid2ans)
165 | 
166 |     def dump_result(self, quesid2ans: dict, path):
167 |         """
168 |         Dump results to a json file, which could be submitted to the VQA online evaluation.
169 |         VQA json file submission requirement:
170 |             results = [result]
171 |             result = {
172 |                 "question_id": int,
173 |                 "answer": str
174 |             }
175 | 
176 |         :param quesid2ans: dict of quesid --> ans
177 |         :param path: The desired path of saved file.
178 |         """
179 |         with open(path, 'w') as f:
180 |             result = []
181 |             for ques_id, ans in quesid2ans.items():
182 |                 result.append({
183 |                     'question_id': ques_id,
184 |                     'answer': ans
185 |                 })
186 |             json.dump(result, f, indent=4, sort_keys=True)
187 | 
188 | 
189 | 


--------------------------------------------------------------------------------
/src/tasks/nlvr2.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyleft 2019 project LXRT.
  3 | 
  4 | import os
  5 | import collections
  6 | 
  7 | from tqdm import tqdm
  8 | import torch
  9 | import torch.nn as nn
 10 | from torch.utils.data.dataloader import DataLoader
 11 | 
 12 | from param import args
 13 | from tasks.nlvr2_model import NLVR2Model
 14 | from tasks.nlvr2_data import NLVR2Dataset, NLVR2TorchDataset, NLVR2Evaluator
 15 | 
 16 | DataTuple = collections.namedtuple("DataTuple", 'dataset loader evaluator')
 17 | 
 18 | 
 19 | def get_tuple(splits: str, bs:int, shuffle=False, drop_last=False) -> DataTuple:
 20 |     dset = NLVR2Dataset(splits)
 21 |     tset = NLVR2TorchDataset(dset)
 22 |     evaluator = NLVR2Evaluator(dset)
 23 |     data_loader = DataLoader(
 24 |         tset, batch_size=bs,
 25 |         shuffle=shuffle, num_workers=args.num_workers,
 26 |         drop_last=drop_last, pin_memory=True
 27 |     )
 28 | 
 29 |     return DataTuple(dataset=dset, loader=data_loader, evaluator=evaluator)
 30 | 
 31 | 
 32 | class NLVR2:
 33 |     def __init__(self):
 34 |         self.train_tuple = get_tuple(
 35 |             args.train, bs=args.batch_size, shuffle=True, drop_last=True
 36 |         )
 37 |         if args.valid != "":
 38 |             valid_bsize = 2048 if args.multiGPU else 512
 39 |             self.valid_tuple = get_tuple(
 40 |                 args.valid, bs=valid_bsize,
 41 |                 shuffle=False, drop_last=False
 42 |             )
 43 |         else:
 44 |             self.valid_tuple = None
 45 | 
 46 |         self.model = NLVR2Model()
 47 | 
 48 |         # Load pre-trained weights
 49 |         if args.load_lxmert is not None:
 50 |             self.model.lxrt_encoder.load(args.load_lxmert)
 51 | 
 52 |         # GPU options
 53 |         if args.multiGPU:
 54 |             self.model.lxrt_encoder.multi_gpu()
 55 |         self.model = self.model.cuda()
 56 | 
 57 |         # Losses and optimizer
 58 |         self.mce_loss = nn.CrossEntropyLoss(ignore_index=-1)
 59 |         if 'bert' in args.optim:
 60 |             batch_per_epoch = len(self.train_tuple.loader)
 61 |             t_total = int(batch_per_epoch * args.epochs)
 62 |             print("Total Iters: %d" % t_total)
 63 |             from lxrt.optimization import BertAdam
 64 |             self.optim = BertAdam(list(self.model.parameters()),
 65 |                                   lr=args.lr,
 66 |                                   warmup=0.1,
 67 |                                   t_total=t_total)
 68 |         else:
 69 |             self.optim = args.optimizer(list(self.model.parameters()), args.lr)
 70 | 
 71 |         self.output = args.output
 72 |         os.makedirs(self.output, exist_ok=True)
 73 | 
 74 |     def train(self, train_tuple, eval_tuple):
 75 |         dset, loader, evaluator = train_tuple
 76 |         iter_wrapper = (lambda x: tqdm(x, total=len(loader))) if args.tqdm else (lambda x: x)
 77 | 
 78 |         best_valid = 0.
 79 |         for epoch in range(args.epochs):
 80 |             quesid2ans = {}
 81 |             for i, (ques_id, feats, boxes, sent, label) in iter_wrapper(enumerate(loader)):
 82 |                 self.model.train()
 83 | 
 84 |                 self.optim.zero_grad()
 85 |                 feats, boxes, label = feats.cuda(), boxes.cuda(), label.cuda()
 86 |                 logit = self.model(feats, boxes, sent)
 87 | 
 88 |                 loss = self.mce_loss(logit, label)
 89 | 
 90 |                 loss.backward()
 91 |                 nn.utils.clip_grad_norm_(self.model.parameters(), 5.)
 92 |                 self.optim.step()
 93 | 
 94 |                 score, predict = logit.max(1)
 95 |                 for qid, l in zip(ques_id, predict.cpu().numpy()):
 96 |                     quesid2ans[qid] = l
 97 | 
 98 |             log_str = "\nEpoch %d: Train %0.2f\n" % (epoch, evaluator.evaluate(quesid2ans) * 100.)
 99 | 
100 |             if self.valid_tuple is not None:  # Do Validation
101 |                 valid_score = self.evaluate(eval_tuple)
102 |                 if valid_score > best_valid:
103 |                     best_valid = valid_score
104 |                     self.save("BEST")
105 | 
106 |                 log_str += "Epoch %d: Valid %0.2f\n" % (epoch, valid_score * 100.) + \
107 |                            "Epoch %d: Best %0.2f\n" % (epoch, best_valid * 100.)
108 | 
109 |             print(log_str, end='')
110 | 
111 |             with open(self.output + "/log.log", 'a') as f:
112 |                 f.write(log_str)
113 |                 f.flush()
114 | 
115 |         self.save("LAST")
116 | 
117 |     def predict(self, eval_tuple: DataTuple, dump=None):
118 |         self.model.eval()
119 |         dset, loader, evaluator = eval_tuple
120 |         quesid2ans = {}
121 |         for i, datum_tuple in enumerate(loader):
122 |             ques_id, feats, boxes, sent = datum_tuple[:4]   # avoid handling target
123 |             with torch.no_grad():
124 |                 feats, boxes = feats.cuda(), boxes.cuda()
125 |                 logit = self.model(feats, boxes, sent)
126 |                 score, predict = logit.max(1)
127 |                 for qid, l in zip(ques_id, predict.cpu().numpy()):
128 |                     quesid2ans[qid] = l
129 |         if dump is not None:
130 |             evaluator.dump_result(quesid2ans, dump)
131 |         return quesid2ans
132 | 
133 |     def evaluate(self, eval_tuple: DataTuple, dump=None):
134 |         dset, loader, evaluator = eval_tuple
135 |         quesid2ans = self.predict(eval_tuple, dump)
136 |         return evaluator.evaluate(quesid2ans)
137 | 
138 |     def save(self, name):
139 |         torch.save(self.model.state_dict(),
140 |                    os.path.join(self.output, "%s.pth" % name))
141 | 
142 |     def load(self, path):
143 |         print("Load model from %s" % path)
144 |         state_dict = torch.load("%s.pth" % path)
145 |         self.model.load_state_dict(state_dict)
146 | 
147 | 
148 | if __name__ == "__main__":
149 |     # Build Class
150 |     nlvr2 = NLVR2()
151 | 
152 |     # Load Model
153 |     if args.load is not None:
154 |         nlvr2.load(args.load)
155 | 
156 |     # Test or Train
157 |     if args.test is not None:
158 |         args.fast = args.tiny = False       # Always loading all data in test
159 |         if 'hidden' in args.test:
160 |             nlvr2.predict(
161 |                 get_tuple(args.test, bs=args.batch_size,
162 |                           shuffle=False, drop_last=False),
163 |                 dump=os.path.join(args.output, 'hidden_predict.csv')
164 |             )
165 |         elif 'test' in args.test or 'valid' in args.test:
166 |             result = nlvr2.evaluate(
167 |                 get_tuple(args.test, bs=args.batch_size,
168 |                           shuffle=False, drop_last=False),
169 |                 dump=os.path.join(args.output, '%s_predict.csv' % args.test)
170 |             )
171 |             print(result)
172 |         else:
173 |             assert False, "No such test option for %s" % args.test
174 |     else:
175 |         print('Splits in Train data:', nlvr2.train_tuple.dataset.splits)
176 |         if nlvr2.valid_tuple is not None:
177 |             print('Splits in Valid data:', nlvr2.valid_tuple.dataset.splits)
178 |         else:
179 |             print("DO NOT USE VALIDATION")
180 |         nlvr2.train(nlvr2.train_tuple, nlvr2.valid_tuple)
181 | 
182 | 
183 | 


--------------------------------------------------------------------------------
/src/tasks/gqa_data_patches.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyleft 2019 project LXRT.
  3 | 
  4 | import json
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from torch.utils.data import Dataset
  9 | 
 10 | from src.param import args
 11 | from src.utils import load_obj_tsv, load_patches
 12 | 
 13 | # Load part of the dataset for fast checking.
 14 | # Notice that here is the number of images instead of the number of data,
 15 | # which means all related data to the images would be used.
 16 | TINY_IMG_NUM = 512
 17 | FAST_IMG_NUM = 5000
 18 | 
 19 | 
 20 | class GQADataset:
 21 |     """
 22 |     A GQA data example in json file:
 23 |     {
 24 |         "img_id": "2375429",
 25 |         "label": {
 26 |             "pipe": 1.0
 27 |         },
 28 |         "question_id": "07333408",
 29 |         "sent": "What is on the white wall?"
 30 |     }
 31 |     """
 32 |     def __init__(self, splits: str):
 33 |         self.name = splits
 34 |         self.splits = splits.split(',')
 35 | 
 36 |         # Loading datasets to data
 37 |         self.data = []
 38 |         for split in self.splits:
 39 |             self.data.extend(json.load(open("data/gqa/%s.json" % split)))
 40 |         print("Load %d data from split(s) %s." % (len(self.data), self.name))
 41 | 
 42 |         # List to dict (for evaluation and others)
 43 |         self.id2datum = {
 44 |             datum['question_id']: datum
 45 |             for datum in self.data
 46 |         }
 47 | 
 48 |         # Answers
 49 |         self.ans2label = json.load(open("data/gqa/trainval_ans2label.json"))
 50 |         self.label2ans = json.load(open("data/gqa/trainval_label2ans.json"))
 51 |         assert len(self.ans2label) == len(self.label2ans)
 52 |         for ans, label in self.ans2label.items():
 53 |             assert self.label2ans[label] == ans
 54 | 
 55 |     @property
 56 |     def num_answers(self):
 57 |         return len(self.ans2label)
 58 | 
 59 |     def __len__(self):
 60 |         return len(self.data)
 61 | 
 62 | 
 63 | class GQABufferLoader():
 64 |     def __init__(self):
 65 |         self.key2data = {}
 66 | 
 67 |     def load_data(self, name, number):
 68 |         if name == 'testdev':
 69 |             path = "data/gqa/testdev_patches_32x32.hdf5"
 70 |         elif name == 'valid':
 71 |             path = "data/gqa/valid_patches_32x32.hdf5"
 72 |         else:
 73 |             path = "data/gqa/train_patches_32x32.hdf5"
 74 |         key = "%s_%d" % (path, number)
 75 |         if key not in self.key2data:
 76 |             self.key2data[key] = load_patches(
 77 |                 path,
 78 |                 dataset='gqa',
 79 |                 topk=number
 80 |             )
 81 |         return self.key2data[key]
 82 | 
 83 | 
 84 | gqa_buffer_loader = GQABufferLoader()
 85 | 
 86 | 
 87 | """
 88 | Example in obj tsv:
 89 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf",
 90 |               "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"]
 91 | """
 92 | class GQATorchDataset(Dataset):
 93 |     def __init__(self, dataset: GQADataset):
 94 |         super().__init__()
 95 |         self.raw_dataset = dataset
 96 | 
 97 |         if args.tiny:
 98 |             topk = TINY_IMG_NUM
 99 |         elif args.fast:
100 |             topk = FAST_IMG_NUM
101 |         else:
102 |             topk = -1
103 | 
104 |         # Loading detection features to img_data
105 |         # Since images in train and valid both come from Visual Genome,
106 |         # buffer the image loading to save memory.
107 |         img_data = []
108 |         if 'testdev' in dataset.splits or 'testdev_all' in dataset.splits:     # Always loading all the data in testdev
109 |             img_data.extend(gqa_buffer_loader.load_data('testdev', -1))
110 |         elif 'valid' in dataset.splits:
111 |             img_data.extend(gqa_buffer_loader.load_data('valid', -1))
112 |         else:
113 |             img_data.extend(gqa_buffer_loader.load_data('train', topk))
114 |         self.imgid2img = {}
115 |         for img_datum in img_data:
116 |             self.imgid2img[img_datum['img_id']] = img_datum
117 | 
118 |         # Only kept the data with loaded image features
119 |         self.data = []
120 |         for datum in self.raw_dataset.data:
121 |             if datum['img_id'] in self.imgid2img:
122 |                 self.data.append(datum)
123 |         print("Use %d data in torch dataset" % (len(self.data)))
124 |         print()
125 | 
126 |     def __len__(self):
127 |         return len(self.data)
128 | 
129 |     def __getitem__(self, item: int):
130 |         datum = self.data[item]
131 | 
132 |         img_id = datum['img_id']
133 |         ques_id = datum['question_id']
134 |         ques = datum['sent']
135 |         h,w = 7,7
136 |         # Get image info
137 |         img_info = self.imgid2img[img_id]
138 |         obj_num = img_info['num_boxes']
139 |         # boxes = img_info['boxes'].copy()
140 |         feats = img_info['features'].copy()
141 |         # assert len(boxes) == len(feats) == obj_num
142 |         boxes = np.ones(h*w + 1, dtype=np.float32)  # assuming feats of shape [d, h, w]
143 |         # Normalize the boxes (to 0 ~ 1)
144 |         # img_h, img_w = img_info['img_h'], img_info['img_w']
145 |         # boxes = boxes.copy()
146 |         # boxes[:, (0, 2)] /= img_w
147 |         # boxes[:, (1, 3)] /= img_h
148 |         # np.testing.assert_array_less(boxes, 1+1e-5)
149 |         # np.testing.assert_array_less(-boxes, 0+1e-5)
150 | 
151 |         # Create target
152 |         if 'label' in datum:
153 |             label = datum['label']
154 |             target = torch.zeros(self.raw_dataset.num_answers)
155 |             for ans, score in label.items():
156 |                 if ans in self.raw_dataset.ans2label:
157 |                     target[self.raw_dataset.ans2label[ans]] = score
158 |             return ques_id, feats, boxes, ques, target
159 |         else:
160 |             return ques_id, feats, boxes, ques
161 | 
162 | 
163 | class GQAEvaluator:
164 |     def __init__(self, dataset: GQADataset):
165 |         self.dataset = dataset
166 | 
167 |     def evaluate(self, quesid2ans: dict):
168 |         score = 0.
169 |         for quesid, ans in quesid2ans.items():
170 |             datum = self.dataset.id2datum[quesid]
171 |             label = datum['label']
172 |             if ans in label:
173 |                 score += label[ans]
174 |         return score / len(quesid2ans)
175 | 
176 |     def save_json(self, data, file_path):
177 |         with open(file_path, "w") as f:
178 |             json.dump(data, f)
179 | 
180 |     def dump_result(self, quesid2ans: dict, path):
181 |         """
182 |         Dump the result to a GQA-challenge submittable json file.
183 |         GQA json file submission requirement:
184 |             results = [result]
185 |             result = {
186 |                 "questionId": str,      # Note: it's a actually an int number but the server requires an str.
187 |                 "prediction": str
188 |             }
189 | 
190 |         :param quesid2ans: A dict mapping question id to its predicted answer.
191 |         :param path: The file path to save the json file.
192 |         :return:
193 |         """
194 |         with open(path, 'w') as f:
195 |             result = []
196 |             for ques_id, ans in quesid2ans.items():
197 |                 result.append({
198 |                     'questionId': ques_id,
199 |                     'prediction': ans
200 |                 })
201 |             json.dump(result, f, indent=4, sort_keys=True)
202 | 
203 | 
204 | 


--------------------------------------------------------------------------------
/src/tasks/gqa_data.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyleft 2019 project LXRT.
  3 | 
  4 | import json
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from torch.utils.data import Dataset
  9 | 
 10 | from src.param import args
 11 | from src.utils import load_obj_tsv, load_spatial_gqa
 12 | 
 13 | # Load part of the dataset for fast checking.
 14 | # Notice that here is the number of images instead of the number of data,
 15 | # which means all related data to the images would be used.
 16 | TINY_IMG_NUM = 512
 17 | FAST_IMG_NUM = 5000
 18 | 
 19 | 
 20 | class GQADataset:
 21 |     """
 22 |     A GQA data example in json file:
 23 |     {
 24 |         "img_id": "2375429",
 25 |         "label": {
 26 |             "pipe": 1.0
 27 |         },
 28 |         "question_id": "07333408",
 29 |         "sent": "What is on the white wall?"
 30 |     }
 31 |     """
 32 |     def __init__(self, splits: str):
 33 |         self.name = splits
 34 |         self.splits = splits.split(',')
 35 | 
 36 |         # Loading datasets to data
 37 |         self.data = []
 38 |         for split in self.splits:
 39 |             self.data.extend(json.load(open("../../data/gqa/%s.json" % split)))
 40 |         print("Load %d data from split(s) %s." % (len(self.data), self.name))
 41 | 
 42 |         # List to dict (for evaluation and others)
 43 |         self.id2datum = {
 44 |             datum['question_id']: datum
 45 |             for datum in self.data
 46 |         }
 47 | 
 48 |         # Answers
 49 |         self.ans2label = json.load(open("../../data/gqa/trainval_ans2label.json"))
 50 |         self.label2ans = json.load(open("../../data/gqa/trainval_label2ans.json"))
 51 |         assert len(self.ans2label) == len(self.label2ans)
 52 |         for ans, label in self.ans2label.items():
 53 |             assert self.label2ans[label] == ans
 54 | 
 55 |     @property
 56 |     def num_answers(self):
 57 |         return len(self.ans2label)
 58 | 
 59 |     def __len__(self):
 60 |         return len(self.data)
 61 | 
 62 | 
 63 | class GQABufferLoader():
 64 |     def __init__(self):
 65 |         self.key2data = {}
 66 | 
 67 |     def load_data(self, name, number):
 68 |         if name == 'testdev':
 69 |             # path = "data/vg_gqa_imgfeat/gqa_testdev_obj36.tsv"
 70 |             path = "../../data/gqa/gqa_spatial.h5"
 71 |         else:
 72 |             # path = "data/vg_gqa_imgfeat/vg_gqa_obj36.tsv"
 73 |             path = "../../data/gqa/gqa_spatial.h5"
 74 |         key = "%s_%d" % (path, number)
 75 |         if key not in self.key2data:
 76 |             self.key2data[key] = load_spatial_gqa(
 77 |                 path,
 78 |                 topk=number
 79 |             )
 80 |         return self.key2data[key]
 81 | 
 82 | 
 83 | gqa_buffer_loader = GQABufferLoader()
 84 | 
 85 | 
 86 | """
 87 | Example in obj tsv:
 88 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf",
 89 |               "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"]
 90 | """
 91 | class GQATorchDataset(Dataset):
 92 |     def __init__(self, dataset: GQADataset):
 93 |         super().__init__()
 94 |         self.raw_dataset = dataset
 95 |         # self.img_info_data = json.load('data/gqa/gqa_spatial_merged_info.json')
 96 | 
 97 |         if args.tiny:
 98 |             topk = TINY_IMG_NUM
 99 |         elif args.fast:
100 |             topk = FAST_IMG_NUM
101 |         else:
102 |             topk = -1
103 | 
104 |         # Loading detection features to img_data
105 |         # Since images in train and valid both come from Visual Genome,
106 |         # buffer the image loading to save memory.
107 |         img_data = []
108 |         if 'testdev' in dataset.splits or 'testdev_all' in dataset.splits:     # Always loading all the data in testdev
109 |             img_data.extend(gqa_buffer_loader.load_data('testdev', -1))
110 |         else:
111 |             img_data.extend(gqa_buffer_loader.load_data('train', topk))
112 |         self.imgid2img = {}
113 |         for img_datum in img_data:
114 |             self.imgid2img[img_datum['img_id']] = img_datum
115 | 
116 |         # Only kept the data with loaded image features
117 |         self.data = []
118 |         for datum in self.raw_dataset.data:
119 |             if datum['img_id'] in self.imgid2img:
120 |                 self.data.append(datum)
121 |         print("Use %d data in torch dataset" % (len(self.data)))
122 |         print()
123 | 
124 |     def __len__(self):
125 |         return len(self.data)
126 | 
127 |     def __getitem__(self, item: int):
128 |         datum = self.data[item]
129 | 
130 |         img_id = datum['img_id']
131 |         ques_id = datum['question_id']
132 |         ques = datum['sent']
133 | 
134 |         # Get image info
135 |         img_info = self.imgid2img[img_id]
136 |         obj_num = img_info['num_boxes']
137 |         # boxes = img_info['boxes'].copy()
138 | 
139 |         feats = img_info['features'].copy()
140 |         ##Aisha change:
141 | 
142 |         boxes = np.ones(feats.shape[1]*feats.shape[2]+1, dtype=np.float32) #assuming feats of shape [d, h, w]
143 |         # assert len(boxes) == len(feats) == obj_num
144 | 
145 |         # Normalize the boxes (to 0 ~ 1)
146 |         # img_h, img_w = img_info['img_h'], img_info['img_w']
147 |         # boxes = boxes.copy()
148 |         # boxes[:, (0, 2)] /= img_w
149 |         # boxes[:, (1, 3)] /= img_h
150 |         # np.testing.assert_array_less(boxes, 1+1e-5)
151 |         # np.testing.assert_array_less(-boxes, 0+1e-5)
152 | 
153 |         # Create target
154 |         if 'label' in datum:
155 |             label = datum['label']
156 |             target = torch.zeros(self.raw_dataset.num_answers)
157 |             for ans, score in label.items():
158 |                 if ans in self.raw_dataset.ans2label:
159 |                     target[self.raw_dataset.ans2label[ans]] = score
160 |             return ques_id, feats, boxes, ques, target
161 |         else:
162 |             return ques_id, feats, boxes, ques
163 | 
164 | 
165 | class GQAEvaluator:
166 |     def __init__(self, dataset: GQADataset):
167 |         self.dataset = dataset
168 | 
169 |     def evaluate(self, quesid2ans: dict):
170 |         score = 0.
171 |         for quesid, ans in quesid2ans.items():
172 |             datum = self.dataset.id2datum[quesid]
173 |             label = datum['label']
174 |             if ans in label:
175 |                 score += label[ans]
176 |         return score / len(quesid2ans)
177 | 
178 |     def save_json(self, data, file_path):
179 |         with open(file_path, "w") as f:
180 |             json.dump(data, f)
181 | 
182 |     def dump_result(self, quesid2ans: dict, path):
183 |         """
184 |         Dump the result to a GQA-challenge submittable json file.
185 |         GQA json file submission requirement:
186 |             results = [result]
187 |             result = {
188 |                 "questionId": str,      # Note: it's a actually an int number but the server requires an str.
189 |                 "prediction": str
190 |             }
191 | 
192 |         :param quesid2ans: A dict mapping question id to its predicted answer.
193 |         :param path: The file path to save the json file.
194 |         :return:
195 |         """
196 |         with open(path, 'w') as f:
197 |             result = []
198 |             for ques_id, ans in quesid2ans.items():
199 |                 result.append({
200 |                     'questionId': ques_id,
201 |                     'prediction': ans
202 |                 })
203 |             json.dump(result, f, indent=4, sort_keys=True)
204 | 
205 | 
206 | 


--------------------------------------------------------------------------------
/src/tasks/vqahat_data.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyleft 2019 project LXRT.
  3 | 
  4 | import json
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from torch.utils.data import Dataset
  9 | 
 10 | from src.param import args
 11 | from src.utils import load_spatial_data
 12 | 
 13 | # Load part of the dataset for fast checking.
 14 | # Notice that here is the number of images instead of the number of data,
 15 | # which means all related data to the images would be used.
 16 | TINY_IMG_NUM = 512
 17 | FAST_IMG_NUM = 5000
 18 | 
 19 | 
 20 | class GQADataset:
 21 |     """
 22 |     A GQA data example in json file:
 23 |     {
 24 |         "img_id": "2375429",
 25 |         "label": {
 26 |             "pipe": 1.0
 27 |         },
 28 |         "question_id": "07333408",
 29 |         "sent": "What is on the white wall?"
 30 |     }
 31 |     """
 32 |     def __init__(self, splits: str):
 33 |         self.name = splits
 34 |         self.splits = splits.split(',')
 35 | 
 36 |         # Loading datasets to data
 37 |         self.data = []
 38 |         for split in self.splits:
 39 |             self.data.extend(json.load(open("../../data/VQA_HAT/%s.json" % split)))
 40 |         print("Load %d data from split(s) %s." % (len(self.data), self.name))
 41 | 
 42 |         # List to dict (for evaluation and others)
 43 |         self.id2datum = {
 44 |             datum['question_id']: datum
 45 |             for datum in self.data
 46 |         }
 47 | 
 48 |         # Answers
 49 |         self.ans2label = json.load(open("../../data/VQA_HAT/trainval_ans2label.json"))
 50 |         self.label2ans = json.load(open("../../data/VQA_HAT/trainval_label2ans.json"))
 51 |         assert len(self.ans2label) == len(self.label2ans)
 52 |         for ans, label in self.ans2label.items():
 53 |             assert self.label2ans[label] == ans
 54 | 
 55 |     @property
 56 |     def num_answers(self):
 57 |         return len(self.ans2label)
 58 | 
 59 |     def __len__(self):
 60 |         return len(self.data)
 61 | 
 62 | 
 63 | class GQABufferLoader():
 64 |     def __init__(self):
 65 |         self.key2data = {}
 66 | 
 67 |     def load_data(self, name, number):
 68 |         if name == 'testdev':
 69 |             # path = "data/vg_gqa_imgfeat/gqa_testdev_obj36.tsv"
 70 |             path = "../../data/mscoco_imgfeat/valid_features.hdf5"
 71 |         elif name == 'valid':
 72 |             # path = "data/vg_gqa_imgfeat/vg_gqa_obj36.tsv"
 73 |             path = "../../data/mscoco_imgfeat/valid_features.hdf5"
 74 |         else:
 75 |             path = "../../data/mscoco_imgfeat/train_features.hdf5"
 76 |         key = "%s_%d" % (path, number)
 77 |         print(key)
 78 |         if key not in self.key2data:
 79 |             self.key2data[key] = load_spatial_data(
 80 |                 path,
 81 |                 topk=number
 82 |             )
 83 |         return self.key2data[key]
 84 | 
 85 | 
 86 | gqa_buffer_loader = GQABufferLoader()
 87 | 
 88 | 
 89 | """
 90 | Example in obj tsv:
 91 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf",
 92 |               "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"]
 93 | """
 94 | class GQATorchDataset(Dataset):
 95 |     def __init__(self, dataset: GQADataset):
 96 |         super().__init__()
 97 |         self.raw_dataset = dataset
 98 |         # self.img_info_data = json.load('data/gqa/gqa_spatial_merged_info.json')
 99 | 
100 |         if args.tiny:
101 |             topk = TINY_IMG_NUM
102 |         elif args.fast:
103 |             topk = FAST_IMG_NUM
104 |         else:
105 |             topk = -1
106 | 
107 |         # Loading detection features to img_data
108 |         # Since images in train and valid both come from Visual Genome,
109 |         # buffer the image loading to save memory.
110 |         img_data = []
111 |         if 'testdev' in dataset.splits or 'testdev_all' in dataset.splits:     # Always loading all the data in testdev
112 |             img_data.extend(gqa_buffer_loader.load_data('testdev', -1))
113 |         elif 'valid' in dataset.splits:
114 |             img_data.extend(gqa_buffer_loader.load_data('valid', -1))
115 |         else:
116 |             img_data.extend(gqa_buffer_loader.load_data('train', topk))
117 |         self.imgid2img = {}
118 |         for img_datum in img_data:
119 |             self.imgid2img[img_datum['img_id']] = img_datum
120 | 
121 |         # Only kept the data with loaded image features
122 |         self.data = []
123 |         for datum in self.raw_dataset.data:
124 |             datum_img_id = "COCO_val2014_{0:012d}".format(datum['image_id'])
125 |             if datum_img_id in self.imgid2img:
126 |                 datum['image_id'] = datum_img_id
127 |                 self.data.append(datum)
128 |         print("Use %d data in torch dataset" % (len(self.data)))
129 |         print()
130 | 
131 |     def __len__(self):
132 |         return len(self.data)
133 | 
134 |     def __getitem__(self, item: int):
135 |         datum = self.data[item]
136 | 
137 |         img_id = datum['image_id']
138 |         ques_id = datum['question_id']
139 |         ques = datum['question']
140 | 
141 |         # Get image info
142 |         img_info = self.imgid2img[img_id]
143 |         obj_num = img_info['num_boxes']
144 |         # boxes = img_info['boxes'].copy()
145 | 
146 |         feats = img_info['features'].copy()
147 |         ##Aisha change:
148 | 
149 |         boxes = np.ones(feats.shape[1]*feats.shape[2]+1, dtype=np.float32) #assuming feats of shape [d, h, w]
150 |         # assert len(boxes) == len(feats) == obj_num
151 | 
152 |         # Normalize the boxes (to 0 ~ 1)
153 |         # img_h, img_w = img_info['img_h'], img_info['img_w']
154 |         # boxes = boxes.copy()
155 |         # boxes[:, (0, 2)] /= img_w
156 |         # boxes[:, (1, 3)] /= img_h
157 |         # np.testing.assert_array_less(boxes, 1+1e-5)
158 |         # np.testing.assert_array_less(-boxes, 0+1e-5)
159 | 
160 |         # Create target
161 |         if 'label' in datum:
162 |             label = datum['label']
163 |             target = torch.zeros(self.raw_dataset.num_answers)
164 |             for ans, score in label.items():
165 |                 if ans in self.raw_dataset.ans2label:
166 |                     target[self.raw_dataset.ans2label[ans]] = score
167 |             return ques_id, feats, boxes, ques, target
168 |         else:
169 |             return ques_id, feats, boxes, ques
170 | 
171 | 
172 | class GQAEvaluator:
173 |     def __init__(self, dataset: GQADataset):
174 |         self.dataset = dataset
175 | 
176 |     def evaluate(self, quesid2ans: dict):
177 |         score = 0.
178 |         for quesid, ans in quesid2ans.items():
179 |             datum = self.dataset.id2datum[quesid]
180 |             label = datum['label']
181 |             if ans in label:
182 |                 score += label[ans]
183 |         return score / len(quesid2ans)
184 | 
185 |     def save_json(self, data, file_path):
186 |         with open(file_path, "w") as f:
187 |             json.dump(data, f)
188 | 
189 |     def dump_result(self, quesid2ans: dict, path):
190 |         """
191 |         Dump the result to a GQA-challenge submittable json file.
192 |         GQA json file submission requirement:
193 |             results = [result]
194 |             result = {
195 |                 "questionId": str,      # Note: it's a actually an int number but the server requires an str.
196 |                 "prediction": str
197 |             }
198 | 
199 |         :param quesid2ans: A dict mapping question id to its predicted answer.
200 |         :param path: The file path to save the json file.
201 |         :return:
202 |         """
203 |         with open(path, 'w') as f:
204 |             result = []
205 |             for ques_id, ans in quesid2ans.items():
206 |                 result.append({
207 |                     'questionId': ques_id,
208 |                     'prediction': ans
209 |                 })
210 |             json.dump(result, f, indent=4, sort_keys=True)
211 | 
212 | 
213 | 


--------------------------------------------------------------------------------
/src/param.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyleft 2019 project LXRT.
  3 | 
  4 | import argparse
  5 | import random
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | 
 10 | 
 11 | def get_optimizer(optim):
 12 |     # Bind the optimizer
 13 |     if optim == 'rms':
 14 |         print("Optimizer: Using RMSProp")
 15 |         optimizer = torch.optim.RMSprop
 16 |     elif optim == 'adam':
 17 |         print("Optimizer: Using Adam")
 18 |         optimizer = torch.optim.Adam
 19 |     elif optim == 'adamax':
 20 |         print("Optimizer: Using Adamax")
 21 |         optimizer = torch.optim.Adamax
 22 |     elif optim == 'sgd':
 23 |         print("Optimizer: sgd")
 24 |         optimizer = torch.optim.SGD
 25 |     elif 'bert' in optim:
 26 |         optimizer = 'bert'  # The bert optimizer will be bind later.
 27 |     else:
 28 |         assert False, "Please add your optimizer %s in the list." % optim
 29 | 
 30 |     return optimizer
 31 | 
 32 | 
 33 | def parse_args():
 34 |     parser = argparse.ArgumentParser()
 35 | 
 36 |     # Data Splits
 37 |     parser.add_argument("--train", default='train')
 38 |     parser.add_argument("--valid", default='valid')
 39 |     parser.add_argument("--test", default=None)
 40 | 
 41 |     # Training Hyper-parameters
 42 |     parser.add_argument('--batchSize', dest='batch_size', type=int, default=256)
 43 |     parser.add_argument('--optim', default='bert')
 44 |     parser.add_argument('--lr', type=float, default=1e-4)
 45 |     parser.add_argument('--epochs', type=int, default=10)
 46 |     parser.add_argument('--dropout', type=float, default=0.1)
 47 |     parser.add_argument('--margin', type=float, default=0.1)
 48 |     parser.add_argument('--seed', type=int, default=9595, help='random seed')
 49 | 
 50 |     # Debugging
 51 |     parser.add_argument('--output', type=str, default='snap/test')
 52 |     parser.add_argument("--fast", action='store_const', default=False, const=True)
 53 |     parser.add_argument("--tiny", action='store_const', default=False, const=True)
 54 |     parser.add_argument("--tqdm", action='store_const', default=False, const=True)
 55 | 
 56 |     # Model Loading
 57 |     parser.add_argument('--load', type=str, default=None,
 58 |                         help='Load the model (usually the fine-tuned model).')
 59 |     parser.add_argument('--loadLXMERT', dest='load_lxmert', type=str, default=None,
 60 |                         help='Load the pre-trained LXMERT model.')
 61 |     parser.add_argument('--loadLXMERTQA', dest='load_lxmert_qa', type=str, default=None,
 62 |                         help='Load the pre-trained LXMERT model with QA answer head.')
 63 |     parser.add_argument("--fromScratch", dest='from_scratch', action='store_const', default=False, const=True,
 64 |                         help='If none of the --load, --loadLXMERT, --loadLXMERTQA is set, '
 65 |                              'the model would be trained from scratch. If --fromScratch is'
 66 |                              ' not specified, the model would load BERT-pre-trained weights by'
 67 |                              ' default. ')
 68 | 
 69 |     parser.add_argument("--vitInit", dest='vit_init', action='store_const', default=False, const=True,
 70 |                         help='If --vitInit specified, rlayers will be initialized from vit weights '
 71 |                              'starting from layer index specified with --startIndex')
 72 | 
 73 |     # Optimization
 74 |     parser.add_argument("--mceLoss", dest='mce_loss', action='store_const', default=False, const=True)
 75 | 
 76 |     # LXRT Model Config
 77 |     # Note: LXRT = L, X, R (three encoders), Transformer
 78 |     parser.add_argument("--llayers", default=9, type=int, help='Number of Language layers')
 79 |     parser.add_argument("--xlayers", default=5, type=int, help='Number of CROSS-modality layers.')
 80 |     parser.add_argument("--rlayers", default=5, type=int, help='Number of object Relationship layers.')
 81 |     parser.add_argument("--startIndex", dest='start_index', default=7, type=int,
 82 |                         help='Specify the layer index to start loading vit weights from.')
 83 |     parser.add_argument("--skipConnection", dest='skip_connection', action='store_const', default=False, const=True)
 84 |     parser.add_argument("--sharedWeights", dest='shared_weights', action='store_const', default=False, const=True)
 85 |     parser.add_argument("--normInputs", dest='norm_inputs', action='store_const', default=False, const=True)
 86 |     parser.add_argument("--crossAttn", dest='cross_attn', action='store_const', default=False, const=True)
 87 |     parser.add_argument("--crossAttnType", dest='cross_attn_type', default="cross", type=str,
 88 |                         choices=["cross", "self", 'cross_self', 'no_cross', 'old'],
 89 |                         help='Types of cross-modality attention')
 90 |     parser.add_argument("--patches", dest='patches', action='store_const', default=False, const=True)
 91 |     parser.add_argument("--attnRouting", dest='attn_routing', action='store_const', default=False, const=True)
 92 |     parser.add_argument("--freezeWeights", dest='freeze_weights', action='store_const', default=False, const=True)
 93 |     parser.add_argument("--noCaps", dest='no_caps', action='store_const', default=False, const=True)
 94 |     parser.add_argument("--NUM_PRIM_CAPS", default=32, type=int, help='Number of primary capsules.')
 95 |     parser.add_argument("--NUM_VIS_CAPS", default=32, type=int, help='Number of visual capsules.')
 96 |     parser.add_argument("--POSE_DIM", default=4, type=int, help='Pose matrix size. Default is 4.')
 97 |     parser.add_argument("--HW", default=7, type=int, help='Spatial feature map size.')
 98 | 
 99 |     # LXRT evaluation
100 |     parser.add_argument("--outputAttn", dest='output_attention', action='store_const', default=False, const=True)
101 |     parser.add_argument("--numBlock", dest='num_block', default=-1, type=int)
102 |     parser.add_argument("--gradCAM", dest='gradcam', action='store_const', default=False, const=True)
103 |     # LXMERT Pre-training Config
104 |     parser.add_argument("--taskMatched", dest='task_matched', action='store_const', default=False, const=True)
105 |     parser.add_argument("--taskMaskLM", dest='task_mask_lm', action='store_const', default=False, const=True)
106 |     parser.add_argument("--taskObjPredict", dest='task_obj_predict', action='store_const', default=False, const=True)
107 |     parser.add_argument("--taskQA", dest='task_qa', action='store_const', default=False, const=True)
108 |     parser.add_argument("--taskGrounding", dest='task_grounding', action='store_const', default=False, const=True)
109 |     parser.add_argument("--taskContrastive", dest='task_contrastive', action='store_const', default=False, const=True)
110 |     parser.add_argument("--visualLosses", dest='visual_losses', default='obj,attr,feat', type=str)
111 |     parser.add_argument("--qaSets", dest='qa_sets', default=None, type=str)
112 |     parser.add_argument("--excludeSet", dest='exclude_set', default='', type=str)
113 |     parser.add_argument("--wordMaskRate", dest='word_mask_rate', default=0.15, type=float)
114 |     parser.add_argument("--objMaskRate", dest='obj_mask_rate', default=0.15, type=float)
115 | 
116 |     # LXMERT Finetuning Config
117 |     parser.add_argument('--trainParadigm',
118 |                         dest='train_paradigm',
119 |                         default='full',
120 |                         const='full',
121 |                         nargs='?',
122 |                         choices=['full', 'weak', 'zero'],
123 |                         help='training paradigm for refer expression task')
124 |     # Training configuration
125 |     parser.add_argument("--multiGPU", action='store_const', default=False, const=True)
126 |     parser.add_argument("--numWorkers", dest='num_workers', default=0)
127 | 
128 |     # Parse the arguments.
129 |     args = parser.parse_args()
130 | 
131 |     # Bind optimizer class.
132 |     args.optimizer = get_optimizer(args.optim)
133 | 
134 |     # Set seeds
135 |     torch.manual_seed(args.seed)
136 |     random.seed(args.seed)
137 |     np.random.seed(args.seed)
138 | 
139 |     return args
140 | 
141 | 
142 | args = parse_args()
143 | 


--------------------------------------------------------------------------------
/src/tasks/refcocoplus_data.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyleft 2019 project LXRT.
  3 | 
  4 | import json
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from torch.utils.data import Dataset
  9 | 
 10 | from src.param import args
 11 | from src.utils import load_obj_tsv, load_spatial_data
 12 | 
 13 | # Load part of the dataset for fast checking.
 14 | # Notice that here is the number of images instead of the number of data,
 15 | # which means all related data to the images would be used.
 16 | TINY_IMG_NUM = 512
 17 | FAST_IMG_NUM = 5000
 18 | 
 19 | 
 20 | class RefCOCOplusDataset:
 21 |     """
 22 |     A GQA data example in json file:
 23 |     {
 24 |         "caption": caption,
 25 |         "sent_id": sent_id,
 26 |         "image_id": image_id,
 27 |         "refBox": refBox,
 28 |         "ref_id": ref_id, --> unique id assigned to each data sample
 29 |     }
 30 |     """
 31 |     def __init__(self, splits: str):
 32 |         self.name = splits
 33 |         self.splits = splits.split(',')
 34 | 
 35 |         # Loading datasets to data
 36 |         self.data = []
 37 |         for split in self.splits:
 38 |             self.data.extend(json.load(open("/data/Grounded-RL2021/lxmert/data/refcoco+/annotations_%s.json" % split)))
 39 |         print("Load %d data from split(s) %s." % (len(self.data), self.name))
 40 | 
 41 | 
 42 |         # List to dict (for evaluation and others)
 43 |         self.id2datum = {
 44 |             datum['sent_id']: datum
 45 |             for datum in self.data
 46 |         }
 47 | 
 48 |         # Answers
 49 |         # self.ans2label = json.load(open("data/refcoco/trainval_ans2label.json"))
 50 |         # self.label2ans = json.load(open("data/refcoco/trainval_label2ans.json"))
 51 |         # assert len(self.ans2label) == len(self.label2ans)
 52 |         # for ans, label in self.ans2label.items():
 53 |         #     assert self.label2ans[label] == ans
 54 | 
 55 |     @property
 56 |     # def num_answers(self):
 57 |     #     return len(self.ans2label)
 58 | 
 59 |     def __len__(self):
 60 |         return len(self.data)
 61 | 
 62 | 
 63 | class RefCOCOplusBufferLoader():
 64 |     def __init__(self):
 65 |         self.key2data = {}
 66 | 
 67 |     def load_data(self, name, number):
 68 |         # if name == 'testdev':
 69 |         #     # path = "data/vg_gqa_imgfeat/gqa_testdev_obj36.tsv"
 70 |         #     path = "data/refcoco/refcoco_testdev_spatial.h5"
 71 |         # else:
 72 |         #     # path = "data/vg_gqa_imgfeat/vg_gqa_obj36.tsv"
 73 |         #     path = "data/refcoco/refcoco_testdev_spatial.h5"
 74 |         path = "/data/Grounded-RL2021/lxmert/data/refcoco+/{}_features.hdf5".format(name)
 75 |         key = "%s_%d" % (path, number)
 76 |         if key not in self.key2data:
 77 |             self.key2data[key] = load_spatial_data(
 78 |                 path,
 79 |                 topk=number
 80 |             )
 81 |         return self.key2data[key]
 82 | 
 83 | 
 84 | gqa_buffer_loader = RefCOCOplusBufferLoader()
 85 | 
 86 | 
 87 | """
 88 | Example in obj tsv:
 89 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf",
 90 |               "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"]
 91 | """
 92 | class RefCOCOplusTorchDataset(Dataset):
 93 |     def __init__(self, dataset: RefCOCOplusDataset):
 94 |         super().__init__()
 95 |         self.raw_dataset = dataset
 96 |         # self.img_info_data = json.load('data/gqa/gqa_spatial_merged_info.json')
 97 | 
 98 |         if args.tiny:
 99 |             topk = TINY_IMG_NUM
100 |         elif args.fast:
101 |             topk = FAST_IMG_NUM
102 |         else:
103 |             topk = -1
104 | 
105 |         # Loading detection features to img_data
106 |         # Since images in train and valid both come from Visual Genome,
107 |         # buffer the image loading to save memory.
108 |         img_data = []
109 |         if 'test' in dataset.splits or 'test' in dataset.splits:     # Always loading all the data in testdev
110 |             img_data.extend(gqa_buffer_loader.load_data('test', -1))
111 |         elif 'valid' in dataset.splits or 'valid' in dataset.splits:     # Always loading all the data in testdev
112 |             img_data.extend(gqa_buffer_loader.load_data('valid', -1))
113 |         else:
114 |             img_data.extend(gqa_buffer_loader.load_data('train', topk))
115 |         self.imgid2img = {}
116 |         for img_datum in img_data:
117 |             self.imgid2img[img_datum['image_id']] = img_datum
118 | 
119 |         # Only kept the data with loaded image features
120 |         self.data = []
121 |         for datum in self.raw_dataset.data:
122 |             if datum['image_id'] in self.imgid2img:
123 |                 self.data.append(datum)
124 |         print("Use %d data in torch dataset" % (len(self.data)))
125 |         print()
126 | 
127 |     def __len__(self):
128 |         return len(self.data)
129 | 
130 |     def __getitem__(self, item: int):
131 |         datum = self.data[item]
132 | 
133 |         img_id = datum['image_id']
134 |         sent_id = datum['sent_id']
135 |         sent = datum['caption']
136 | 
137 |         # Get image info
138 |         img_info = self.imgid2img[img_id]
139 |         obj_num = img_info['num_boxes']
140 |         # boxes = img_info['boxes'].copy()
141 | 
142 |         feats = img_info['features'].copy()
143 |         ##Aisha change:
144 | 
145 |         boxes = np.ones(feats.shape[1]*feats.shape[2], dtype=np.float32) #assuming feats of shape [d, h, w]
146 |         # assert len(boxes) == len(feats) == obj_num
147 | 
148 |         target_box = datum['refBox']
149 |         # Normalize the boxes (to 0 ~ 1)
150 |         img_h, img_w = img_info['img_h'], img_info['img_w']
151 |         target_box = target_box.copy()
152 |         # target_box[:, (0, 2)] /= img_w
153 |         # target_box[:, (1, 3)] /= img_h
154 |         target_box[0] /= img_w
155 |         target_box[2] /= img_w
156 |         target_box[1] /= img_h
157 |         target_box[3] /= img_h
158 |         np.testing.assert_array_less(np.array(target_box), 1+1e-5)
159 |         np.testing.assert_array_less(-np.array(target_box), 0+1e-5)
160 | 
161 |         # Create target
162 |         # if 'label' in datum:
163 |         #     label = datum['label']
164 |         #     target = torch.zeros(self.raw_dataset.num_answers)
165 |         #     for ans, score in label.items():
166 |         #         if ans in self.raw_dataset.ans2label:
167 |         #             target[self.raw_dataset.ans2label[ans]] = score
168 |         #     return ref_id, feats, target_box, sent, target
169 |         # else:
170 |         return sent_id, feats, boxes, sent, target_box
171 | 
172 | 
173 | class RefCOCOplusEvaluator:
174 |     def __init__(self, dataset: RefCOCOplusDataset):
175 |         self.dataset = dataset
176 | 
177 |     def evaluate(self, sentid2box: dict):
178 |         score = 0.
179 |         for sentid, box in sentid2box.items():
180 |             datum = self.dataset.id2datum[sentid]
181 |             label = datum['refBox']
182 |             if box in label:
183 |                 score += label[box]
184 |         return score / len(sentid2box)
185 | 
186 |     def save_json(self, data, file_path):
187 |         with open(file_path, "w") as f:
188 |             json.dump(data, f)
189 | 
190 |     def dump_result(self, quesid2ans: dict, path):
191 |         """
192 |         Dump the result to a GQA-challenge submittable json file.
193 |         GQA json file submission requirement:
194 |             results = [result]
195 |             result = {
196 |                 "questionId": str,      # Note: it's a actually an int number but the server requires an str.
197 |                 "prediction": str
198 |             }
199 | 
200 |         :param quesid2ans: A dict mapping question id to its predicted answer.
201 |         :param path: The file path to save the json file.
202 |         :return:
203 |         """
204 |         with open(path, 'w') as f:
205 |             result = []
206 |             for ques_id, ans in quesid2ans.items():
207 |                 result.append({
208 |                     'questionId': ques_id,
209 |                     'prediction': ans
210 |                 })
211 |             json.dump(result, f, indent=4, sort_keys=True)
212 | 
213 | 
214 | 


--------------------------------------------------------------------------------
/src/lxrt/entry.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 project LXRT.
  3 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  4 | # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
  5 | #
  6 | # Licensed under the Apache License, Version 2.0 (the "License");
  7 | # you may not use this file except in compliance with the License.
  8 | # You may obtain a copy of the License at
  9 | #
 10 | #     http://www.apache.org/licenses/LICENSE-2.0
 11 | #
 12 | # Unless required by applicable law or agreed to in writing, software
 13 | # distributed under the License is distributed on an "AS IS" BASIS,
 14 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 15 | # See the License for the specific language governing permissions and
 16 | # limitations under the License.
 17 | 
 18 | import os
 19 | 
 20 | import torch
 21 | import torch.nn as nn
 22 | 
 23 | from src.lxrt.tokenization import BertTokenizer
 24 | from src.lxrt.modeling_capsbert import LXRTFeatureExtraction as VisualBertForLXRFeature, VISUAL_CONFIG
 25 | 
 26 | 
 27 | class InputFeatures(object):
 28 |     """A single set of features of data."""
 29 | 
 30 |     def __init__(self, input_ids, input_mask, segment_ids):
 31 |         self.input_ids = input_ids
 32 |         self.input_mask = input_mask
 33 |         self.segment_ids = segment_ids
 34 | 
 35 | 
 36 | def convert_sents_to_features(sents, max_seq_length, tokenizer):
 37 |     """Loads a data file into a list of `InputBatch`s."""
 38 | 
 39 |     features = []
 40 |     for (i, sent) in enumerate(sents):
 41 |         tokens_a = tokenizer.tokenize(sent.strip())
 42 | 
 43 |         # Account for [CLS] and [SEP] with "- 2"
 44 |         if len(tokens_a) > max_seq_length - 2:
 45 |             tokens_a = tokens_a[:(max_seq_length - 2)]
 46 |         
 47 |         # Keep segment id which allows loading BERT-weights.
 48 |         tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
 49 |         segment_ids = [0] * len(tokens)
 50 | 
 51 |         input_ids = tokenizer.convert_tokens_to_ids(tokens)
 52 | 
 53 |         # The mask has 1 for real tokens and 0 for padding tokens. Only real
 54 |         # tokens are attended to.
 55 |         input_mask = [1] * len(input_ids)
 56 | 
 57 |         # Zero-pad up to the sequence length.
 58 |         padding = [0] * (max_seq_length - len(input_ids))
 59 |         input_ids += padding
 60 |         input_mask += padding
 61 |         segment_ids += padding
 62 | 
 63 |         assert len(input_ids) == max_seq_length
 64 |         assert len(input_mask) == max_seq_length
 65 |         assert len(segment_ids) == max_seq_length
 66 | 
 67 |         features.append(
 68 |                 InputFeatures(input_ids=input_ids,
 69 |                               input_mask=input_mask,
 70 |                               segment_ids=segment_ids))
 71 |     return features
 72 | 
 73 | 
 74 | def set_visual_config(args):
 75 |     VISUAL_CONFIG.l_layers = args.llayers
 76 |     VISUAL_CONFIG.x_layers = args.xlayers
 77 |     VISUAL_CONFIG.r_layers = args.rlayers
 78 | 
 79 |     #capsules config
 80 |     VISUAL_CONFIG.num_prim_caps = args.NUM_PRIM_CAPS
 81 |     VISUAL_CONFIG.num_vis_caps = args.NUM_VIS_CAPS
 82 |     VISUAL_CONFIG.pose_matrix_dim = args.POSE_DIM
 83 |     VISUAL_CONFIG.hw = args.HW
 84 |     VISUAL_CONFIG.caps_dim = args.NUM_VIS_CAPS * (args.POSE_DIM*args.POSE_DIM+1)
 85 |     VISUAL_CONFIG.is_attn_routing = args.attn_routing
 86 |     print(VISUAL_CONFIG.num_prim_caps)
 87 | 
 88 | 
 89 | class LXRTEncoder(nn.Module):
 90 |     def __init__(self, args, max_seq_length, mode='x'):
 91 |         super().__init__()
 92 |         self.max_seq_length = max_seq_length
 93 |         set_visual_config(args)
 94 |         self.args = args
 95 |         self.mode = mode
 96 |         if torch.cuda.is_available():
 97 |             self.device = 'cuda'
 98 |         else:
 99 |             self.device = 'cpu'
100 | 
101 |         # Using the bert tokenizer
102 |         self.tokenizer = BertTokenizer.from_pretrained(
103 |             "bert-base-uncased",
104 |             do_lower_case=True
105 |         )
106 |         cross_attn_type = args.cross_attn_type if hasattr(args, 'cross_attn_type') else 'old'
107 | 
108 |         # Build LXRT Model
109 |         self.model = VisualBertForLXRFeature.from_pretrained(
110 |             "bert-base-uncased",
111 |             mode=mode,
112 |             skip_connection=args.skip_connection,
113 |             shared_weights=args.shared_weights,
114 |             cross_attn = args.cross_attn,
115 |             cross_attn_type=cross_attn_type,
116 |             freeze_weights = args.freeze_weights,
117 |             patches=args.patches,
118 |             margin=args.margin,
119 |             vit_init=args.vit_init,
120 |             start_index=args.start_index,
121 |             no_caps = args.no_caps
122 |         )
123 | 
124 |         if args.from_scratch:
125 |             print("initializing all the weights")
126 |             self.model.apply(self.model.init_bert_weights)
127 | 
128 |         # GPU Options
129 |         if torch.cuda.is_available():
130 |             self.model = self.model.cuda()
131 |             if args.multiGPU:
132 |                 self.model = nn.DataParallel(self.model)
133 | 
134 |     def multi_gpu(self):
135 |         self.model = nn.DataParallel(self.model)
136 | 
137 |     @property
138 |     def dim(self):
139 |         return 768
140 | 
141 |     def forward(self, sents, feats, visual_attention_mask=None):
142 |         train_features = convert_sents_to_features(
143 |             sents, self.max_seq_length, self.tokenizer)
144 | 
145 |         input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long).to(self.device)
146 |         input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long).to(self.device)
147 |         segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long).to(self.device)
148 | 
149 |         # print(feats[0].shape)
150 |         if self.mode == 'lxr':
151 |             feat, output, attn_probs = self.model(input_ids, segment_ids, input_mask,
152 |                                             visual_feats=feats,
153 |                                             visual_attention_mask=visual_attention_mask,
154 |                                             output_all_attention_masks=self.args.output_attention)
155 |         else:
156 |             feat = None
157 |             output, attn_probs = self.model(input_ids, segment_ids, input_mask,
158 |                                 visual_feats=feats,
159 |                                 visual_attention_mask=visual_attention_mask, output_all_attention_masks=self.args.output_attention)
160 |         return feat, output, attn_probs
161 | 
162 |     def save(self, path):
163 |         torch.save(self.model.state_dict(),
164 |                    os.path.join("%s_LXRT.pth" % path))
165 | 
166 |     def load(self, path):
167 |         # Load state_dict from snapshot file
168 |         print("Load LXMERT pre-trained model from %s" % path)
169 |         state_dict = torch.load("%s_LXRT.pth" % path, map_location=torch.device(self.device))
170 |         new_state_dict = {}
171 |         for key, value in state_dict.items():
172 |             if key.startswith("module."):
173 |                 new_state_dict[key[len("module."):]] = value
174 | 
175 |             else:
176 |                 new_state_dict[key] = value
177 |             if key.startswith("lxrt_encoder.model."):
178 |                 new_state_dict[key[len("lxrt_encoder.model."):]] = value
179 |             # else:
180 |             #     new_state_dict[key] = value
181 |         state_dict = new_state_dict
182 | 
183 |         # Print out the differences of pre-trained and model weights.
184 |         load_keys = set(state_dict.keys())
185 |         model_keys = set(self.model.state_dict().keys())
186 |         print()
187 |         print("Weights in loaded but not in model:")
188 |         for key in sorted(load_keys.difference(model_keys)):
189 |             print(key)
190 |         print()
191 |         print("Weights in model but not in loaded:")
192 |         for key in sorted(model_keys.difference(load_keys)):
193 |             print(key)
194 |         print()
195 | 
196 |         # Load weights to model
197 |         self.model.load_state_dict(state_dict, strict=False)
198 | 
199 | 
200 | 
201 | 
202 | 


--------------------------------------------------------------------------------
/src/lxrt/optimization.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2019 project LXRT
  3 | # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
  4 | #
  5 | # Licensed under the Apache License, Version 2.0 (the "License");
  6 | # you may not use this file except in compliance with the License.
  7 | # You may obtain a copy of the License at
  8 | #
  9 | #     http://www.apache.org/licenses/LICENSE-2.0
 10 | #
 11 | # Unless required by applicable law or agreed to in writing, software
 12 | # distributed under the License is distributed on an "AS IS" BASIS,
 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 14 | # See the License for the specific language governing permissions and
 15 | # limitations under the License.
 16 | """PyTorch optimization for BERT model."""
 17 | 
 18 | import math
 19 | import torch
 20 | from torch.optim import Optimizer
 21 | from torch.optim.optimizer import required
 22 | import logging
 23 | 
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | def warmup_cosine(x, warmup=0.002):
 27 |     if x < warmup:
 28 |         return x/warmup
 29 |     return 0.5 * (1.0 + torch.cos(math.pi * x))
 30 | 
 31 | def warmup_constant(x, warmup=0.002):
 32 |     """ Linearly increases learning rate over `warmup`*`t_total` (as provided to BertAdam) training steps.
 33 |         Learning rate is 1. afterwards. """
 34 |     if x < warmup:
 35 |         return x/warmup
 36 |     return 1.0
 37 | 
 38 | def warmup_linear(x, warmup=0.002):
 39 |     """ Specifies a triangular learning rate schedule where peak is reached at `warmup`*`t_total`-th (as provided to BertAdam) training step.
 40 |         After `t_total`-th training step, learning rate is zero. """
 41 |     if x < warmup:
 42 |         return x/warmup
 43 |     return max((x-1.)/(warmup-1.), 0)
 44 | 
 45 | SCHEDULES = {
 46 |     'warmup_cosine':   warmup_cosine,
 47 |     'warmup_constant': warmup_constant,
 48 |     'warmup_linear':   warmup_linear,
 49 | }
 50 | 
 51 | 
 52 | class BertAdam(Optimizer):
 53 |     """Implements BERT version of Adam algorithm with weight decay fix.
 54 |     Params:
 55 |         lr: learning rate
 56 |         warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
 57 |         t_total: total number of training steps for the learning
 58 |             rate schedule, -1  means constant learning rate. Default: -1
 59 |         schedule: schedule to use for the warmup (see above). Default: 'warmup_linear'
 60 |         b1: Adams b1. Default: 0.9
 61 |         b2: Adams b2. Default: 0.999
 62 |         e: Adams epsilon. Default: 1e-6
 63 |         weight_decay: Weight decay. Default: 0.01
 64 |         max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
 65 |     """
 66 |     def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
 67 |                  b1=0.9, b2=0.999, e=1e-6, weight_decay=0.01,
 68 |                  max_grad_norm=1.0):
 69 |         if lr is not required and lr < 0.0:
 70 |             raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
 71 |         if schedule not in SCHEDULES:
 72 |             raise ValueError("Invalid schedule parameter: {}".format(schedule))
 73 |         if not 0.0 <= warmup < 1.0 and not warmup == -1:
 74 |             raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
 75 |         if not 0.0 <= b1 < 1.0:
 76 |             raise ValueError("Invalid b1 parameter: {} - should be in [0.0, 1.0[".format(b1))
 77 |         if not 0.0 <= b2 < 1.0:
 78 |             raise ValueError("Invalid b2 parameter: {} - should be in [0.0, 1.0[".format(b2))
 79 |         if not e >= 0.0:
 80 |             raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
 81 |         defaults = dict(lr=lr, schedule=schedule, warmup=warmup, t_total=t_total,
 82 |                         b1=b1, b2=b2, e=e, weight_decay=weight_decay,
 83 |                         max_grad_norm=max_grad_norm)
 84 |         super(BertAdam, self).__init__(params, defaults)
 85 | 
 86 |     def get_lr(self):
 87 |         lr = []
 88 |         for group in self.param_groups:
 89 |             for p in group['params']:
 90 |                 state = self.state[p]
 91 |                 if len(state) == 0:
 92 |                     return [0]
 93 |                 if group['t_total'] != -1:
 94 |                     schedule_fct = SCHEDULES[group['schedule']]
 95 |                     lr_scheduled = group['lr'] * schedule_fct(state['step']/group['t_total'], group['warmup'])
 96 |                 else:
 97 |                     lr_scheduled = group['lr']
 98 |                 lr.append(lr_scheduled)
 99 |         return lr
100 | 
101 |     def step(self, closure=None):
102 |         """Performs a single optimization step.
103 | 
104 |         Arguments:
105 |             closure (callable, optional): A closure that reevaluates the model
106 |                 and returns the loss.
107 |         """
108 |         loss = None
109 |         if closure is not None:
110 |             loss = closure()
111 | 
112 |         warned_for_t_total = False
113 | 
114 |         for group in self.param_groups:
115 |             for p in group['params']:
116 |                 if p.grad is None:
117 |                     continue
118 |                 grad = p.grad.data
119 |                 if grad.is_sparse:
120 |                     raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
121 | 
122 |                 state = self.state[p]
123 | 
124 |                 # State initialization
125 |                 if len(state) == 0:
126 |                     state['step'] = 0
127 |                     # Exponential moving average of gradient values
128 |                     state['next_m'] = torch.zeros_like(p.data)
129 |                     # Exponential moving average of squared gradient values
130 |                     state['next_v'] = torch.zeros_like(p.data)
131 | 
132 |                 next_m, next_v = state['next_m'], state['next_v']
133 |                 beta1, beta2 = group['b1'], group['b2']
134 | 
135 |                 # LXRT: grad is clipped outside.
136 |                 # Add grad clipping
137 |                 # if group['max_grad_norm'] > 0:
138 |                 #     clip_grad_norm_(p, group['max_grad_norm'])
139 | 
140 |                 # Decay the first and second moment running average coefficient
141 |                 # In-place operations to update the averages at the same time
142 |                 next_m.mul_(beta1).add_(1 - beta1, grad)
143 |                 next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
144 |                 update = next_m / (next_v.sqrt() + group['e'])
145 | 
146 |                 # Just adding the square of the weights to the loss function is *not*
147 |                 # the correct way of using L2 regularization/weight decay with Adam,
148 |                 # since that will interact with the m and v parameters in strange ways.
149 |                 #
150 |                 # Instead we want to decay the weights in a manner that doesn't interact
151 |                 # with the m/v parameters. This is equivalent to adding the square
152 |                 # of the weights to the loss with plain (non-momentum) SGD.
153 |                 if group['weight_decay'] > 0.0:
154 |                     update += group['weight_decay'] * p.data
155 | 
156 |                 if group['t_total'] != -1:
157 |                     schedule_fct = SCHEDULES[group['schedule']]
158 |                     progress = state['step']/group['t_total']
159 |                     lr_scheduled = group['lr'] * schedule_fct(progress, group['warmup'])
160 |                     # warning for exceeding t_total (only active with warmup_linear
161 |                     if group['schedule'] == "warmup_linear" and progress > 1. and not warned_for_t_total:
162 |                         logger.warning(
163 |                             "Training beyond specified 't_total' steps with schedule '{}'. Learning rate set to {}. "
164 |                             "Please set 't_total' of {} correctly.".format(group['schedule'], lr_scheduled, self.__class__.__name__))
165 |                         warned_for_t_total = True
166 |                     # end warning
167 |                 else:
168 |                     lr_scheduled = group['lr']
169 | 
170 |                 update_with_lr = lr_scheduled * update
171 |                 p.data.add_(-update_with_lr)
172 | 
173 |                 state['step'] += 1
174 | 
175 |                 # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
176 |                 # No bias correction
177 |                 # bias_correction1 = 1 - beta1 ** state['step']
178 |                 # bias_correction2 = 1 - beta2 ** state['step']
179 | 
180 |         return loss
181 | 


--------------------------------------------------------------------------------
/src/tasks/vqa.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyleft 2019 project LXRT.
  3 | 
  4 | import os
  5 | import collections
  6 | 
  7 | import torch
  8 | import torch.nn as nn
  9 | from torch.utils.data.dataloader import DataLoader
 10 | from tqdm import tqdm
 11 | 
 12 | from param import args
 13 | from pretrain.qa_answer_table import load_lxmert_qa
 14 | from tasks.vqa_model import VQAModel
 15 | from tasks.vqa_data import VQADataset, VQATorchDataset, VQAEvaluator
 16 | 
 17 | DataTuple = collections.namedtuple("DataTuple", 'dataset loader evaluator')
 18 | 
 19 | 
 20 | def get_data_tuple(splits: str, bs:int, shuffle=False, drop_last=False) -> DataTuple:
 21 |     dset = VQADataset(splits)
 22 |     tset = VQATorchDataset(dset)
 23 |     evaluator = VQAEvaluator(dset)
 24 |     data_loader = DataLoader(
 25 |         tset, batch_size=bs,
 26 |         shuffle=shuffle, num_workers=args.num_workers,
 27 |         drop_last=drop_last, pin_memory=True
 28 |     )
 29 | 
 30 |     return DataTuple(dataset=dset, loader=data_loader, evaluator=evaluator)
 31 | 
 32 | 
 33 | class VQA:
 34 |     def __init__(self):
 35 |         # Datasets
 36 |         self.train_tuple = get_data_tuple(
 37 |             args.train, bs=args.batch_size, shuffle=True, drop_last=True
 38 |         )
 39 |         if args.valid != "":
 40 |             self.valid_tuple = get_data_tuple(
 41 |                 args.valid, bs=1024,
 42 |                 shuffle=False, drop_last=False
 43 |             )
 44 |         else:
 45 |             self.valid_tuple = None
 46 |         
 47 |         # Model
 48 |         self.model = VQAModel(self.train_tuple.dataset.num_answers)
 49 | 
 50 |         # Load pre-trained weights
 51 |         if args.load_lxmert is not None:
 52 |             self.model.lxrt_encoder.load(args.load_lxmert)
 53 |         if args.load_lxmert_qa is not None:
 54 |             load_lxmert_qa(args.load_lxmert_qa, self.model,
 55 |                            label2ans=self.train_tuple.dataset.label2ans)
 56 |         
 57 |         # GPU options
 58 |         self.model = self.model.cuda()
 59 |         if args.multiGPU:
 60 |             self.model.lxrt_encoder.multi_gpu()
 61 | 
 62 |         # Loss and Optimizer
 63 |         self.bce_loss = nn.BCEWithLogitsLoss()
 64 |         if 'bert' in args.optim:
 65 |             batch_per_epoch = len(self.train_tuple.loader)
 66 |             t_total = int(batch_per_epoch * args.epochs)
 67 |             print("BertAdam Total Iters: %d" % t_total)
 68 |             from lxrt.optimization import BertAdam
 69 |             self.optim = BertAdam(list(self.model.parameters()),
 70 |                                   lr=args.lr,
 71 |                                   warmup=0.1,
 72 |                                   t_total=t_total)
 73 |         else:
 74 |             self.optim = args.optimizer(self.model.parameters(), args.lr)
 75 |         
 76 |         # Output Directory
 77 |         self.output = args.output
 78 |         os.makedirs(self.output, exist_ok=True)
 79 | 
 80 |     def train(self, train_tuple, eval_tuple):
 81 |         dset, loader, evaluator = train_tuple
 82 |         iter_wrapper = (lambda x: tqdm(x, total=len(loader))) if args.tqdm else (lambda x: x)
 83 | 
 84 |         best_valid = 0.
 85 |         for epoch in range(args.epochs):
 86 |             quesid2ans = {}
 87 |             for i, (ques_id, feats, boxes, sent, target) in iter_wrapper(enumerate(loader)):
 88 | 
 89 |                 self.model.train()
 90 |                 self.optim.zero_grad()
 91 | 
 92 |                 feats, boxes, target = feats.cuda(), boxes.cuda(), target.cuda()
 93 |                 logit = self.model(feats, boxes, sent)
 94 |                 assert logit.dim() == target.dim() == 2
 95 |                 loss = self.bce_loss(logit, target)
 96 |                 loss = loss * logit.size(1)
 97 | 
 98 |                 loss.backward()
 99 |                 nn.utils.clip_grad_norm_(self.model.parameters(), 5.)
100 |                 self.optim.step()
101 | 
102 |                 score, label = logit.max(1)
103 |                 for qid, l in zip(ques_id, label.cpu().numpy()):
104 |                     ans = dset.label2ans[l]
105 |                     quesid2ans[qid.item()] = ans
106 | 
107 |             log_str = "\nEpoch %d: Train %0.2f\n" % (epoch, evaluator.evaluate(quesid2ans) * 100.)
108 | 
109 |             if self.valid_tuple is not None:  # Do Validation
110 |                 valid_score = self.evaluate(eval_tuple)
111 |                 if valid_score > best_valid:
112 |                     best_valid = valid_score
113 |                     self.save("BEST")
114 | 
115 |                 log_str += "Epoch %d: Valid %0.2f\n" % (epoch, valid_score * 100.) + \
116 |                            "Epoch %d: Best %0.2f\n" % (epoch, best_valid * 100.)
117 | 
118 |             print(log_str, end='')
119 | 
120 |             with open(self.output + "/log.log", 'a') as f:
121 |                 f.write(log_str)
122 |                 f.flush()
123 | 
124 |         self.save("LAST")
125 | 
126 |     def predict(self, eval_tuple: DataTuple, dump=None):
127 |         """
128 |         Predict the answers to questions in a data split.
129 | 
130 |         :param eval_tuple: The data tuple to be evaluated.
131 |         :param dump: The path of saved file to dump results.
132 |         :return: A dict of question_id to answer.
133 |         """
134 |         self.model.eval()
135 |         dset, loader, evaluator = eval_tuple
136 |         quesid2ans = {}
137 |         for i, datum_tuple in enumerate(loader):
138 |             ques_id, feats, boxes, sent = datum_tuple[:4]   # Avoid seeing ground truth
139 |             with torch.no_grad():
140 |                 feats, boxes = feats.cuda(), boxes.cuda()
141 |                 logit = self.model(feats, boxes, sent)
142 |                 score, label = logit.max(1)
143 |                 for qid, l in zip(ques_id, label.cpu().numpy()):
144 |                     ans = dset.label2ans[l]
145 |                     quesid2ans[qid.item()] = ans
146 |         if dump is not None:
147 |             evaluator.dump_result(quesid2ans, dump)
148 |         return quesid2ans
149 | 
150 |     def evaluate(self, eval_tuple: DataTuple, dump=None):
151 |         """Evaluate all data in data_tuple."""
152 |         quesid2ans = self.predict(eval_tuple, dump)
153 |         return eval_tuple.evaluator.evaluate(quesid2ans)
154 | 
155 |     @staticmethod
156 |     def oracle_score(data_tuple):
157 |         dset, loader, evaluator = data_tuple
158 |         quesid2ans = {}
159 |         for i, (ques_id, feats, boxes, sent, target) in enumerate(loader):
160 |             _, label = target.max(1)
161 |             for qid, l in zip(ques_id, label.cpu().numpy()):
162 |                 ans = dset.label2ans[l]
163 |                 quesid2ans[qid.item()] = ans
164 |         return evaluator.evaluate(quesid2ans)
165 | 
166 |     def save(self, name):
167 |         torch.save(self.model.state_dict(),
168 |                    os.path.join(self.output, "%s.pth" % name))
169 | 
170 |     def load(self, path):
171 |         print("Load model from %s" % path)
172 |         state_dict = torch.load("%s.pth" % path)
173 |         self.model.load_state_dict(state_dict)
174 | 
175 | 
176 | if __name__ == "__main__":
177 |     # Build Class
178 |     vqa = VQA()
179 | 
180 |     # Load VQA model weights
181 |     # Note: It is different from loading LXMERT pre-trained weights.
182 |     if args.load is not None:
183 |         vqa.load(args.load)
184 | 
185 |     # Test or Train
186 |     if args.test is not None:
187 |         args.fast = args.tiny = False       # Always loading all data in test
188 |         if 'test' in args.test:
189 |             vqa.predict(
190 |                 get_data_tuple(args.test, bs=950,
191 |                                shuffle=False, drop_last=False),
192 |                 dump=os.path.join(args.output, 'test_predict.json')
193 |             )
194 |         elif 'val' in args.test:    
195 |             # Since part of valididation data are used in pre-training/fine-tuning,
196 |             # only validate on the minival set.
197 |             result = vqa.evaluate(
198 |                 get_data_tuple('minival', bs=950,
199 |                                shuffle=False, drop_last=False),
200 |                 dump=os.path.join(args.output, 'minival_predict.json')
201 |             )
202 |             print(result)
203 |         else:
204 |             assert False, "No such test option for %s" % args.test
205 |     else:
206 |         print('Splits in Train data:', vqa.train_tuple.dataset.splits)
207 |         if vqa.valid_tuple is not None:
208 |             print('Splits in Valid data:', vqa.valid_tuple.dataset.splits)
209 |             print("Valid Oracle: %0.2f" % (vqa.oracle_score(vqa.valid_tuple) * 100))
210 |         else:
211 |             print("DO NOT USE VALIDATION")
212 |         vqa.train(vqa.train_tuple, vqa.valid_tuple)
213 | 
214 | 
215 | 


--------------------------------------------------------------------------------
/src/tasks/refcocog_data.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyleft 2019 project LXRT.
  3 | 
  4 | import json
  5 | import random
  6 | 
  7 | import numpy as np
  8 | import torch
  9 | from torch.utils.data import Dataset
 10 | 
 11 | from src import eval_utils
 12 | from src.param import args
 13 | from src.utils import load_obj_tsv, load_spatial_data
 14 | 
 15 | # Load part of the dataset for fast checking.
 16 | # Notice that here is the number of images instead of the number of data,
 17 | # which means all related data to the images would be used.
 18 | TINY_IMG_NUM = 512
 19 | FAST_IMG_NUM = 5000
 20 | 
 21 | 
 22 | class RefCOCOgDataset:
 23 |     """
 24 |     A GQA data example in json file:
 25 |     {
 26 |         "caption": caption,
 27 |         "sent_id": sent_id,
 28 |         "image_id": image_id,
 29 |         "refBox": refBox,
 30 |         "ref_id": ref_id, --> unique id assigned to each data sample
 31 |     }
 32 |     """
 33 |     def __init__(self, splits: str):
 34 |         self.name = splits
 35 |         self.splits = splits.split(',')
 36 | 
 37 |         # Loading datasets to data
 38 |         self.data = []
 39 |         for split in self.splits:
 40 |             self.data.extend(json.load(open("../../data/refcocog/annotations_%s.json" % split)))
 41 |         print("Load %d data from split(s) %s." % (len(self.data), self.name))
 42 | 
 43 | 
 44 |         # List to dict (for evaluation and others)
 45 |         self.id2datum = {
 46 |             datum['sent_id']: datum
 47 |             for datum in self.data
 48 |         }
 49 | 
 50 |         # Answers
 51 |         # self.ans2label = json.load(open("data/refcoco/trainval_ans2label.json"))
 52 |         # self.label2ans = json.load(open("data/refcoco/trainval_label2ans.json"))
 53 |         # assert len(self.ans2label) == len(self.label2ans)
 54 |         # for ans, label in self.ans2label.items():
 55 |         #     assert self.label2ans[label] == ans
 56 | 
 57 |     @property
 58 |     # def num_answers(self):
 59 |     #     return len(self.ans2label)
 60 | 
 61 |     def __len__(self):
 62 |         return len(self.data)
 63 | 
 64 | 
 65 | class RefCOCOgBufferLoader():
 66 |     def __init__(self):
 67 |         self.key2data = {}
 68 | 
 69 |     def load_data(self, name, number):
 70 |         # if name == 'testdev':
 71 |         #     # path = "data/vg_gqa_imgfeat/gqa_testdev_obj36.tsv"
 72 |         #     path = "data/refcoco/refcoco_testdev_spatial.h5"
 73 |         # else:
 74 |         #     # path = "data/vg_gqa_imgfeat/vg_gqa_obj36.tsv"
 75 |         #     path = "data/refcoco/refcoco_testdev_spatial.h5"
 76 |         path = "../../data/refcocog/{}_features.hdf5".format(name)
 77 |         key = "%s_%d" % (path, number)
 78 |         if key not in self.key2data:
 79 |             self.key2data[key] = load_spatial_data(
 80 |                 path,
 81 |                 topk=number
 82 |             )
 83 |         return self.key2data[key]
 84 | 
 85 | 
 86 | refcocog_buffer_loader = RefCOCOgBufferLoader()
 87 | 
 88 | 
 89 | """
 90 | Example in obj tsv:
 91 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf",
 92 |               "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"]
 93 | """
 94 | class RefCOCOgTorchDataset(Dataset):
 95 |     def __init__(self, dataset: RefCOCOgDataset):
 96 |         super().__init__()
 97 |         self.weakly_supervise = args.train_paradigm == 'weak'
 98 |         self.raw_dataset = dataset
 99 |         # self.img_info_data = json.load('data/gqa/gqa_spatial_merged_info.json')
100 | 
101 |         if args.tiny:
102 |             topk = TINY_IMG_NUM
103 |         elif args.fast:
104 |             topk = FAST_IMG_NUM
105 |         else:
106 |             topk = -1
107 | 
108 |         # Loading detection features to img_data
109 |         # Since images in train and valid both come from Visual Genome,
110 |         # buffer the image loading to save memory.
111 |         img_data = []
112 |         if 'test' in dataset.splits or 'test' in dataset.splits:     # Always loading all the data in testdev
113 |             img_data.extend(refcocog_buffer_loader.load_data('test', -1))
114 |         elif 'valid' in dataset.splits or 'valid' in dataset.splits:     # Always loading all the data in testdev
115 |             img_data.extend(refcocog_buffer_loader.load_data('valid', -1))
116 |         else:
117 |             img_data.extend(refcocog_buffer_loader.load_data('train', topk))
118 |         self.imgid2img = {}
119 |         for img_datum in img_data:
120 |             self.imgid2img[img_datum['image_id']] = img_datum
121 | 
122 |         # Only kept the data with loaded image features
123 |         self.data = []
124 |         for datum in self.raw_dataset.data:
125 |             if datum['image_id'] in self.imgid2img:
126 |                 self.data.append(datum)
127 |         print("Use %d data in torch dataset" % (len(self.data)))
128 |         print()
129 | 
130 |     def __len__(self):
131 |         return len(self.data)
132 | 
133 |     def __getitem__(self, item: int):
134 |         datum = self.data[item]
135 | 
136 |         img_id = datum['image_id']
137 |         sent_id = datum['sent_id']
138 |         sent = datum['caption']
139 | 
140 |         # If weakly supervision, replace the sentence with an sentence
141 |         # corresponding to other image.
142 |         is_matched = 1
143 |         if self.weakly_supervise:
144 |             if random.random() < 0.5:
145 |                 is_matched = 0
146 |                 other_datum = self.data[random.randint(0, len(self.data) - 1)]
147 |                 while other_datum['img_id'] == img_id:
148 |                     other_datum = self.data[random.randint(0, len(self.data) - 1)]
149 |                 sent = other_datum['sent']
150 | 
151 |         # Get image info
152 |         img_info = self.imgid2img[img_id]
153 |         obj_num = img_info['num_boxes']
154 |         # boxes = img_info['boxes'].copy()
155 | 
156 |         feats = img_info['features'].copy()
157 |         ##Aisha change:
158 | 
159 |         boxes = np.ones(feats.shape[1]*feats.shape[2]+1, dtype=np.float32) #assuming feats of shape [d, h, w]
160 |         # assert len(boxes) == len(feats) == obj_num
161 | 
162 |         target_box = datum['refBox']
163 |         # Normalize the boxes (to 0 ~ 1)
164 |         img_h, img_w = img_info['img_h'], img_info['img_w']
165 |         target_box = target_box.copy()
166 |         # target_box[:, (0, 2)] /= img_w
167 |         # target_box[:, (1, 3)] /= img_h
168 |         target_box[0] /= img_w
169 |         target_box[2] /= img_w
170 |         target_box[1] /= img_h
171 |         target_box[3] /= img_h
172 |         np.testing.assert_array_less(np.array(target_box), 1+1e-5)
173 |         np.testing.assert_array_less(-np.array(target_box), 0+1e-5)
174 | 
175 |         # Create target
176 |         # if 'label' in datum:
177 |         #     label = datum['label']
178 |         #     target = torch.zeros(self.raw_dataset.num_answers)
179 |         #     for ans, score in label.items():
180 |         #         if ans in self.raw_dataset.ans2label:
181 |         #             target[self.raw_dataset.ans2label[ans]] = score
182 |         #     return ref_id, feats, target_box, sent, target
183 |         # else:
184 |         return sent_id, feats, boxes, sent, torch.tensor(target_box), is_matched
185 | 
186 | 
187 | class RefCOCOgEvaluator:
188 |     def __init__(self, dataset: RefCOCOgDataset):
189 |         self.dataset = dataset
190 | 
191 |     def evaluate(self, sentid2box: dict):
192 |         sid2iou = {}
193 | 
194 |         for sentid, pred_box in sentid2box.items():
195 |             datum = self.dataset.id2datum[sentid.item()]
196 |             gt_box = datum['refBox']
197 |             miou, accu = eval_utils.trans_vg_eval_val(torch.as_tensor(pred_box), torch.as_tensor(gt_box))
198 |             sid2iou[sentid] = miou.detach().cpu().numpy()
199 | 
200 |         accu = self.iou_acc(sid2iou)
201 |         return accu.float() / len(sentid2box)
202 | 
203 |     def iou_acc(self, sid2iou):
204 |         accu = torch.sum(torch.FloatTensor(list(sid2iou.values())) >= 0.5)
205 |         return accu
206 | 
207 |     def save_json(self, data, file_path):
208 |         with open(file_path, "w") as f:
209 |             json.dump(data, f)
210 | 
211 |     def dump_result(self, sentid2box: dict, path):
212 |         """
213 |         Dump the result to a GQA-challenge submittable json file.
214 |         GQA json file submission requirement:
215 |             results = [result]
216 |             result = {
217 |                 "questionId": str,      # Note: it's a actually an int number but the server requires an str.
218 |                 "prediction": str
219 |             }
220 | 
221 |         :param sentid2box: A dict mapping question id to its predicted answer.
222 |         :param path: The file path to save the json file.
223 |         :return:
224 |         """
225 |         with open(path, 'w') as f:
226 |             result = []
227 |             for sent_id, box in sentid2box.items():
228 |                 result.append({
229 |                     'questionId': sent_id,
230 |                     'prediction': box
231 |                 })
232 |             json.dump(result, f, indent=4, sort_keys=True)
233 | 
234 | 
235 | 


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyleft 2019 Project LXRT
  3 | import base64
  4 | import csv
  5 | import json
  6 | import os
  7 | import sys
  8 | import time
  9 | 
 10 | import h5py
 11 | import numpy as np
 12 | 
 13 | csv.field_size_limit(sys.maxsize)
 14 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf",
 15 |               "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"]
 16 | 
 17 | 
 18 | def load_json(file_path):
 19 |     with open(file_path, "r") as f:
 20 |         return json.load(f)
 21 | 
 22 | 
 23 | def save_json(data, file_path):
 24 |     with open(file_path, "w") as f:
 25 |         json.dump(data, f)
 26 | 
 27 | 
 28 | def load_obj_tsv(fname, topk=None):
 29 |     """Load object features from tsv file.
 30 | 
 31 |     :param fname: The path to the tsv file.
 32 |     :param topk: Only load features for top K images (lines) in the tsv file.
 33 |         Will load all the features if topk is either -1 or None.
 34 |     :return: A list of image object features where each feature is a dict.
 35 |         See FILENAMES above for the keys in the feature dict.
 36 |     """
 37 |     data = []
 38 |     start_time = time.time()
 39 |     print("Start to load Faster-RCNN detected objects from %s" % fname)
 40 | 
 41 |     with open(fname) as f:
 42 |         reader = csv.DictReader(f, FIELDNAMES, delimiter="\t")
 43 |         for i, item in enumerate(reader):
 44 | 
 45 |             for key in ['img_h', 'img_w', 'num_boxes']:
 46 |                 if item[key] == '':
 47 |                     item[key] = 0
 48 |                 item[key] = int(item[key])
 49 | 
 50 |             boxes = item['num_boxes']
 51 |             decode_config = [
 52 |                 ('objects_id', (boxes,), np.int64),
 53 |                 ('objects_conf', (boxes,), np.float32),
 54 |                 ('attrs_id', (boxes,), np.int64),
 55 |                 ('attrs_conf', (boxes,), np.float32),
 56 |                 ('boxes', (boxes, 4), np.float32),
 57 |                 ('features', (7, 7, 1024), np.float64),
 58 |             ]
 59 |             for key, shape, dtype in decode_config:
 60 |                 if key == 'features':
 61 |                     decoded_item = base64.b64decode(item[key])
 62 |                     item[key] = np.frombuffer(decoded_item)  # todo: replace dummy data with orig features
 63 |                 else:
 64 |                     item[key] = np.frombuffer(base64.b64decode(item[key]), dtype=dtype)
 65 |                 item[key] = item[key].reshape(shape)
 66 |                 item[key].setflags(write=False)
 67 | 
 68 |             data.append(item)
 69 |             if topk is not None and len(data) == topk:
 70 |                 break
 71 |     elapsed_time = time.time() - start_time
 72 |     print("Loaded %d images in file %s in %d seconds." % (len(data), fname, elapsed_time))
 73 |     return data
 74 | 
 75 | ###### additional functions, Author: Aisha Urooj #######
 76 | def load_spatial_data(fname, topk=None):
 77 |     """Load object features from tsv file.
 78 | 
 79 |     :param fname: The path to the tsv file.
 80 |     :param topk: Only load features for top K images (lines) in the tsv file.
 81 |         Will load all the features if topk is either -1 or None.
 82 |     :return: A list of image object features where each feature is a dict.
 83 |         See FILENAMES above for the keys in the feature dict.
 84 |     """
 85 |     data = []
 86 |     fparts = fname.split('/')
 87 |     print(fparts[:-1])
 88 |     fpath = os.path.join(*fparts[:-1])
 89 |     fn = fparts[-1]
 90 |     split = fn.split('_')[0]
 91 |     mapping_fn = os.path.join(fpath, 'img_id2idx_{}.json'.format(split))
 92 |     start_time = time.time()
 93 |     print("Reading %s file" % mapping_fn)
 94 |     img_id2idx_dict = load_json(mapping_fn)
 95 |     print("Start to load ResNet152 features from %s" % fname)
 96 | 
 97 |     h = h5py.File(os.path.join(fpath, '{}_features.hdf5'.format(split)), 'r')
 98 |     img_features = h['data']
 99 | 
100 |     for img_id, item in img_id2idx_dict.items():
101 |         item["features"] = img_features[item["i"]]
102 |         item["img_id"] = img_id
103 | 
104 |         for key in ['objects_id', 'objects_conf', 'attrs_id', 'attrs_conf', 'boxes', 'features']:
105 |             if item[key] is not None:
106 |                 item[key].setflags(write=False)
107 |             else:
108 |                 if key == 'boxes':
109 |                     item[key] = np.zeros((1, 4))
110 |                 else:
111 |                     item[key] = np.array([0, 0])
112 | 
113 |         data.append(item)
114 | 
115 |         if topk is not None and len(data) == topk:
116 |             break
117 | 
118 |     elapsed_time = time.time() - start_time
119 |     print("Loaded %d images in file %s in %d seconds." % (len(data), fname, elapsed_time))
120 | 
121 |     return data
122 | 
123 | 
124 | def load_spatial_gqa(fname, topk=None):
125 |     """Load object features from tsv file.
126 | 
127 |     :param fname: The path to the tsv file.
128 |     :param topk: Only load features for top K images (lines) in the tsv file.
129 |         Will load all the features if topk is either -1 or None.
130 |     :return: A list of image object features where each feature is a dict.
131 |         See FILENAMES above for the keys in the feature dict.
132 |     """
133 |     # todo: adopt function to read gqa data
134 |     data = []
135 |     fparts = fname.split('/')
136 |     print(fparts[:-1])
137 |     fpath = os.path.join(*fparts[:-1])
138 |     fn = fparts[-1]
139 |     split = fn.split('_')[0]
140 |     mapping_fn = os.path.join(fpath, 'gqa_spatial_merged_info.json')
141 |     start_time = time.time()
142 |     print("Reading %s file" % mapping_fn)
143 |     img_id2idx_dict = load_json(mapping_fn)
144 |     print("Start to load ResNet101 features from %s" % fname)
145 | 
146 |     h = h5py.File(os.path.join(fpath, 'gqa_spatial.h5'), 'r')
147 |     img_features = h['features']
148 | 
149 |     for img_id, item in img_id2idx_dict.items():
150 |         item["features"] = img_features[item["index"]]
151 |         item["img_id"] = img_id
152 | 
153 |         item['objects_id'] = None
154 |         item['objects_conf'] = None
155 |         item['attrs_id'] = None
156 |         item['attrs_conf'] = None
157 |         item['boxes'] = None
158 |         item['num_boxes'] = 0
159 | 
160 |         for key in ['objects_id', 'objects_conf', 'attrs_id', 'attrs_conf', 'boxes', 'features']:
161 |             if item[key] is not None:
162 |                 item[key].setflags(write=False)
163 |             else:
164 |                 if key == 'boxes':
165 |                     item[key] = np.zeros((1, 4))
166 |                 else:
167 |                     item[key] = np.array([0, 0])
168 | 
169 |         data.append(item)
170 | 
171 |         if topk is not None and len(data) == topk:
172 |             break
173 |     elapsed_time = time.time() - start_time
174 |     print("Loaded %d images in file %s in %d seconds." % (len(data), fname, elapsed_time))
175 |     return data
176 | 
177 | 
178 | def load_patches(fname, dataset='', topk=None):
179 |     """Load object features from tsv file.
180 | 
181 |     :param fname: The path to the tsv file.
182 |     :param topk: Only load features for top K images (lines) in the tsv file.
183 |         Will load all the features if topk is either -1 or None.
184 |     :return: A list of image object features where each feature is a dict.
185 |         See FILENAMES above for the keys in the feature dict.
186 |     """
187 |     # todo: adopt function to read gqa data
188 |     assert dataset != ''
189 |     data = []
190 |     fparts = fname.split('/')
191 |     print(fparts[:-1])
192 |     fpath = os.path.join(*fparts[:-1])
193 |     fn = fparts[-1]
194 |     split = fn.split('_')[0]
195 |     mapping_fn = os.path.join(fpath, 'img_id2idx_{dataset}_{split}_32x32.json'.format(dataset=dataset,
196 |                                                                                       split=split))
197 |     start_time = time.time()
198 |     print("Reading %s file" % mapping_fn)
199 |     img_id2idx_dict = load_json(mapping_fn)
200 |     print("Start to load image patches from %s" % fname)
201 | 
202 |     h = h5py.File(os.path.join(fpath, '{split}_patches_32x32.hdf5'.format(split=split)), 'r')
203 |     img_features = h['data']
204 | 
205 |     for img_id, item in img_id2idx_dict.items():
206 |         item["features"] = img_features[item["i"]]
207 |         item["img_id"] = img_id
208 | 
209 |         item['objects_id'] = None
210 |         item['objects_conf'] = None
211 |         item['attrs_id'] = None
212 |         item['attrs_conf'] = None
213 |         item['boxes'] = None
214 |         item['num_boxes'] = 0
215 | 
216 |         for key in ['objects_id', 'objects_conf', 'attrs_id', 'attrs_conf', 'boxes', 'features']:
217 |             if item[key] is not None:
218 |                 item[key].setflags(write=False)
219 |             else:
220 |                 if key == 'boxes':
221 |                     item[key] = np.zeros((1, 4))
222 |                 else:
223 |                     item[key] = np.array([0, 0])
224 | 
225 |         data.append(item)
226 |         if topk is not None and len(data) == topk:
227 |             break
228 | 
229 |     elapsed_time = time.time() - start_time
230 |     print("Loaded %d images in file %s in %d seconds." % (len(data), fname, elapsed_time))
231 |     return data
232 | 


--------------------------------------------------------------------------------
/src/lxrt/file_utils.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Utilities for working with the local dataset cache.
  3 | This file is adapted from the AllenNLP library at https://github.com/allenai/allennlp
  4 | Copyright by the AllenNLP authors.
  5 | """
  6 | import json
  7 | import logging
  8 | import os
  9 | import shutil
 10 | import sys
 11 | import tempfile
 12 | from functools import wraps
 13 | from hashlib import sha256
 14 | from io import open
 15 | 
 16 | import boto3
 17 | import requests
 18 | from botocore.exceptions import ClientError
 19 | from tqdm import tqdm
 20 | 
 21 | try:
 22 |     from urllib.parse import urlparse
 23 | except ImportError:
 24 |     from urlparse import urlparse
 25 | 
 26 | try:
 27 |     from pathlib import Path
 28 |     PYTORCH_PRETRAINED_BERT_CACHE = Path(os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
 29 |                                                    Path.home() / '.pytorch_pretrained_bert'))
 30 | except (AttributeError, ImportError):
 31 |     PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
 32 |                                               os.path.join(os.path.expanduser("~"), '.pytorch_pretrained_bert'))
 33 | 
 34 | logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 35 | 
 36 | 
 37 | def url_to_filename(url, etag=None):
 38 |     """
 39 |     Convert `url` into a hashed filename in a repeatable way.
 40 |     If `etag` is specified, append its hash to the url's, delimited
 41 |     by a period.
 42 |     """
 43 |     url_bytes = url.encode('utf-8')
 44 |     url_hash = sha256(url_bytes)
 45 |     filename = url_hash.hexdigest()
 46 | 
 47 |     if etag:
 48 |         etag_bytes = etag.encode('utf-8')
 49 |         etag_hash = sha256(etag_bytes)
 50 |         filename += '.' + etag_hash.hexdigest()
 51 | 
 52 |     return filename
 53 | 
 54 | 
 55 | def filename_to_url(filename, cache_dir=None):
 56 |     """
 57 |     Return the url and etag (which may be ``None``) stored for `filename`.
 58 |     Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
 59 |     """
 60 |     if cache_dir is None:
 61 |         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
 62 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
 63 |         cache_dir = str(cache_dir)
 64 | 
 65 |     cache_path = os.path.join(cache_dir, filename)
 66 |     if not os.path.exists(cache_path):
 67 |         raise EnvironmentError("file {} not found".format(cache_path))
 68 | 
 69 |     meta_path = cache_path + '.json'
 70 |     if not os.path.exists(meta_path):
 71 |         raise EnvironmentError("file {} not found".format(meta_path))
 72 | 
 73 |     with open(meta_path, encoding="utf-8") as meta_file:
 74 |         metadata = json.load(meta_file)
 75 |     url = metadata['url']
 76 |     etag = metadata['etag']
 77 | 
 78 |     return url, etag
 79 | 
 80 | 
 81 | def cached_path(url_or_filename, cache_dir=None):
 82 |     """
 83 |     Given something that might be a URL (or might be a local path),
 84 |     determine which. If it's a URL, download the file and cache it, and
 85 |     return the path to the cached file. If it's already a local path,
 86 |     make sure the file exists and then return the path.
 87 |     """
 88 |     if cache_dir is None:
 89 |         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
 90 |     if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
 91 |         url_or_filename = str(url_or_filename)
 92 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
 93 |         cache_dir = str(cache_dir)
 94 | 
 95 |     parsed = urlparse(url_or_filename)
 96 | 
 97 |     if parsed.scheme in ('http', 'https', 's3'):
 98 |         # URL, so get it from the cache (downloading if necessary)
 99 |         return get_from_cache(url_or_filename, cache_dir)
100 |     elif os.path.exists(url_or_filename):
101 |         # File, and it exists.
102 |         return url_or_filename
103 |     elif parsed.scheme == '':
104 |         # File, but it doesn't exist.
105 |         raise EnvironmentError("file {} not found".format(url_or_filename))
106 |     else:
107 |         # Something unknown
108 |         raise ValueError("unable to parse {} as a URL or as a local path".format(url_or_filename))
109 | 
110 | 
111 | def split_s3_path(url):
112 |     """Split a full s3 path into the bucket name and path."""
113 |     parsed = urlparse(url)
114 |     if not parsed.netloc or not parsed.path:
115 |         raise ValueError("bad s3 path {}".format(url))
116 |     bucket_name = parsed.netloc
117 |     s3_path = parsed.path
118 |     # Remove '/' at beginning of path.
119 |     if s3_path.startswith("/"):
120 |         s3_path = s3_path[1:]
121 |     return bucket_name, s3_path
122 | 
123 | 
124 | def s3_request(func):
125 |     """
126 |     Wrapper function for s3 requests in order to create more helpful error
127 |     messages.
128 |     """
129 | 
130 |     @wraps(func)
131 |     def wrapper(url, *args, **kwargs):
132 |         try:
133 |             return func(url, *args, **kwargs)
134 |         except ClientError as exc:
135 |             if int(exc.response["Error"]["Code"]) == 404:
136 |                 raise EnvironmentError("file {} not found".format(url))
137 |             else:
138 |                 raise
139 | 
140 |     return wrapper
141 | 
142 | 
143 | @s3_request
144 | def s3_etag(url):
145 |     """Check ETag on S3 object."""
146 |     s3_resource = boto3.resource("s3")
147 |     bucket_name, s3_path = split_s3_path(url)
148 |     s3_object = s3_resource.Object(bucket_name, s3_path)
149 |     return s3_object.e_tag
150 | 
151 | 
152 | @s3_request
153 | def s3_get(url, temp_file):
154 |     """Pull a file directly from S3."""
155 |     s3_resource = boto3.resource("s3")
156 |     bucket_name, s3_path = split_s3_path(url)
157 |     s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
158 | 
159 | 
160 | def http_get(url, temp_file):
161 |     req = requests.get(url, stream=True)
162 |     content_length = req.headers.get('Content-Length')
163 |     total = int(content_length) if content_length is not None else None
164 |     progress = tqdm(unit="B", total=total)
165 |     for chunk in req.iter_content(chunk_size=1024):
166 |         if chunk: # filter out keep-alive new chunks
167 |             progress.update(len(chunk))
168 |             temp_file.write(chunk)
169 |     progress.close()
170 | 
171 | 
172 | def get_from_cache(url, cache_dir=None):
173 |     """
174 |     Given a URL, look for the corresponding dataset in the local cache.
175 |     If it's not there, download it. Then return the path to the cached file.
176 |     """
177 |     if cache_dir is None:
178 |         cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
179 |     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
180 |         cache_dir = str(cache_dir)
181 | 
182 |     if not os.path.exists(cache_dir):
183 |         os.makedirs(cache_dir)
184 | 
185 |     # Get eTag to add to filename, if it exists.
186 |     if url.startswith("s3://"):
187 |         etag = s3_etag(url)
188 |     else:
189 |         response = requests.head(url, allow_redirects=True)
190 |         if response.status_code != 200:
191 |             raise IOError("HEAD request failed for url {} with status code {}"
192 |                           .format(url, response.status_code))
193 |         etag = response.headers.get("ETag")
194 | 
195 |     filename = url_to_filename(url, etag)
196 | 
197 |     # get cache path to put the file
198 |     cache_path = os.path.join(cache_dir, filename)
199 | 
200 |     if not os.path.exists(cache_path):
201 |         # Download to temporary file, then copy to cache dir once finished.
202 |         # Otherwise you get corrupt cache entries if the download gets interrupted.
203 |         with tempfile.NamedTemporaryFile() as temp_file:
204 |             logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
205 | 
206 |             # GET file object
207 |             if url.startswith("s3://"):
208 |                 s3_get(url, temp_file)
209 |             else:
210 |                 http_get(url, temp_file)
211 | 
212 |             # we are copying the file before closing it, so flush to avoid truncation
213 |             temp_file.flush()
214 |             # shutil.copyfileobj() starts at the current position, so go to the start
215 |             temp_file.seek(0)
216 | 
217 |             logger.info("copying %s to cache at %s", temp_file.name, cache_path)
218 |             with open(cache_path, 'wb') as cache_file:
219 |                 shutil.copyfileobj(temp_file, cache_file)
220 | 
221 |             logger.info("creating metadata file for %s", cache_path)
222 |             meta = {'url': url, 'etag': etag}
223 |             meta_path = cache_path + '.json'
224 |             with open(meta_path, 'w', encoding="utf-8") as meta_file:
225 |                 json.dump(meta, meta_file)
226 | 
227 |             logger.info("removing temp file %s", temp_file.name)
228 | 
229 |     return cache_path
230 | 
231 | 
232 | def read_set_from_file(filename):
233 |     '''
234 |     Extract a de-duped collection (set) of text from a file.
235 |     Expected file format is one item per line.
236 |     '''
237 |     collection = set()
238 |     with open(filename, 'r', encoding='utf-8') as file_:
239 |         for line in file_:
240 |             collection.add(line.rstrip())
241 |     return collection
242 | 
243 | 
244 | def get_file_extension(path, dot=True, lower=True):
245 |     ext = os.path.splitext(path)[1]
246 |     ext = ext if dot else ext[1:]
247 |     return ext.lower() if lower else ext
248 | 


--------------------------------------------------------------------------------
/src/tasks/mscoco_retrieval_data.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyleft 2019 project LXRT.
  3 | 
  4 | import json
  5 | 
  6 | import numpy as np
  7 | import torch
  8 | from torch.utils.data import Dataset
  9 | 
 10 | from src.param import args
 11 | from src.utils import load_obj_tsv, load_spatial_data
 12 | 
 13 | # Load part of the dataset for fast checking.
 14 | # Notice that here is the number of images instead of the number of data,
 15 | # which means all related data to the images would be used.
 16 | TINY_IMG_NUM = 512
 17 | FAST_IMG_NUM = 5000
 18 | 
 19 | def make_uid(img_id, dset, sent_idx):
 20 |     return "%s_%s_%03d" % (img_id, dset, sent_idx),
 21 | 
 22 | class MSCOCODataset:
 23 |     """
 24 |     A GQA data example in json file:
 25 |     {
 26 |         "caption": caption,
 27 |         "sent_id": sent_id,
 28 |         "image_id": image_id,
 29 |         "refBox": refBox,
 30 |         "ref_id": ref_id, --> unique id assigned to each data sample
 31 |     }
 32 |     """
 33 |     def __init__(self, splits: str):
 34 |         self.name = splits
 35 |         self.splits = splits.split(',')
 36 | 
 37 |         # Loading datasets to data
 38 |         self.data = []
 39 |         for split in self.splits:
 40 |             if split == 'train':
 41 |                 self.data.extend(
 42 |                 json.load(open("/media/data/data/data/lxmert/mscoco_%s.json" % split)))
 43 |             else:
 44 |                 self.data.extend(json.load(open("/media/data/data/data/lxmert/mscoco_karpathy_retrieval_%s.json" % split)))
 45 |         print("Load %d data from split(s) %s." % (len(self.data), self.name))
 46 | 
 47 |         data_flattened = self.flatten_data()
 48 |         self.data = data_flattened
 49 |         # List to dict (for evaluation and others)
 50 |         self.id2datum = {
 51 |             datum['uid']: datum
 52 |             for datum in self.data
 53 |         }
 54 | 
 55 |         # Answers
 56 |         # self.ans2label = json.load(open("data/refcoco/trainval_ans2label.json"))
 57 |         # self.label2ans = json.load(open("data/refcoco/trainval_label2ans.json"))
 58 |         # assert len(self.ans2label) == len(self.label2ans)
 59 |         # for ans, label in self.ans2label.items():
 60 |         #     assert self.label2ans[label] == ans
 61 | 
 62 |     def flatten_data(self):
 63 |         data_flattened = []
 64 |         for datum in self.data:
 65 |             sentf = datum['sentf']
 66 |             for sents_cat, sents in sentf.items():
 67 |                 if sents_cat == 'mscoco':
 68 |                     # print(sents_cat)
 69 |                     if sents_cat in datum['labelf']:
 70 |                         labels = datum['labelf'][sents_cat]
 71 |                     else:
 72 |                         labels = None
 73 |                     for sent_idx, sent in enumerate(sents):
 74 |                         new_datum = {
 75 |                             'uid': make_uid(datum['img_id'], sents_cat, sent_idx),
 76 |                             'img_id': datum['img_id'],
 77 |                             'sent_id': sent_idx,
 78 |                             'sent': sent
 79 |                         }
 80 |                         if labels is not None:
 81 |                             new_datum['label'] = labels[sent_idx]
 82 |                         data_flattened.append(new_datum)
 83 |                         break
 84 |         print("Use %d data in torch dataset" % (len(data_flattened)))
 85 |         return data_flattened
 86 | 
 87 |     @property
 88 |     # def num_answers(self):
 89 |     #     return len(self.ans2label)
 90 | 
 91 |     def __len__(self):
 92 |         return len(self.data)
 93 | 
 94 | 
 95 | class MSCOCOBufferLoader():
 96 |     def __init__(self):
 97 |         self.key2data = {}
 98 | 
 99 |     def load_data(self, name, number):
100 |         # if name == 'testdev':
101 |         #     # path = "data/vg_gqa_imgfeat/gqa_testdev_obj36.tsv"
102 |         #     path = "data/refcoco/refcoco_testdev_spatial.h5"
103 |         # else:
104 |         #     # path = "data/vg_gqa_imgfeat/vg_gqa_obj36.tsv"
105 |         #     path = "data/refcoco/refcoco_testdev_spatial.h5"
106 |         path = "/media/data/data/data/mscoco_imgfeat/{}_features.hdf5".format(name)
107 |         key = "%s_%d" % (path, number)
108 |         if key not in self.key2data:
109 |             self.key2data[key] = load_spatial_data(
110 |                 path,
111 |                 topk=number
112 |             )
113 |         return self.key2data[key]
114 | 
115 | 
116 | mscoco_buffer_loader = MSCOCOBufferLoader()
117 | 
118 | 
119 | """
120 | Example in obj tsv:
121 | FIELDNAMES = ["img_id", "img_h", "img_w", "objects_id", "objects_conf",
122 |               "attrs_id", "attrs_conf", "num_boxes", "boxes", "features"]
123 | """
124 | class MSCOCOTorchDataset(Dataset):
125 |     def __init__(self, dataset: MSCOCODataset):
126 |         super().__init__()
127 |         self.raw_dataset = dataset
128 |         # self.img_info_data = json.load('data/gqa/gqa_spatial_merged_info.json')
129 | 
130 |         if args.tiny:
131 |             topk = TINY_IMG_NUM
132 |         elif args.fast:
133 |             topk = FAST_IMG_NUM
134 |         else:
135 |             topk = -1
136 | 
137 |         # Loading detection features to img_data
138 |         # Since images in train and valid both come from Visual Genome,
139 |         # buffer the image loading to save memory.
140 |         img_data = []
141 |         if 'test' in dataset.splits or 'test' in dataset.splits:     # Always loading all the data in testdev
142 |             img_data.extend(mscoco_buffer_loader.load_data('test', -1))
143 |         elif 'valid' in dataset.splits or 'valid' in dataset.splits:     # Always loading all the data in testdev
144 |             img_data.extend(mscoco_buffer_loader.load_data('valid', -1))
145 |         else:
146 |             img_data.extend(mscoco_buffer_loader.load_data('train', topk))
147 |         self.imgid2img = {}
148 |         for img_datum in img_data:
149 |             self.imgid2img[img_datum['img_id']] = img_datum
150 | 
151 |         # Only kept the data with loaded image features
152 |         self.data = []
153 |         for datum in self.raw_dataset.data:
154 |             if datum['img_id'] in self.imgid2img:
155 |                 self.data.append(datum)
156 |         print("Use %d data in torch dataset" % (len(self.data)))
157 |         print()
158 | 
159 |     def __len__(self):
160 |         return len(self.data)
161 | 
162 |     def __getitem__(self, item: int):
163 |         datum = self.data[item]
164 | 
165 |         img_id = datum['img_id']
166 |         sent_id = datum['uid']
167 |         sent = datum['sent']
168 | 
169 |         # Get image info
170 |         img_info = self.imgid2img[img_id]
171 |         obj_num = img_info['num_boxes']
172 |         # boxes = img_info['boxes'].copy()
173 | 
174 |         feats = img_info['features'].copy()
175 |         ##Aisha change:
176 | 
177 |         boxes = np.ones(feats.shape[1]*feats.shape[2]+1, dtype=np.float32) #assuming feats of shape [d, h, w]
178 |         # assert len(boxes) == len(feats) == obj_num
179 | 
180 |         # target_box = datum['refBox']
181 |         # # Normalize the boxes (to 0 ~ 1)
182 |         # img_h, img_w = img_info['img_h'], img_info['img_w']
183 |         # target_box = target_box.copy()
184 |         # # target_box[:, (0, 2)] /= img_w
185 |         # # target_box[:, (1, 3)] /= img_h
186 |         # target_box[0] /= img_w
187 |         # target_box[2] /= img_w
188 |         # target_box[1] /= img_h
189 |         # target_box[3] /= img_h
190 |         # np.testing.assert_array_less(np.array(target_box), 1+1e-5)
191 |         # np.testing.assert_array_less(-np.array(target_box), 0+1e-5)
192 | 
193 |         # Create target
194 |         # if 'label' in datum:
195 |         #     label = datum['label']
196 |         #     target = torch.zeros(self.raw_dataset.num_answers)
197 |         #     for ans, score in label.items():
198 |         #         if ans in self.raw_dataset.ans2label:
199 |         #             target[self.raw_dataset.ans2label[ans]] = score
200 |         #     return ref_id, feats, target_box, sent, target
201 |         # else:
202 |         return sent_id, feats, boxes, sent
203 | 
204 | 
205 | class MSCOCOEvaluator:
206 |     def __init__(self, dataset: MSCOCODataset):
207 |         self.dataset = dataset
208 | 
209 |     def evaluate(self, sentid2box: dict):
210 |         score = 0.
211 |         for sentid, box in sentid2box.items():
212 |             datum = self.dataset.id2datum[sentid]
213 |             label = datum['refBox']
214 |             if box in label:
215 |                 score += label[box]
216 |         return score / len(sentid2box)
217 | 
218 |     def save_json(self, data, file_path):
219 |         with open(file_path, "w") as f:
220 |             json.dump(data, f)
221 | 
222 |     def dump_result(self, quesid2ans: dict, path):
223 |         """
224 |         Dump the result to a GQA-challenge submittable json file.
225 |         GQA json file submission requirement:
226 |             results = [result]
227 |             result = {
228 |                 "questionId": str,      # Note: it's a actually an int number but the server requires an str.
229 |                 "prediction": str
230 |             }
231 | 
232 |         :param quesid2ans: A dict mapping question id to its predicted answer.
233 |         :param path: The file path to save the json file.
234 |         :return:
235 |         """
236 |         with open(path, 'w') as f:
237 |             result = []
238 |             for ques_id, ans in quesid2ans.items():
239 |                 result.append({
240 |                     'questionId': ques_id,
241 |                     'prediction': ans
242 |                 })
243 |             json.dump(result, f, indent=4, sort_keys=True)
244 | 
245 | 
246 | 


--------------------------------------------------------------------------------
/src/tasks/refcocoplus.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyleft 2019 project LXRT.
  3 | 
  4 | import os
  5 | import collections
  6 | 
  7 | import gc
  8 | import torch
  9 | from tqdm import tqdm
 10 | import torch.nn as nn
 11 | from torch.utils.data.dataloader import DataLoader
 12 | 
 13 | from src.param import args
 14 | from src.pretrain.qa_answer_table import load_lxmert_qa
 15 | from src.tasks.refcocoplus_model import RefCOCOplusModel
 16 | from src.tasks.refcocoplus_data import RefCOCOplusDataset, RefCOCOplusTorchDataset, RefCOCOplusEvaluator
 17 | 
 18 | print(args)
 19 | DataTuple = collections.namedtuple("DataTuple", 'dataset loader evaluator')
 20 | 
 21 | 
 22 | def get_tuple(splits: str, bs:int, shuffle=False, drop_last=False) -> DataTuple:
 23 |     dset = RefCOCOplusDataset(splits)
 24 |     tset = RefCOCOplusTorchDataset(dset)
 25 |     evaluator = RefCOCOplusEvaluator(dset)
 26 |     data_loader = DataLoader(
 27 |         tset, batch_size=bs,
 28 |         shuffle=shuffle, num_workers=args.num_workers,
 29 |         drop_last=drop_last, pin_memory=True
 30 |     )
 31 | 
 32 |     return DataTuple(dataset=dset, loader=data_loader, evaluator=evaluator)
 33 | 
 34 | 
 35 | class RefCOCOplus:
 36 |     def __init__(self):
 37 |         self.train_tuple = get_tuple(
 38 |             args.train, bs=args.batch_size, shuffle=True, drop_last=True
 39 |         )
 40 |         if args.valid != "":
 41 |             valid_bsize = args.batch_size #2048 if args.multiGPU else args.batch_size#512
 42 |             self.valid_tuple = get_tuple(
 43 |                 args.valid, bs=valid_bsize,
 44 |                 shuffle=False, drop_last=False
 45 |             )
 46 |         else:
 47 |             self.valid_tuple = None
 48 | 
 49 |         self.model = RefCOCOplusModel()
 50 | 
 51 |         # Load pre-trained weights
 52 |         if args.load_lxmert is not None:
 53 |             self.model.lxrt_encoder.load(args.load_lxmert)
 54 |         if args.load_lxmert_qa is not None:
 55 |             load_lxmert_qa(args.load_lxmert_qa, self.model,
 56 |                            label2ans=self.train_tuple.dataset.label2ans)
 57 | 
 58 |         # GPU options
 59 |         self.model = self.model.cuda()
 60 |         if args.multiGPU:
 61 |             self.model.lxrt_encoder.multi_gpu()
 62 | 
 63 |         # Losses and optimizer
 64 |         self.bce_loss = nn.BCEWithLogitsLoss()
 65 |         self.mce_loss = nn.CrossEntropyLoss(ignore_index=-1)
 66 |         if 'bert' in args.optim:
 67 |             batch_per_epoch = len(self.train_tuple.loader)
 68 |             t_total = int(batch_per_epoch * args.epochs)
 69 |             print("Total Iters: %d" % t_total)
 70 |             from src.lxrt.optimization import BertAdam
 71 |             self.optim = BertAdam(list(self.model.parameters()),
 72 |                                   lr=args.lr,
 73 |                                   warmup=0.1,
 74 |                                   t_total=t_total)
 75 |         else:
 76 |             self.optim = args.optimizer(list(self.model.parameters()), args.lr)
 77 | 
 78 |         self.output = args.output
 79 | 
 80 |         os.makedirs(self.output, exist_ok=True)
 81 | 
 82 |     def train(self, train_tuple, eval_tuple):
 83 |         dset, loader, evaluator = train_tuple
 84 |         iter_wrapper = (lambda x: tqdm(x, total=len(loader))) if args.tqdm else (lambda x: x)
 85 | 
 86 |         best_valid = 0.
 87 |         for epoch in range(args.epochs):
 88 |             # log_str = ''
 89 |             quesid2ans = {}
 90 |             for i, (ques_id, feats, boxes, sent, target) in iter_wrapper(enumerate(loader)):
 91 | 
 92 |                 self.model.train()
 93 |                 self.optim.zero_grad()
 94 | 
 95 |                 feats, boxes, target = feats.cuda(), boxes.cuda(), target.cuda()
 96 |                 logit, attn_probs = self.model(feats, boxes, sent)
 97 |                 assert logit.dim() == target.dim() == 2
 98 |                 if args.mce_loss:
 99 |                     max_value, target = target.max(1)
100 |                     loss = self.mce_loss(logit, target) * logit.size(1)
101 |                 else:
102 |                     loss = self.bce_loss(logit, target)
103 |                     loss = loss * logit.size(1)
104 | 
105 |                 loss.backward()
106 |                 nn.utils.clip_grad_norm_(self.model.parameters(), 5.)
107 |                 self.optim.step()
108 | 
109 |                 score, label = logit.max(1)
110 |                 for qid, l in zip(ques_id, label.cpu().numpy()):
111 |                     ans = dset.label2ans[l]
112 |                     quesid2ans[qid] = ans
113 | 
114 |                 # del logit, attn_probs
115 |                 # gc.collect()
116 | 
117 |             log_str = "\nEpoch %d: Train %0.2f\n" % (epoch, evaluator.evaluate(quesid2ans) * 100.)
118 | 
119 |             # to handle GPU OOM error
120 |             torch.cuda.empty_cache()
121 | 
122 |             if self.valid_tuple is not None:  # Do Validation
123 |                 valid_score = self.evaluate(eval_tuple)
124 |                 if valid_score > best_valid:
125 |                     best_valid = valid_score
126 |                     self.save("BEST")
127 | 
128 |                 log_str += "Epoch %d: Valid %0.2f\n" % (epoch, valid_score * 100.) + \
129 |                            "Epoch %d: Best %0.2f\n" % (epoch, best_valid * 100.)
130 | 
131 |             print(log_str, end='')
132 | 
133 |             with open(self.output + "/log.log", 'a') as f:
134 |                 f.write(log_str)
135 |                 f.flush()
136 | 
137 |         self.save("LAST")
138 | 
139 |     def predict(self, eval_tuple: DataTuple, dump=None):
140 |         self.model.eval()
141 |         dset, loader, evaluator = eval_tuple
142 |         quesid2ans = {}
143 |         results = []
144 |         for i, datum_tuple in enumerate(loader):
145 |             ques_id, feats, boxes, sent = datum_tuple[:4]   # avoid handling target
146 |             attention = []
147 | 
148 |             with torch.no_grad():
149 |                 feats, boxes = feats.cuda(), boxes.cuda()
150 |                 attn_probs  = self.model(feats, boxes, sent)
151 |                 # print(attn_probs)
152 |                 if self.model.args.output_attention:
153 |                     last_layer_att_score = torch.squeeze(attn_probs[1][-1]['attn'][:, :, 0, :])  # batch_size, att_head, target_num_feat, source_num_feat -> use all att head and CLS as target
154 |                     # print(last_layer_att_score.shape)
155 |                     last_layer_att_score = last_layer_att_score.cpu().numpy().tolist()
156 |                 else:
157 |                     last_layer_att_score = []
158 | 
159 |                 # score, label = logit.max(1)
160 |                 for qid in ques_id:
161 |                     # ans = dset.label2ans[l]
162 |                     # quesid2ans[qid] = ans
163 |                     results.append(
164 |                         {
165 |                             "questionId": qid.tolist(),
166 |                             # "prediction": ans,
167 |                             "attention": last_layer_att_score
168 |                         }
169 |                     )
170 | 
171 |             # del logit, attn_probs, datum_tuple
172 |             # gc.collect()
173 | 
174 |         evaluator.save_json(results, '/data/Grounded-RL2021/lxmert/snap/refcoco+  /attentions.json')
175 | 
176 |         # if dump is not None:
177 |         #     evaluator.dump_result(quesid2ans, dump)
178 |         # return quesid2ans
179 | 
180 |     def evaluate(self, eval_tuple: DataTuple, dump=None):
181 |         dset, loader, evaluator = eval_tuple
182 |         self.predict(eval_tuple, dump)
183 |         # return evaluator.evaluate(quesid2ans)
184 | 
185 |     @staticmethod
186 |     def oracle_score(data_tuple):
187 |         dset, loader, evaluator = data_tuple
188 |         quesid2ans = {}
189 |         for i, (ques_id, feats, boxes, sent, target) in enumerate(loader):
190 |             _, label = target.max(1)
191 |             for qid, l in zip(ques_id, label.cpu().numpy()):
192 |                 ans = dset.label2ans[l]
193 |                 quesid2ans[qid] = ans
194 |         return evaluator.evaluate(quesid2ans)
195 | 
196 |     def save(self, name):
197 |         torch.save(self.model.state_dict(),
198 |                    os.path.join(self.output, "%s.pth" % name))
199 | 
200 |     def load(self, path):
201 |         print("Load model from %s" % path)
202 |         state_dict = torch.load("%s.pth" % path)
203 |         for key in list(state_dict.keys()):
204 |             if '.module' in key:
205 |                 state_dict[key.replace('.module', '')] = state_dict.pop(key)
206 |         self.model.load_state_dict(state_dict, strict=False)
207 | 
208 | 
209 | if __name__ == "__main__":
210 |     torch.backends.cudnn.benchmark = True
211 |     torch.backends.cudnn.enabled = True
212 |     # Build Class
213 |     refcoco = RefCOCOplus()
214 | 
215 | 
216 |     # Load Model
217 |     if args.load is not None:
218 |         refcoco.load(args.load)
219 | 
220 |     # Test or Train
221 |     if args.test is not None:
222 |         args.fast = args.tiny = False       # Always loading all data in test
223 |         if 'submit' in args.test:
224 |             refcoco.predict(
225 |                 get_tuple(args.test, bs=args.batch_size,
226 |                           shuffle=False, drop_last=False),
227 |                 dump=os.path.join(args.output, 'submit_predict.json')
228 |             )
229 |         if 'test' in args.test:
230 |             result = refcoco.evaluate(
231 |                 get_tuple('test', bs=args.batch_size,
232 |                           shuffle=False, drop_last=False),
233 |                 dump=os.path.join(args.output, 'test_predict.json')
234 |             )
235 |             print(result)
236 |         if 'valid' in args.test:
237 |             result = refcoco.evaluate(
238 |                 get_tuple('valid', bs=args.batch_size,
239 |                           shuffle=False, drop_last=False),
240 |                 dump=os.path.join(args.output, 'valid_predict.json')
241 |             )
242 |             print(result)
243 |     else:
244 |         # print("Train Oracle: %0.2f" % (gqa.oracle_score(gqa.train_tuple) * 100))
245 |         print('Splits in Train data:', refcoco.train_tuple.dataset.splits)
246 |         if refcoco.valid_tuple is not None:
247 |             print('Splits in Valid data:', refcoco.valid_tuple.dataset.splits)
248 |             print("Valid Oracle: %0.2f" % (refcoco.oracle_score(refcoco.valid_tuple) * 100))
249 |         else:
250 |             print("DO NOT USE VALIDATION")
251 |         refcoco.train(refcoco.train_tuple, refcoco.valid_tuple)
252 | 
253 | 
254 | 


--------------------------------------------------------------------------------
/src/tasks/refcocog.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyleft 2019 project LXRT.
  3 | 
  4 | import os
  5 | import collections
  6 | 
  7 | import gc
  8 | import torch
  9 | from tqdm import tqdm
 10 | import torch.nn as nn
 11 | from torch.utils.data.dataloader import DataLoader
 12 | 
 13 | from src import eval_utils
 14 | from src.param import args
 15 | from src.pretrain.qa_answer_table import load_lxmert_qa
 16 | from src.tasks.refcocog_model import RefCOCOgModel
 17 | from src.tasks.refcocog_data import RefCOCOgDataset, RefCOCOgTorchDataset, RefCOCOgEvaluator
 18 | 
 19 | print(args)
 20 | DataTuple = collections.namedtuple("DataTuple", 'dataset loader evaluator')
 21 | 
 22 | 
 23 | def get_tuple(splits: str, bs:int, shuffle=False, drop_last=False) -> DataTuple:
 24 |     dset = RefCOCOgDataset(splits)
 25 |     tset = RefCOCOgTorchDataset(dset)
 26 |     evaluator = RefCOCOgEvaluator(dset)
 27 |     data_loader = DataLoader(
 28 |         tset, batch_size=bs,
 29 |         shuffle=shuffle, num_workers=args.num_workers,
 30 |         drop_last=drop_last, pin_memory=True
 31 |     )
 32 | 
 33 |     return DataTuple(dataset=dset, loader=data_loader, evaluator=evaluator)
 34 | 
 35 | 
 36 | class RefCOCOg:
 37 |     def __init__(self):
 38 |         self.weakly_supervise = args.train_paradigm == 'weak'
 39 |         self.train_tuple = get_tuple(
 40 |             args.train, bs=args.batch_size, shuffle=True, drop_last=True
 41 |         )
 42 |         if args.valid != "":
 43 |             valid_bsize = args.batch_size #2048 if args.multiGPU else args.batch_size#512
 44 |             self.valid_tuple = get_tuple(
 45 |                 args.valid, bs=valid_bsize,
 46 |                 shuffle=False, drop_last=False
 47 |             )
 48 |         else:
 49 |             self.valid_tuple = None
 50 | 
 51 |         self.model = RefCOCOgModel(train_paradigm=args.train_paradigm)
 52 | 
 53 |         # Load pre-trained weights
 54 |         if args.load_lxmert is not None:
 55 |             self.model.lxrt_encoder.load(args.load_lxmert)
 56 |         if args.load_lxmert_qa is not None:
 57 |             load_lxmert_qa(args.load_lxmert_qa, self.model,
 58 |                            label2ans=self.train_tuple.dataset.label2ans)
 59 | 
 60 |         # GPU options
 61 |         if torch.cuda.is_available():
 62 |             self.device = 'cuda'
 63 |         else:
 64 |             self.device = 'cpu'
 65 |         self.model = self.model.to(self.device)
 66 |         if args.multiGPU and self.device == 'cuda':
 67 |             self.model.lxrt_encoder.multi_gpu()
 68 | 
 69 |         # Losses and optimizer
 70 |         self.bce_loss = nn.BCEWithLogitsLoss()
 71 |         self.mce_loss = nn.CrossEntropyLoss(ignore_index=-1)
 72 |         self.l1_loss = nn.L1Loss(reduction='none')
 73 | 
 74 |         if 'bert' in args.optim:
 75 |             batch_per_epoch = len(self.train_tuple.loader)
 76 |             t_total = int(batch_per_epoch * args.epochs)
 77 |             print("Total Iters: %d" % t_total)
 78 |             from src.lxrt.optimization import BertAdam
 79 |             self.optim = BertAdam(list(self.model.parameters()),
 80 |                                   lr=args.lr,
 81 |                                   warmup=0.1,
 82 |                                   t_total=t_total)
 83 |         else:
 84 |             self.optim = args.optimizer(list(self.model.parameters()), args.lr)
 85 | 
 86 |         self.output = args.output
 87 | 
 88 |         os.makedirs(self.output, exist_ok=True)
 89 | 
 90 |     def train(self, train_tuple, eval_tuple):
 91 |         dset, loader, evaluator = train_tuple
 92 |         iter_wrapper = (lambda x: tqdm(x, total=len(loader))) if args.tqdm else (lambda x: x)
 93 | 
 94 |         best_valid = 0.
 95 |         for epoch in range(args.epochs):
 96 |             # log_str = ''
 97 |             sentid2pbox = {}
 98 |             for i, (sent_id, feats, boxes, sent, target, is_matched) in iter_wrapper(enumerate(loader)):
 99 | 
100 |                 self.model.train()
101 |                 self.optim.zero_grad()
102 | 
103 |                 feats, boxes, target, is_matched = feats.to(self.device), boxes.to(self.device), \
104 |                                                    target.to(self.device), is_matched.to(self.device)
105 | 
106 |                 logit, attn_probs = self.model(feats, boxes, sent)
107 |                 assert logit.dim() == target.dim() == 2 or logit.dim() == target.dim() == 4
108 | 
109 |                 if self.weakly_supervise:
110 |                     loss = self.bce_loss(logit, is_matched)
111 |                 else:
112 |                     loss = self.l1_loss(logit, target)
113 |                     loss = loss.sum() / logit.shape[0]
114 | 
115 |                 loss.backward()
116 |                 nn.utils.clip_grad_norm_(self.model.parameters(), 5.)
117 |                 self.optim.step()
118 | 
119 |                 if self.weakly_supervise:
120 |                     pass
121 |                 else:
122 |                     miou, accu = eval_utils.trans_vg_eval_val(logit, target)
123 |                     print('Epoch: {epoch}, Iteration: {iter}, loss: {loss:.6f}, miou: {miou:.4f}, Accuracy: {acc:.4f}'.format(
124 |                         epoch=epoch,
125 |                         iter=i,
126 |                         loss=loss.item(),
127 |                         miou=miou.detach().mean().cpu().numpy(),
128 |                         acc=accu
129 |                     ))
130 |                     #todo: fix evaluation code for ref expression task
131 |                     pred_boxes = eval_utils.get_pred_boxes(logit)
132 |                 # score, label = logit.max(1)
133 |                 for sid, pbox in zip(sent_id, pred_boxes.cpu().detach().numpy()):
134 |                     sentid2pbox[sid] = pbox
135 | 
136 |             log_str = "\nEpoch %d: Train %0.2f\n" % (epoch, evaluator.evaluate(sentid2pbox) * 100.)
137 | 
138 |             # to handle GPU OOM error
139 |             torch.cuda.empty_cache()
140 | 
141 |             if self.valid_tuple is not None:  # Do Validation
142 |                 valid_score = self.evaluate(eval_tuple)
143 |                 if valid_score > best_valid:
144 |                     best_valid = valid_score
145 |                     self.save("BEST")
146 | 
147 |                 log_str += "Epoch %d: Valid %0.2f\n" % (epoch, valid_score * 100.) + \
148 |                            "Epoch %d: Best %0.2f\n" % (epoch, best_valid * 100.)
149 | 
150 |             print(log_str, end='')
151 | 
152 |             with open(self.output + "/log.log", 'a') as f:
153 |                 f.write(log_str)
154 |                 f.flush()
155 | 
156 |         self.save("LAST")
157 | 
158 |     def predict(self, eval_tuple: DataTuple, dump=None):
159 |         self.model.eval()
160 |         dset, loader, evaluator = eval_tuple
161 |         sentid2ans = {}
162 |         results = []
163 |         for i, datum_tuple in enumerate(loader):
164 |             sent_id, feats, boxes, sent = datum_tuple[:4]   # avoid handling target
165 |             attention = []
166 | 
167 |             with torch.no_grad():
168 |                 feats, boxes = feats.to(self.device), boxes.to(self.device)
169 |                 logits, attn_probs  = self.model(feats, boxes, sent)
170 |                 # print(attn_probs)
171 |                 if self.model.args.output_attention:
172 |                     last_layer_att_score = torch.squeeze(attn_probs[1][-1]['attn'][:, :, 0, :])  # batch_size, att_head, target_num_feat, source_num_feat -> use all att head and CLS as target
173 |                     # print(last_layer_att_score.shape)
174 |                     last_layer_att_score = last_layer_att_score.cpu().numpy().tolist()
175 |                 else:
176 |                     last_layer_att_score = []
177 | 
178 |                 pred_boxes = eval_utils.get_pred_boxes(logits)
179 | 
180 |                 for sid in sent_id:
181 |                     # ans = dset.label2ans[l]
182 |                     sentid2ans[sid] = logits
183 |                     results.append(
184 |                         {
185 |                             "questionId": sid.tolist(),
186 |                             "prediction": pred_boxes,
187 |                             "attention": last_layer_att_score
188 |                         }
189 |                     )
190 | 
191 |             # del logit, attn_probs, datum_tuple
192 |             # gc.collect()
193 | 
194 |         evaluator.save_json(results, '/data/Grounded-RL2021/lxmert/snap/refcocog/attentions.json')
195 | 
196 |         if dump is not None:
197 |             evaluator.dump_result(sentid2ans, dump)
198 |         return sentid2ans
199 | 
200 |     def evaluate(self, eval_tuple: DataTuple, dump=None):
201 |         dset, loader, evaluator = eval_tuple
202 |         sentid2box = self.predict(eval_tuple, dump)
203 |         return evaluator.evaluate(sentid2box)
204 | 
205 |     @staticmethod
206 |     def oracle_score(data_tuple):
207 |         dset, loader, evaluator = data_tuple
208 |         sentid2box = {}
209 |         for i, (ques_id, feats, boxes, sent, target_box, is_matched) in enumerate(loader):
210 |             # target_ = torch.stack(target_box, dim=0).permute(1, 0)
211 |             miou, acc = eval_utils.trans_vg_eval_val(target_box, target_box, oracle=True)
212 |             # _, label = target_box.max(1)
213 |             for sid, iou in zip(ques_id, miou.cpu().numpy()):
214 |                 # ans = dset.label2ans[l]
215 |                 sentid2box[sid] = iou
216 |         return evaluator.iou_acc(sentid2box) / dset.__len__
217 | 
218 |     def save(self, name):
219 |         torch.save(self.model.state_dict(),
220 |                    os.path.join(self.output, "%s.pth" % name))
221 | 
222 |     def load(self, path):
223 |         print("Load model from %s" % path)
224 |         state_dict = torch.load("%s.pth" % path)
225 |         for key in list(state_dict.keys()):
226 |             if '.module' in key:
227 |                 state_dict[key.replace('.module', '')] = state_dict.pop(key)
228 |         self.model.load_state_dict(state_dict, strict=False)
229 | 
230 | 
231 | if __name__ == "__main__":
232 |     torch.backends.cudnn.benchmark = True
233 |     torch.backends.cudnn.enabled = True
234 |     # Build Class
235 |     refcocog = RefCOCOg()
236 | 
237 |     # Load Model
238 |     if args.load is not None:
239 |         refcocog.load(args.load)
240 | 
241 |     # Test or Train
242 |     if args.test is not None:
243 |         args.fast = args.tiny = False       # Always loading all data in test
244 |         if 'submit' in args.test:
245 |             refcocog.predict(
246 |                 get_tuple(args.test, bs=args.batch_size,
247 |                           shuffle=False, drop_last=False),
248 |                 dump=os.path.join(args.output, 'submit_predict.json')
249 |             )
250 |         if 'test' in args.test:
251 |             result = refcocog.evaluate(
252 |                 get_tuple('test', bs=args.batch_size,
253 |                           shuffle=False, drop_last=False),
254 |                 dump=os.path.join(args.output, 'test_predict.json')
255 |             )
256 |             print(result)
257 |         if 'valid' in args.test:
258 |             result = refcocog.evaluate(
259 |                 get_tuple('valid', bs=args.batch_size,
260 |                           shuffle=False, drop_last=False),
261 |                 dump=os.path.join(args.output, 'valid_predict.json')
262 |             )
263 |             print(result)
264 |     else:
265 |         # print("Train Oracle: %0.2f" % (gqa.oracle_score(gqa.train_tuple) * 100))
266 |         print('Splits in Train data:', refcocog.train_tuple.dataset.splits)
267 |         if refcocog.valid_tuple is not None:
268 |             print('Splits in Valid data:', refcocog.valid_tuple.dataset.splits)
269 |             print("Valid Oracle: %0.2f" % (refcocog.oracle_score(refcocog.valid_tuple) * 100))
270 |         else:
271 |             print("DO NOT USE VALIDATION")
272 |         refcocog.train(refcocog.train_tuple, refcocog.valid_tuple)
273 | 
274 | 
275 | 


--------------------------------------------------------------------------------