├── LICENSE
├── README.md
├── cfgs
    ├── pretrain
    │   ├── base_e2e_16x16G_fp16.yaml
    │   ├── base_prec_4x16G_fp32.yaml
    │   ├── base_prec_withouttextonly_4x16G_fp32.yaml
    │   ├── large_e2e_16x16G_fp16.yaml
    │   ├── large_prec_4x16G_fp16.yaml
    │   └── vis_attention_maps_coco.yaml
    ├── refcoco
    │   ├── base_detected_regions_4x16G.yaml
    │   ├── base_gt_boxes_4x16G.yaml
    │   ├── large_detected_regions_4x16G.yaml
    │   └── large_gt_boxes_4x16G.yaml
    ├── vcr
    │   ├── base_q2a_4x16G_fp32.yaml
    │   ├── base_qa2r_4x16G_fp32.yaml
    │   ├── large_q2a_16x16G_fp16.yaml
    │   ├── large_q2a_4x16G_fp16.yaml
    │   ├── large_qa2r_16x16G_fp16.yaml
    │   └── large_qa2r_4x16G_fp16.yaml
    └── vqa
    │   ├── base_4x16G_fp32.yaml
    │   └── large_4x16G_fp32.yaml
├── common
    ├── __init__.py
    ├── backbone
    │   ├── __init__.py
    │   └── resnet
    │   │   ├── __init__.py
    │   │   └── resnet.py
    ├── callbacks
    │   ├── batch_end_callbacks
    │   │   ├── __init__.py
    │   │   └── speedometer.py
    │   └── epoch_end_callbacks
    │   │   ├── __init__.py
    │   │   ├── checkpoint.py
    │   │   └── validation_monitor.py
    ├── fast_rcnn.py
    ├── lib
    │   └── roi_pooling
    │   │   ├── ROIAlign.h
    │   │   ├── ROIPool.h
    │   │   ├── __init__.py
    │   │   ├── cpu
    │   │       ├── ROIAlign_cpu.cpp
    │   │       └── vision.h
    │   │   ├── cuda
    │   │       ├── ROIAlign_cuda.cu
    │   │       ├── ROIPool_cuda.cu
    │   │       └── vision.h
    │   │   ├── debug.py
    │   │   ├── roi_align.py
    │   │   ├── roi_pool.py
    │   │   ├── setup.py
    │   │   └── vision.cpp
    ├── lr_scheduler.py
    ├── metrics
    │   ├── __init__.py
    │   ├── composite_eval_metric.py
    │   ├── eval_metric.py
    │   ├── pretrain_metrics.py
    │   ├── refcoco_metrics.py
    │   ├── vcr_metrics.py
    │   └── vqa_metrics.py
    ├── module.py
    ├── nlp
    │   ├── __init__.py
    │   ├── bert
    │   │   ├── __init__.py
    │   │   └── optimization.py
    │   ├── bert_encoder_wrapper.py
    │   ├── encoder_base.py
    │   ├── input_variational_dropout.py
    │   ├── misc.py
    │   ├── roberta
    │   │   ├── __init__.py
    │   │   ├── modeling_roberta.py
    │   │   ├── tokenization_roberta.py
    │   │   └── utils.py
    │   └── time_distributed.py
    ├── trainer.py
    ├── utils
    │   ├── __init__.py
    │   ├── bbox.py
    │   ├── clip_pad.py
    │   ├── create_logger.py
    │   ├── flatten.py
    │   ├── load.py
    │   ├── mask.py
    │   ├── masked_softmax.py
    │   ├── misc.py
    │   ├── multi_task_dataloader.py
    │   ├── pad_sequence.py
    │   └── zipreader.py
    └── visual_linguistic_bert.py
├── data
    ├── PREPARE_DATA.md
    └── conceptual-captions
    │   ├── ReadMe.txt
    │   └── utils
    │       ├── check_valid.py
    │       ├── download_train.sh
    │       ├── download_val.sh
    │       ├── gen_train4download.py
    │       ├── gen_train_image_json.py
    │       ├── gen_val4download.py
    │       └── gen_val_image_json.py
├── external
    └── pytorch_pretrained_bert
    │   ├── __init__.py
    │   ├── __main__.py
    │   ├── convert_gpt2_checkpoint_to_pytorch.py
    │   ├── convert_openai_checkpoint_to_pytorch.py
    │   ├── convert_tf_checkpoint_to_pytorch.py
    │   ├── convert_transfo_xl_checkpoint_to_pytorch.py
    │   ├── file_utils.py
    │   ├── modeling.py
    │   ├── modeling_gpt2.py
    │   ├── modeling_openai.py
    │   ├── modeling_transfo_xl.py
    │   ├── modeling_transfo_xl_utilities.py
    │   ├── optimization.py
    │   ├── optimization_openai.py
    │   ├── tokenization.py
    │   ├── tokenization_gpt2.py
    │   ├── tokenization_openai.py
    │   └── tokenization_transfo_xl.py
├── figs
    ├── attention_viz.png
    └── pretrain.png
├── model
    └── pretrained_model
    │   └── PREPARE_PRETRAINED_MODELS.md
├── pretrain
    ├── _init_paths.py
    ├── data
    │   ├── __init__.py
    │   ├── build.py
    │   ├── collate_batch.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── coco_captions.py
    │   │   ├── conceptual_captions.py
    │   │   └── general_corpus.py
    │   ├── samplers
    │   │   ├── __init__.py
    │   │   ├── distributed.py
    │   │   └── grouped_batch_sampler.py
    │   └── transforms
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   └── transforms.py
    ├── function
    │   ├── __init__.py
    │   ├── config.py
    │   ├── train.py
    │   ├── val.py
    │   └── vis.py
    ├── modules
    │   ├── __init__.py
    │   ├── resnet_vlbert_for_attention_vis.py
    │   ├── resnet_vlbert_for_pretraining.py
    │   └── resnet_vlbert_for_pretraining_multitask.py
    ├── train_end2end.py
    └── vis_attention_maps.py
├── refcoco
    ├── _init_paths.py
    ├── data
    │   ├── __init__.py
    │   ├── build.py
    │   ├── collate_batch.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── refcoco.py
    │   │   └── refer
    │   │   │   ├── Makefile
    │   │   │   ├── __init__.py
    │   │   │   ├── external
    │   │   │       ├── .gitignore
    │   │   │       ├── README.md
    │   │   │       ├── __init__.py
    │   │   │       ├── _mask.pyx
    │   │   │       ├── mask.py
    │   │   │       ├── maskApi.c
    │   │   │       └── maskApi.h
    │   │   │   ├── refer.py
    │   │   │   └── setup.py
    │   ├── samplers
    │   │   ├── __init__.py
    │   │   ├── distributed.py
    │   │   └── grouped_batch_sampler.py
    │   └── transforms
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   └── transforms.py
    ├── function
    │   ├── __init__.py
    │   ├── config.py
    │   ├── test.py
    │   ├── train.py
    │   └── val.py
    ├── modules
    │   ├── __init__.py
    │   └── resnet_vlbert_for_refcoco.py
    ├── test.py
    └── train_end2end.py
├── requirements.txt
├── scripts
    ├── dist_run_multi.sh
    ├── dist_run_single.sh
    ├── dist_run_slurm.sh
    ├── init.sh
    ├── init_slurm.sh
    ├── launch.py
    ├── nondist_run.sh
    └── nondist_run_slurm.sh
├── vcr
    ├── _init_paths.py
    ├── data
    │   ├── __init__.py
    │   ├── build.py
    │   ├── collate_batch.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   └── vcr.py
    │   ├── samplers
    │   │   ├── __init__.py
    │   │   ├── distributed.py
    │   │   └── grouped_batch_sampler.py
    │   └── transforms
    │   │   ├── __init__.py
    │   │   ├── build.py
    │   │   └── transforms.py
    ├── function
    │   ├── __init__.py
    │   ├── config.py
    │   ├── test.py
    │   ├── train.py
    │   └── val.py
    ├── modules
    │   ├── __init__.py
    │   └── resnet_vlbert_for_vcr.py
    ├── test.py
    ├── train_end2end.py
    └── val.py
├── viz
    ├── VISUALIZATION.md
    ├── _init_paths.py
    ├── bertviz
    │   ├── __init__.py
    │   ├── attention.py
    │   ├── model_view.js
    │   └── model_view.py
    └── model_view_vl-bert_coco.ipynb
└── vqa
    ├── _init_paths.py
    ├── data
        ├── __init__.py
        ├── build.py
        ├── collate_batch.py
        ├── datasets
        │   ├── __init__.py
        │   └── vqa.py
        ├── samplers
        │   ├── __init__.py
        │   ├── distributed.py
        │   └── grouped_batch_sampler.py
        └── transforms
        │   ├── __init__.py
        │   ├── build.py
        │   └── transforms.py
    ├── function
        ├── __init__.py
        ├── config.py
        ├── test.py
        ├── train.py
        └── val.py
    ├── modules
        ├── __init__.py
        └── resnet_vlbert_for_vqa.py
    ├── test.py
    └── train_end2end.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Weijie Su
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/cfgs/pretrain/base_e2e_16x16G_fp16.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | RNG_SEED: 12345
  3 | OUTPUT_PATH: './output/pretrain/vl-bert'
  4 | MODULE: ResNetVLBERTForPretrainingMultitask
  5 | GPUS: '0,1,2,3'
  6 | LOG_FREQUENT: 100
  7 | VAL_FREQUENT: 1
  8 | CHECKPOINT_FREQUENT: 1
  9 | MODEL_PREFIX: 'vl-bert_base_res101_pretrain_multitask'
 10 | NUM_WORKERS_PER_GPU: 4
 11 | SCALES:
 12 | - 600
 13 | - 1000
 14 | 
 15 | DATASET:
 16 | 
 17 | - DATASET: conceptual_captions
 18 |   APPEND_INDEX: false
 19 |   DATASET_PATH: './data/conceptual-captions'
 20 |   ROOT_PATH: './'
 21 |   TRAIN_IMAGE_SET: 'train'
 22 |   VAL_IMAGE_SET: 'val'
 23 |   TEST_IMAGE_SET: 'val'
 24 |   ADD_IMAGE_AS_A_BOX: true
 25 |   ZIP_MODE: false
 26 |   CACHE_MODE: false
 27 |   IGNORE_DB_CACHE: false
 28 |   MASK_SIZE: 14
 29 | 
 30 | - DATASET: general_corpus
 31 |   TRAIN_ANNOTATION_FILE: './data/en_corpus/bc1g.doc+./data/en_corpus/wiki.doc'
 32 |   VAL_ANNOTATION_FILE: './data/en_corpus/bc1g.doc'
 33 |   TEST_ANNOTATION_FILE: './data/en_corpus/bc1g.doc'
 34 |   SEQ_LEN: 64
 35 |   MIN_SEQ_LEN: 64
 36 | 
 37 | NETWORK:
 38 |   PARTIAL_PRETRAIN: ""
 39 |   PARTIAL_PRETRAIN_PREFIX_CHANGES: []
 40 |   IMAGE_NUM_LAYERS: 101
 41 |   IMAGE_C5_DILATED: true
 42 |   IMAGE_STRIDE_IN_1x1: true
 43 |   PIXEL_MEANS:
 44 |   - 102.9801
 45 |   - 115.9465
 46 |   - 122.7717
 47 |   PIXEL_STDS:
 48 |   - 1.0
 49 |   - 1.0
 50 |   - 1.0
 51 |   IMAGE_FEAT_PRECOMPUTED: false
 52 |   IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua'
 53 |   IMAGE_PRETRAINED_EPOCH: 0
 54 |   IMAGE_FROZEN_BACKBONE_STAGES:
 55 |   - 1
 56 |   - 2
 57 |   IMAGE_FROZEN_BN: true
 58 |   IMAGE_FINAL_DIM: 768
 59 |   IMAGE_SEMANTIC: false
 60 |   OUTPUT_CONV5: false
 61 |   BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased'
 62 |   BERT_PRETRAINED: ''
 63 |   BERT_PRETRAINED_EPOCH: 0
 64 |   BERT_FROZEN: false
 65 |   ENABLE_CNN_REG_LOSS: false
 66 |   MLM_LOSS_NORM_IN_BATCH_FIRST: false
 67 |   MVRC_LOSS_NORM_IN_BATCH_FIRST: false
 68 |   WITH_REL_LOSS: false
 69 |   WITH_MLM_LOSS: true
 70 |   WITH_MVRC_LOSS: true
 71 | 
 72 |   VLBERT:
 73 |     with_pooler: false
 74 |     input_transform_type: 1
 75 |     visual_size: 768
 76 |     hidden_size: 768
 77 |     num_hidden_layers: 12
 78 |     num_attention_heads: 12
 79 |     intermediate_size: 3072
 80 |     hidden_act: "gelu"
 81 |     hidden_dropout_prob: 0.1
 82 |     attention_probs_dropout_prob: 0.1
 83 |     max_position_embeddings: 512
 84 |     type_vocab_size: 3
 85 |     vocab_size: 30522
 86 |     initializer_range: 0.02
 87 |     visual_scale_text_init: 0.0
 88 |     visual_scale_object_init: 0.0
 89 |     visual_ln: true
 90 |     pos_embedding_frozen: false
 91 | 
 92 | TRAIN:
 93 |   SHUFFLE: true
 94 |   FLIP_PROB: 0.5
 95 |   BATCH_IMAGES:
 96 |   - 8
 97 |   - 8
 98 |   ASPECT_GROUPING: false
 99 |   RESUME: false
100 |   AUTO_RESUME: true
101 |   BEGIN_EPOCH: 0
102 |   END_EPOCH: 10
103 |   OPTIMIZER: 'AdamW'
104 |   CLIP_GRAD_NORM: 10
105 |   LR: 1.0e-7
106 |   LR_SCHEDULE: 'triangle'
107 |   WD: 0.0001
108 |   WARMUP: true
109 |   WARMUP_METHOD: 'linear'
110 |   WARMUP_FACTOR: 0.0
111 |   WARMUP_STEPS: 16000
112 |   FP16: true
113 |   FP16_LOSS_SCALE: 'dynamic'
114 |   LOSS_LOGGERS:
115 |   - "mlm_loss_wvc,MLMLossWVC"
116 |   - "mlm_loss_aux,MLMLossAUX"
117 |   - "mvrc_loss,MVRCLoss"
118 | 
119 | VAL:
120 |   SHUFFLE: false
121 |   FLIP_PROB: 0
122 |   BATCH_IMAGES:
123 |   - 8
124 |   - 8
125 | 
126 | TEST:
127 |   SHUFFLE: false
128 |   FLIP_PROB: 0
129 |   TEST_EPOCH: 0
130 |   BATCH_IMAGES:
131 |   - 8
132 |   - 8


--------------------------------------------------------------------------------
/cfgs/pretrain/base_prec_4x16G_fp32.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | RNG_SEED: 12345
  3 | OUTPUT_PATH: './output/pretrain/vlbert'
  4 | MODULE: ResNetVLBERTForPretrainingMultitask
  5 | GPUS: '0,1,2,3'
  6 | LOG_FREQUENT: 100
  7 | VAL_FREQUENT: 1
  8 | CHECKPOINT_FREQUENT: 1
  9 | MODEL_PREFIX: 'vl-bert_base_res101_pretrain_multitask'
 10 | NUM_WORKERS_PER_GPU: 4
 11 | SCALES:
 12 | - 600
 13 | - 1000
 14 | 
 15 | DATASET:
 16 | 
 17 | - DATASET: conceptual_captions
 18 |   APPEND_INDEX: false
 19 |   DATASET_PATH: './data/conceptual-captions'
 20 |   ROOT_PATH: './'
 21 |   TRAIN_IMAGE_SET: 'train'
 22 |   VAL_IMAGE_SET: 'val'
 23 |   TEST_IMAGE_SET: 'val'
 24 |   ADD_IMAGE_AS_A_BOX: true
 25 |   ZIP_MODE: false
 26 |   CACHE_MODE: false
 27 |   IGNORE_DB_CACHE: false
 28 |   MASK_SIZE: 14
 29 | 
 30 | - DATASET: general_corpus
 31 |   TRAIN_ANNOTATION_FILE: './data/en_corpus/bc1g.doc+./data/en_corpus/wiki.doc'
 32 |   VAL_ANNOTATION_FILE: './data/en_corpus/bc1g.doc'
 33 |   TEST_ANNOTATION_FILE: './data/en_corpus/bc1g.doc'
 34 |   SEQ_LEN: 64
 35 |   MIN_SEQ_LEN: 64
 36 | 
 37 | NETWORK:
 38 |   PARTIAL_PRETRAIN: ""
 39 |   PARTIAL_PRETRAIN_PREFIX_CHANGES: []
 40 |   IMAGE_NUM_LAYERS: 101
 41 |   IMAGE_C5_DILATED: true
 42 |   IMAGE_STRIDE_IN_1x1: true
 43 |   PIXEL_MEANS:
 44 |   - 102.9801
 45 |   - 115.9465
 46 |   - 122.7717
 47 |   PIXEL_STDS:
 48 |   - 1.0
 49 |   - 1.0
 50 |   - 1.0
 51 |   IMAGE_FEAT_PRECOMPUTED: true
 52 |   IMAGE_PRETRAINED: ''
 53 |   IMAGE_PRETRAINED_EPOCH: 0
 54 |   IMAGE_FROZEN_BACKBONE_STAGES:
 55 |   - 1
 56 |   - 2
 57 |   IMAGE_FROZEN_BN: true
 58 |   IMAGE_FINAL_DIM: 768
 59 |   IMAGE_SEMANTIC: false
 60 |   OUTPUT_CONV5: false
 61 |   BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased'
 62 |   BERT_PRETRAINED: ''
 63 |   BERT_PRETRAINED_EPOCH: 0
 64 |   BERT_FROZEN: false
 65 |   ENABLE_CNN_REG_LOSS: false
 66 |   MLM_LOSS_NORM_IN_BATCH_FIRST: false
 67 |   MVRC_LOSS_NORM_IN_BATCH_FIRST: false
 68 |   WITH_REL_LOSS: false
 69 |   WITH_MLM_LOSS: true
 70 |   WITH_MVRC_LOSS: true
 71 | 
 72 |   VLBERT:
 73 |     with_pooler: false
 74 |     input_transform_type: 1
 75 |     visual_size: 768
 76 |     hidden_size: 768
 77 |     num_hidden_layers: 12
 78 |     num_attention_heads: 12
 79 |     intermediate_size: 3072
 80 |     hidden_act: "gelu"
 81 |     hidden_dropout_prob: 0.1
 82 |     attention_probs_dropout_prob: 0.1
 83 |     max_position_embeddings: 512
 84 |     type_vocab_size: 3
 85 |     vocab_size: 30522
 86 |     initializer_range: 0.02
 87 |     visual_scale_text_init: 0.0
 88 |     visual_scale_object_init: 0.0
 89 |     visual_ln: true
 90 |     pos_embedding_frozen: false
 91 | 
 92 | TRAIN:
 93 |   SHUFFLE: true
 94 |   FLIP_PROB: 0.5
 95 |   BATCH_IMAGES:
 96 |   - 32
 97 |   - 32
 98 |   ASPECT_GROUPING: false
 99 |   RESUME: false
100 |   AUTO_RESUME: true
101 |   BEGIN_EPOCH: 0
102 |   END_EPOCH: 10
103 |   OPTIMIZER: 'AdamW'
104 |   CLIP_GRAD_NORM: 10
105 |   LR: 1.0e-7
106 |   LR_SCHEDULE: 'triangle'
107 |   WD: 0.0001
108 |   WARMUP: true
109 |   WARMUP_METHOD: 'linear'
110 |   WARMUP_FACTOR: 0.0
111 |   WARMUP_STEPS: 16000
112 |   FP16: false
113 |   FP16_LOSS_SCALE: 128.0
114 |   LOSS_LOGGERS:
115 |   - "mlm_loss_wvc,MLMLossWVC"
116 |   - "mlm_loss_aux,MLMLossAUX"
117 |   - "mvrc_loss,MVRCLoss"
118 | 
119 | VAL:
120 |   SHUFFLE: false
121 |   FLIP_PROB: 0
122 |   BATCH_IMAGES:
123 |   - 32
124 |   - 32
125 | 
126 | TEST:
127 |   SHUFFLE: false
128 |   FLIP_PROB: 0
129 |   TEST_EPOCH: 0
130 |   BATCH_IMAGES:
131 |   - 32
132 |   - 32


--------------------------------------------------------------------------------
/cfgs/pretrain/base_prec_withouttextonly_4x16G_fp32.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | RNG_SEED: 12345
  3 | OUTPUT_PATH: './output/pretrain/vlbert'
  4 | MODULE: ResNetVLBERTForPretraining
  5 | GPUS: '0,1,2,3'
  6 | LOG_FREQUENT: 100
  7 | VAL_FREQUENT: 1
  8 | CHECKPOINT_FREQUENT: 1
  9 | MODEL_PREFIX: 'vl-bert_base_res101_pretrain'
 10 | NUM_WORKERS_PER_GPU: 4
 11 | SCALES:
 12 | - 600
 13 | - 1000
 14 | 
 15 | DATASET:
 16 |   DATASET: conceptual_captions
 17 |   APPEND_INDEX: false
 18 |   DATASET_PATH: './data/conceptual-captions'
 19 |   ROOT_PATH: './'
 20 |   TRAIN_IMAGE_SET: 'train'
 21 |   VAL_IMAGE_SET: 'val'
 22 |   TEST_IMAGE_SET: 'val'
 23 |   ADD_IMAGE_AS_A_BOX: true
 24 |   ZIP_MODE: false
 25 |   CACHE_MODE: false
 26 |   IGNORE_DB_CACHE: false
 27 |   MASK_SIZE: 14
 28 | 
 29 | NETWORK:
 30 |   PARTIAL_PRETRAIN: ""
 31 |   PARTIAL_PRETRAIN_PREFIX_CHANGES: []
 32 |   IMAGE_NUM_LAYERS: 101
 33 |   IMAGE_C5_DILATED: true
 34 |   IMAGE_STRIDE_IN_1x1: true
 35 |   PIXEL_MEANS:
 36 |   - 102.9801
 37 |   - 115.9465
 38 |   - 122.7717
 39 |   PIXEL_STDS:
 40 |   - 1.0
 41 |   - 1.0
 42 |   - 1.0
 43 |   IMAGE_FEAT_PRECOMPUTED: true
 44 |   IMAGE_PRETRAINED: ''
 45 |   IMAGE_PRETRAINED_EPOCH: 0
 46 |   IMAGE_FROZEN_BACKBONE_STAGES:
 47 |   - 1
 48 |   - 2
 49 |   IMAGE_FROZEN_BN: true
 50 |   IMAGE_FINAL_DIM: 768
 51 |   IMAGE_SEMANTIC: false
 52 |   OUTPUT_CONV5: false
 53 |   BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased'
 54 |   BERT_PRETRAINED: ''
 55 |   BERT_PRETRAINED_EPOCH: 0
 56 |   BERT_FROZEN: false
 57 |   ENABLE_CNN_REG_LOSS: false
 58 |   MLM_LOSS_NORM_IN_BATCH_FIRST: false
 59 |   MVRC_LOSS_NORM_IN_BATCH_FIRST: false
 60 |   WITH_REL_LOSS: false
 61 |   WITH_MLM_LOSS: true
 62 |   WITH_MVRC_LOSS: true
 63 | 
 64 |   VLBERT:
 65 |     with_pooler: false
 66 |     input_transform_type: 1
 67 |     visual_size: 768
 68 |     hidden_size: 768
 69 |     num_hidden_layers: 12
 70 |     num_attention_heads: 12
 71 |     intermediate_size: 3072
 72 |     hidden_act: "gelu"
 73 |     hidden_dropout_prob: 0.1
 74 |     attention_probs_dropout_prob: 0.1
 75 |     max_position_embeddings: 512
 76 |     type_vocab_size: 3
 77 |     vocab_size: 30522
 78 |     initializer_range: 0.02
 79 |     visual_scale_text_init: 0.0
 80 |     visual_scale_object_init: 0.0
 81 |     visual_ln: true
 82 |     pos_embedding_frozen: false
 83 | 
 84 | TRAIN:
 85 |   SHUFFLE: true
 86 |   FLIP_PROB: 0.5
 87 |   BATCH_IMAGES: 64
 88 |   ASPECT_GROUPING: false
 89 |   RESUME: false
 90 |   AUTO_RESUME: true
 91 |   BEGIN_EPOCH: 0
 92 |   END_EPOCH: 10
 93 |   OPTIMIZER: 'AdamW'
 94 |   CLIP_GRAD_NORM: 10
 95 |   LR: 1.0e-7
 96 |   LR_SCHEDULE: 'triangle'
 97 |   WD: 0.0001
 98 |   WARMUP: true
 99 |   WARMUP_METHOD: 'linear'
100 |   WARMUP_FACTOR: 0.0
101 |   WARMUP_STEPS: 8000
102 |   FP16: false
103 |   FP16_LOSS_SCALE: 128.0
104 |   LOSS_LOGGERS:
105 |   - "mlm_loss,MLMLossWVC"
106 |   - "mvrc_loss,MVRCLoss"
107 | 
108 | VAL:
109 |   SHUFFLE: false
110 |   FLIP_PROB: 0
111 |   BATCH_IMAGES: 64
112 | 
113 | TEST:
114 |   SHUFFLE: false
115 |   FLIP_PROB: 0
116 |   TEST_EPOCH: 0
117 |   BATCH_IMAGES: 64


--------------------------------------------------------------------------------
/cfgs/pretrain/large_e2e_16x16G_fp16.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | RNG_SEED: 12345
  3 | OUTPUT_PATH: './output/pretrain/vlbert'
  4 | MODULE: ResNetVLBERTForPretrainingMultitask
  5 | GPUS: '0,1,2,3'
  6 | LOG_FREQUENT: 100
  7 | VAL_FREQUENT: 1
  8 | CHECKPOINT_FREQUENT: 1
  9 | MODEL_PREFIX: 'vl-bert_large_res101_pretrain_multitask'
 10 | NUM_WORKERS_PER_GPU: 4
 11 | SCALES:
 12 | - 600
 13 | - 1000
 14 | 
 15 | DATASET:
 16 | 
 17 | - DATASET: conceptual_captions
 18 |   APPEND_INDEX: false
 19 |   DATASET_PATH: './data/conceptual-captions'
 20 |   ROOT_PATH: './'
 21 |   TRAIN_IMAGE_SET: 'train'
 22 |   VAL_IMAGE_SET: 'val'
 23 |   TEST_IMAGE_SET: 'val'
 24 |   ADD_IMAGE_AS_A_BOX: true
 25 |   ZIP_MODE: false
 26 |   CACHE_MODE: false
 27 |   IGNORE_DB_CACHE: false
 28 |   MASK_SIZE: 14
 29 | 
 30 | - DATASET: general_corpus
 31 |   TRAIN_ANNOTATION_FILE: './data/en_corpus/bc1g.doc+./data/en_corpus/wiki.doc'
 32 |   VAL_ANNOTATION_FILE: './data/en_corpus/bc1g.doc'
 33 |   TEST_ANNOTATION_FILE: './data/en_corpus/bc1g.doc'
 34 |   SEQ_LEN: 64
 35 |   MIN_SEQ_LEN: 64
 36 | 
 37 | NETWORK:
 38 |   PARTIAL_PRETRAIN: ""
 39 |   PARTIAL_PRETRAIN_PREFIX_CHANGES: []
 40 |   IMAGE_NUM_LAYERS: 101
 41 |   IMAGE_C5_DILATED: true
 42 |   IMAGE_STRIDE_IN_1x1: true
 43 |   PIXEL_MEANS:
 44 |   - 102.9801
 45 |   - 115.9465
 46 |   - 122.7717
 47 |   PIXEL_STDS:
 48 |   - 1.0
 49 |   - 1.0
 50 |   - 1.0
 51 |   IMAGE_FEAT_PRECOMPUTED: false
 52 |   IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua'
 53 |   IMAGE_PRETRAINED_EPOCH: 0
 54 |   IMAGE_FROZEN_BACKBONE_STAGES:
 55 |   - 1
 56 |   - 2
 57 |   IMAGE_FROZEN_BN: true
 58 |   IMAGE_FINAL_DIM: 1024
 59 |   IMAGE_SEMANTIC: false
 60 |   OUTPUT_CONV5: false
 61 |   BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased'
 62 |   BERT_PRETRAINED: ''
 63 |   BERT_PRETRAINED_EPOCH: 0
 64 |   BERT_FROZEN: false
 65 |   ENABLE_CNN_REG_LOSS: false
 66 |   MLM_LOSS_NORM_IN_BATCH_FIRST: false
 67 |   MVRC_LOSS_NORM_IN_BATCH_FIRST: false
 68 |   WITH_REL_LOSS: false
 69 |   WITH_MLM_LOSS: true
 70 |   WITH_MVRC_LOSS: true
 71 | 
 72 |   VLBERT:
 73 |     with_pooler: false
 74 |     input_transform_type: 1
 75 |     visual_size: 1024
 76 |     hidden_size: 1024
 77 |     num_hidden_layers: 24
 78 |     num_attention_heads: 16
 79 |     intermediate_size: 4096
 80 |     hidden_act: "gelu"
 81 |     hidden_dropout_prob: 0.1
 82 |     attention_probs_dropout_prob: 0.1
 83 |     max_position_embeddings: 512
 84 |     type_vocab_size: 3
 85 |     vocab_size: 30522
 86 |     initializer_range: 0.02
 87 |     visual_scale_text_init: 0.0
 88 |     visual_scale_object_init: 0.0
 89 |     visual_ln: true
 90 |     pos_embedding_frozen: false
 91 | 
 92 | TRAIN:
 93 |   SHUFFLE: true
 94 |   FLIP_PROB: 0.5
 95 |   BATCH_IMAGES:
 96 |   - 4
 97 |   - 4
 98 |   ASPECT_GROUPING: false
 99 |   RESUME: false
100 |   AUTO_RESUME: true
101 |   BEGIN_EPOCH: 0
102 |   END_EPOCH: 10
103 |   OPTIMIZER: 'AdamW'
104 |   CLIP_GRAD_NORM: 10
105 |   GRAD_ACCUMULATE_STEPS: 2
106 |   LR: 1.0e-7
107 |   LR_SCHEDULE: 'triangle'
108 |   WD: 0.0001
109 |   WARMUP: true
110 |   WARMUP_METHOD: 'linear'
111 |   WARMUP_FACTOR: 0.0
112 |   WARMUP_STEPS: 16000
113 |   FP16: true
114 |   FP16_LOSS_SCALE: 'dynamic'
115 |   LOSS_LOGGERS:
116 |   - "mlm_loss_wvc,MLMLossWVC"
117 |   - "mlm_loss_aux,MLMLossAUX"
118 |   - "mvrc_loss,MVRCLoss"
119 | 
120 | VAL:
121 |   SHUFFLE: false
122 |   FLIP_PROB: 0
123 |   BATCH_IMAGES:
124 |   - 4
125 |   - 4
126 | 
127 | TEST:
128 |   SHUFFLE: false
129 |   FLIP_PROB: 0
130 |   TEST_EPOCH: 0
131 |   BATCH_IMAGES:
132 |   - 4
133 |   - 4


--------------------------------------------------------------------------------
/cfgs/pretrain/large_prec_4x16G_fp16.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | RNG_SEED: 12345
  3 | OUTPUT_PATH: './output/pretrain/vlbert'
  4 | MODULE: ResNetVLBERTForPretrainingMultitask
  5 | GPUS: '0,1,2,3'
  6 | LOG_FREQUENT: 100
  7 | VAL_FREQUENT: 1
  8 | CHECKPOINT_FREQUENT: 1
  9 | MODEL_PREFIX: 'vl-bert_large_res101_pretrain_multitask'
 10 | NUM_WORKERS_PER_GPU: 4
 11 | SCALES:
 12 | - 600
 13 | - 1000
 14 | 
 15 | DATASET:
 16 | 
 17 | - DATASET: conceptual_captions
 18 |   APPEND_INDEX: false
 19 |   DATASET_PATH: './data/conceptual-captions'
 20 |   ROOT_PATH: './'
 21 |   TRAIN_IMAGE_SET: 'train'
 22 |   VAL_IMAGE_SET: 'val'
 23 |   TEST_IMAGE_SET: 'val'
 24 |   ADD_IMAGE_AS_A_BOX: true
 25 |   ZIP_MODE: false
 26 |   CACHE_MODE: false
 27 |   IGNORE_DB_CACHE: false
 28 |   MASK_SIZE: 14
 29 | 
 30 | - DATASET: general_corpus
 31 |   TRAIN_ANNOTATION_FILE: './data/en_corpus/bc1g.doc+./data/en_corpus/wiki.doc'
 32 |   VAL_ANNOTATION_FILE: './data/en_corpus/bc1g.doc'
 33 |   TEST_ANNOTATION_FILE: './data/en_corpus/bc1g.doc'
 34 |   SEQ_LEN: 64
 35 |   MIN_SEQ_LEN: 64
 36 | 
 37 | NETWORK:
 38 |   PARTIAL_PRETRAIN: ""
 39 |   PARTIAL_PRETRAIN_PREFIX_CHANGES: []
 40 |   IMAGE_NUM_LAYERS: 101
 41 |   IMAGE_C5_DILATED: true
 42 |   IMAGE_STRIDE_IN_1x1: true
 43 |   PIXEL_MEANS:
 44 |   - 102.9801
 45 |   - 115.9465
 46 |   - 122.7717
 47 |   PIXEL_STDS:
 48 |   - 1.0
 49 |   - 1.0
 50 |   - 1.0
 51 |   IMAGE_FEAT_PRECOMPUTED: true
 52 |   IMAGE_PRETRAINED: ''
 53 |   IMAGE_PRETRAINED_EPOCH: 0
 54 |   IMAGE_FROZEN_BACKBONE_STAGES:
 55 |   - 1
 56 |   - 2
 57 |   IMAGE_FROZEN_BN: true
 58 |   IMAGE_FINAL_DIM: 1024
 59 |   IMAGE_SEMANTIC: false
 60 |   OUTPUT_CONV5: false
 61 |   BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased'
 62 |   BERT_PRETRAINED: ''
 63 |   BERT_PRETRAINED_EPOCH: 0
 64 |   BERT_FROZEN: false
 65 |   ENABLE_CNN_REG_LOSS: false
 66 |   MLM_LOSS_NORM_IN_BATCH_FIRST: false
 67 |   MVRC_LOSS_NORM_IN_BATCH_FIRST: false
 68 |   WITH_REL_LOSS: false
 69 |   WITH_MLM_LOSS: true
 70 |   WITH_MVRC_LOSS: true
 71 | 
 72 |   VLBERT:
 73 |     with_pooler: false
 74 |     input_transform_type: 1
 75 |     visual_size: 1024
 76 |     hidden_size: 1024
 77 |     num_hidden_layers: 24
 78 |     num_attention_heads: 16
 79 |     intermediate_size: 4096
 80 |     hidden_act: "gelu"
 81 |     hidden_dropout_prob: 0.1
 82 |     attention_probs_dropout_prob: 0.1
 83 |     max_position_embeddings: 512
 84 |     type_vocab_size: 3
 85 |     vocab_size: 30522
 86 |     initializer_range: 0.02
 87 |     visual_scale_text_init: 0.0
 88 |     visual_scale_object_init: 0.0
 89 |     visual_ln: true
 90 |     pos_embedding_frozen: false
 91 | 
 92 | TRAIN:
 93 |   SHUFFLE: true
 94 |   FLIP_PROB: 0.5
 95 |   BATCH_IMAGES:
 96 |   - 32
 97 |   - 32
 98 |   ASPECT_GROUPING: false
 99 |   RESUME: false
100 |   AUTO_RESUME: true
101 |   BEGIN_EPOCH: 0
102 |   END_EPOCH: 10
103 |   OPTIMIZER: 'AdamW'
104 |   CLIP_GRAD_NORM: 10
105 |   LR: 1.0e-7
106 |   LR_SCHEDULE: 'triangle'
107 |   WD: 0.0001
108 |   WARMUP: true
109 |   WARMUP_METHOD: 'linear'
110 |   WARMUP_FACTOR: 0.0
111 |   WARMUP_STEPS: 16000
112 |   FP16: true
113 |   FP16_LOSS_SCALE: 'dynamic'
114 |   LOSS_LOGGERS:
115 |   - "mlm_loss_wvc,MLMLossWVC"
116 |   - "mlm_loss_aux,MLMLossAUX"
117 |   - "mvrc_loss,MVRCLoss"
118 | 
119 | VAL:
120 |   SHUFFLE: false
121 |   FLIP_PROB: 0
122 |   BATCH_IMAGES:
123 |   - 32
124 |   - 32
125 | 
126 | TEST:
127 |   SHUFFLE: false
128 |   FLIP_PROB: 0
129 |   TEST_EPOCH: 0
130 |   BATCH_IMAGES:
131 |   - 32
132 |   - 32


--------------------------------------------------------------------------------
/cfgs/pretrain/vis_attention_maps_coco.yaml:
--------------------------------------------------------------------------------
 1 | ---
 2 | RNG_SEED: 12345
 3 | OUTPUT_PATH: './output/pretrain/vlbert'
 4 | MODULE: ResNetVLBERTForAttentionVis
 5 | GPUS: '0'
 6 | LOG_FREQUENT: 100
 7 | VAL_FREQUENT: 1
 8 | CHECKPOINT_FREQUENT: 1
 9 | MODEL_PREFIX: 'vl-bert_base_res101_pretrain'
10 | NUM_WORKERS_PER_GPU: 2
11 | SCALES:
12 | - 600
13 | - 1000
14 | 
15 | DATASET:
16 |   DATASET: coco_captions
17 |   APPEND_INDEX: false
18 |   DATASET_PATH: './data/coco'
19 |   ROOT_PATH: './'
20 |   TRAIN_IMAGE_SET: 'train'
21 |   VAL_IMAGE_SET: 'val'
22 |   TEST_IMAGE_SET: 'val'
23 |   ADD_IMAGE_AS_A_BOX: true
24 |   ZIP_MODE: false
25 |   CACHE_MODE: false
26 |   IGNORE_DB_CACHE: false
27 |   MASK_SIZE: 14
28 | 
29 | NETWORK:
30 |   PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-base-e2e.model"
31 |   PARTIAL_PRETRAIN_PREFIX_CHANGES: []
32 |   IMAGE_NUM_LAYERS: 101
33 |   IMAGE_C5_DILATED: true
34 |   IMAGE_STRIDE_IN_1x1: true
35 |   PIXEL_MEANS:
36 |   - 102.9801
37 |   - 115.9465
38 |   - 122.7717
39 |   PIXEL_STDS:
40 |   - 1.0
41 |   - 1.0
42 |   - 1.0
43 |   IMAGE_FEAT_PRECOMPUTED: false
44 |   IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua'
45 |   IMAGE_PRETRAINED_EPOCH: 0
46 |   IMAGE_FROZEN_BACKBONE_STAGES:
47 |   - 1
48 |   - 2
49 |   IMAGE_FROZEN_BN: true
50 |   IMAGE_FINAL_DIM: 768
51 |   IMAGE_SEMANTIC: false
52 |   OUTPUT_CONV5: false
53 |   BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased'
54 |   BERT_PRETRAINED: ''
55 |   BERT_PRETRAINED_EPOCH: 0
56 |   BERT_FROZEN: false
57 |   ENABLE_CNN_REG_LOSS: false
58 |   MLM_LOSS_NORM_IN_BATCH_FIRST: false
59 |   MVRC_LOSS_NORM_IN_BATCH_FIRST: false
60 |   WITH_REL_LOSS: false
61 |   WITH_MLM_LOSS: false
62 |   WITH_MVRC_LOSS: false
63 | 
64 |   VLBERT:
65 |     with_pooler: false
66 |     input_transform_type: 1
67 |     visual_size: 768
68 |     hidden_size: 768
69 |     num_hidden_layers: 12
70 |     num_attention_heads: 12
71 |     intermediate_size: 3072
72 |     hidden_act: "gelu"
73 |     hidden_dropout_prob: 0.1
74 |     attention_probs_dropout_prob: 0.1
75 |     max_position_embeddings: 512
76 |     type_vocab_size: 3
77 |     vocab_size: 30522
78 |     initializer_range: 0.02
79 |     visual_scale_text_init: 0.0
80 |     visual_scale_object_init: 0.0
81 |     visual_ln: true
82 |     pos_embedding_frozen: false
83 | 
84 | VAL:
85 |   SHUFFLE: false
86 |   FLIP_PROB: 0
87 |   BATCH_IMAGES: 1


--------------------------------------------------------------------------------
/cfgs/refcoco/base_detected_regions_4x16G.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | RNG_SEED: 12345
  3 | OUTPUT_PATH: './output/vl-bert/refcoco+'
  4 | MODULE: ResNetVLBERT
  5 | GPUS: '0,1,2,3'
  6 | LOG_FREQUENT: 100
  7 | VAL_FREQUENT: 1
  8 | CHECKPOINT_FREQUENT: 1
  9 | MODEL_PREFIX: 'vl-bert_base_res101_refcoco'
 10 | NUM_WORKERS_PER_GPU: 4
 11 | SCALES:
 12 | - 600
 13 | - 1000
 14 | 
 15 | DATASET:
 16 |   DATASET: refcoco+
 17 |   LABEL_INDEX_IN_BATCH: -1
 18 |   APPEND_INDEX: false
 19 |   DATASET_PATH: './data/coco'
 20 |   ROOT_PATH: './'
 21 |   TRAIN_IMAGE_SET: 'train'
 22 |   VAL_IMAGE_SET: 'val'
 23 |   TEST_IMAGE_SET: 'test'
 24 |   ADD_IMAGE_AS_A_BOX: true
 25 |   ZIP_MODE: false
 26 |   CACHE_MODE: false
 27 |   IGNORE_DB_CACHE: true
 28 |   TRAIN_BOXES: "proposal+gt"
 29 |   VAL_BOXES: "proposal"
 30 |   TEST_BOXES: "proposal"
 31 | 
 32 | 
 33 | NETWORK:
 34 |   PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-base-e2e.model"
 35 |   PARTIAL_PRETRAIN_PREFIX_CHANGES:
 36 |   - "vlbert.mvrc_head.transform->final_mlp.0"
 37 |   - "module.vlbert.mvrc_head.transform->module.final_mlp.0"
 38 |   - "vlbert->vlbert"
 39 |   - "module.vlbert->module.vlbert"
 40 |   IMAGE_NUM_LAYERS: 101
 41 |   IMAGE_C5_DILATED: true
 42 |   IMAGE_STRIDE_IN_1x1: true
 43 |   PIXEL_MEANS:
 44 |   - 102.9801
 45 |   - 115.9465
 46 |   - 122.7717
 47 |   PIXEL_STDS:
 48 |   - 1.0
 49 |   - 1.0
 50 |   - 1.0
 51 |   IMAGE_FEAT_PRECOMPUTED: false
 52 |   IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua'
 53 |   IMAGE_PRETRAINED_EPOCH: 0
 54 |   IMAGE_FROZEN_BACKBONE_STAGES:
 55 |   - 1
 56 |   - 2
 57 |   IMAGE_FROZEN_BN: true
 58 |   IMAGE_FINAL_DIM: 768
 59 |   IMAGE_SEMANTIC: false
 60 |   OUTPUT_CONV5: false
 61 |   BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased'
 62 |   BERT_PRETRAINED: ''
 63 |   BERT_PRETRAINED_EPOCH: 0
 64 |   BERT_FROZEN: false
 65 |   ENABLE_CNN_REG_LOSS: false
 66 | 
 67 |   VLBERT:
 68 |     input_transform_type: 1
 69 |     visual_size: 768
 70 |     hidden_size: 768
 71 |     num_hidden_layers: 12
 72 |     num_attention_heads: 12
 73 |     intermediate_size: 3072
 74 |     hidden_act: "gelu"
 75 |     hidden_dropout_prob: 0.1
 76 |     attention_probs_dropout_prob: 0.1
 77 |     max_position_embeddings: 512
 78 |     type_vocab_size: 3
 79 |     vocab_size: 30522
 80 |     initializer_range: 0.02
 81 |     visual_scale_text_init: 0.0
 82 |     visual_scale_object_init: 0.0
 83 |     visual_ln: true
 84 | 
 85 |   CLASSIFIER_DROPOUT: 0.0
 86 | 
 87 | TRAIN:
 88 |   SHUFFLE: true
 89 |   FLIP_PROB: 0.5
 90 |   BATCH_IMAGES: 4
 91 |   ASPECT_GROUPING: true
 92 |   RESUME: false
 93 |   AUTO_RESUME: true
 94 |   BEGIN_EPOCH: 0
 95 |   END_EPOCH: 20
 96 |   OPTIMIZER: 'AdamW'
 97 |   CLIP_GRAD_NORM: 1.0
 98 |   GRAD_ACCUMULATE_STEPS: 2
 99 |   LR: 8.00e-7
100 |   LR_SCHEDULE: 'triangle'
101 |   WD: 0.0001
102 |   WARMUP: true
103 |   WARMUP_METHOD: 'linear'
104 |   WARMUP_FACTOR: 0.0
105 |   WARMUP_STEPS: 3750
106 |   FP16: false
107 |   FP16_LOSS_SCALE: 128.0
108 | 
109 | VAL:
110 |   SHUFFLE: false
111 |   FLIP_PROB: 0
112 |   BATCH_IMAGES: 4
113 | 
114 | TEST:
115 |   SHUFFLE: false
116 |   FLIP_PROB: 0
117 |   TEST_EPOCH: 0
118 |   BATCH_IMAGES: 4
119 | 


--------------------------------------------------------------------------------
/cfgs/refcoco/base_gt_boxes_4x16G.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | RNG_SEED: 12345
  3 | OUTPUT_PATH: './output/vl-bert/refcoco+'
  4 | MODULE: ResNetVLBERT
  5 | GPUS: '0,1,2,3'
  6 | LOG_FREQUENT: 100
  7 | VAL_FREQUENT: 1
  8 | CHECKPOINT_FREQUENT: 1
  9 | MODEL_PREFIX: 'vl-bert_base_res101_refcoco'
 10 | NUM_WORKERS_PER_GPU: 4
 11 | SCALES:
 12 | - 600
 13 | - 1000
 14 | 
 15 | DATASET:
 16 |   DATASET: refcoco+
 17 |   LABEL_INDEX_IN_BATCH: -1
 18 |   APPEND_INDEX: false
 19 |   DATASET_PATH: './data/coco'
 20 |   ROOT_PATH: './'
 21 |   TRAIN_IMAGE_SET: 'train'
 22 |   VAL_IMAGE_SET: 'val'
 23 |   TEST_IMAGE_SET: 'test'
 24 |   ADD_IMAGE_AS_A_BOX: true
 25 |   ZIP_MODE: false
 26 |   CACHE_MODE: false
 27 |   IGNORE_DB_CACHE: true
 28 |   TRAIN_BOXES: "gt"
 29 |   VAL_BOXES: "gt"
 30 |   TEST_BOXES: "gt"
 31 | 
 32 | 
 33 | NETWORK:
 34 |   PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-base-e2e.model"
 35 |   PARTIAL_PRETRAIN_PREFIX_CHANGES:
 36 |   - "vlbert.mvrc_head.transform->final_mlp.0"
 37 |   - "module.vlbert.mvrc_head.transform->module.final_mlp.0"
 38 |   - "vlbert->vlbert"
 39 |   - "module.vlbert->module.vlbert"
 40 |   IMAGE_NUM_LAYERS: 101
 41 |   IMAGE_C5_DILATED: true
 42 |   IMAGE_STRIDE_IN_1x1: true
 43 |   PIXEL_MEANS:
 44 |   - 102.9801
 45 |   - 115.9465
 46 |   - 122.7717
 47 |   PIXEL_STDS:
 48 |   - 1.0
 49 |   - 1.0
 50 |   - 1.0
 51 |   IMAGE_FEAT_PRECOMPUTED: false
 52 |   IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua'
 53 |   IMAGE_PRETRAINED_EPOCH: 0
 54 |   IMAGE_FROZEN_BACKBONE_STAGES:
 55 |   - 1
 56 |   - 2
 57 |   IMAGE_FROZEN_BN: true
 58 |   IMAGE_FINAL_DIM: 768
 59 |   IMAGE_SEMANTIC: false
 60 |   OUTPUT_CONV5: false
 61 |   BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased'
 62 |   BERT_PRETRAINED: ''
 63 |   BERT_PRETRAINED_EPOCH: 0
 64 |   BERT_FROZEN: false
 65 |   ENABLE_CNN_REG_LOSS: false
 66 | 
 67 |   VLBERT:
 68 |     input_transform_type: 1
 69 |     visual_size: 768
 70 |     hidden_size: 768
 71 |     num_hidden_layers: 12
 72 |     num_attention_heads: 12
 73 |     intermediate_size: 3072
 74 |     hidden_act: "gelu"
 75 |     hidden_dropout_prob: 0.1
 76 |     attention_probs_dropout_prob: 0.1
 77 |     max_position_embeddings: 512
 78 |     type_vocab_size: 3
 79 |     vocab_size: 30522
 80 |     initializer_range: 0.02
 81 |     visual_scale_text_init: 0.0
 82 |     visual_scale_object_init: 0.0
 83 |     visual_ln: true
 84 | 
 85 |   CLASSIFIER_DROPOUT: 0.0
 86 | 
 87 | TRAIN:
 88 |   SHUFFLE: true
 89 |   FLIP_PROB: 0.5
 90 |   BATCH_IMAGES: 4
 91 |   ASPECT_GROUPING: true
 92 |   RESUME: false
 93 |   AUTO_RESUME: true
 94 |   BEGIN_EPOCH: 0
 95 |   END_EPOCH: 20
 96 |   OPTIMIZER: 'AdamW'
 97 |   CLIP_GRAD_NORM: 1.0
 98 |   GRAD_ACCUMULATE_STEPS: 2
 99 |   LR: 8.00e-7
100 |   LR_SCHEDULE: 'triangle'
101 |   WD: 0.0001
102 |   WARMUP: true
103 |   WARMUP_METHOD: 'linear'
104 |   WARMUP_FACTOR: 0.0
105 |   WARMUP_STEPS: 3750
106 |   FP16: false
107 |   FP16_LOSS_SCALE: 128.0
108 | 
109 | VAL:
110 |   SHUFFLE: false
111 |   FLIP_PROB: 0
112 |   BATCH_IMAGES: 4
113 | 
114 | TEST:
115 |   SHUFFLE: false
116 |   FLIP_PROB: 0
117 |   TEST_EPOCH: 0
118 |   BATCH_IMAGES: 4
119 | 


--------------------------------------------------------------------------------
/cfgs/refcoco/large_detected_regions_4x16G.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | RNG_SEED: 12345
  3 | OUTPUT_PATH: './output/vl-bert/refcoco+'
  4 | MODULE: ResNetVLBERT
  5 | GPUS: '0,1,2,3'
  6 | LOG_FREQUENT: 100
  7 | VAL_FREQUENT: 1
  8 | CHECKPOINT_FREQUENT: 1
  9 | MODEL_PREFIX: 'vl-bert_large_res101_refcoco'
 10 | NUM_WORKERS_PER_GPU: 4
 11 | SCALES:
 12 | - 600
 13 | - 1000
 14 | 
 15 | DATASET:
 16 |   DATASET: refcoco+
 17 |   LABEL_INDEX_IN_BATCH: -1
 18 |   APPEND_INDEX: false
 19 |   DATASET_PATH: './data/coco'
 20 |   ROOT_PATH: './'
 21 |   TRAIN_IMAGE_SET: 'train'
 22 |   VAL_IMAGE_SET: 'val'
 23 |   TEST_IMAGE_SET: 'test'
 24 |   ADD_IMAGE_AS_A_BOX: true
 25 |   ZIP_MODE: false
 26 |   CACHE_MODE: false
 27 |   IGNORE_DB_CACHE: true
 28 |   TRAIN_BOXES: "proposal+gt"
 29 |   VAL_BOXES: "proposal"
 30 |   TEST_BOXES: "proposal"
 31 | 
 32 | 
 33 | NETWORK:
 34 |   PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-large-e2e.model"
 35 |   PARTIAL_PRETRAIN_PREFIX_CHANGES:
 36 |   - "vlbert.mvrc_head.transform->final_mlp.0"
 37 |   - "module.vlbert.mvrc_head.transform->module.final_mlp.0"
 38 |   - "vlbert->vlbert"
 39 |   - "module.vlbert->module.vlbert"
 40 |   IMAGE_NUM_LAYERS: 101
 41 |   IMAGE_C5_DILATED: true
 42 |   IMAGE_STRIDE_IN_1x1: true
 43 |   PIXEL_MEANS:
 44 |   - 102.9801
 45 |   - 115.9465
 46 |   - 122.7717
 47 |   PIXEL_STDS:
 48 |   - 1.0
 49 |   - 1.0
 50 |   - 1.0
 51 |   IMAGE_FEAT_PRECOMPUTED: false
 52 |   IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua'
 53 |   IMAGE_PRETRAINED_EPOCH: 0
 54 |   IMAGE_FROZEN_BACKBONE_STAGES:
 55 |   - 1
 56 |   - 2
 57 |   IMAGE_FROZEN_BN: true
 58 |   IMAGE_FINAL_DIM: 1024
 59 |   IMAGE_SEMANTIC: false
 60 |   OUTPUT_CONV5: false
 61 |   BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased'
 62 |   BERT_PRETRAINED: ''
 63 |   BERT_PRETRAINED_EPOCH: 0
 64 |   BERT_FROZEN: false
 65 |   ENABLE_CNN_REG_LOSS: false
 66 | 
 67 |   VLBERT:
 68 |     input_transform_type: 1
 69 |     visual_size: 1024
 70 |     hidden_size: 1024
 71 |     num_hidden_layers: 24
 72 |     num_attention_heads: 16
 73 |     intermediate_size: 4096
 74 |     hidden_act: "gelu"
 75 |     hidden_dropout_prob: 0.1
 76 |     attention_probs_dropout_prob: 0.1
 77 |     max_position_embeddings: 512
 78 |     type_vocab_size: 3
 79 |     vocab_size: 30522
 80 |     initializer_range: 0.02
 81 |     visual_scale_text_init: 0.0
 82 |     visual_scale_object_init: 0.0
 83 |     visual_ln: true
 84 | 
 85 |   CLASSIFIER_DROPOUT: 0.0
 86 | 
 87 | TRAIN:
 88 |   SHUFFLE: true
 89 |   FLIP_PROB: 0.5
 90 |   BATCH_IMAGES: 2
 91 |   ASPECT_GROUPING: true
 92 |   RESUME: false
 93 |   AUTO_RESUME: true
 94 |   BEGIN_EPOCH: 0
 95 |   END_EPOCH: 20
 96 |   OPTIMIZER: 'AdamW'
 97 |   CLIP_GRAD_NORM: 1.0
 98 |   GRAD_ACCUMULATE_STEPS: 4
 99 |   LR: 8.00e-7
100 |   LR_SCHEDULE: 'triangle'
101 |   WD: 0.0001
102 |   WARMUP: true
103 |   WARMUP_METHOD: 'linear'
104 |   WARMUP_FACTOR: 0.0
105 |   WARMUP_STEPS: 3750
106 |   FP16: false
107 |   FP16_LOSS_SCALE: 128.0
108 | 
109 | VAL:
110 |   SHUFFLE: false
111 |   FLIP_PROB: 0
112 |   BATCH_IMAGES: 2
113 | 
114 | TEST:
115 |   SHUFFLE: false
116 |   FLIP_PROB: 0
117 |   TEST_EPOCH: 0
118 |   BATCH_IMAGES: 2


--------------------------------------------------------------------------------
/cfgs/refcoco/large_gt_boxes_4x16G.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | RNG_SEED: 12345
  3 | OUTPUT_PATH: './output/vl-bert/refcoco+'
  4 | MODULE: ResNetVLBERT
  5 | GPUS: '0,1,2,3'
  6 | LOG_FREQUENT: 100
  7 | VAL_FREQUENT: 1
  8 | CHECKPOINT_FREQUENT: 1
  9 | MODEL_PREFIX: 'vl-bert_large_res101_refcoco'
 10 | NUM_WORKERS_PER_GPU: 4
 11 | SCALES:
 12 | - 600
 13 | - 1000
 14 | 
 15 | DATASET:
 16 |   DATASET: refcoco+
 17 |   LABEL_INDEX_IN_BATCH: -1
 18 |   APPEND_INDEX: false
 19 |   DATASET_PATH: './data/coco'
 20 |   ROOT_PATH: './'
 21 |   TRAIN_IMAGE_SET: 'train'
 22 |   VAL_IMAGE_SET: 'val'
 23 |   TEST_IMAGE_SET: 'test'
 24 |   ADD_IMAGE_AS_A_BOX: true
 25 |   ZIP_MODE: false
 26 |   CACHE_MODE: false
 27 |   IGNORE_DB_CACHE: true
 28 |   TRAIN_BOXES: "gt"
 29 |   VAL_BOXES: "gt"
 30 |   TEST_BOXES: "gt"
 31 | 
 32 | 
 33 | NETWORK:
 34 |   PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-large-e2e.model"
 35 |   PARTIAL_PRETRAIN_PREFIX_CHANGES:
 36 |   - "vlbert.mvrc_head.transform->final_mlp.0"
 37 |   - "module.vlbert.mvrc_head.transform->module.final_mlp.0"
 38 |   - "vlbert->vlbert"
 39 |   - "module.vlbert->module.vlbert"
 40 |   IMAGE_NUM_LAYERS: 101
 41 |   IMAGE_C5_DILATED: true
 42 |   IMAGE_STRIDE_IN_1x1: true
 43 |   PIXEL_MEANS:
 44 |   - 102.9801
 45 |   - 115.9465
 46 |   - 122.7717
 47 |   PIXEL_STDS:
 48 |   - 1.0
 49 |   - 1.0
 50 |   - 1.0
 51 |   IMAGE_FEAT_PRECOMPUTED: false
 52 |   IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua'
 53 |   IMAGE_PRETRAINED_EPOCH: 0
 54 |   IMAGE_FROZEN_BACKBONE_STAGES:
 55 |   - 1
 56 |   - 2
 57 |   IMAGE_FROZEN_BN: true
 58 |   IMAGE_FINAL_DIM: 1024
 59 |   IMAGE_SEMANTIC: false
 60 |   OUTPUT_CONV5: false
 61 |   BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased'
 62 |   BERT_PRETRAINED: ''
 63 |   BERT_PRETRAINED_EPOCH: 0
 64 |   BERT_FROZEN: false
 65 |   ENABLE_CNN_REG_LOSS: false
 66 | 
 67 |   VLBERT:
 68 |     input_transform_type: 1
 69 |     visual_size: 1024
 70 |     hidden_size: 1024
 71 |     num_hidden_layers: 24
 72 |     num_attention_heads: 16
 73 |     intermediate_size: 4096
 74 |     hidden_act: "gelu"
 75 |     hidden_dropout_prob: 0.1
 76 |     attention_probs_dropout_prob: 0.1
 77 |     max_position_embeddings: 512
 78 |     type_vocab_size: 3
 79 |     vocab_size: 30522
 80 |     initializer_range: 0.02
 81 |     visual_scale_text_init: 0.0
 82 |     visual_scale_object_init: 0.0
 83 |     visual_ln: true
 84 | 
 85 |   CLASSIFIER_DROPOUT: 0.0
 86 | 
 87 | TRAIN:
 88 |   SHUFFLE: true
 89 |   FLIP_PROB: 0.5
 90 |   BATCH_IMAGES: 2
 91 |   ASPECT_GROUPING: true
 92 |   RESUME: false
 93 |   AUTO_RESUME: true
 94 |   BEGIN_EPOCH: 0
 95 |   END_EPOCH: 20
 96 |   OPTIMIZER: 'AdamW'
 97 |   CLIP_GRAD_NORM: 1.0
 98 |   GRAD_ACCUMULATE_STEPS: 4
 99 |   LR: 8.00e-7
100 |   LR_SCHEDULE: 'triangle'
101 |   WD: 0.0001
102 |   WARMUP: true
103 |   WARMUP_METHOD: 'linear'
104 |   WARMUP_FACTOR: 0.0
105 |   WARMUP_STEPS: 3750
106 |   FP16: false
107 |   FP16_LOSS_SCALE: 128.0
108 | 
109 | VAL:
110 |   SHUFFLE: false
111 |   FLIP_PROB: 0
112 |   BATCH_IMAGES: 2
113 | 
114 | TEST:
115 |   SHUFFLE: false
116 |   FLIP_PROB: 0
117 |   TEST_EPOCH: 0
118 |   BATCH_IMAGES: 2


--------------------------------------------------------------------------------
/cfgs/vcr/base_q2a_4x16G_fp32.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | RNG_SEED: 12345
  3 | OUTPUT_PATH: './output/vl-bert/vcr'
  4 | MODULE: ResNetVLBERT
  5 | GPUS: '0,1,2,3'
  6 | LOG_FREQUENT: 100
  7 | VAL_FREQUENT: 1
  8 | CHECKPOINT_FREQUENT: 1
  9 | MODEL_PREFIX: 'vl-bert_base_a_res101'
 10 | NUM_WORKERS_PER_GPU: 4
 11 | SCALES:
 12 | - 600
 13 | - 1200
 14 | 
 15 | DATASET:
 16 |   DATASET: vcr
 17 |   LABEL_INDEX_IN_BATCH: 7
 18 |   APPEND_INDEX: false
 19 |   TASK: 'Q2A'
 20 |   BASIC_ALIGN: false
 21 |   DATASET_PATH: './data/vcr'
 22 |   ROOT_PATH: './'
 23 |   TRAIN_IMAGE_SET: 'vcr1images'
 24 |   VAL_IMAGE_SET: 'vcr1images'
 25 |   TEST_IMAGE_SET: 'vcr1images'
 26 |   TRAIN_ANNOTATION_FILE: 'train.jsonl'
 27 |   VAL_ANNOTATION_FILE: 'val.jsonl'
 28 |   TEST_ANNOTATION_FILE: 'test.jsonl'
 29 |   ONLY_USE_RELEVANT_DETS: false
 30 |   ADD_IMAGE_AS_A_BOX: true
 31 |   ZIP_MODE: false
 32 |   CACHE_MODE: false
 33 |   IGNORE_DB_CACHE: true
 34 |   MASK_SIZE: 14
 35 | 
 36 | NETWORK:
 37 |   PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-base-e2e.model"
 38 |   PARTIAL_PRETRAIN_PREFIX_CHANGES:
 39 |   - "vlbert.mvrc_head.transform->cnn_loss_reg.0"
 40 |   - "module.vlbert.mvrc_head.transform->module.cnn_loss_reg.0"
 41 |   - "module.vlbert->module.vlbert._module"
 42 |   - "vlbert->vlbert._module"
 43 |   PARTIAL_PRETRAIN_SEGMB_INIT: true
 44 |   IMAGE_NUM_LAYERS: 101
 45 |   IMAGE_C5_DILATED: true
 46 |   IMAGE_STRIDE_IN_1x1: true
 47 |   PIXEL_MEANS:
 48 |   - 102.9801
 49 |   - 115.9465
 50 |   - 122.7717
 51 |   PIXEL_STDS:
 52 |   - 1.0
 53 |   - 1.0
 54 |   - 1.0
 55 |   IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua'
 56 |   IMAGE_PRETRAINED_EPOCH: 0
 57 |   IMAGE_FROZEN_BACKBONE_STAGES:
 58 |   - 1
 59 |   - 2
 60 |   IMAGE_FROZEN_BN: true
 61 |   IMAGE_FINAL_DIM: 768
 62 |   IMAGE_SEMANTIC: false
 63 |   OUTPUT_CONV5: false
 64 |   BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased'
 65 |   BERT_PRETRAINED: ''
 66 |   BERT_PRETRAINED_EPOCH: 0
 67 |   BERT_FROZEN: false
 68 |   ENABLE_CNN_REG_LOSS: true
 69 |   ANS_LOSS_WEIGHT: 1.0
 70 |   CNN_LOSS_TOP: true
 71 | 
 72 |   VLBERT:
 73 |     input_transform_type: 1
 74 |     visual_size: 768
 75 |     hidden_size: 768
 76 |     num_hidden_layers: 12
 77 |     num_attention_heads: 12
 78 |     intermediate_size: 3072
 79 |     hidden_act: "gelu"
 80 |     hidden_dropout_prob: 0.1
 81 |     attention_probs_dropout_prob: 0.1
 82 |     max_position_embeddings: 512
 83 |     type_vocab_size: 3
 84 |     vocab_size: 30522
 85 |     initializer_range: 0.02
 86 |     visual_scale_text_init: 0.0
 87 |     visual_scale_object_init: 0.0
 88 |     visual_ln: true
 89 |     object_word_embed_mode: 2
 90 | 
 91 |   CLASSIFIER_TYPE: "1fc"
 92 |   CLASSIFIER_HIDDEN_SIZE: 1024
 93 |   CLASSIFIER_DROPOUT: 0.1
 94 |   CLASSIFIER_SIGMOID: true
 95 | 
 96 | TRAIN:
 97 |   SHUFFLE: true
 98 |   FLIP_PROB: 0.5
 99 |   BATCH_IMAGES: 4
100 |   ASPECT_GROUPING: false
101 |   RESUME: false
102 |   AUTO_RESUME: true
103 |   BEGIN_EPOCH: 0
104 |   END_EPOCH: 20
105 |   OPTIMIZER: 'SGD'
106 |   CLIP_GRAD_NORM: 10
107 |   GRAD_ACCUMULATE_STEPS: 4
108 |   LR_FACTOR: 0.1
109 |   LR_STEP: "14,18"
110 |   LR: 7.0e-5
111 |   WD: 0.0001
112 |   WARMUP: true
113 |   WARMUP_METHOD: 'linear'
114 |   WARMUP_FACTOR: 0.0
115 |   WARMUP_STEPS: 1000
116 |   MOMENTUM: 0.9
117 |   FP16: false
118 |   FP16_LOSS_SCALE: 128.0
119 | 
120 | VAL:
121 |   SHUFFLE: false
122 |   FLIP_PROB: 0
123 |   BATCH_IMAGES: 4
124 | 
125 | TEST:
126 |   SHUFFLE: false
127 |   FLIP_PROB: 0
128 |   TEST_EPOCH: 0
129 |   BATCH_IMAGES: 4


--------------------------------------------------------------------------------
/cfgs/vcr/base_qa2r_4x16G_fp32.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | RNG_SEED: 12345
  3 | OUTPUT_PATH: './output/vl-bert/vcr'
  4 | MODULE: ResNetVLBERT
  5 | GPUS: '0,1,2,3'
  6 | LOG_FREQUENT: 100
  7 | VAL_FREQUENT: 1
  8 | CHECKPOINT_FREQUENT: 1
  9 | MODEL_PREFIX: 'vl-bert_base_r_res101'
 10 | NUM_WORKERS_PER_GPU: 4
 11 | SCALES:
 12 | - 600
 13 | - 1200
 14 | 
 15 | DATASET:
 16 |   DATASET: vcr
 17 |   LABEL_INDEX_IN_BATCH: 7
 18 |   APPEND_INDEX: false
 19 |   TASK: 'QA2R'
 20 |   BASIC_ALIGN: false
 21 |   DATASET_PATH: './data/vcr'
 22 |   ROOT_PATH: './'
 23 |   TRAIN_IMAGE_SET: 'vcr1images'
 24 |   VAL_IMAGE_SET: 'vcr1images'
 25 |   TEST_IMAGE_SET: 'vcr1images'
 26 |   TRAIN_ANNOTATION_FILE: 'train.jsonl'
 27 |   VAL_ANNOTATION_FILE: 'val.jsonl'
 28 |   TEST_ANNOTATION_FILE: 'test.jsonl'
 29 |   ONLY_USE_RELEVANT_DETS: false
 30 |   ADD_IMAGE_AS_A_BOX: true
 31 |   ZIP_MODE: false
 32 |   CACHE_MODE: false
 33 |   IGNORE_DB_CACHE: true
 34 |   MASK_SIZE: 14
 35 | 
 36 | NETWORK:
 37 |   PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-base-e2e.model"
 38 |   PARTIAL_PRETRAIN_PREFIX_CHANGES:
 39 |   - "vlbert.mvrc_head.transform->cnn_loss_reg.0"
 40 |   - "module.vlbert.mvrc_head.transform->module.cnn_loss_reg.0"
 41 |   - "module.vlbert->module.vlbert._module"
 42 |   - "vlbert->vlbert._module"
 43 |   PARTIAL_PRETRAIN_SEGMB_INIT: true
 44 |   IMAGE_NUM_LAYERS: 101
 45 |   IMAGE_C5_DILATED: true
 46 |   IMAGE_STRIDE_IN_1x1: true
 47 |   PIXEL_MEANS:
 48 |   - 102.9801
 49 |   - 115.9465
 50 |   - 122.7717
 51 |   PIXEL_STDS:
 52 |   - 1.0
 53 |   - 1.0
 54 |   - 1.0
 55 |   IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua'
 56 |   IMAGE_PRETRAINED_EPOCH: 0
 57 |   IMAGE_FROZEN_BACKBONE_STAGES:
 58 |   - 1
 59 |   - 2
 60 |   IMAGE_FROZEN_BN: true
 61 |   IMAGE_FINAL_DIM: 768
 62 |   IMAGE_SEMANTIC: false
 63 |   OUTPUT_CONV5: false
 64 |   BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased'
 65 |   BERT_PRETRAINED: ''
 66 |   BERT_PRETRAINED_EPOCH: 0
 67 |   BERT_FROZEN: false
 68 |   ENABLE_CNN_REG_LOSS: true
 69 |   ANS_LOSS_WEIGHT: 1.0
 70 |   CNN_LOSS_TOP: true
 71 | 
 72 |   VLBERT:
 73 |     input_transform_type: 1
 74 |     visual_size: 768
 75 |     hidden_size: 768
 76 |     num_hidden_layers: 12
 77 |     num_attention_heads: 12
 78 |     intermediate_size: 3072
 79 |     hidden_act: "gelu"
 80 |     hidden_dropout_prob: 0.1
 81 |     attention_probs_dropout_prob: 0.1
 82 |     max_position_embeddings: 512
 83 |     type_vocab_size: 3
 84 |     vocab_size: 30522
 85 |     initializer_range: 0.02
 86 |     visual_scale_text_init: 0.0
 87 |     visual_scale_object_init: 0.0
 88 |     visual_ln: true
 89 |     object_word_embed_mode: 2
 90 | 
 91 |   CLASSIFIER_TYPE: "1fc"
 92 |   CLASSIFIER_HIDDEN_SIZE: 1024
 93 |   CLASSIFIER_DROPOUT: 0.1
 94 |   CLASSIFIER_SIGMOID: true
 95 | 
 96 | TRAIN:
 97 |   SHUFFLE: true
 98 |   FLIP_PROB: 0.5
 99 |   BATCH_IMAGES: 4
100 |   ASPECT_GROUPING: false
101 |   RESUME: false
102 |   AUTO_RESUME: true
103 |   BEGIN_EPOCH: 0
104 |   END_EPOCH: 20
105 |   OPTIMIZER: 'SGD'
106 |   CLIP_GRAD_NORM: 10
107 |   GRAD_ACCUMULATE_STEPS: 4
108 |   LR_FACTOR: 0.1
109 |   LR_STEP: "14,18"
110 |   LR: 7.0e-5
111 |   WD: 0.0001
112 |   WARMUP: true
113 |   WARMUP_METHOD: 'linear'
114 |   WARMUP_FACTOR: 0.0
115 |   WARMUP_STEPS: 1000
116 |   MOMENTUM: 0.9
117 |   FP16: false
118 |   FP16_LOSS_SCALE: 128.0
119 | 
120 | VAL:
121 |   SHUFFLE: false
122 |   FLIP_PROB: 0
123 |   BATCH_IMAGES: 4
124 | 
125 | TEST:
126 |   SHUFFLE: false
127 |   FLIP_PROB: 0
128 |   TEST_EPOCH: 0
129 |   BATCH_IMAGES: 4


--------------------------------------------------------------------------------
/cfgs/vcr/large_q2a_16x16G_fp16.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | RNG_SEED: 12345
  3 | OUTPUT_PATH: './output/vl-bert/vcr'
  4 | MODULE: ResNetVLBERT
  5 | GPUS: '0,1,2,3'
  6 | LOG_FREQUENT: 100
  7 | VAL_FREQUENT: 1
  8 | CHECKPOINT_FREQUENT: 1
  9 | MODEL_PREFIX: 'vl-bert_large_a_res101'
 10 | NUM_WORKERS_PER_GPU: 4
 11 | SCALES:
 12 | - 600
 13 | - 1200
 14 | 
 15 | DATASET:
 16 |   DATASET: vcr
 17 |   LABEL_INDEX_IN_BATCH: 7
 18 |   APPEND_INDEX: false
 19 |   TASK: 'Q2A'
 20 |   BASIC_ALIGN: false
 21 |   DATASET_PATH: './data/vcr'
 22 |   ROOT_PATH: './'
 23 |   TRAIN_IMAGE_SET: 'vcr1images'
 24 |   VAL_IMAGE_SET: 'vcr1images'
 25 |   TEST_IMAGE_SET: 'vcr1images'
 26 |   TRAIN_ANNOTATION_FILE: 'train.jsonl'
 27 |   VAL_ANNOTATION_FILE: 'val.jsonl'
 28 |   TEST_ANNOTATION_FILE: 'test.jsonl'
 29 |   ONLY_USE_RELEVANT_DETS: false
 30 |   ADD_IMAGE_AS_A_BOX: true
 31 |   ZIP_MODE: false
 32 |   CACHE_MODE: false
 33 |   IGNORE_DB_CACHE: true
 34 |   MASK_SIZE: 14
 35 | 
 36 | NETWORK:
 37 |   PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-large-e2e.model"
 38 |   PARTIAL_PRETRAIN_PREFIX_CHANGES:
 39 |   - "vlbert.mvrc_head.transform->cnn_loss_reg.0"
 40 |   - "module.vlbert.mvrc_head.transform->module.cnn_loss_reg.0"
 41 |   - "module.vlbert->module.vlbert._module"
 42 |   - "vlbert->vlbert._module"
 43 |   PARTIAL_PRETRAIN_SEGMB_INIT: true
 44 |   IMAGE_NUM_LAYERS: 101
 45 |   IMAGE_C5_DILATED: true
 46 |   IMAGE_STRIDE_IN_1x1: true
 47 |   PIXEL_MEANS:
 48 |   - 102.9801
 49 |   - 115.9465
 50 |   - 122.7717
 51 |   PIXEL_STDS:
 52 |   - 1.0
 53 |   - 1.0
 54 |   - 1.0
 55 |   IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua'
 56 |   IMAGE_PRETRAINED_EPOCH: 0
 57 |   IMAGE_FROZEN_BACKBONE_STAGES:
 58 |   - 1
 59 |   - 2
 60 |   IMAGE_FROZEN_BN: true
 61 |   IMAGE_FINAL_DIM: 1024
 62 |   IMAGE_SEMANTIC: false
 63 |   OUTPUT_CONV5: false
 64 |   BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased'
 65 |   BERT_PRETRAINED: ''
 66 |   BERT_PRETRAINED_EPOCH: 0
 67 |   BERT_FROZEN: false
 68 |   ENABLE_CNN_REG_LOSS: true
 69 |   ANS_LOSS_WEIGHT: 1.0
 70 |   CNN_LOSS_TOP: true
 71 | 
 72 |   VLBERT:
 73 |     input_transform_type: 1
 74 |     visual_size: 1024
 75 |     hidden_size: 1024
 76 |     num_hidden_layers: 24
 77 |     num_attention_heads: 16
 78 |     intermediate_size: 4096
 79 |     hidden_act: "gelu"
 80 |     hidden_dropout_prob: 0.1
 81 |     attention_probs_dropout_prob: 0.1
 82 |     max_position_embeddings: 512
 83 |     type_vocab_size: 3
 84 |     vocab_size: 30522
 85 |     initializer_range: 0.02
 86 |     visual_scale_text_init: 0.0
 87 |     visual_scale_object_init: 0.0
 88 |     visual_ln: true
 89 |     object_word_embed_mode: 2
 90 | 
 91 |   CLASSIFIER_TYPE: "1fc"
 92 |   CLASSIFIER_HIDDEN_SIZE: 1024
 93 |   CLASSIFIER_DROPOUT: 0.1
 94 |   CLASSIFIER_SIGMOID: true
 95 | 
 96 | TRAIN:
 97 |   SHUFFLE: true
 98 |   FLIP_PROB: 0.5
 99 |   BATCH_IMAGES: 4
100 |   ASPECT_GROUPING: false
101 |   RESUME: false
102 |   AUTO_RESUME: true
103 |   BEGIN_EPOCH: 0
104 |   END_EPOCH: 20
105 |   OPTIMIZER: 'SGD'
106 |   CLIP_GRAD_NORM: 10
107 |   LR_FACTOR: 0.1
108 |   LR_STEP: "14,18"
109 |   LR: 7.0e-5
110 |   WD: 0.0001
111 |   WARMUP: true
112 |   WARMUP_METHOD: 'linear'
113 |   WARMUP_FACTOR: 0.0
114 |   WARMUP_STEPS: 1000
115 |   MOMENTUM: 0.9
116 |   FP16: true
117 |   FP16_LOSS_SCALE: 'dynamic'
118 | 
119 | VAL:
120 |   SHUFFLE: false
121 |   FLIP_PROB: 0
122 |   BATCH_IMAGES: 4
123 | 
124 | TEST:
125 |   SHUFFLE: false
126 |   FLIP_PROB: 0
127 |   TEST_EPOCH: 0
128 |   BATCH_IMAGES: 4


--------------------------------------------------------------------------------
/cfgs/vcr/large_q2a_4x16G_fp16.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | RNG_SEED: 12345
  3 | OUTPUT_PATH: './output/vl-bert/vcr'
  4 | MODULE: ResNetVLBERT
  5 | GPUS: '0,1,2,3'
  6 | LOG_FREQUENT: 100
  7 | VAL_FREQUENT: 1
  8 | CHECKPOINT_FREQUENT: 1
  9 | MODEL_PREFIX: 'vl-bert_large_a_res101'
 10 | NUM_WORKERS_PER_GPU: 4
 11 | SCALES:
 12 | - 600
 13 | - 1200
 14 | 
 15 | DATASET:
 16 |   DATASET: vcr
 17 |   LABEL_INDEX_IN_BATCH: 7
 18 |   APPEND_INDEX: false
 19 |   TASK: 'Q2A'
 20 |   BASIC_ALIGN: false
 21 |   DATASET_PATH: './data/vcr'
 22 |   ROOT_PATH: './'
 23 |   TRAIN_IMAGE_SET: 'vcr1images'
 24 |   VAL_IMAGE_SET: 'vcr1images'
 25 |   TEST_IMAGE_SET: 'vcr1images'
 26 |   TRAIN_ANNOTATION_FILE: 'train.jsonl'
 27 |   VAL_ANNOTATION_FILE: 'val.jsonl'
 28 |   TEST_ANNOTATION_FILE: 'test.jsonl'
 29 |   ONLY_USE_RELEVANT_DETS: false
 30 |   ADD_IMAGE_AS_A_BOX: true
 31 |   ZIP_MODE: false
 32 |   CACHE_MODE: false
 33 |   IGNORE_DB_CACHE: true
 34 |   MASK_SIZE: 14
 35 | 
 36 | NETWORK:
 37 |   PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-large-e2e.model"
 38 |   PARTIAL_PRETRAIN_PREFIX_CHANGES:
 39 |   - "vlbert.mvrc_head.transform->cnn_loss_reg.0"
 40 |   - "module.vlbert.mvrc_head.transform->module.cnn_loss_reg.0"
 41 |   - "module.vlbert->module.vlbert._module"
 42 |   - "vlbert->vlbert._module"
 43 |   PARTIAL_PRETRAIN_SEGMB_INIT: true
 44 |   IMAGE_NUM_LAYERS: 101
 45 |   IMAGE_C5_DILATED: true
 46 |   IMAGE_STRIDE_IN_1x1: true
 47 |   PIXEL_MEANS:
 48 |   - 102.9801
 49 |   - 115.9465
 50 |   - 122.7717
 51 |   PIXEL_STDS:
 52 |   - 1.0
 53 |   - 1.0
 54 |   - 1.0
 55 |   IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua'
 56 |   IMAGE_PRETRAINED_EPOCH: 0
 57 |   IMAGE_FROZEN_BACKBONE_STAGES:
 58 |   - 1
 59 |   - 2
 60 |   IMAGE_FROZEN_BN: true
 61 |   IMAGE_FINAL_DIM: 1024
 62 |   IMAGE_SEMANTIC: false
 63 |   OUTPUT_CONV5: false
 64 |   BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased'
 65 |   BERT_PRETRAINED: ''
 66 |   BERT_PRETRAINED_EPOCH: 0
 67 |   BERT_FROZEN: false
 68 |   ENABLE_CNN_REG_LOSS: true
 69 |   ANS_LOSS_WEIGHT: 1.0
 70 |   CNN_LOSS_TOP: true
 71 | 
 72 |   VLBERT:
 73 |     input_transform_type: 1
 74 |     visual_size: 1024
 75 |     hidden_size: 1024
 76 |     num_hidden_layers: 24
 77 |     num_attention_heads: 16
 78 |     intermediate_size: 4096
 79 |     hidden_act: "gelu"
 80 |     hidden_dropout_prob: 0.1
 81 |     attention_probs_dropout_prob: 0.1
 82 |     max_position_embeddings: 512
 83 |     type_vocab_size: 3
 84 |     vocab_size: 30522
 85 |     initializer_range: 0.02
 86 |     visual_scale_text_init: 0.0
 87 |     visual_scale_object_init: 0.0
 88 |     visual_ln: true
 89 |     object_word_embed_mode: 2
 90 | 
 91 |   CLASSIFIER_TYPE: "1fc"
 92 |   CLASSIFIER_HIDDEN_SIZE: 1024
 93 |   CLASSIFIER_DROPOUT: 0.1
 94 |   CLASSIFIER_SIGMOID: true
 95 | 
 96 | TRAIN:
 97 |   SHUFFLE: true
 98 |   FLIP_PROB: 0.5
 99 |   BATCH_IMAGES: 4
100 |   ASPECT_GROUPING: false
101 |   RESUME: false
102 |   AUTO_RESUME: true
103 |   BEGIN_EPOCH: 0
104 |   END_EPOCH: 20
105 |   OPTIMIZER: 'SGD'
106 |   CLIP_GRAD_NORM: 10
107 |   GRAD_ACCUMULATE_STEPS: 4
108 |   LR_FACTOR: 0.1
109 |   LR_STEP: "14,18"
110 |   LR: 7.0e-5
111 |   WD: 0.0001
112 |   WARMUP: true
113 |   WARMUP_METHOD: 'linear'
114 |   WARMUP_FACTOR: 0.0
115 |   WARMUP_STEPS: 1000
116 |   MOMENTUM: 0.9
117 |   FP16: true
118 |   FP16_LOSS_SCALE: 'dynamic'
119 | 
120 | VAL:
121 |   SHUFFLE: false
122 |   FLIP_PROB: 0
123 |   BATCH_IMAGES: 4
124 | 
125 | TEST:
126 |   SHUFFLE: false
127 |   FLIP_PROB: 0
128 |   TEST_EPOCH: 0
129 |   BATCH_IMAGES: 4


--------------------------------------------------------------------------------
/cfgs/vcr/large_qa2r_16x16G_fp16.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | RNG_SEED: 12345
  3 | OUTPUT_PATH: './output/vl-bert/vcr'
  4 | MODULE: ResNetVLBERT
  5 | GPUS: '0,1,2,3'
  6 | LOG_FREQUENT: 100
  7 | VAL_FREQUENT: 1
  8 | CHECKPOINT_FREQUENT: 1
  9 | MODEL_PREFIX: 'vl-bert_large_r_res101'
 10 | NUM_WORKERS_PER_GPU: 4
 11 | SCALES:
 12 | - 600
 13 | - 1200
 14 | 
 15 | DATASET:
 16 |   DATASET: vcr
 17 |   LABEL_INDEX_IN_BATCH: 7
 18 |   APPEND_INDEX: false
 19 |   TASK: 'QA2R'
 20 |   BASIC_ALIGN: false
 21 |   DATASET_PATH: './data/vcr'
 22 |   ROOT_PATH: './'
 23 |   TRAIN_IMAGE_SET: 'vcr1images'
 24 |   VAL_IMAGE_SET: 'vcr1images'
 25 |   TEST_IMAGE_SET: 'vcr1images'
 26 |   TRAIN_ANNOTATION_FILE: 'train.jsonl'
 27 |   VAL_ANNOTATION_FILE: 'val.jsonl'
 28 |   TEST_ANNOTATION_FILE: 'test.jsonl'
 29 |   ONLY_USE_RELEVANT_DETS: false
 30 |   ADD_IMAGE_AS_A_BOX: true
 31 |   ZIP_MODE: false
 32 |   CACHE_MODE: false
 33 |   IGNORE_DB_CACHE: true
 34 |   MASK_SIZE: 14
 35 | 
 36 | NETWORK:
 37 |   PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-large-e2e.model"
 38 |   PARTIAL_PRETRAIN_PREFIX_CHANGES:
 39 |   - "vlbert.mvrc_head.transform->cnn_loss_reg.0"
 40 |   - "module.vlbert.mvrc_head.transform->module.cnn_loss_reg.0"
 41 |   - "module.vlbert->module.vlbert._module"
 42 |   - "vlbert->vlbert._module"
 43 |   PARTIAL_PRETRAIN_SEGMB_INIT: true
 44 |   IMAGE_NUM_LAYERS: 101
 45 |   IMAGE_C5_DILATED: true
 46 |   IMAGE_STRIDE_IN_1x1: true
 47 |   PIXEL_MEANS:
 48 |   - 102.9801
 49 |   - 115.9465
 50 |   - 122.7717
 51 |   PIXEL_STDS:
 52 |   - 1.0
 53 |   - 1.0
 54 |   - 1.0
 55 |   IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua'
 56 |   IMAGE_PRETRAINED_EPOCH: 0
 57 |   IMAGE_FROZEN_BACKBONE_STAGES:
 58 |   - 1
 59 |   - 2
 60 |   IMAGE_FROZEN_BN: true
 61 |   IMAGE_FINAL_DIM: 1024
 62 |   IMAGE_SEMANTIC: false
 63 |   OUTPUT_CONV5: false
 64 |   BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased'
 65 |   BERT_PRETRAINED: ''
 66 |   BERT_PRETRAINED_EPOCH: 0
 67 |   BERT_FROZEN: false
 68 |   ENABLE_CNN_REG_LOSS: true
 69 |   ANS_LOSS_WEIGHT: 1.0
 70 |   CNN_LOSS_TOP: true
 71 | 
 72 |   VLBERT:
 73 |     input_transform_type: 1
 74 |     visual_size: 1024
 75 |     hidden_size: 1024
 76 |     num_hidden_layers: 24
 77 |     num_attention_heads: 16
 78 |     intermediate_size: 4096
 79 |     hidden_act: "gelu"
 80 |     hidden_dropout_prob: 0.1
 81 |     attention_probs_dropout_prob: 0.1
 82 |     max_position_embeddings: 512
 83 |     type_vocab_size: 3
 84 |     vocab_size: 30522
 85 |     initializer_range: 0.02
 86 |     visual_scale_text_init: 0.0
 87 |     visual_scale_object_init: 0.0
 88 |     visual_ln: true
 89 |     object_word_embed_mode: 2
 90 | 
 91 |   CLASSIFIER_TYPE: "1fc"
 92 |   CLASSIFIER_HIDDEN_SIZE: 1024
 93 |   CLASSIFIER_DROPOUT: 0.1
 94 |   CLASSIFIER_SIGMOID: true
 95 | 
 96 | TRAIN:
 97 |   SHUFFLE: true
 98 |   FLIP_PROB: 0.5
 99 |   BATCH_IMAGES: 4
100 |   ASPECT_GROUPING: false
101 |   RESUME: false
102 |   AUTO_RESUME: true
103 |   BEGIN_EPOCH: 0
104 |   END_EPOCH: 20
105 |   OPTIMIZER: 'SGD'
106 |   CLIP_GRAD_NORM: 10
107 |   LR_FACTOR: 0.1
108 |   LR_STEP: "14,18"
109 |   LR: 7.0e-5
110 |   WD: 0.0001
111 |   WARMUP: true
112 |   WARMUP_METHOD: 'linear'
113 |   WARMUP_FACTOR: 0.0
114 |   WARMUP_STEPS: 1000
115 |   MOMENTUM: 0.9
116 |   FP16: true
117 |   FP16_LOSS_SCALE: 'dynamic'
118 | 
119 | VAL:
120 |   SHUFFLE: false
121 |   FLIP_PROB: 0
122 |   BATCH_IMAGES: 4
123 | 
124 | TEST:
125 |   SHUFFLE: false
126 |   FLIP_PROB: 0
127 |   TEST_EPOCH: 0
128 |   BATCH_IMAGES: 4


--------------------------------------------------------------------------------
/cfgs/vcr/large_qa2r_4x16G_fp16.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | RNG_SEED: 12345
  3 | OUTPUT_PATH: './output/vl-bert/vcr'
  4 | MODULE: ResNetVLBERT
  5 | GPUS: '0,1,2,3'
  6 | LOG_FREQUENT: 100
  7 | VAL_FREQUENT: 1
  8 | CHECKPOINT_FREQUENT: 1
  9 | MODEL_PREFIX: 'vl-bert_large_r_res101'
 10 | NUM_WORKERS_PER_GPU: 4
 11 | SCALES:
 12 | - 600
 13 | - 1200
 14 | 
 15 | DATASET:
 16 |   DATASET: vcr
 17 |   LABEL_INDEX_IN_BATCH: 7
 18 |   APPEND_INDEX: false
 19 |   TASK: 'QA2R'
 20 |   BASIC_ALIGN: false
 21 |   DATASET_PATH: './data/vcr'
 22 |   ROOT_PATH: './'
 23 |   TRAIN_IMAGE_SET: 'vcr1images'
 24 |   VAL_IMAGE_SET: 'vcr1images'
 25 |   TEST_IMAGE_SET: 'vcr1images'
 26 |   TRAIN_ANNOTATION_FILE: 'train.jsonl'
 27 |   VAL_ANNOTATION_FILE: 'val.jsonl'
 28 |   TEST_ANNOTATION_FILE: 'test.jsonl'
 29 |   ONLY_USE_RELEVANT_DETS: false
 30 |   ADD_IMAGE_AS_A_BOX: true
 31 |   ZIP_MODE: false
 32 |   CACHE_MODE: false
 33 |   IGNORE_DB_CACHE: true
 34 |   MASK_SIZE: 14
 35 | 
 36 | NETWORK:
 37 |   PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-large-e2e.model"
 38 |   PARTIAL_PRETRAIN_PREFIX_CHANGES:
 39 |   - "vlbert.mvrc_head.transform->cnn_loss_reg.0"
 40 |   - "module.vlbert.mvrc_head.transform->module.cnn_loss_reg.0"
 41 |   - "module.vlbert->module.vlbert._module"
 42 |   - "vlbert->vlbert._module"
 43 |   PARTIAL_PRETRAIN_SEGMB_INIT: true
 44 |   IMAGE_NUM_LAYERS: 101
 45 |   IMAGE_C5_DILATED: true
 46 |   IMAGE_STRIDE_IN_1x1: true
 47 |   PIXEL_MEANS:
 48 |   - 102.9801
 49 |   - 115.9465
 50 |   - 122.7717
 51 |   PIXEL_STDS:
 52 |   - 1.0
 53 |   - 1.0
 54 |   - 1.0
 55 |   IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua'
 56 |   IMAGE_PRETRAINED_EPOCH: 0
 57 |   IMAGE_FROZEN_BACKBONE_STAGES:
 58 |   - 1
 59 |   - 2
 60 |   IMAGE_FROZEN_BN: true
 61 |   IMAGE_FINAL_DIM: 1024
 62 |   IMAGE_SEMANTIC: false
 63 |   OUTPUT_CONV5: false
 64 |   BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased'
 65 |   BERT_PRETRAINED: ''
 66 |   BERT_PRETRAINED_EPOCH: 0
 67 |   BERT_FROZEN: false
 68 |   ENABLE_CNN_REG_LOSS: true
 69 |   ANS_LOSS_WEIGHT: 1.0
 70 |   CNN_LOSS_TOP: true
 71 | 
 72 |   VLBERT:
 73 |     input_transform_type: 1
 74 |     visual_size: 1024
 75 |     hidden_size: 1024
 76 |     num_hidden_layers: 24
 77 |     num_attention_heads: 16
 78 |     intermediate_size: 4096
 79 |     hidden_act: "gelu"
 80 |     hidden_dropout_prob: 0.1
 81 |     attention_probs_dropout_prob: 0.1
 82 |     max_position_embeddings: 512
 83 |     type_vocab_size: 3
 84 |     vocab_size: 30522
 85 |     initializer_range: 0.02
 86 |     visual_scale_text_init: 0.0
 87 |     visual_scale_object_init: 0.0
 88 |     visual_ln: true
 89 |     object_word_embed_mode: 2
 90 | 
 91 |   CLASSIFIER_TYPE: "1fc"
 92 |   CLASSIFIER_HIDDEN_SIZE: 1024
 93 |   CLASSIFIER_DROPOUT: 0.1
 94 |   CLASSIFIER_SIGMOID: true
 95 | 
 96 | TRAIN:
 97 |   SHUFFLE: true
 98 |   FLIP_PROB: 0.5
 99 |   BATCH_IMAGES: 4
100 |   ASPECT_GROUPING: false
101 |   RESUME: false
102 |   AUTO_RESUME: true
103 |   BEGIN_EPOCH: 0
104 |   END_EPOCH: 20
105 |   OPTIMIZER: 'SGD'
106 |   CLIP_GRAD_NORM: 10
107 |   GRAD_ACCUMULATE_STEPS: 4
108 |   LR_FACTOR: 0.1
109 |   LR_STEP: "14,18"
110 |   LR: 7.0e-5
111 |   WD: 0.0001
112 |   WARMUP: true
113 |   WARMUP_METHOD: 'linear'
114 |   WARMUP_FACTOR: 0.0
115 |   WARMUP_STEPS: 1000
116 |   MOMENTUM: 0.9
117 |   FP16: true
118 |   FP16_LOSS_SCALE: 'dynamic'
119 | 
120 | VAL:
121 |   SHUFFLE: false
122 |   FLIP_PROB: 0
123 |   BATCH_IMAGES: 4
124 | 
125 | TEST:
126 |   SHUFFLE: false
127 |   FLIP_PROB: 0
128 |   TEST_EPOCH: 0
129 |   BATCH_IMAGES: 4


--------------------------------------------------------------------------------
/cfgs/vqa/base_4x16G_fp32.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | RNG_SEED: 12345
  3 | OUTPUT_PATH: './output/vl-bert/vqa'
  4 | MODULE: ResNetVLBERT
  5 | GPUS: '0,1,2,3'
  6 | LOG_FREQUENT: 100
  7 | VAL_FREQUENT: 1
  8 | CHECKPOINT_FREQUENT: 1
  9 | MODEL_PREFIX: 'vl-bert_base_res101_vqa'
 10 | NUM_WORKERS_PER_GPU: 4
 11 | SCALES:
 12 | - 600
 13 | - 1000
 14 | 
 15 | DATASET:
 16 |   DATASET: vqa
 17 |   ANSWER_VOCAB_FILE: './data/coco/vqa/answers_vqa.txt'
 18 |   LABEL_INDEX_IN_BATCH: -1
 19 |   APPEND_INDEX: false
 20 |   DATASET_PATH: './data/coco'
 21 |   ROOT_PATH: './'
 22 |   TRAIN_IMAGE_SET: 'train2014+val2014'
 23 |   VAL_IMAGE_SET: 'val2014'
 24 |   TEST_IMAGE_SET: 'test2015'
 25 |   ADD_IMAGE_AS_A_BOX: true
 26 |   ZIP_MODE: false
 27 |   CACHE_MODE: false
 28 |   IGNORE_DB_CACHE: false
 29 |   MASK_SIZE: 14
 30 |   BOXES: "10-100ada"
 31 |   USE_IMDB: false
 32 | 
 33 | NETWORK:
 34 |   PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-base-prec.model"
 35 |   PARTIAL_PRETRAIN_PREFIX_CHANGES:
 36 |   - "vlbert.mlm_head.predictions.transform->final_mlp.0"
 37 |   - "module.vlbert.mlm_head.predictions.transform->module.final_mlp.0"
 38 |   - "vlbert->vlbert"
 39 |   - "module.vlbert->module.vlbert"
 40 |   IMAGE_NUM_LAYERS: 101
 41 |   IMAGE_C5_DILATED: true
 42 |   IMAGE_STRIDE_IN_1x1: true
 43 |   PIXEL_MEANS:
 44 |   - 102.9801
 45 |   - 115.9465
 46 |   - 122.7717
 47 |   PIXEL_STDS:
 48 |   - 1.0
 49 |   - 1.0
 50 |   - 1.0
 51 |   IMAGE_FEAT_PRECOMPUTED: true
 52 |   IMAGE_PRETRAINED: ''
 53 |   IMAGE_PRETRAINED_EPOCH: 0
 54 |   IMAGE_FROZEN_BACKBONE_STAGES:
 55 |   - 1
 56 |   - 2
 57 |   IMAGE_FROZEN_BN: true
 58 |   IMAGE_FINAL_DIM: 768
 59 |   IMAGE_SEMANTIC: false
 60 |   OUTPUT_CONV5: false
 61 |   BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased'
 62 |   BERT_PRETRAINED: ''
 63 |   BERT_PRETRAINED_EPOCH: 0
 64 |   BERT_FROZEN: false
 65 |   ENABLE_CNN_REG_LOSS: false
 66 | 
 67 |   VLBERT:
 68 |     input_transform_type: 1
 69 |     visual_size: 768
 70 |     hidden_size: 768
 71 |     num_hidden_layers: 12
 72 |     num_attention_heads: 12
 73 |     intermediate_size: 3072
 74 |     hidden_act: "gelu"
 75 |     hidden_dropout_prob: 0.1
 76 |     attention_probs_dropout_prob: 0.1
 77 |     max_position_embeddings: 512
 78 |     type_vocab_size: 3
 79 |     vocab_size: 30522
 80 |     initializer_range: 0.02
 81 |     visual_scale_text_init: 0.0
 82 |     visual_scale_object_init: 0.0
 83 |     visual_ln: true
 84 | 
 85 |   CLASSIFIER_TYPE: "mlm"
 86 |   CLASSIFIER_PRETRAINED: true
 87 |   CLASSIFIER_DROPOUT: 0.1
 88 | 
 89 | TRAIN:
 90 |   SHUFFLE: true
 91 |   FLIP_PROB: 0.5
 92 |   BATCH_IMAGES: 64
 93 |   ASPECT_GROUPING: false
 94 |   RESUME: false
 95 |   AUTO_RESUME: true
 96 |   BEGIN_EPOCH: 0
 97 |   END_EPOCH: 5
 98 |   OPTIMIZER: 'AdamW'
 99 |   CLIP_GRAD_NORM: 1.0
100 |   LR: 6.25e-7
101 |   LR_SCHEDULE: 'triangle'
102 |   WD: 0.0001
103 |   WARMUP: true
104 |   WARMUP_METHOD: 'linear'
105 |   WARMUP_FACTOR: 0.0
106 |   WARMUP_STEPS: 500
107 |   FP16: false
108 |   FP16_LOSS_SCALE: 128.0
109 | 
110 | VAL:
111 |   SHUFFLE: false
112 |   FLIP_PROB: 0
113 |   BATCH_IMAGES: 64
114 | 
115 | TEST:
116 |   SHUFFLE: false
117 |   FLIP_PROB: 0
118 |   TEST_EPOCH: 0
119 |   BATCH_IMAGES: 64


--------------------------------------------------------------------------------
/cfgs/vqa/large_4x16G_fp32.yaml:
--------------------------------------------------------------------------------
  1 | ---
  2 | RNG_SEED: 12345
  3 | OUTPUT_PATH: './output/vl-bert/vqa'
  4 | MODULE: ResNetVLBERT
  5 | GPUS: '0,1,2,3'
  6 | LOG_FREQUENT: 100
  7 | VAL_FREQUENT: 1
  8 | CHECKPOINT_FREQUENT: 1
  9 | MODEL_PREFIX: 'vl-bert_large_res101_vqa'
 10 | NUM_WORKERS_PER_GPU: 4
 11 | SCALES:
 12 | - 600
 13 | - 1000
 14 | 
 15 | DATASET:
 16 |   DATASET: vqa
 17 |   ANSWER_VOCAB_FILE: './data/coco/vqa/answers_vqa.txt'
 18 |   LABEL_INDEX_IN_BATCH: -1
 19 |   APPEND_INDEX: false
 20 |   DATASET_PATH: './data/coco'
 21 |   ROOT_PATH: './'
 22 |   TRAIN_IMAGE_SET: 'train2014+val2014'
 23 |   VAL_IMAGE_SET: 'val2014'
 24 |   TEST_IMAGE_SET: 'test2015'
 25 |   ADD_IMAGE_AS_A_BOX: true
 26 |   ZIP_MODE: false
 27 |   CACHE_MODE: false
 28 |   IGNORE_DB_CACHE: false
 29 |   MASK_SIZE: 14
 30 |   BOXES: "10-100ada"
 31 |   USE_IMDB: false
 32 | 
 33 | NETWORK:
 34 |   PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-large-prec.model"
 35 |   PARTIAL_PRETRAIN_PREFIX_CHANGES:
 36 |   - "vlbert.mlm_head.predictions.transform->final_mlp.0"
 37 |   - "module.vlbert.mlm_head.predictions.transform->module.final_mlp.0"
 38 |   - "vlbert->vlbert"
 39 |   - "module.vlbert->module.vlbert"
 40 |   IMAGE_NUM_LAYERS: 101
 41 |   IMAGE_C5_DILATED: true
 42 |   IMAGE_STRIDE_IN_1x1: true
 43 |   PIXEL_MEANS:
 44 |   - 102.9801
 45 |   - 115.9465
 46 |   - 122.7717
 47 |   PIXEL_STDS:
 48 |   - 1.0
 49 |   - 1.0
 50 |   - 1.0
 51 |   IMAGE_FEAT_PRECOMPUTED: true
 52 |   IMAGE_PRETRAINED: ''
 53 |   IMAGE_PRETRAINED_EPOCH: 0
 54 |   IMAGE_FROZEN_BACKBONE_STAGES:
 55 |   - 1
 56 |   - 2
 57 |   IMAGE_FROZEN_BN: true
 58 |   IMAGE_FINAL_DIM: 1024
 59 |   IMAGE_SEMANTIC: false
 60 |   OUTPUT_CONV5: false
 61 |   BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased'
 62 |   BERT_PRETRAINED: ''
 63 |   BERT_PRETRAINED_EPOCH: 0
 64 |   BERT_FROZEN: false
 65 |   ENABLE_CNN_REG_LOSS: false
 66 | 
 67 |   VLBERT:
 68 |     input_transform_type: 1
 69 |     visual_size: 1024
 70 |     hidden_size: 1024
 71 |     num_hidden_layers: 24
 72 |     num_attention_heads: 16
 73 |     intermediate_size: 4096
 74 |     hidden_act: "gelu"
 75 |     hidden_dropout_prob: 0.1
 76 |     attention_probs_dropout_prob: 0.1
 77 |     max_position_embeddings: 512
 78 |     type_vocab_size: 3
 79 |     vocab_size: 30522
 80 |     initializer_range: 0.02
 81 |     visual_scale_text_init: 0.0
 82 |     visual_scale_object_init: 0.0
 83 |     visual_ln: true
 84 | 
 85 |   CLASSIFIER_TYPE: "mlm"
 86 |   CLASSIFIER_PRETRAINED: true
 87 |   CLASSIFIER_DROPOUT: 0.1
 88 | 
 89 | TRAIN:
 90 |   SHUFFLE: true
 91 |   FLIP_PROB: 0.5
 92 |   BATCH_IMAGES: 16
 93 |   ASPECT_GROUPING: false
 94 |   RESUME: false
 95 |   AUTO_RESUME: true
 96 |   BEGIN_EPOCH: 0
 97 |   END_EPOCH: 5
 98 |   OPTIMIZER: 'AdamW'
 99 |   CLIP_GRAD_NORM: 1.0
100 |   GRAD_ACCUMULATE_STEPS: 4
101 |   LR: 6.25e-7
102 |   LR_SCHEDULE: 'triangle'
103 |   WD: 0.0001
104 |   WARMUP: true
105 |   WARMUP_METHOD: 'linear'
106 |   WARMUP_FACTOR: 0.0
107 |   WARMUP_STEPS: 500
108 |   FP16: false
109 |   FP16_LOSS_SCALE: 128.0
110 | 
111 | VAL:
112 |   SHUFFLE: false
113 |   FLIP_PROB: 0
114 |   BATCH_IMAGES: 16
115 | 
116 | TEST:
117 |   SHUFFLE: false
118 |   FLIP_PROB: 0
119 |   TEST_EPOCH: 0
120 |   BATCH_IMAGES: 16


--------------------------------------------------------------------------------
/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/common/__init__.py


--------------------------------------------------------------------------------
/common/backbone/__init__.py:
--------------------------------------------------------------------------------
1 | from .resnet import resnet18, resnet34, resnet50, resnet101, resnet152
2 | 


--------------------------------------------------------------------------------
/common/backbone/resnet/__init__.py:
--------------------------------------------------------------------------------
1 | from .resnet import *
2 | 


--------------------------------------------------------------------------------
/common/callbacks/batch_end_callbacks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/common/callbacks/batch_end_callbacks/__init__.py


--------------------------------------------------------------------------------
/common/callbacks/epoch_end_callbacks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/common/callbacks/epoch_end_callbacks/__init__.py


--------------------------------------------------------------------------------
/common/callbacks/epoch_end_callbacks/checkpoint.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class Checkpoint(object):
 5 |     def __init__(self, prefix, frequent):
 6 |         super(Checkpoint, self).__init__()
 7 |         self.prefix = prefix
 8 |         self.frequent = frequent
 9 | 
10 |     def __call__(self, epoch_num, net, optimizer, writer, validation_monitor=None):
11 |         if (epoch_num + 1) % self.frequent == 0:
12 |             param_name = '{}-{:04d}.model'.format(self.prefix, epoch_num)
13 |             checkpoint_dict = dict()
14 |             checkpoint_dict['state_dict'] = net.state_dict()
15 |             checkpoint_dict['optimizer'] = optimizer.state_dict()
16 |             save_to_best = False
17 |             if validation_monitor is not None:
18 |                 checkpoint_dict['validation_monitor'] = validation_monitor.state_dict()
19 |                 if validation_monitor.best_epoch == epoch_num:
20 |                     save_to_best = True
21 |             torch.save(checkpoint_dict, param_name)
22 |             if save_to_best:
23 |                 best_param_name = '{}-best.model'.format(self.prefix)
24 |                 torch.save(checkpoint_dict, best_param_name)
25 |                 print('Save new best model to {}.'.format(best_param_name))
26 | 


--------------------------------------------------------------------------------
/common/callbacks/epoch_end_callbacks/validation_monitor.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import shutil
 3 | 
 4 | 
 5 | class ValidationMonitor(object):
 6 |     def __init__(self, val_func, val_loader, metrics, host_metric_name='Acc', label_index_in_batch=-1):
 7 |         super(ValidationMonitor, self).__init__()
 8 |         self.val_func = val_func
 9 |         self.val_loader = val_loader
10 |         self.metrics = metrics
11 |         self.host_metric_name = host_metric_name
12 |         self.best_epoch = -1
13 |         self.best_val = -1.0
14 |         self.label_index_in_batch = label_index_in_batch
15 | 
16 |     def state_dict(self):
17 |         return {'best_epoch': self.best_epoch,
18 |                 'best_val': self.best_val}
19 | 
20 |     def load_state_dict(self, state_dict):
21 |         assert 'best_epoch' in state_dict, 'miss key \'best_epoch\''
22 |         assert 'best_val' in state_dict, 'miss key \'best_val\''
23 |         self.best_epoch = state_dict['best_epoch']
24 |         self.best_val = state_dict['best_val']
25 | 
26 |     def __call__(self, epoch_num, net, optimizer, writer):
27 |         self.val_func(net, self.val_loader, self.metrics, self.label_index_in_batch)
28 | 
29 |         name, value = self.metrics.get()
30 |         s = "Epoch[%d] \tVal-" % (epoch_num)
31 |         for n, v in zip(name, value):
32 |             if n == self.host_metric_name and v > self.best_val:
33 |                 self.best_epoch = epoch_num
34 |                 self.best_val = v
35 |                 logging.info('New Best Val {}: {}, Epoch: {}'.format(self.host_metric_name, self.best_val, self.best_epoch))
36 |                 print('New Best Val {}: {}, Epoch: {}'.format(self.host_metric_name, self.best_val, self.best_epoch))
37 |             s += "%s=%f,\t" % (n, v)
38 |             if writer is not None:
39 |                 writer.add_scalar(tag='Val-' + n,
40 |                                   scalar_value=v,
41 |                                   global_step=epoch_num + 1)
42 |         logging.info(s)
43 |         print(s)
44 | 
45 |         logging.info('Best Val {}: {}, Epoch: {}'.format(self.host_metric_name, self.best_val, self.best_epoch))
46 |         print('Best Val {}: {}, Epoch: {}'.format(self.host_metric_name, self.best_val, self.best_epoch))
47 | 
48 | 
49 | 
50 | 
51 | 
52 | 


--------------------------------------------------------------------------------
/common/lib/roi_pooling/ROIAlign.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #pragma once
 3 | 
 4 | #include "cpu/vision.h"
 5 | 
 6 | #ifdef WITH_CUDA
 7 | #include "cuda/vision.h"
 8 | #endif
 9 | 
10 | // Interface for Python
11 | at::Tensor ROIAlign_forward(const at::Tensor& input,
12 |                             const at::Tensor& rois,
13 |                             const float spatial_scale,
14 |                             const int pooled_height,
15 |                             const int pooled_width,
16 |                             const int sampling_ratio) {
17 |   if (input.type().is_cuda()) {
18 | #ifdef WITH_CUDA
19 |     return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
20 | #else
21 |     AT_ERROR("Not compiled with GPU support");
22 | #endif
23 |   }
24 |   return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio);
25 | }
26 | 
27 | at::Tensor ROIAlign_backward(const at::Tensor& grad,
28 |                              const at::Tensor& rois,
29 |                              const float spatial_scale,
30 |                              const int pooled_height,
31 |                              const int pooled_width,
32 |                              const int batch_size,
33 |                              const int channels,
34 |                              const int height,
35 |                              const int width,
36 |                              const int sampling_ratio) {
37 |   if (grad.type().is_cuda()) {
38 | #ifdef WITH_CUDA
39 |     return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio);
40 | #else
41 |     AT_ERROR("Not compiled with GPU support");
42 | #endif
43 |   }
44 |   AT_ERROR("Not implemented on the CPU");
45 | }
46 | 
47 | 


--------------------------------------------------------------------------------
/common/lib/roi_pooling/ROIPool.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #pragma once
 3 | 
 4 | #include "cpu/vision.h"
 5 | 
 6 | #ifdef WITH_CUDA
 7 | #include "cuda/vision.h"
 8 | #endif
 9 | 
10 | 
11 | std::tuple<at::Tensor, at::Tensor> ROIPool_forward(const at::Tensor& input,
12 |                                 const at::Tensor& rois,
13 |                                 const float spatial_scale,
14 |                                 const int pooled_height,
15 |                                 const int pooled_width) {
16 |   if (input.type().is_cuda()) {
17 | #ifdef WITH_CUDA
18 |     return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width);
19 | #else
20 |     AT_ERROR("Not compiled with GPU support");
21 | #endif
22 |   }
23 |   AT_ERROR("Not implemented on the CPU");
24 | }
25 | 
26 | at::Tensor ROIPool_backward(const at::Tensor& grad,
27 |                                  const at::Tensor& input,
28 |                                  const at::Tensor& rois,
29 |                                  const at::Tensor& argmax,
30 |                                  const float spatial_scale,
31 |                                  const int pooled_height,
32 |                                  const int pooled_width,
33 |                                  const int batch_size,
34 |                                  const int channels,
35 |                                  const int height,
36 |                                  const int width) {
37 |   if (grad.type().is_cuda()) {
38 | #ifdef WITH_CUDA
39 |     return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width);
40 | #else
41 |     AT_ERROR("Not compiled with GPU support");
42 | #endif
43 |   }
44 |   AT_ERROR("Not implemented on the CPU");
45 | }
46 | 
47 | 
48 | 
49 | 


--------------------------------------------------------------------------------
/common/lib/roi_pooling/__init__.py:
--------------------------------------------------------------------------------
1 | from .roi_align import ROIAlign
2 | from .roi_pool import ROIPool


--------------------------------------------------------------------------------
/common/lib/roi_pooling/cpu/vision.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #pragma once
 3 | #include <torch/extension.h>
 4 | 
 5 | 
 6 | at::Tensor ROIAlign_forward_cpu(const at::Tensor& input,
 7 |                                 const at::Tensor& rois,
 8 |                                 const float spatial_scale,
 9 |                                 const int pooled_height,
10 |                                 const int pooled_width,
11 |                                 const int sampling_ratio);
12 | 
13 | 
14 | 


--------------------------------------------------------------------------------
/common/lib/roi_pooling/cuda/vision.h:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #pragma once
 3 | #include <torch/extension.h>
 4 | 
 5 | 
 6 | at::Tensor ROIAlign_forward_cuda(const at::Tensor& input,
 7 |                                  const at::Tensor& rois,
 8 |                                  const float spatial_scale,
 9 |                                  const int pooled_height,
10 |                                  const int pooled_width,
11 |                                  const int sampling_ratio);
12 | 
13 | at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad,
14 |                                   const at::Tensor& rois,
15 |                                   const float spatial_scale,
16 |                                   const int pooled_height,
17 |                                   const int pooled_width,
18 |                                   const int batch_size,
19 |                                   const int channels,
20 |                                   const int height,
21 |                                   const int width,
22 |                                   const int sampling_ratio);
23 | 
24 | 
25 | std::tuple<at::Tensor, at::Tensor> ROIPool_forward_cuda(const at::Tensor& input,
26 |                                 const at::Tensor& rois,
27 |                                 const float spatial_scale,
28 |                                 const int pooled_height,
29 |                                 const int pooled_width);
30 | 
31 | at::Tensor ROIPool_backward_cuda(const at::Tensor& grad,
32 |                                  const at::Tensor& input,
33 |                                  const at::Tensor& rois,
34 |                                  const at::Tensor& argmax,
35 |                                  const float spatial_scale,
36 |                                  const int pooled_height,
37 |                                  const int pooled_width,
38 |                                  const int batch_size,
39 |                                  const int channels,
40 |                                  const int height,
41 |                                  const int width);
42 | 
43 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh);
44 | 
45 | 
46 | at::Tensor compute_flow_cuda(const at::Tensor& boxes,
47 |                              const int height,
48 |                              const int width);
49 | 


--------------------------------------------------------------------------------
/common/lib/roi_pooling/debug.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from roi_pool import ROIPool
 3 | from roi_align import ROIAlign
 4 | 
 5 | align = ROIAlign(output_size=(3, 3), spatial_scale=1.0, sampling_ratio=1)
 6 | pool = ROIPool(output_size=(3, 3), spatial_scale=1.0)
 7 | 
 8 | device = torch.device("cuda:0")
 9 | 
10 | feature = torch.arange(81*2*3).view((2,3,9,9)).float().to(device)
11 | rois = torch.Tensor([[0,0,0,9,9],[1,0,0,9,9],[1,0,0,7,7]]).to(device)
12 | 
13 | pooled = pool(feature,rois)
14 | aligned = align(feature,rois)
15 | 
16 | import IPython
17 | IPython.embed()
18 | 


--------------------------------------------------------------------------------
/common/lib/roi_pooling/roi_align.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | from torch import nn
 4 | from torch.autograd import Function
 5 | from torch.autograd.function import once_differentiable
 6 | from torch.nn.modules.utils import _pair
 7 | 
 8 | from . import C_ROIPooling
 9 | 
10 | 
11 | class _ROIAlign(Function):
12 |     @staticmethod
13 |     def forward(ctx, input, rois, output_size, spatial_scale, sampling_ratio):
14 |         ctx.save_for_backward(rois)
15 |         ctx.output_size = _pair(output_size)
16 |         ctx.spatial_scale = spatial_scale
17 |         ctx.sampling_ratio = sampling_ratio
18 |         ctx.input_shape = input.size()
19 |         output = C_ROIPooling.roi_align_forward(
20 |             input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio
21 |         )
22 |         return output
23 | 
24 |     @staticmethod
25 |     @once_differentiable
26 |     def backward(ctx, grad_output):
27 |         rois, = ctx.saved_tensors
28 |         output_size = ctx.output_size
29 |         spatial_scale = ctx.spatial_scale
30 |         sampling_ratio = ctx.sampling_ratio
31 |         bs, ch, h, w = ctx.input_shape
32 |         grad_input = C_ROIPooling.roi_align_backward(
33 |             grad_output,
34 |             rois,
35 |             spatial_scale,
36 |             output_size[0],
37 |             output_size[1],
38 |             bs,
39 |             ch,
40 |             h,
41 |             w,
42 |             sampling_ratio,
43 |         )
44 |         return grad_input, None, None, None, None
45 | 
46 | 
47 | roi_align = _ROIAlign.apply
48 | 
49 | 
50 | class ROIAlign(nn.Module):
51 |     def __init__(self, output_size, spatial_scale, sampling_ratio=1):
52 |         """
53 |         :param output_size: e.g. (3,3)
54 |         :param spatial_scale: e.g. 1.0/16
55 |         :param sampling_ratio: e.g. 1
56 |         """
57 |         super(ROIAlign, self).__init__()
58 |         self.output_size = output_size
59 |         self.spatial_scale = spatial_scale
60 |         self.sampling_ratio = sampling_ratio
61 | 
62 |     def forward(self, input, rois):
63 |         """
64 |         :param input: the input features [B C H W]
65 |         :param rois: [k, 5]: (im_index, x1, y1, x2, y2)
66 |         :return: pooled features [K C H W], K = k
67 |         """
68 |         return roi_align(
69 |             input.float(), rois.float(), self.output_size, self.spatial_scale, self.sampling_ratio
70 |         )
71 | 
72 |     def __repr__(self):
73 |         tmpstr = self.__class__.__name__ + "("
74 |         tmpstr += "output_size=" + str(self.output_size)
75 |         tmpstr += ", spatial_scale=" + str(self.spatial_scale)
76 |         tmpstr += ", sampling_ratio=" + str(self.sampling_ratio)
77 |         tmpstr += ")"
78 |         return tmpstr
79 | 


--------------------------------------------------------------------------------
/common/lib/roi_pooling/roi_pool.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | import torch
 3 | from torch import nn
 4 | from torch.autograd import Function
 5 | from torch.autograd.function import once_differentiable
 6 | from torch.nn.modules.utils import _pair
 7 | 
 8 | from . import C_ROIPooling
 9 | 
10 | 
11 | class _ROIPool(Function):
12 |     @staticmethod
13 |     def forward(ctx, input, rois, output_size, spatial_scale):
14 |         ctx.output_size = _pair(output_size)
15 |         ctx.spatial_scale = spatial_scale
16 |         ctx.input_shape = input.size()
17 |         output, argmax = C_ROIPooling.roi_pool_forward(
18 |             input, rois, spatial_scale, output_size[0], output_size[1]
19 |         )
20 |         ctx.save_for_backward(input, rois, argmax)
21 |         return output
22 | 
23 |     @staticmethod
24 |     @once_differentiable
25 |     def backward(ctx, grad_output):
26 |         input, rois, argmax = ctx.saved_tensors
27 |         output_size = ctx.output_size
28 |         spatial_scale = ctx.spatial_scale
29 |         bs, ch, h, w = ctx.input_shape
30 |         grad_input = C_ROIPooling.roi_pool_backward(
31 |             grad_output,
32 |             input,
33 |             rois,
34 |             argmax,
35 |             spatial_scale,
36 |             output_size[0],
37 |             output_size[1],
38 |             bs,
39 |             ch,
40 |             h,
41 |             w,
42 |         )
43 |         return grad_input, None, None, None
44 | 
45 | 
46 | roi_pool = _ROIPool.apply
47 | 
48 | 
49 | class ROIPool(nn.Module):
50 |     def __init__(self, output_size, spatial_scale):
51 |         """
52 |         :param output_size: e.g. (3,3)
53 |         :param spatial_scale: e.g. 1.0/16
54 |         """
55 |         super(ROIPool, self).__init__()
56 |         self.output_size = output_size
57 |         self.spatial_scale = spatial_scale
58 | 
59 |     def forward(self, input, rois):
60 |         """
61 |         :param input: the input features [B C H W]
62 |         :param rois: [k, 5] : (im_index, x1, y1, x2, y2)
63 |         :return: pooled features (K C H W), K = k
64 |         """
65 |         return roi_pool(input.float(), rois.float(), self.output_size, self.spatial_scale)
66 | 
67 |     def __repr__(self):
68 |         tmpstr = self.__class__.__name__ + "("
69 |         tmpstr += "output_size=" + str(self.output_size)
70 |         tmpstr += ", spatial_scale=" + str(self.spatial_scale)
71 |         tmpstr += ")"
72 |         return tmpstr
73 | 


--------------------------------------------------------------------------------
/common/lib/roi_pooling/setup.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | #!/usr/bin/env python
 3 | 
 4 | import glob
 5 | import os
 6 | 
 7 | import torch
 8 | from setuptools import find_packages
 9 | from setuptools import setup
10 | from torch.utils.cpp_extension import CUDA_HOME
11 | from torch.utils.cpp_extension import CppExtension
12 | from torch.utils.cpp_extension import CUDAExtension
13 | 
14 | requirements = ["torch", "torchvision"]
15 | 
16 | 
17 | def get_extensions():
18 |     this_dir = os.path.dirname(os.path.abspath(__file__))
19 |     extensions_dir = this_dir
20 | 
21 |     main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
22 |     source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
23 |     source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
24 | 
25 |     sources = main_file + source_cpu
26 |     extension = CppExtension
27 | 
28 |     extra_compile_args = {"cxx": []}
29 |     define_macros = []
30 | 
31 |     if torch.cuda.is_available() and CUDA_HOME is not None:
32 |         extension = CUDAExtension
33 |         sources += source_cuda
34 |         define_macros += [("WITH_CUDA", None)]
35 |         extra_compile_args["nvcc"] = [
36 |             "-DCUDA_HAS_FP16=1",
37 |             "-D__CUDA_NO_HALF_OPERATORS__",
38 |             "-D__CUDA_NO_HALF_CONVERSIONS__",
39 |             "-D__CUDA_NO_HALF2_OPERATORS__",
40 |         ]
41 | 
42 |     sources = [os.path.join(extensions_dir, s) for s in sources]
43 | 
44 |     include_dirs = [extensions_dir]
45 | 
46 |     ext_modules = [
47 |         extension(
48 |             "C_ROIPooling",
49 |             sources,
50 |             include_dirs=include_dirs,
51 |             define_macros=define_macros,
52 |             extra_compile_args=extra_compile_args,
53 |         )
54 |     ]
55 | 
56 |     return ext_modules
57 | 
58 | 
59 | setup(
60 |     name="C_ROIPooling",
61 |     version="0.1",
62 |     description="ROIPooling in C++ or CUDA",
63 |     ext_modules=get_extensions(),
64 |     cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
65 | )
66 | 


--------------------------------------------------------------------------------
/common/lib/roi_pooling/vision.cpp:
--------------------------------------------------------------------------------
 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | 
 3 | #include "ROIAlign.h"
 4 | #include "ROIPool.h"
 5 | 
 6 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
 7 |   m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward");
 8 |   m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward");
 9 |   m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward");
10 |   m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward");
11 | }
12 | 


--------------------------------------------------------------------------------
/common/lr_scheduler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | from bisect import bisect_right
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | # FIXME ideally this would be achieved with a CombinedLRScheduler,
 8 | # separating MultiStepLR with WarmupLR
 9 | # but the current LRScheduler design doesn't allow it
10 | class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler):
11 |     def __init__(
12 |         self,
13 |         optimizer,
14 |         milestones,
15 |         gamma=0.1,
16 |         warmup_factor=1.0 / 3,
17 |         warmup_iters=500,
18 |         warmup_method="linear",
19 |         last_epoch=-1,
20 |     ):
21 |         if not list(milestones) == sorted(milestones):
22 |             raise ValueError(
23 |                 "Milestones should be a list of" " increasing integers. Got {}",
24 |                 milestones,
25 |             )
26 | 
27 |         if warmup_method not in ("constant", "linear"):
28 |             raise ValueError(
29 |                 "Only 'constant' or 'linear' warmup_method accepted"
30 |                 "got {}".format(warmup_method)
31 |             )
32 |         self.milestones = milestones
33 |         self.gamma = gamma
34 |         self.warmup_factor = warmup_factor
35 |         self.warmup_iters = warmup_iters
36 |         self.warmup_method = warmup_method
37 |         super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch)
38 | 
39 |     def get_lr(self):
40 |         warmup_factor = 1
41 |         if self.last_epoch < self.warmup_iters:
42 |             if self.warmup_method == "constant":
43 |                 warmup_factor = self.warmup_factor
44 |             elif self.warmup_method == "linear":
45 |                 alpha = self.last_epoch / self.warmup_iters
46 |                 warmup_factor = self.warmup_factor * (1 - alpha) + alpha
47 |         return [
48 |             base_lr
49 |             * warmup_factor
50 |             * self.gamma ** bisect_right(self.milestones, self.last_epoch)
51 |             for base_lr in self.base_lrs
52 |         ]
53 | 


--------------------------------------------------------------------------------
/common/metrics/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/common/metrics/__init__.py


--------------------------------------------------------------------------------
/common/metrics/composite_eval_metric.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from .eval_metric import EvalMetric
 3 | import torch
 4 | 
 5 | class CompositeEvalMetric(EvalMetric):
 6 |     """Manages multiple evaluation metrics.
 7 |     Args:
 8 |         metrics (list of EvalMetric): List of child metrics.
 9 |         name (str): Name of this metric instance for display.
10 |     """
11 | 
12 |     def __init__(self, metrics=None, name='composite'):
13 |         super(CompositeEvalMetric, self).__init__(name)
14 |         if metrics is None:
15 |             metrics = []
16 |         self.metrics = metrics
17 | 
18 |     def add(self, metric):
19 |         """Adds a child metric.
20 |         Args:
21 |             metric (EvalMetric): A metric instance.
22 |         """
23 |         self.metrics.append(metric)
24 | 
25 |     def get_metric(self, index):
26 |         """Returns a child metric.
27 |         Args:
28 |             index (int): Index of child metric in the list of metrics.
29 |         """
30 |         try:
31 |             return self.metrics[index]
32 |         except IndexError:
33 |             return ValueError("Metric index {} is out of range 0 and {}".format(
34 |                 index, len(self.metrics)))
35 | 
36 |     def update(self, outputs):
37 |         """Updates the internal evaluation result.
38 |         Args:
39 |             labels (dict of `NDArray`): The labels of the data.
40 |             preds (dict of `NDArray`): Predicted values.
41 |         """
42 |         for metric in self.metrics:
43 |             metric.update(outputs)
44 | 
45 |     def reset(self):
46 |         """Resets the internal evaluation result to initial state."""
47 |         try:
48 |             for metric in self.metrics:
49 |                 metric.reset()
50 |         except AttributeError:
51 |             pass
52 | 
53 |     def get(self):
54 |         """Returns the current evaluation result.
55 |         Returns:
56 |             names (list of str): Name of the metrics.
57 |             values (list of float): Value of the evaluations.
58 |         """
59 |         names = []
60 |         values = []
61 |         for metric in self.metrics:
62 |             name, value = metric.get()
63 |             if isinstance(name, str):
64 |                 name = [name]
65 |             if isinstance(value, (float, int, np.generic,torch.Tensor)):
66 |                 value = [value]
67 |             names.extend(name)
68 |             values.extend(value)
69 |         return names, values
70 | 


--------------------------------------------------------------------------------
/common/metrics/eval_metric.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as distributed
 3 | 
 4 | 
 5 | class EvalMetric(object):
 6 |     """Base class for all evaluation metrics.
 7 |     .. note::
 8 |         This is a base class that provides common metric interfaces.
 9 |         One should not use this class directly, but instead create new metric
10 |         classes that extend it.
11 |     Args
12 |         name (str): Name of this metric instance for display.
13 |     """
14 | 
15 |     def __init__(self, name, allreduce=False, num_replicas=1, **kwargs):
16 |         self.name = str(name)
17 |         self.allreduce=allreduce
18 |         self.num_replicas = num_replicas
19 |         self._kwargs = kwargs
20 |         self.reset()
21 | 
22 |     def __str__(self):
23 |         return "EvalMetric: {}".format(dict(self.get_name_value()))
24 | 
25 |     def update(self, outputs):
26 |         """Updates the internal evaluation result.
27 |         Args
28 |             labels (list of `NDArray`): The labels of the data.
29 |             preds (list of `NDArray`): Predicted values.
30 |         """
31 |         raise NotImplementedError()
32 | 
33 |     def reset(self):
34 |         """Resets the internal evaluation result to initial state."""
35 |         self.num_inst = torch.tensor(0.)
36 |         self.sum_metric = torch.tensor(0.)
37 | 
38 |     def get(self):
39 |         """Returns the current evaluation result.
40 |         Returns:
41 |             names (list of str): Name of the metrics.
42 |             values (list of float): Value of the evaluations.
43 |         """
44 |         if self.num_inst.item() == 0:
45 |             return (self.name, float('nan'))
46 |         else:
47 |             if self.allreduce:
48 |                 num_inst = self.num_inst.clone().cuda()
49 |                 sum_metric = self.sum_metric.clone().cuda()
50 |                 distributed.all_reduce(num_inst, op=distributed.ReduceOp.SUM)
51 |                 distributed.all_reduce(sum_metric, op=distributed.ReduceOp.SUM)
52 |                 metric_tensor = (sum_metric / num_inst).detach().cpu()
53 |             else:
54 |                 metric_tensor = (self.sum_metric / self.num_inst).detach().cpu()
55 | 
56 |             return (self.name, metric_tensor.item())
57 | 
58 |     def get_name_value(self):
59 |         """Returns zipped name and value pairs.
60 |         Returns
61 |             A (list of tuples): (name, value) tuple list.
62 |         """
63 |         name, value = self.get()
64 |         if not isinstance(name, list):
65 |             name = [name]
66 |         if not isinstance(value, list):
67 |             value = [value]
68 |         return list(zip(name, value))
69 | 


--------------------------------------------------------------------------------
/common/metrics/pretrain_metrics.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from .eval_metric import EvalMetric
 3 | 
 4 | 
 5 | class LossLogger(EvalMetric):
 6 |     def __init__(self, output_name, display_name=None,
 7 |                  allreduce=False, num_replicas=1):
 8 |         self.output_name = output_name
 9 |         if display_name is None:
10 |             display_name = output_name
11 |         super(LossLogger, self).__init__(display_name, allreduce, num_replicas)
12 | 
13 |     def update(self, outputs):
14 |         with torch.no_grad():
15 |             if self.output_name in outputs:
16 |                 self.sum_metric += float(outputs[self.output_name].mean().item())
17 |             self.num_inst += 1
18 | 
19 | 
20 | class RelationshipAccuracy(EvalMetric):
21 |     def __init__(self, allreduce=False, num_replicas=1):
22 |         super(RelationshipAccuracy, self).__init__('RelAcc', allreduce, num_replicas)
23 | 
24 |     def update(self, outputs):
25 |         with torch.no_grad():
26 |             logits = outputs['relationship_logits']
27 |             label = outputs['relationship_label']
28 |             self.sum_metric += float((logits.argmax(dim=1) == label).sum().item())
29 |             self.num_inst += logits.shape[0]
30 | 
31 | 
32 | class MLMAccuracy(EvalMetric):
33 |     def __init__(self, allreduce=False, num_replicas=1):
34 |         super(MLMAccuracy, self).__init__('MLMAcc', allreduce, num_replicas)
35 | 
36 |     def update(self, outputs):
37 |         with torch.no_grad():
38 |             logits = outputs['mlm_logits']
39 |             label = outputs['mlm_label']
40 |             keep = (label != -1)
41 |             if keep.sum() > 0:
42 |                 self.sum_metric += float((logits[keep].argmax(dim=1) == label[keep]).sum().item())
43 |                 self.num_inst += keep.sum().item()
44 | 
45 | 
46 | class MLMAccuracyWVC(EvalMetric):
47 |     def __init__(self, allreduce=False, num_replicas=1):
48 |         super(MLMAccuracyWVC, self).__init__('MLMAccWVC', allreduce, num_replicas)
49 | 
50 |     def update(self, outputs):
51 |         with torch.no_grad():
52 |             logits = outputs['mlm_logits_wvc']
53 |             label = outputs['mlm_label_wvc']
54 |             keep = (label != -1)
55 |             if keep.sum() > 0:
56 |                 self.sum_metric += float((logits[keep].argmax(dim=1) == label[keep]).sum().item())
57 |                 self.num_inst += keep.sum().item()
58 | 
59 | 
60 | class MLMAccuracyAUX(EvalMetric):
61 |     def __init__(self, allreduce=False, num_replicas=1):
62 |         super(MLMAccuracyAUX, self).__init__('MLMAccAUX', allreduce, num_replicas)
63 | 
64 |     def update(self, outputs):
65 |         with torch.no_grad():
66 |             logits = outputs['mlm_logits_aux']
67 |             label = outputs['mlm_label_aux']
68 |             keep = (label != -1)
69 |             if keep.sum() > 0:
70 |                 self.sum_metric += float((logits[keep].argmax(dim=1) == label[keep]).sum().item())
71 |                 self.num_inst += keep.sum().item()
72 | 
73 | 
74 | class MVRCAccuracy(EvalMetric):
75 |     def __init__(self, allreduce=False, num_replicas=1):
76 |         super(MVRCAccuracy, self).__init__('MVRCAccuracy', allreduce, num_replicas)
77 | 
78 |     def update(self, outputs):
79 |         with torch.no_grad():
80 |             logits = outputs['mvrc_logits']
81 |             label = outputs['mvrc_label']
82 |             keep = (label.sum(2) - 1.0).abs() < 0.1
83 |             if keep.sum() > 0:
84 |                 self.sum_metric += float((logits[keep].argmax(dim=1) == label[keep].argmax(dim=1)).sum().item())
85 |                 self.num_inst += keep.sum().item()
86 | 
87 | 
88 | 
89 | 
90 | 


--------------------------------------------------------------------------------
/common/metrics/refcoco_metrics.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from .eval_metric import EvalMetric
 3 | 
 4 | 
 5 | class LossLogger(EvalMetric):
 6 |     def __init__(self, output_name, display_name=None,
 7 |                  allreduce=False, num_replicas=1):
 8 |         self.output_name = output_name
 9 |         if display_name is None:
10 |             display_name = output_name
11 |         super(LossLogger, self).__init__(display_name, allreduce, num_replicas)
12 | 
13 |     def update(self, outputs):
14 |         with torch.no_grad():
15 |             if self.output_name in outputs:
16 |                 self.sum_metric += float(outputs[self.output_name].mean().item())
17 |             self.num_inst += 1
18 | 
19 | 
20 | class RefAccuracy(EvalMetric):
21 |     def __init__(self, allreduce=False, num_replicas=1):
22 |         super(RefAccuracy, self).__init__('RefAcc', allreduce, num_replicas)
23 | 
24 |     def update(self, outputs):
25 |         with torch.no_grad():
26 |             cls_logits = outputs['label_logits']
27 |             label = outputs['label']
28 |             bs, _ = cls_logits.shape
29 |             batch_inds = torch.arange(bs, device=cls_logits.device)
30 |             self.sum_metric += float((label[batch_inds, cls_logits.argmax(1)] > 0.5).sum().item())
31 |             self.num_inst += cls_logits.shape[0]
32 | 
33 | 
34 | class ClsAccuracy(EvalMetric):
35 |     def __init__(self, allreduce=False, num_replicas=1):
36 |         super(ClsAccuracy, self).__init__('ClsAcc', allreduce, num_replicas)
37 | 
38 |     def update(self, outputs):
39 |         with torch.no_grad():
40 |             cls_logits = outputs['label_logits']
41 |             cls_pred = (cls_logits > 0).long()
42 |             label = outputs['label'].long()
43 |             keep = (label >= 0)
44 |             self.sum_metric += float((cls_pred[keep] == label[keep]).sum().item())
45 |             self.num_inst += keep.sum().item()
46 | 
47 | 
48 | class ClsPosAccuracy(EvalMetric):
49 |     def __init__(self, allreduce=False, num_replicas=1):
50 |         super(ClsPosAccuracy, self).__init__('ClsPosAcc', allreduce, num_replicas)
51 | 
52 |     def update(self, outputs):
53 |         with torch.no_grad():
54 |             cls_logits = outputs['label_logits']
55 |             cls_pred = (cls_logits > 0).long()
56 |             label = outputs['label'].long()
57 |             keep = (label == 1)
58 |             self.sum_metric += float((cls_pred[keep] == label[keep]).sum().item())
59 |             self.num_inst += keep.sum().item()
60 | 
61 | 
62 | class ClsPosFraction(EvalMetric):
63 |     def __init__(self, allreduce=False, num_replicas=1):
64 |         super(ClsPosFraction, self).__init__('ClsPosFrac', allreduce, num_replicas)
65 | 
66 |     def update(self, outputs):
67 |         with torch.no_grad():
68 |             label = outputs['label'].long()
69 |             num_pos = (label == 1).sum().item()
70 |             num_valid = (label >= 0).sum().item()
71 |             self.sum_metric += float(num_pos)
72 |             self.num_inst += float(num_valid)
73 | 
74 | 
75 | 
76 | 
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/common/metrics/vcr_metrics.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from .eval_metric import EvalMetric
 3 | 
 4 | 
 5 | class LossLogger(EvalMetric):
 6 |     def __init__(self, output_name, display_name=None,
 7 |                  allreduce=False, num_replicas=1):
 8 |         self.output_name = output_name
 9 |         if display_name is None:
10 |             display_name = output_name
11 |         super(LossLogger, self).__init__(display_name, allreduce, num_replicas)
12 | 
13 |     def update(self, outputs):
14 |         with torch.no_grad():
15 |             if self.output_name in outputs:
16 |                 self.sum_metric += float(outputs[self.output_name].mean().item())
17 |             self.num_inst += 1
18 | 
19 | 
20 | class Accuracy(EvalMetric):
21 |     def __init__(self, allreduce=False, num_replicas=1):
22 |         super(Accuracy, self).__init__('Acc', allreduce, num_replicas)
23 | 
24 |     def update(self, outputs):
25 |         with torch.no_grad():
26 |             _filter = outputs['label'] != -1
27 |             cls_logits = outputs['label_logits'][_filter]
28 |             label = outputs['label'][_filter]
29 |             if cls_logits.dim() == 1:
30 |                 cls_logits = cls_logits.view((-1, 4))
31 |                 label = label.view((-1, 4)).argmax(1)
32 |             self.sum_metric += float((cls_logits.argmax(dim=1) == label).sum().item())
33 |             self.num_inst += cls_logits.shape[0]
34 | 
35 | 
36 | class AnsLoss(EvalMetric):
37 |     def __init__(self, allreduce=False, num_replicas=1):
38 |         super(AnsLoss, self).__init__('AnsLoss', allreduce, num_replicas)
39 | 
40 |     def update(self, outputs):
41 |         with torch.no_grad():
42 |             self.sum_metric += float(outputs['ans_loss'].mean().item())
43 |             self.num_inst += 1
44 | 
45 | 
46 | class CNNRegLoss(EvalMetric):
47 |     def __init__(self, allreduce=False, num_replicas=1):
48 |         super(CNNRegLoss, self).__init__('CNNRegLoss', allreduce, num_replicas)
49 | 
50 |     def update(self, outputs):
51 |         with torch.no_grad():
52 |             if 'cnn_regularization_loss' in outputs:
53 |                 self.sum_metric += float(outputs['cnn_regularization_loss'].mean().item())
54 |             self.num_inst += 1
55 | 
56 | 
57 | class PositiveFraction(EvalMetric):
58 |     def __init__(self, allreduce=False, num_replicas=1):
59 |         super(PositiveFraction, self).__init__('PosFraction', allreduce, num_replicas)
60 | 
61 |     def update(self, outputs):
62 |         with torch.no_grad():
63 |             self.sum_metric += float(outputs['positive_fraction'].mean().item())
64 |             self.num_inst += 1
65 | 
66 | 
67 | class JointAccuracy(EvalMetric):
68 |     def __init__(self, allreduce=False, num_replicas=1):
69 |         super(JointAccuracy, self).__init__('JointAcc', allreduce, num_replicas)
70 | 
71 |     def update(self, outputs):
72 |         a_cls_logits = outputs['answer_label_logits']
73 |         a_label = outputs['answer_label']
74 |         r_cls_logits = outputs['rationale_label_logits']
75 |         r_label = outputs['rationale_label']
76 |         self.sum_metric += float(((a_cls_logits.argmax(dim=1) == a_label)
77 |                                   & (r_cls_logits.argmax(dim=1) == r_label)).sum().item())
78 |         self.num_inst += a_cls_logits.shape[0]
79 | 
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/common/metrics/vqa_metrics.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from .eval_metric import EvalMetric
 3 | 
 4 | 
 5 | class LossLogger(EvalMetric):
 6 |     def __init__(self, output_name, display_name=None,
 7 |                  allreduce=False, num_replicas=1):
 8 |         self.output_name = output_name
 9 |         if display_name is None:
10 |             display_name = output_name
11 |         super(LossLogger, self).__init__(display_name, allreduce, num_replicas)
12 | 
13 |     def update(self, outputs):
14 |         with torch.no_grad():
15 |             if self.output_name in outputs:
16 |                 self.sum_metric += float(outputs[self.output_name].mean().item())
17 |             self.num_inst += 1
18 | 
19 | 
20 | class SoftAccuracy(EvalMetric):
21 |     def __init__(self, allreduce=False, num_replicas=1):
22 |         super(SoftAccuracy, self).__init__('SoftAcc', allreduce, num_replicas)
23 | 
24 |     def update(self, outputs):
25 |         with torch.no_grad():
26 |             cls_logits = outputs['label_logits']
27 |             label = outputs['label']
28 |             bs, num_classes = cls_logits.shape
29 |             batch_inds = torch.arange(bs, device=cls_logits.device)
30 |             self.sum_metric += float(label[batch_inds, cls_logits.argmax(1)].sum().item())
31 |             self.num_inst += cls_logits.shape[0]
32 | 
33 | 
34 | 
35 | 
36 | 


--------------------------------------------------------------------------------
/common/module.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | from typing import Dict
 3 | import torch
 4 | import torch.nn as nn
 5 | import torch.nn.functional as F
 6 | 
 7 | 
 8 | class Module(nn.Module):
 9 |     def __init__(self, config):
10 |         super(Module, self).__init__()
11 |         self.config = config
12 | 
13 |     def init_weight(self):
14 |         raise NotImplementedError()
15 | 
16 |     def fix_params(self):
17 |         raise NotImplementedError()
18 | 
19 |     def forward(self, *inputs, **kwargs):
20 |         inputs, kwargs = self.preprocess(*inputs, **kwargs)
21 |         if self.training:
22 |             return self.train_forward(*inputs, **kwargs)
23 |         else:
24 |             return self.inference_forward(*inputs, **kwargs)
25 | 
26 |     def train_forward(self, *inputs, **kwargs):
27 |         """
28 |         def train_forward(self, data, label, **kwargs):
29 |             # this is a toy example for 1 output, 2 loss function
30 | 
31 |             output = None
32 |             loss1 = torch.tensor(0.0)
33 |             loss2 = torch.tensor(0.0)
34 | 
35 |             outputs = {'output': output,
36 |                        'loss1': loss1,
37 |                        'loss2': loss2}
38 |             loss = loss1 + loss2
39 | 
40 |             return outputs, loss
41 |         """
42 |         raise NotImplemented
43 | 
44 |     def inference_forward(self, *inputs, **kwargs):
45 |         """
46 |         def inference_forward(self, data, **kwargs):
47 |             output = None
48 |             outputs = {'output': output}
49 |             return outputs
50 |         """
51 |         raise NotImplemented
52 | 
53 |     def preprocess(self, *inputs, **kwargs):
54 |         if self.training:
55 |             return self.train_preprocess(*inputs, **kwargs)
56 |         else:
57 |             return self.inference_preprocess(*inputs, **kwargs)
58 | 
59 |     def train_preprocess(self, *inputs, **kwargs):
60 |         return inputs, kwargs
61 | 
62 |     def inference_preprocess(self, *inputs, **kwargs):
63 |         return inputs, kwargs
64 | 


--------------------------------------------------------------------------------
/common/nlp/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/common/nlp/__init__.py


--------------------------------------------------------------------------------
/common/nlp/bert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/common/nlp/bert/__init__.py


--------------------------------------------------------------------------------
/common/nlp/bert_encoder_wrapper.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | from external.pytorch_pretrained_bert.modeling import BertEncoder, BertLayerNorm
 4 | 
 5 | 
 6 | class BertEncoderWrapper(nn.Module):
 7 |     def __init__(self, bert_config, input_size, output_all_encoded_layers=False):
 8 |         super(BertEncoderWrapper, self).__init__()
 9 |         self.bert_config = bert_config
10 |         self.output_all_encoded_layers = output_all_encoded_layers
11 |         self.input_transform = nn.Linear(input_size, bert_config.hidden_size)
12 |         self.with_position_embeddings = False if 'with_position_embeddings' not in bert_config \
13 |             else bert_config.with_position_embeddings
14 |         if self.with_position_embeddings:
15 |             self.position_embedding = nn.Embedding(bert_config.max_position_embeddings, bert_config.hidden_size)
16 |             self.LayerNorm = BertLayerNorm(bert_config.hidden_size, eps=1e-12)
17 |             self.dropout = nn.Dropout(bert_config.hidden_dropout_prob)
18 |         self.bert_encoder = BertEncoder(bert_config)
19 | 
20 |         self.apply(self.init_bert_weights)
21 | 
22 |     def init_bert_weights(self, module):
23 |         """ Initialize the weights.
24 |         """
25 |         if isinstance(module, (nn.Linear, nn.Embedding)):
26 |             # Slightly different from the TF version which uses truncated_normal for initialization
27 |             # cf https://github.com/pytorch/pytorch/pull/5617
28 |             module.weight.data.normal_(mean=0.0, std=self.bert_config.initializer_range)
29 |         elif isinstance(module, BertLayerNorm):
30 |             module.bias.data.zero_()
31 |             module.weight.data.fill_(1.0)
32 |         if isinstance(module, nn.Linear) and module.bias is not None:
33 |             module.bias.data.zero_()
34 | 
35 |     def get_output_dim(self):
36 |         return self.bert_config.hidden_size
37 | 
38 |     def forward(self, inputs, mask):
39 |         inputs = self.input_transform(inputs)
40 |         if self.with_position_embeddings:
41 |             seq_length = inputs.size(1)
42 |             position_ids = torch.arange(seq_length, dtype=torch.long, device=inputs.device)
43 |             position_ids = position_ids.unsqueeze(0).expand((inputs.shape[0], inputs.shape[1]))
44 |             position_embeddings = self.position_embedding(position_ids)
45 |             inputs = inputs + position_embeddings
46 |             inputs = self.LayerNorm(inputs)
47 |             inputs = self.dropout(inputs)
48 | 
49 |         extended_attention_mask = mask.unsqueeze(1).unsqueeze(2)
50 |         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
51 |         # masked positions, this operation will create a tensor which is 0.0 for
52 |         # positions we want to attend and -10000.0 for masked positions.
53 |         # Since we are adding it to the raw scores before the softmax, this is
54 |         # effectively the same as removing these entirely.
55 |         extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)
56 |         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
57 |         output = self.bert_encoder(inputs,
58 |                                    extended_attention_mask,
59 |                                    output_all_encoded_layers=self.output_all_encoded_layers)
60 |         if not self.output_all_encoded_layers:
61 |             output = output[0]
62 |         return output
63 | 
64 | 


--------------------------------------------------------------------------------
/common/nlp/input_variational_dropout.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class InputVariationalDropout(torch.nn.Dropout):
 4 |     """
 5 |     Apply the dropout technique in Gal and Ghahramani, "Dropout as a Bayesian Approximation:
 6 |     Representing Model Uncertainty in Deep Learning" (https://arxiv.org/abs/1506.02142) to a
 7 |     3D tensor.
 8 | 
 9 |     This module accepts a 3D tensor of shape ``(batch_size, num_timesteps, embedding_dim)``
10 |     and samples a single dropout mask of shape ``(batch_size, embedding_dim)`` and applies
11 |     it to every time step.
12 |     """
13 |     def forward(self, input_tensor):
14 |         # pylint: disable=arguments-differ
15 |         """
16 |         Apply dropout to input tensor.
17 | 
18 |         Parameters
19 |         ----------
20 |         input_tensor: ``torch.FloatTensor``
21 |             A tensor of shape ``(batch_size, num_timesteps, embedding_dim)``
22 | 
23 |         Returns
24 |         -------
25 |         output: ``torch.FloatTensor``
26 |             A tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` with dropout applied.
27 |         """
28 |         ones = input_tensor.data.new_ones(input_tensor.shape[0], input_tensor.shape[-1])
29 |         dropout_mask = torch.nn.functional.dropout(ones, self.p, self.training, inplace=False)
30 |         if self.inplace:
31 |             input_tensor *= dropout_mask.unsqueeze(1)
32 |             return None
33 |         else:
34 |             return dropout_mask.unsqueeze(1) * input_tensor


--------------------------------------------------------------------------------
/common/nlp/misc.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import random
 3 | 
 4 | 
 5 | def get_align_matrix(aligned_ids, sparse=False, device=None, dtype=torch.float32):
 6 |     """
 7 |     Get aligned matrix for feature alignment in sentence embedding
 8 |     :param aligned_ids: list, aligned_ids[k] means original index of k-th token
 9 |     :param sparse: whether to return sparse matrix
10 |     :param device: device of returned align matrix
11 |     :param dtype: dtype of returned align matrix
12 |     :return: align_matrix: torch.FloatTensor, shape: (L, L')
13 | 
14 |     Example:
15 |     >> aligned_ids = [0, 0, 1, 2, 2, 2]
16 |     >> get_align_matrix(aligned_ids)
17 |     tensor([[0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000],
18 |             [0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000],
19 |             [0.0000, 0.0000, 0.0000, 0.3333, 0.3333, 0.3333]])
20 |     """
21 | 
22 |     l0 = max(aligned_ids) + 1
23 |     l1 = len(aligned_ids)
24 |     if sparse:
25 |         raise NotImplementedError
26 |     else:
27 |         align_matrix = torch.zeros((l0, l1), dtype=dtype, device=device)
28 |         align_matrix[aligned_ids, torch.arange(l1)] = 1
29 |         align_matrix = align_matrix / align_matrix.sum(dim=1, keepdim=True)
30 | 
31 |     return align_matrix
32 | 
33 | 
34 | def get_all_ngrams(words):
35 |     """
36 |     Get all n-grams of words
37 |     :param words: list of str
38 |     :return: ngrams, list of (list of str)
39 |     """
40 |     ngrams = []
41 |     N = len(words)
42 |     for n in range(1, N + 1):
43 |         for i in range(0, N - n + 1):
44 |             ngrams.append([words[j] for j in range(i, i + n)])
45 | 
46 |     return ngrams
47 | 
48 | 
49 | def random_word_with_token_ids(token_ids, tokenizer):
50 |     """
51 |     Masking some random tokens for Language Model task with probabilities as in the original BERT paper.
52 |     :param token_ids: list of int, list of token id.
53 |     :param tokenizer: Tokenizer, object used for tokenization (we need it's vocab here)
54 |     :return: (list of str, list of int), masked tokens and related labels for LM prediction
55 |     """
56 |     output_label = []
57 |     mask_id = tokenizer.convert_tokens_to_ids(['[MASK]'])[0]
58 | 
59 |     for i, token_id in enumerate(token_ids):
60 |         prob = random.random()
61 |         # mask token with 15% probability
62 |         if prob < 0.15:
63 |             prob /= 0.15
64 | 
65 |             # 80% randomly change token to mask token
66 |             if prob < 0.8:
67 |                 token_ids[i] = mask_id
68 | 
69 |             # 10% randomly change token to random token
70 |             elif prob < 0.9:
71 |                 token_ids[i] = random.choice(list(tokenizer.vocab.items()))[1]
72 | 
73 |             # -> rest 10% randomly keep current token
74 | 
75 |             # append current token to output (we will predict these later)
76 |             output_label.append(token_id)
77 |         else:
78 |             # no masking token (will be ignored by loss function later)
79 |             output_label.append(-1)
80 | 
81 |     return token_ids, output_label
82 | 
83 | 
84 | 
85 | 
86 | 
87 | 


--------------------------------------------------------------------------------
/common/nlp/roberta/__init__.py:
--------------------------------------------------------------------------------
1 | from .tokenization_roberta import RobertaTokenizer
2 | 


--------------------------------------------------------------------------------
/common/nlp/time_distributed.py:
--------------------------------------------------------------------------------
 1 | """
 2 | A wrapper that unrolls the second (time) dimension of a tensor
 3 | into the first (batch) dimension, applies some other ``Module``,
 4 | and then rolls the time dimension back up.
 5 | """
 6 | 
 7 | import torch
 8 | 
 9 | 
10 | class TimeDistributed(torch.nn.Module):
11 |     """
12 |     Given an input shaped like ``(batch_size, time_steps, [rest])`` and a ``Module`` that takes
13 |     inputs like ``(batch_size, [rest])``, ``TimeDistributed`` reshapes the input to be
14 |     ``(batch_size * time_steps, [rest])``, applies the contained ``Module``, then reshapes it back.
15 | 
16 |     Note that while the above gives shapes with ``batch_size`` first, this ``Module`` also works if
17 |     ``batch_size`` is second - we always just combine the first two dimensions, then split them.
18 |     """
19 |     def __init__(self, module):
20 |         super(TimeDistributed, self).__init__()
21 |         self._module = module
22 | 
23 |     def forward(self, *inputs, **kwargs):  # pylint: disable=arguments-differ
24 |         reshaped_inputs = []
25 |         for input_tensor in inputs:
26 |             input_size = input_tensor.size()
27 |             if len(input_size) <= 2:
28 |                 raise RuntimeError("No dimension to distribute: " + str(input_size))
29 | 
30 |             # Squash batch_size and time_steps into a single axis; result has shape
31 |             # (batch_size * time_steps, input_size).
32 |             squashed_shape = [-1] + [x for x in input_size[2:]]
33 |             reshaped_inputs.append(input_tensor.contiguous().view(*squashed_shape))
34 | 
35 |         reshaped_outputs = self._module(*reshaped_inputs, **kwargs)
36 | 
37 |         if isinstance(reshaped_outputs, torch.Tensor):
38 |             # Now get the output back into the right shape.
39 |             # (batch_size, time_steps, [hidden_size])
40 |             new_shape = [input_size[0], input_size[1]] + [x for x in reshaped_outputs.size()[1:]]
41 |             outputs = reshaped_outputs.contiguous().view(*new_shape)
42 |         elif isinstance(reshaped_outputs, tuple):
43 |             outputs = []
44 |             for output in reshaped_outputs:
45 |                 new_shape = [input_size[0], input_size[1]] + [x for x in output.size()[1:]]
46 |                 outputs.append(output.contiguous().view(*new_shape))
47 |             outputs = tuple(outputs)
48 |         else:
49 |             raise ValueError("Not support!")
50 | 
51 |         return outputs
52 | 


--------------------------------------------------------------------------------
/common/utils/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/common/utils/__init__.py


--------------------------------------------------------------------------------
/common/utils/bbox.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def nonlinear_transform(ex_rois, gt_rois):
 5 |     """
 6 |     compute bounding box regression targets from ex_rois to gt_rois
 7 |     :param ex_rois: [k, 4] ([x1, y1, x2, y2])
 8 |     :param gt_rois: [k, 4] (corresponding gt_boxes [x1, y1, x2, y2] )
 9 |     :return: bbox_targets: [k, 4]
10 |     """
11 |     assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number'
12 | 
13 |     ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
14 |     ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
15 |     ex_ctr_x = ex_rois[:, 0] + 0.5 * (ex_widths - 1.0)
16 |     ex_ctr_y = ex_rois[:, 1] + 0.5 * (ex_heights - 1.0)
17 | 
18 |     gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0
19 |     gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0
20 |     gt_ctr_x = gt_rois[:, 0] + 0.5 * (gt_widths - 1.0)
21 |     gt_ctr_y = gt_rois[:, 1] + 0.5 * (gt_heights - 1.0)
22 | 
23 |     targets_dx = (gt_ctr_x - ex_ctr_x) / (ex_widths + 1e-6)
24 |     targets_dy = (gt_ctr_y - ex_ctr_y) / (ex_heights + 1e-6)
25 |     targets_dw = torch.log(gt_widths / (ex_widths).clamp(min=1e-6))
26 |     targets_dh = torch.log(gt_heights / ((ex_heights).clamp(min=1e-6)))
27 | 
28 |     targets = torch.cat(
29 |         (targets_dx.view(-1, 1), targets_dy.view(-1, 1), targets_dw.view(-1, 1), targets_dh.view(-1, 1)), dim=-1)
30 |     return targets
31 | 
32 | 
33 | def coordinate_embeddings(boxes, dim):
34 |     """
35 |     Coordinate embeddings of bounding boxes
36 |     :param boxes: [K, 6] ([x1, y1, x2, y2, w_image, h_image])
37 |     :param dim: sin/cos embedding dimension
38 |     :return: [K, 4, 2 * dim]
39 |     """
40 | 
41 |     num_boxes = boxes.shape[0]
42 |     w = boxes[:, 4]
43 |     h = boxes[:, 5]
44 | 
45 |     # transform to (x_c, y_c, w, h) format
46 |     boxes_ = boxes.new_zeros((num_boxes, 4))
47 |     boxes_[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2
48 |     boxes_[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2
49 |     boxes_[:, 2] = boxes[:, 2] - boxes[:, 0]
50 |     boxes_[:, 3] = boxes[:, 3] - boxes[:, 1]
51 |     boxes = boxes_
52 | 
53 |     # position
54 |     pos = boxes.new_zeros((num_boxes, 4))
55 |     pos[:, 0] = boxes[:, 0] / w * 100
56 |     pos[:, 1] = boxes[:, 1] / h * 100
57 |     pos[:, 2] = boxes[:, 2] / w * 100
58 |     pos[:, 3] = boxes[:, 3] / h * 100
59 | 
60 |     # sin/cos embedding
61 |     dim_mat = 1000 ** (torch.arange(dim, dtype=boxes.dtype, device=boxes.device) / dim)
62 |     sin_embedding = (pos.view((num_boxes, 4, 1)) / dim_mat.view((1, 1, -1))).sin()
63 |     cos_embedding = (pos.view((num_boxes, 4, 1)) / dim_mat.view((1, 1, -1))).cos()
64 | 
65 |     return torch.cat((sin_embedding, cos_embedding), dim=-1)
66 | 
67 | 
68 | def bbox_iou_py_vectorized(boxes, query_boxes):
69 |     n_ = boxes.shape[0]
70 |     k_ = query_boxes.shape[0]
71 |     n_mesh, k_mesh = torch.meshgrid([torch.arange(n_), torch.arange(k_)])
72 |     n_mesh = n_mesh.contiguous().view(-1)
73 |     k_mesh = k_mesh.contiguous().view(-1)
74 |     boxes = boxes[n_mesh]
75 |     query_boxes = query_boxes[k_mesh]
76 | 
77 |     x11, y11, x12, y12 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3]
78 |     x21, y21, x22, y22 = query_boxes[:, 0], query_boxes[:, 1], query_boxes[:, 2], query_boxes[:, 3]
79 |     xA = torch.max(x11, x21)
80 |     yA = torch.max(y11, y21)
81 |     xB = torch.min(x12, x22)
82 |     yB = torch.min(y12, y22)
83 |     interArea = torch.clamp(xB - xA + 1, min=0) * torch.clamp(yB - yA + 1, min=0)
84 |     boxAArea = (x12 - x11 + 1) * (y12 - y11 + 1)
85 |     boxBArea = (x22 - x21 + 1) * (y22 - y21 + 1)
86 |     iou = interArea / (boxAArea + boxBArea - interArea)
87 | 
88 |     return iou.view(n_, k_).to(boxes.device)
89 | 
90 | 
91 | 
92 | 
93 | 
94 | 
95 | 


--------------------------------------------------------------------------------
/common/utils/clip_pad.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def clip_pad_images(tensor, pad_shape, pad=0):
 5 |     """
 6 |     Clip clip_pad_images of the pad area.
 7 |     :param tensor: [c, H, W]
 8 |     :param pad_shape: [h, w]
 9 |     :return: [c, h, w]
10 |     """
11 |     if not isinstance(tensor, torch.Tensor):
12 |         tensor = torch.as_tensor(tensor)
13 |     H, W = tensor.shape[1:]
14 |     h = pad_shape[1]
15 |     w = pad_shape[2]
16 | 
17 |     tensor_ret = torch.zeros((tensor.shape[0], h, w), dtype=tensor.dtype) + pad
18 |     tensor_ret[:, :min(h, H), :min(w, W)] = tensor[:, :min(h, H), :min(w, W)]
19 | 
20 |     return tensor_ret
21 | 
22 | 
23 | def clip_pad_boxes(tensor, pad_length, pad=0):
24 |     """
25 |         Clip boxes of the pad area.
26 |         :param tensor: [k, d]
27 |         :param pad_shape: K
28 |         :return: [K, d]
29 |     """
30 |     if not isinstance(tensor, torch.Tensor):
31 |         tensor = torch.as_tensor(tensor)
32 |     k = tensor.shape[0]
33 |     d = tensor.shape[1]
34 |     K = pad_length
35 |     tensor_ret = torch.zeros((K, d), dtype=tensor.dtype) + pad
36 |     tensor_ret[:min(k, K), :] = tensor[:min(k, K), :]
37 | 
38 |     return tensor_ret
39 | 
40 | 
41 | def clip_pad_1d(tensor, pad_length, pad=0):
42 |     if not isinstance(tensor, torch.Tensor):
43 |         tensor = torch.as_tensor(tensor)
44 |     tensor_ret = torch.zeros((pad_length, ), dtype=tensor.dtype) + pad
45 |     tensor_ret[:min(tensor.shape[0], pad_length)] = tensor[:min(tensor.shape[0], pad_length)]
46 | 
47 |     return tensor_ret
48 | 
49 | 
50 | def clip_pad_2d(tensor, pad_shape, pad=0):
51 |     if not isinstance(tensor, torch.Tensor):
52 |         tensor = torch.as_tensor(tensor)
53 |     tensor_ret = torch.zeros(*pad_shape, dtype=tensor.dtype) + pad
54 |     tensor_ret[:min(tensor.shape[0], pad_shape[0]), :min(tensor.shape[1], pad_shape[1])] \
55 |         = tensor[:min(tensor.shape[0], pad_shape[0]), :min(tensor.shape[1], pad_shape[1])]
56 | 
57 |     return tensor_ret
58 | 


--------------------------------------------------------------------------------
/common/utils/create_logger.py:
--------------------------------------------------------------------------------
 1 | # --------------------------------------------------------
 2 | # Deformable Convolutional Networks
 3 | # Copyright (c) 2017 Microsoft
 4 | # Licensed under The Apache-2.0 License [see LICENSE for details]
 5 | # Written by Bin Xiao
 6 | # --------------------------------------------------------
 7 | 
 8 | import os
 9 | import logging
10 | import time
11 | import errno
12 | 
13 | 
14 | def makedirsExist(path):
15 |     try:
16 |         os.makedirs(path)
17 |     except OSError as e:
18 |         if e.errno == errno.EEXIST:
19 |             print('Directory not created.')
20 |         else:
21 |             raise
22 | 
23 | 
24 | def create_logger(root_output_path, config_file, image_set, split='train', hypers=()):
25 |     # set up logger
26 |     if not os.path.exists(root_output_path):
27 |         makedirsExist(root_output_path)
28 |     assert os.path.exists(root_output_path), '{} does not exist'.format(root_output_path)
29 | 
30 |     cfg_name = os.path.splitext(os.path.basename(config_file))[0]
31 | 
32 |     config_output_path = os.path.join(root_output_path, '{}'.format(cfg_name))
33 |     for (hyper_name, hyper_val) in hypers:
34 |         config_output_path += '@{}={}'.format(hyper_name, hyper_val)
35 |     if not os.path.exists(config_output_path):
36 |         makedirsExist(config_output_path)
37 | 
38 |     final_output_path = os.path.join(config_output_path, image_set + '_' + split)
39 |     if not os.path.exists(final_output_path):
40 |         makedirsExist(final_output_path)
41 | 
42 |     log_file = '{}_{}.log'.format(cfg_name, time.strftime('%Y-%m-%d-%H-%M'))
43 |     head = '%(asctime)-15s %(message)s'
44 |     logging.basicConfig(filename=os.path.join(final_output_path, log_file), format=head)
45 |     logger = logging.getLogger()
46 |     logger.setLevel(logging.INFO)
47 | 
48 |     return logger, final_output_path
49 | 
50 | 
51 | 


--------------------------------------------------------------------------------
/common/utils/flatten.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class Flattener(torch.nn.Module):
 5 |     def __init__(self):
 6 |         """
 7 |         Flattens last 3 dimensions to make it only batch size, -1
 8 |         """
 9 |         super(Flattener, self).__init__()
10 | 
11 |     def forward(self, x):
12 |         return x.view(x.size(0), -1)
13 | 


--------------------------------------------------------------------------------
/common/utils/mask.py:
--------------------------------------------------------------------------------
 1 | from skimage.draw import polygon
 2 | import torch
 3 | 
 4 | 
 5 | def generate_instance_mask(seg_polys, box, mask_size=(14, 14), dtype=torch.float32, copy=True):
 6 |     """
 7 |     Generate instance mask from polygon
 8 |     :param seg_poly: torch.Tensor, (N, 2), (x, y) coordinate of N vertices of segmented foreground polygon
 9 |     :param box: array-like, (4, ), (xmin, ymin, xmax, ymax), instance bounding box
10 |     :param mask_size: tuple, (mask_height, mask_weight)
11 |     :param dtype: data type of generated mask
12 |     :param copy: whether copy seg_polys to a new tensor first
13 |     :return: torch.Tensor, of mask_size, instance mask
14 |     """
15 |     mask = torch.zeros(mask_size, dtype=dtype)
16 |     w_ratio = float(mask_size[0]) / (box[2] - box[0] + 1)
17 |     h_ratio = float(mask_size[1]) / (box[3] - box[1] + 1)
18 | 
19 |     # import IPython
20 |     # IPython.embed()
21 | 
22 |     for seg_poly in seg_polys:
23 |         if copy:
24 |             seg_poly = seg_poly.detach().clone()
25 |         seg_poly = seg_poly.type(torch.float32)
26 |         seg_poly[:, 0] = (seg_poly[:, 0] - box[0]) * w_ratio
27 |         seg_poly[:, 1] = (seg_poly[:, 1] - box[1]) * h_ratio
28 |         rr, cc = polygon(seg_poly[:, 1].clamp(min=0, max=mask_size[1] - 1),
29 |                          seg_poly[:, 0].clamp(min=0, max=mask_size[0] - 1))
30 | 
31 |         mask[rr, cc] = 1
32 |     return mask
33 | 
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/common/utils/masked_softmax.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def masked_softmax(vector: torch.Tensor, mask: torch.Tensor, dim: int = -1) -> torch.Tensor:
 5 |     """
 6 |     ``torch.nn.functional.softmax(vector)`` does not work if some elements of ``vector`` should be
 7 |     masked.  This performs a softmax on just the non-masked portions of ``vector``.  Passing
 8 |     ``None`` in for the mask is also acceptable; you'll just get a regular softmax.
 9 | 
10 |     ``vector`` can have an arbitrary number of dimensions; the only requirement is that ``mask`` is
11 |     broadcastable to ``vector's`` shape.  If ``mask`` has fewer dimensions than ``vector``, we will
12 |     unsqueeze on dimension 1 until they match.  If you need a different unsqueezing of your mask,
13 |     do it yourself before passing the mask into this function.
14 | 
15 |     In the case that the input vector is completely masked, this function returns an array
16 |     of ``0.0``. This behavior may cause ``NaN`` if this is used as the last layer of a model
17 |     that uses categorical cross-entropy loss.
18 |     """
19 |     if mask is None:
20 |         result = torch.nn.functional.softmax(vector, dim=dim)
21 |     else:
22 |         mask = mask.type(vector.dtype)
23 |         while mask.dim() < vector.dim():
24 |             mask = mask.unsqueeze(1)
25 |         # To limit numerical errors from large vector elements outside the mask, we zero these out.
26 |         result = torch.nn.functional.softmax(vector * mask, dim=dim)
27 |         result = result * mask
28 |         result = result / (result.sum(dim=dim, keepdim=True) + (1e-7 if vector.dtype == torch.half else 1e-13))
29 |     return result
30 | 


--------------------------------------------------------------------------------
/common/utils/multi_task_dataloader.py:
--------------------------------------------------------------------------------
 1 | from functools import reduce
 2 | import operator
 3 | from typing import List
 4 | from torch.utils.data import DataLoader
 5 | import sys
 6 | 
 7 | INT_MAX = sys.maxsize
 8 | 
 9 | 
10 | def prod(iterable):
11 |     if len(list(iterable)) > 0:
12 |         return reduce(operator.mul, iterable)
13 |     else:
14 |         return 1
15 | 
16 | 
17 | class MultiTaskDataLoader(object):
18 |     """
19 |     Multi-task DataLoader, the first dataloader is master dataloader
20 |     """
21 |     def __init__(self,
22 |                  loaders: List[DataLoader]):
23 |         assert len(loaders) > 1, "Less than 2 loader!"
24 |         self.loaders = loaders
25 |         self.iters = [iter(loader) for loader in loaders]
26 |         self.lens = [len(loader) for loader in loaders]
27 |         self.global_idx_in_cycle = 0
28 | 
29 |     def __iter__(self):
30 |         if self.global_idx_in_cycle > 0:
31 |             self.iters[0] = iter(self.loaders[0])
32 |         return self
33 | 
34 |     def __next__(self):
35 |         output_tuple = (*next(self.iters[0]), )
36 |         for k, (loader, _iter) in enumerate(zip(self.loaders[1:], self.iters[1:])):
37 |             if hasattr(loader.batch_sampler.sampler, 'set_epoch'):
38 |                 loader.batch_sampler.sampler.set_epoch(int(self.global_idx_in_cycle / self.lens[k+1]))
39 |             try:
40 |                 output_tuple += (*next(_iter), )
41 |             except StopIteration:
42 |                 _iter = iter(loader)
43 |                 self.iters[k+1] = _iter
44 |                 output_tuple += (*next(_iter), )
45 | 
46 |         if self.global_idx_in_cycle < INT_MAX - 1:
47 |             self.global_idx_in_cycle += 1
48 |         else:
49 |             self.global_idx_in_cycle = 0
50 | 
51 |         return output_tuple
52 | 
53 |     def __len__(self):
54 |         return self.lens[0]
55 | 
56 | 
57 | 
58 | 


--------------------------------------------------------------------------------
/common/utils/pad_sequence.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | def pad_sequence(sequence, lengths):
 5 |     """
 6 |     :param sequence: [\sum b, .....] sequence
 7 |     :param lengths: [b1, b2, b3...] that sum to \sum b
 8 |     :return: [len(lengths), maxlen(b), .....] tensor
 9 |     """
10 | 
11 |     output = sequence.new_zeros(len(lengths), max(lengths), *sequence.shape[1:])
12 |     start = 0
13 |     for i, diff in enumerate(lengths):
14 |         if diff > 0:
15 |             output[i, :diff] = sequence[start:(start + diff)]
16 |         start += diff
17 |     return output
18 | 


--------------------------------------------------------------------------------
/common/utils/zipreader.py:
--------------------------------------------------------------------------------
 1 | import zipfile
 2 | import os
 3 | import io
 4 | import time
 5 | from PIL import Image
 6 | 
 7 | 
 8 | class ZipReader(object):
 9 |     zip_bank = dict()
10 | 
11 |     def __init__(self):
12 |         super(ZipReader, self).__init__()
13 | 
14 |     @staticmethod
15 |     def get_zipfile(path):
16 |         zip_bank = ZipReader.zip_bank
17 |         if path in zip_bank:
18 |             return zip_bank[path]
19 |         else:
20 |             print("creating new zip_bank")
21 |             zfile = zipfile.ZipFile(path, 'r')
22 |             zip_bank[path] = zfile
23 |             return zip_bank[path]
24 | 
25 |     @staticmethod
26 |     def split_zip_style_path(path):
27 |         pos_zip_at = path.index('.zip@')
28 |         if pos_zip_at == len(path):
29 |             print("character '@' is not found from the given path '%s'" % (path))
30 |             assert 0
31 |         pos_at = pos_zip_at + len('.zip@') - 1
32 | 
33 |         zip_path = path[0: pos_at]
34 |         folder_path = path[pos_at + 1:]
35 |         folder_path = str.strip(folder_path, '/')
36 |         return zip_path, folder_path
37 | 
38 |     @staticmethod
39 |     def list_folder(path):
40 |         zip_path, folder_path = ZipReader.split_zip_style_path(path)
41 | 
42 |         zfile = ZipReader.get_zipfile(zip_path)
43 |         folder_list = []
44 |         for file_foler_name in zfile.namelist():
45 |             file_foler_name = str.strip(file_foler_name, '/')
46 |             if file_foler_name.startswith(folder_path) and \
47 |                len(os.path.splitext(file_foler_name)[-1]) == 0 and \
48 |                file_foler_name != folder_path:
49 |                 if len(folder_path) == 0:
50 |                     folder_list.append(file_foler_name)
51 |                 else:
52 |                     folder_list.append(file_foler_name[len(folder_path)+1:])
53 | 
54 |         return folder_list
55 | 
56 |     @staticmethod
57 |     def list_files(path, extension=['.*']):
58 |         zip_path, folder_path = ZipReader.split_zip_style_path(path)
59 | 
60 |         zfile = ZipReader.get_zipfile(zip_path)
61 |         file_lists = []
62 |         for file_foler_name in zfile.namelist():
63 |             file_foler_name = str.strip(file_foler_name, '/')
64 |             if file_foler_name.startswith(folder_path) and str.lower(os.path.splitext(file_foler_name)[-1]) in extension:
65 |                 if len(folder_path) == 0:
66 |                     file_lists.append(file_foler_name)
67 |                 else:
68 |                     file_lists.append(file_foler_name[len(folder_path)+1:])
69 | 
70 |         return file_lists
71 | 
72 |     @staticmethod
73 |     def imread(path):
74 |         zip_path, path_img = ZipReader.split_zip_style_path(path)
75 |         zfile = ZipReader.get_zipfile(zip_path)
76 |         data = zfile.read(path_img)
77 |         im = Image.open(io.BytesIO(data))
78 |         return im
79 | 
80 |     @staticmethod
81 |     def read(path):
82 |         zip_path, path_img = ZipReader.split_zip_style_path(path)
83 |         zfile = ZipReader.get_zipfile(zip_path)
84 |         data = zfile.read(path_img)
85 |         return data
86 | 


--------------------------------------------------------------------------------
/data/PREPARE_DATA.md:
--------------------------------------------------------------------------------
 1 | # Prepare Data
 2 | 
 3 | Download datasets as you need, and organize them as following:
 4 |  ```
 5 | code_root/
 6 | └── data/
 7 |     ├── conceptual-captions/
 8 |     │   ├── train_image/
 9 |     │   ├── val_image/
10 |     │   ├── train_frcnn/
11 |     │   ├── val_frcnn/
12 |     │   ├── train.json
13 |     │   ├── val.json
14 |     │   ├── train_frcnn.json
15 |     │   └── val_frcnn.json
16 |     ├── en_corpus/
17 |     │   ├── wiki.doc
18 |     │   └── bc1g.doc
19 |     ├── vcr/
20 |     │   ├── vcr1images/
21 |     │   ├── train.jsonl
22 |     │   ├── val.jsonl
23 |     │   └── test.jsonl
24 |     └── coco/
25 |         ├── train2014/
26 |         ├── val2014/
27 |         ├── test2015/
28 |         ├── annotations/
29 |         ├── vqa/
30 |         ├── refcoco+/
31 |         │   └── proposal/
32 |         └── vgbua_res101_precomputed/
33 |             ├── trainval2014_resnet101_faster_rcnn_genome
34 |             └── test2015_resnet101_faster_rcnn_genome
35 |         
36 |  ```
37 | ## Pre-training Data
38 | 
39 | ### Conceptual Captions
40 | See [ReadMe.txt](./conceptual-captions/ReadMe.txt).
41 | 
42 | ### English Wikipedia & BooksCorpus
43 | * Wikipedia: [GoogleDrive](https://drive.google.com/file/d/1rZJ-Nj_SSqwu85tME3wbN8tfGhljfAsf/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1HSgUZXRESxVnx9ATOHwSrQ)
44 | * BooksCorpus: [GoogleDrive](https://drive.google.com/file/d/16T5EYqIjO-tAj1OFxz6bnnzEABCusCcv/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1797WFFUTnRJakgGxefSrBg)
45 | 
46 | ## Fine-tuning Data
47 | 
48 | ### VCR
49 | * Download and unzip images & annotations from [here](https://visualcommonsense.com/download/).
50 | 
51 | ### VQA & RefCOCO+
52 | 
53 | #### Common
54 | * Download and unzip COCO 2014 images & annotations from [here](http://cocodataset.org/#download).
55 | 
56 | #### VQA
57 | * Download and unzip annotations from [here](https://visualqa.org/download.html) (including "VQA Annotations" and "VQA Input Questions"), 
58 | place all these files directly under ```./data/coco/vqa```.
59 | * Download and unzip following precomputed boxes & features into ```./data/coco/vgbua_res101_precomputed```.
60 |     * train2014 + val2014: [GoogleDrive](https://drive.google.com/file/d/1KyLyqTqBsMX7QtLTma0xFrmhAzdQDUed/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1Udtoi2TC-nAimZf-vLC9PQ)
61 |     * test2015: [GoogleDrive](https://drive.google.com/file/d/10nM3kRz2c827aqwVvLnv430YYFp0po6O/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1wd3rWfPWLBhGkEc10N9e1Q)
62 | 
63 | * Download answer vocabulary from [GoogleDrive](https://drive.google.com/file/d/1CPnYcOgIOP5CZkp_KChuCg54_Ljr6-fp/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1IvPsH-mmqHi2glgznaBuYw), place it under the folder ```./data/coco/vqa/```.
64 |     
65 | #### RefCOCO+
66 | 
67 | * Download and unzip [annotations](http://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco+.zip), place all files in ```refcoco+/``` directly under ```./data/coco/refcoco+```.
68 | * Download [region proposals](http://bvision.cs.unc.edu/licheng/MattNet/detections.zip), place all files in ```detections/refcoco+_unc``` directly under ```./data/coco/refcoco+/proposal```.


--------------------------------------------------------------------------------
/data/conceptual-captions/ReadMe.txt:
--------------------------------------------------------------------------------
 1 | 0. create a python 2.7 conda environment:
 2 | 
 3 |    conda create -n cc python=2.7 pip
 4 |    conda activate cc
 5 |    pip install Cython numpy Pillow
 6 | 
 7 | 1. download "Train_GCC-training.tsv" and "Validation_GCC-1.1.0-Validation.tsv" from
 8 |    https://ai.google.com/research/ConceptualCaptions/download
 9 | 	
10 | 2. move "Train_GCC-training.tsv" and "Validation_GCC-1.1.0-Validation.tsv" into
11 |    conceptual-captions/utils/
12 |    
13 | 3. cd to conceptual-captions/utils/
14 | 
15 | 4. python gen_train4download.py
16 |    python gen_val4download.py
17 | 
18 | 5. sh download_train.sh
19 |    sh download_val.sh
20 |    
21 |    * you may need to run these commands multiple times to avoid temporary network failures and download as more images as possible
22 |    * these commands will skip already successfully downloaded images, so don't worry about wasting time
23 | 
24 | 6. 1) zip (without compression) "train_image" by
25 |    
26 |    cd ../train_image
27 |    zip -0 ../train_image.zip ./*
28 |    cd ../utils/
29 |    
30 |    2) zip (without compression) "val_image" by
31 |    
32 |    cd ../val_image
33 |    zip -0 ../val_image.zip ./*
34 |    cd ../utils/
35 |    
36 | 7. python gen_train_image_json.py
37 |    python gen_val_image_json.py
38 |    
39 |    
40 | 8. git clone https://github.com/jackroos/bottom-up-attention and follow "Installation" :
41 | 
42 |    1) Build the Cython modules
43 | 
44 |    cd $REPO_ROOT/lib
45 |    make
46 |    
47 |    2) Build Caffe and pycaffe
48 | 
49 |    cd $REPO_ROOT/caffe
50 |    # Now follow the Caffe installation instructions here:
51 |    #   http://caffe.berkeleyvision.org/installation.html
52 | 
53 |    # If you're experienced with Caffe and have all of the requirements installed
54 |    # and your Makefile.config in place, then simply do:
55 |    make -j8 && make pycaffe
56 |    
57 |    3) Download pretrained model (https://www.dropbox.com/s/5xethd2nxa8qrnq/resnet101_faster_rcnn_final.caffemodel?dl=1), and put it under data/faster_rcnn_models.
58 |    
59 | 9. python ./tools/generate_tsv_v2.py --gpu 0,1,2,3,4,5,6,7 --cfg experiments/cfgs/faster_rcnn_end2end_resnet.yml --def models/vg/ResNet-101/faster_rcnn_end2end_final/test.prototxt --net data/faster_rcnn_models/resnet101_faster_rcnn_final.caffemodel --split conceptual_captions_train --data_root {Conceptual_Captions_Root} --out {Conceptual_Captions_Root}/train_frcnn/
60 | 
61 |    python ./tools/generate_tsv_v2.py --gpu 0,1,2,3,4,5,6,7 --cfg experiments/cfgs/faster_rcnn_end2end_resnet.yml --def models/vg/ResNet-101/faster_rcnn_end2end_final/test.prototxt --net data/faster_rcnn_models/resnet101_faster_rcnn_final.caffemodel --split conceptual_captions_val --data_root {Conceptual_Captions_Root} --out {Conceptual_Captions_Root}/val_frcnn/
62 |    
63 | 10. zip (without compression) "train_frcnn" and "val_frcnn" similar to step 6.
64 | 


--------------------------------------------------------------------------------
/data/conceptual-captions/utils/check_valid.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | from PIL import Image
 3 | 
 4 | import warnings
 5 | 
 6 | warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning)
 7 | 
 8 | try:
 9 |     im = Image.open(sys.argv[1]).convert('RGB')
10 |     # remove images with too small or too large size
11 |     if (im.size[0] < 10 or im.size[1] < 10 or im.size[0] > 10000 or im.size[1] > 10000):
12 |         raise Exception('')
13 | except:
14 |     print(sys.argv[1])
15 | 


--------------------------------------------------------------------------------
/data/conceptual-captions/utils/download_train.sh:
--------------------------------------------------------------------------------
1 | # use 20 threads
2 | 
3 | cat train4download.txt | xargs -n 2 -P 20 wget -nc -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' --timeout=1 --waitretry=0 --tries=5 --retry-connrefused -nv -O
4 | find ../train_image -type f -size -1c -exec rm {} \;
5 | ls -d ../train_image/* | xargs -n 1 -P 20 python check_valid.py | tee train_size_invalid.txt
6 | xargs rm < train_size_invalid.txt
7 | rm train_size_invalid.txt
8 | ls ../train_image > train_valid.txt
9 | 


--------------------------------------------------------------------------------
/data/conceptual-captions/utils/download_val.sh:
--------------------------------------------------------------------------------
1 | # use 20 threads
2 | 
3 | cat val4download.txt | xargs -n 2 -P 20 wget -nc -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' --timeout=1 --waitretry=0 --tries=5 --retry-connrefused -nv -O
4 | find ../val_image -type f -size -1c -exec rm {} \;
5 | ls -d ../val_image/* | xargs -n 1 -P 20 python check_valid.py | tee val_size_invalid.txt
6 | xargs rm < val_size_invalid.txt
7 | rm val_size_invalid.txt
8 | ls ../val_image > val_valid.txt
9 | 


--------------------------------------------------------------------------------
/data/conceptual-captions/utils/gen_train4download.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | captions = []
 4 | urls = []
 5 | 
 6 | with open('Train_GCC-training.tsv') as fp:
 7 |     for cnt, line in enumerate(fp):
 8 |         s = line.split('\t')
 9 |         captions.append(s[0].split(' '))
10 |         urls.append(s[1][:-1])
11 |         
12 | with open('train4download.txt', 'w') as fp:
13 |     for cnt, url in enumerate(urls):
14 |         fp.write("../train_image/{:08d}.jpg\t\"{}\"\n".format(cnt, url))
15 | 
16 | if not os.path.exists('../train_image'):
17 |     os.makedirs('../train_image')


--------------------------------------------------------------------------------
/data/conceptual-captions/utils/gen_train_image_json.py:
--------------------------------------------------------------------------------
 1 | captions = []
 2 | urls = []
 3 | 
 4 | with open('Train_GCC-training.tsv') as fp:
 5 |     for cnt, line in enumerate(fp):
 6 |         s = line.split('\t')
 7 |         captions.append(s[0].split(' '))
 8 |         urls.append(s[1][:-1])
 9 |         
10 | valids = set([])
11 | with open('train_valid.txt') as fp:
12 |     for cnt, line in enumerate(fp):
13 |         valids.add(line[:-1])
14 |         
15 | import json
16 | with open('train.json', 'w') as outfile:
17 |     for cnt, (cap, url) in enumerate(zip(captions, urls)):
18 |         im = "{:08d}.jpg".format(cnt)
19 |         if (im in valids):
20 |             d = {'image':"train_image.zip@/{}".format(im), 'caption':cap}
21 |             json.dump(d, outfile)
22 |             outfile.write('\n')
23 |             
24 |             
25 | import json
26 | with open('train_frcnn.json', 'w') as outfile:
27 |     for cnt, (cap, url) in enumerate(zip(captions, urls)):
28 |         im = "{:08d}.jpg".format(cnt)
29 |         if (im in valids):
30 |             d = {'image':"train_image.zip@/{}".format(im), 'caption':cap, 'frcnn':"train_frcnn.zip@/{:08d}.json".format(cnt)}
31 |             json.dump(d, outfile)
32 |             outfile.write('\n')


--------------------------------------------------------------------------------
/data/conceptual-captions/utils/gen_val4download.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | captions = []
 4 | urls = []
 5 | 
 6 | with open('Validation_GCC-1.1.0-Validation.tsv') as fp:
 7 |     for cnt, line in enumerate(fp):
 8 |         s = line.split('\t')
 9 |         captions.append(s[0].split(' '))
10 |         urls.append(s[1][:-1])
11 |         
12 | with open('val4download.txt', 'w') as fp:
13 |     for cnt, url in enumerate(urls):
14 |         fp.write("../val_image/{:08d}.jpg\t\"{}\"\n".format(cnt, url))
15 | 
16 | if not os.path.exists('../val_image'):
17 |     os.makedirs('../val_image')


--------------------------------------------------------------------------------
/data/conceptual-captions/utils/gen_val_image_json.py:
--------------------------------------------------------------------------------
 1 | captions = []
 2 | urls = []
 3 | 
 4 | with open('Validation_GCC-1.1.0-Validation.tsv') as fp:
 5 |     for cnt, line in enumerate(fp):
 6 |         s = line.split('\t')
 7 |         captions.append(s[0].split(' '))
 8 |         urls.append(s[1][:-1])
 9 |         
10 | valids = set([])
11 | with open('val_valid.txt') as fp:
12 |     for cnt, line in enumerate(fp):
13 |         valids.add(line[:-1])
14 |         
15 | import json
16 | with open('val.json', 'w') as outfile:
17 |     for cnt, (cap, url) in enumerate(zip(captions, urls)):
18 |         im = "{:08d}.jpg".format(cnt)
19 |         if (im in valids):
20 |             d = {'image':"val_image.zip@/{}".format(im), 'caption':cap}
21 |             json.dump(d, outfile)
22 |             outfile.write('\n')
23 |             
24 | import json
25 | with open('val_frcnn.json', 'w') as outfile:
26 |     for cnt, (cap, url) in enumerate(zip(captions, urls)):
27 |         im = "{:08d}.jpg".format(cnt)
28 |         if (im in valids):
29 |             d = {'image':"val_image.zip@/{}".format(im), 'caption':cap, 'frcnn':"val_frcnn.zip@/{:08d}.json".format(cnt)}
30 |             json.dump(d, outfile)
31 |             outfile.write('\n')


--------------------------------------------------------------------------------
/external/pytorch_pretrained_bert/__init__.py:
--------------------------------------------------------------------------------
 1 | __version__ = "0.6.0"
 2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 3 | from .tokenization_openai import OpenAIGPTTokenizer
 4 | from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 5 | from .tokenization_gpt2 import GPT2Tokenizer
 6 | 
 7 | from .modeling import (BertConfig, BertModel, BertForPreTraining,
 8 |                        BertForMaskedLM, BertForNextSentencePrediction,
 9 |                        BertForSequenceClassification, BertForMultipleChoice,
10 |                        BertForTokenClassification, BertForQuestionAnswering,
11 |                        load_tf_weights_in_bert)
12 | from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
13 |                               OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
14 |                               load_tf_weights_in_openai_gpt)
15 | from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,
16 |                                   load_tf_weights_in_transfo_xl)
17 | from .modeling_gpt2 import (GPT2Config, GPT2Model,
18 |                             GPT2LMHeadModel, GPT2DoubleHeadsModel,
19 |                             load_tf_weights_in_gpt2)
20 | 
21 | from .optimization import BertAdam
22 | from .optimization_openai import OpenAIAdam
23 | 
24 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path
25 | 


--------------------------------------------------------------------------------
/external/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HugginFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | from io import open
21 | 
22 | import torch
23 | 
24 | from external.pytorch_pretrained_bert.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,
25 |                                                             GPT2Config,
26 |                                                             GPT2Model,
27 |                                                             load_tf_weights_in_gpt2)
28 | 
29 | 
30 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
31 |     # Construct model
32 |     if gpt2_config_file == "":
33 |         config = GPT2Config()
34 |     else:
35 |         config = GPT2Config(gpt2_config_file)
36 |     model = GPT2Model(config)
37 | 
38 |     # Load weights from numpy
39 |     load_tf_weights_in_gpt2(model, gpt2_checkpoint_path)
40 | 
41 |     # Save pytorch-model
42 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
43 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
44 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
45 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
46 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
47 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
48 |         f.write(config.to_json_string())
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     parser = argparse.ArgumentParser()
53 |     ## Required parameters
54 |     parser.add_argument("--gpt2_checkpoint_path",
55 |                         default = None,
56 |                         type = str,
57 |                         required = True,
58 |                         help = "Path the TensorFlow checkpoint path.")
59 |     parser.add_argument("--pytorch_dump_folder_path",
60 |                         default = None,
61 |                         type = str,
62 |                         required = True,
63 |                         help = "Path to the output PyTorch model.")
64 |     parser.add_argument("--gpt2_config_file",
65 |                         default = "",
66 |                         type = str,
67 |                         help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
68 |                             "This specifies the model architecture.")
69 |     args = parser.parse_args()
70 |     convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path,
71 |                                          args.gpt2_config_file,
72 |                                          args.pytorch_dump_folder_path)
73 | 


--------------------------------------------------------------------------------
/external/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HugginFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert OpenAI GPT checkpoint."""
16 | 
17 | from __future__ import absolute_import, division, print_function
18 | 
19 | import argparse
20 | from io import open
21 | 
22 | import torch
23 | 
24 | from external.pytorch_pretrained_bert.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,
25 |                                                               OpenAIGPTConfig,
26 |                                                               OpenAIGPTModel,
27 |                                                               load_tf_weights_in_openai_gpt)
28 | 
29 | 
30 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
31 |     # Construct model
32 |     if openai_config_file == "":
33 |         config = OpenAIGPTConfig()
34 |     else:
35 |         config = OpenAIGPTConfig(openai_config_file)
36 |     model = OpenAIGPTModel(config)
37 | 
38 |     # Load weights from numpy
39 |     load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path)
40 | 
41 |     # Save pytorch-model
42 |     pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
43 |     pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
44 |     print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
45 |     torch.save(model.state_dict(), pytorch_weights_dump_path)
46 |     print("Save configuration file to {}".format(pytorch_config_dump_path))
47 |     with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
48 |         f.write(config.to_json_string())
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     parser = argparse.ArgumentParser()
53 |     ## Required parameters
54 |     parser.add_argument("--openai_checkpoint_folder_path",
55 |                         default = None,
56 |                         type = str,
57 |                         required = True,
58 |                         help = "Path the TensorFlow checkpoint path.")
59 |     parser.add_argument("--pytorch_dump_folder_path",
60 |                         default = None,
61 |                         type = str,
62 |                         required = True,
63 |                         help = "Path to the output PyTorch model.")
64 |     parser.add_argument("--openai_config_file",
65 |                         default = "",
66 |                         type = str,
67 |                         help = "An optional config json file corresponding to the pre-trained OpenAI model. \n"
68 |                             "This specifies the model architecture.")
69 |     args = parser.parse_args()
70 |     convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path,
71 |                                          args.openai_config_file,
72 |                                          args.pytorch_dump_folder_path)
73 | 


--------------------------------------------------------------------------------
/external/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The HugginFace Inc. team.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | """Convert BERT checkpoint."""
16 | 
17 | from __future__ import absolute_import
18 | from __future__ import division
19 | from __future__ import print_function
20 | 
21 | import argparse
22 | import torch
23 | 
24 | from external.pytorch_pretrained_bert.modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert
25 | 
26 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
27 |     # Initialise PyTorch model
28 |     config = BertConfig.from_json_file(bert_config_file)
29 |     print("Building PyTorch model from configuration: {}".format(str(config)))
30 |     model = BertForPreTraining(config)
31 | 
32 |     # Load weights from tf checkpoint
33 |     load_tf_weights_in_bert(model, tf_checkpoint_path)
34 | 
35 |     # Save pytorch-model
36 |     print("Save PyTorch model to {}".format(pytorch_dump_path))
37 |     torch.save(model.state_dict(), pytorch_dump_path)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     ## Required parameters
43 |     parser.add_argument("--tf_checkpoint_path",
44 |                         default = None,
45 |                         type = str,
46 |                         required = True,
47 |                         help = "Path the TensorFlow checkpoint path.")
48 |     parser.add_argument("--bert_config_file",
49 |                         default = None,
50 |                         type = str,
51 |                         required = True,
52 |                         help = "The config json file corresponding to the pre-trained BERT model. \n"
53 |                             "This specifies the model architecture.")
54 |     parser.add_argument("--pytorch_dump_path",
55 |                         default = None,
56 |                         type = str,
57 |                         required = True,
58 |                         help = "Path to the output PyTorch model.")
59 |     args = parser.parse_args()
60 |     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
61 |                                      args.bert_config_file,
62 |                                      args.pytorch_dump_path)
63 | 


--------------------------------------------------------------------------------
/figs/attention_viz.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/figs/attention_viz.png


--------------------------------------------------------------------------------
/figs/pretrain.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/figs/pretrain.png


--------------------------------------------------------------------------------
/model/pretrained_model/PREPARE_PRETRAINED_MODELS.md:
--------------------------------------------------------------------------------
 1 | # Prepare Pre-trained Models
 2 | Download pre-trained models and organize them as following:
 3 | ```
 4 | code_root/
 5 | └── model/
 6 |     └── pretrained_model/
 7 |         ├── vl-bert-base-e2e.model
 8 |         ├── vl-bert-large-e2e.model
 9 |         ├── vl-bert-base-prec.model
10 |         ├── vl-bert-large-prec.model
11 |         ├── bert-base-uncased/
12 |         │   ├── vocab.txt
13 |         │   ├── bert_config.json
14 |         │   └── pytorch_model.bin
15 |         ├── bert-large-uncased/
16 |         │   ├── vocab.txt
17 |         │   ├── bert_config.json
18 |         │   └── pytorch_model.bin
19 |         └── resnet101-pt-vgbua-0000.model     
20 | ```
21 | 
22 | 
23 | ## VL-BERT
24 | 
25 | | Model Name         | Download Link    |
26 | | ------------------ | ---------------  |
27 | | vl-bert-base-e2e   | [GoogleDrive](https://drive.google.com/file/d/1jjV1ARYMs37tOaBalhJmwq7LcWeMai96/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1rl0Hl-iZZHL-3fj8hE_Uug) |
28 | | vl-bert-large-e2e  | [GoogleDrive](https://drive.google.com/file/d/1YTHWWyP7Kq6zPySoEcTs3STaQdc5OJ7f/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1yqpDZRuGLsRXpklDgSC_Jw) |
29 | | vl-bert-base-prec  | [GoogleDrive](https://drive.google.com/file/d/1YBFsyoWwz83VPzbimKymSBxE37gYtfgh/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1SvGbE2cjw8jEGWwSfJBFQQ) |
30 | | vl-bert-large-prec | [GoogleDrive](https://drive.google.com/file/d/1REZLN7c3JCHVFoi_nEO-Nn6A4PTKIygG/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1k4eQe2rGGGVD24ZksJteNA) |
31 | 
32 | ***Note***: models with suffix "e2e" means parameters of Fast-RCNN is tuned during pre-training, 
33 | while "prec" means Fast-RCNN is fixed during pre-training and for effeciency the visual features is precomputed using
34 | [bottom-up-attention](https://github.com/peteanderson80/bottom-up-attention). 
35 | 
36 | ## BERT & ResNet
37 | 
38 | Download following pre-trained BERT and ResNet and place them under this folder.
39 | 
40 | * BERT: [GoogleDrive](https://drive.google.com/file/d/14VceZht89V5i54-_xWiw58Rosa5NDL2H/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1dyYcw50eZznL02ilG676Yw)
41 | * ResNet101 pretrained on Visual Genome: 
42 | [GoogleDrive](https://drive.google.com/file/d/1qJYtsGw1SfAyvknDZeRBnp2cF4VNjiDE/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1_yfZG8VqbWmp5Kr9w2DKGQ) 
43 | (converted from [caffe model](https://www.dropbox.com/s/wqada4qiv1dz9dk/resnet101_faster_rcnn_final.caffemodel?dl=1))


--------------------------------------------------------------------------------
/pretrain/_init_paths.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | this_dir = os.path.abspath(os.path.dirname(__file__))
 5 | 
 6 | 
 7 | def add_path(path):
 8 |     if path not in sys.path:
 9 |         sys.path.insert(0, path)
10 | 
11 | 
12 | root_path = os.path.join(this_dir, '../')
13 | add_path(root_path)
14 | 


--------------------------------------------------------------------------------
/pretrain/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/pretrain/data/__init__.py


--------------------------------------------------------------------------------
/pretrain/data/collate_batch.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from common.utils.clip_pad import *
 3 | 
 4 | 
 5 | class BatchCollator(object):
 6 |     def __init__(self, dataset, append_ind=False):
 7 |         self.dataset = dataset
 8 |         self.test_mode = self.dataset.test_mode
 9 |         self.data_names = self.dataset.data_names
10 |         self.append_ind = append_ind
11 | 
12 |     def __call__(self, batch):
13 |         if not isinstance(batch, list):
14 |             batch = list(batch)
15 | 
16 |         if 'image' in self.data_names:
17 |             if batch[0][self.data_names.index('image')] is not None:
18 |                 max_shape = tuple(max(s) for s in zip(*[data[self.data_names.index('image')].shape for data in batch]))
19 |                 image_none = False
20 |             else:
21 |                 image_none = True
22 |         if 'boxes' in self.data_names:
23 |             max_boxes = max([data[self.data_names.index('boxes')].shape[0] for data in batch])
24 |         if 'text' in self.data_names:
25 |             max_text_length = max([len(data[self.data_names.index('text')]) for data in batch])
26 | 
27 |         for i, ibatch in enumerate(batch):
28 |             out = {}
29 | 
30 |             if 'image' in self.data_names:
31 |                 if image_none:
32 |                     out['image'] = None
33 |                 else:
34 |                     image = ibatch[self.data_names.index('image')]
35 |                     out['image'] = clip_pad_images(image, max_shape, pad=0)
36 | 
37 |             if 'boxes' in self.data_names:
38 |                 boxes = ibatch[self.data_names.index('boxes')]
39 |                 out['boxes'] = clip_pad_boxes(boxes, max_boxes, pad=-2)
40 | 
41 |             if 'text' in self.data_names:
42 |                 text = ibatch[self.data_names.index('text')]
43 |                 out['text'] = clip_pad_1d(text, max_text_length, pad=0)
44 | 
45 |             if 'mlm_labels' in self.data_names:
46 |                 mlm_labels = ibatch[self.data_names.index('mlm_labels')]
47 |                 out['mlm_labels'] = clip_pad_1d(mlm_labels, max_text_length, pad=-1)
48 | 
49 |             if 'mvrc_ops' in self.data_names:
50 |                 mvrc_ops = ibatch[self.data_names.index('mvrc_ops')]
51 |                 out['mvrc_ops'] = clip_pad_1d(mvrc_ops, max_boxes, pad=0)
52 | 
53 |             if 'mvrc_labels' in self.data_names:
54 |                 mvrc_labels = ibatch[self.data_names.index('mvrc_labels')]
55 |                 out['mvrc_labels'] = clip_pad_boxes(mvrc_labels, max_boxes, pad=0)
56 | 
57 |             other_names = [data_name for data_name in self.data_names if data_name not in out]
58 |             for name in other_names:
59 |                 out[name] = torch.as_tensor(ibatch[self.data_names.index(name)])
60 | 
61 |             batch[i] = tuple(out[data_name] for data_name in self.data_names)
62 |             if self.append_ind:
63 |                 batch[i] += (torch.tensor(i, dtype=torch.int64),)
64 | 
65 |         out_tuple = ()
66 |         for items in zip(*batch):
67 |             if items[0] is None:
68 |                 out_tuple += (None,)
69 |             else:
70 |                 out_tuple += (torch.stack(tuple(items), dim=0), )
71 | 
72 |         return out_tuple
73 | 
74 | 


--------------------------------------------------------------------------------
/pretrain/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .conceptual_captions import ConceptualCaptionsDataset
2 | from .coco_captions import COCOCaptionsDataset
3 | from .general_corpus import GeneralCorpus
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/pretrain/data/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | from .distributed import DistributedSampler
2 | from .grouped_batch_sampler import GroupedBatchSampler
3 | 
4 | 


--------------------------------------------------------------------------------
/pretrain/data/samplers/distributed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | # Code is copy-pasted exactly as in torch.utils.data.distributed.
 3 | # FIXME remove this once c10d fixes the bug it has
 4 | import math
 5 | import torch
 6 | import torch.distributed as dist
 7 | from torch.utils.data.sampler import Sampler
 8 | 
 9 | 
10 | class DistributedSampler(Sampler):
11 |     """Sampler that restricts data loading to a subset of the dataset.
12 |     It is especially useful in conjunction with
13 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
14 |     process can pass a DistributedSampler instance as a DataLoader sampler,
15 |     and load a subset of the original dataset that is exclusive to it.
16 |     .. note::
17 |         Dataset is assumed to be of constant size.
18 |     Arguments:
19 |         dataset: Dataset used for sampling.
20 |         num_replicas (optional): Number of processes participating in
21 |             distributed training.
22 |         rank (optional): Rank of the current process within num_replicas.
23 |     """
24 | 
25 |     def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
26 |         if num_replicas is None:
27 |             if not dist.is_available():
28 |                 raise RuntimeError("Requires distributed package to be available")
29 |             num_replicas = dist.get_world_size()
30 |         if rank is None:
31 |             if not dist.is_available():
32 |                 raise RuntimeError("Requires distributed package to be available")
33 |             rank = dist.get_rank()
34 |         self.dataset = dataset
35 |         self.num_replicas = num_replicas
36 |         self.rank = rank
37 |         self.epoch = 0
38 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
39 |         self.total_size = self.num_samples * self.num_replicas
40 |         self.shuffle = shuffle
41 | 
42 |     def __iter__(self):
43 |         if self.shuffle:
44 |             # deterministically shuffle based on epoch
45 |             g = torch.Generator()
46 |             g.manual_seed(self.epoch)
47 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
48 |         else:
49 |             indices = torch.arange(len(self.dataset)).tolist()
50 | 
51 |         # add extra samples to make it evenly divisible
52 |         indices += indices[: (self.total_size - len(indices))]
53 |         assert len(indices) == self.total_size
54 | 
55 |         # subsample
56 |         offset = self.num_samples * self.rank
57 |         indices = indices[offset : offset + self.num_samples]
58 |         assert len(indices) == self.num_samples
59 | 
60 |         return iter(indices)
61 | 
62 |     def __len__(self):
63 |         return self.num_samples
64 | 
65 |     def set_epoch(self, epoch):
66 |         self.epoch = epoch


--------------------------------------------------------------------------------
/pretrain/data/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | from .transforms import Compose
2 | from .transforms import Resize
3 | from .transforms import RandomHorizontalFlip
4 | from .transforms import ToTensor
5 | from .transforms import Normalize
6 | 
7 | from .build import build_transforms
8 | 


--------------------------------------------------------------------------------
/pretrain/data/transforms/build.py:
--------------------------------------------------------------------------------
 1 | from . import transforms as T
 2 | 
 3 | 
 4 | def build_transforms(cfg, mode='train'):
 5 |     assert mode in ['train', 'test', 'val']
 6 |     min_size = cfg.SCALES[0]
 7 |     max_size = cfg.SCALES[1]
 8 |     assert min_size <= max_size
 9 | 
10 |     if mode == 'train':
11 |         flip_prob = cfg.TRAIN.FLIP_PROB
12 |     elif mode == 'test':
13 |         flip_prob = cfg.TEST.FLIP_PROB
14 |     else:
15 |         flip_prob = cfg.VAL.FLIP_PROB
16 | 
17 |     to_bgr255 = True
18 | 
19 |     normalize_transform = T.Normalize(
20 |         mean=cfg.NETWORK.PIXEL_MEANS, std=cfg.NETWORK.PIXEL_STDS, to_bgr255=to_bgr255
21 |     )
22 | 
23 |     # transform = T.Compose(
24 |     #     [
25 |     #         T.Resize(min_size, max_size),
26 |     #         T.RandomHorizontalFlip(flip_prob),
27 |     #         T.ToTensor(),
28 |     #         normalize_transform,
29 |     #         T.FixPadding(min_size, max_size, pad=0)
30 |     #     ]
31 |     # )
32 | 
33 |     transform = T.Compose(
34 |         [
35 |             T.Resize(min_size, max_size),
36 |             T.RandomHorizontalFlip(flip_prob),
37 |             T.ToTensor(),
38 |             normalize_transform,
39 |         ]
40 |     )
41 | 
42 |     return transform
43 | 


--------------------------------------------------------------------------------
/pretrain/function/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/pretrain/function/__init__.py


--------------------------------------------------------------------------------
/pretrain/function/val.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | import torch
 3 | from common.trainer import to_cuda
 4 | 
 5 | 
 6 | @torch.no_grad()
 7 | def do_validation(net, val_loader, metrics, label_index_in_batch):
 8 |     net.eval()
 9 |     metrics.reset()
10 |     for nbatch, batch in enumerate(val_loader):
11 |         batch = to_cuda(batch)
12 |         outputs, _ = net(*batch)
13 |         metrics.update(outputs)
14 | 
15 | 


--------------------------------------------------------------------------------
/pretrain/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .resnet_vlbert_for_pretraining import ResNetVLBERTForPretraining
2 | from .resnet_vlbert_for_pretraining_multitask import ResNetVLBERTForPretrainingMultitask
3 | from .resnet_vlbert_for_attention_vis import ResNetVLBERTForAttentionVis
4 | 
5 | 
6 | 


--------------------------------------------------------------------------------
/pretrain/train_end2end.py:
--------------------------------------------------------------------------------
 1 | import _init_paths
 2 | import os
 3 | import argparse
 4 | import torch
 5 | import subprocess
 6 | 
 7 | from pretrain.function.config import config, update_config
 8 | from pretrain.function.train import train_net
 9 | 
10 | 
11 | def parse_args():
12 |     parser = argparse.ArgumentParser('Train Cognition Network')
13 |     parser.add_argument('--cfg', type=str, help='path to config file')
14 |     parser.add_argument('--model-dir', type=str, help='root path to store checkpoint')
15 |     parser.add_argument('--log-dir', type=str, help='tensorboard log dir')
16 |     parser.add_argument('--dist', help='whether to use distributed training', default=False, action='store_true')
17 |     parser.add_argument('--slurm', help='whether this is a slurm job', default=False, action='store_true')
18 |     parser.add_argument('--do-test', help='whether to generate csv result on test set',
19 |                         default=False, action='store_true')
20 |     parser.add_argument('--cudnn-off', help='disable cudnn', default=False, action='store_true')
21 | 
22 |     args = parser.parse_args()
23 | 
24 |     if args.cfg is not None:
25 |         update_config(args.cfg)
26 |     if args.model_dir is not None:
27 |         config.OUTPUT_PATH = os.path.join(args.model_dir, config.OUTPUT_PATH)
28 | 
29 |     if args.slurm:
30 |         proc_id = int(os.environ['SLURM_PROCID'])
31 |         ntasks = int(os.environ['SLURM_NTASKS'])
32 |         node_list = os.environ['SLURM_NODELIST']
33 |         num_gpus = torch.cuda.device_count()
34 |         addr = subprocess.getoutput(
35 |             'scontrol show hostname {} | head -n1'.format(node_list))
36 |         os.environ['MASTER_PORT'] = str(29500)
37 |         os.environ['MASTER_ADDR'] = addr
38 |         os.environ['WORLD_SIZE'] = str(ntasks)
39 |         os.environ['RANK'] = str(proc_id)
40 |         os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
41 | 
42 |     return args, config
43 | 
44 | 
45 | def main():
46 |     args, config = parse_args()
47 |     rank, model = train_net(args, config)
48 | 
49 | 
50 | if __name__ == '__main__':
51 |     main()
52 | 
53 | 
54 | 


--------------------------------------------------------------------------------
/pretrain/vis_attention_maps.py:
--------------------------------------------------------------------------------
 1 | import _init_paths
 2 | import os
 3 | import argparse
 4 | import torch
 5 | import subprocess
 6 | 
 7 | from pretrain.function.config import config, update_config
 8 | from pretrain.function.vis import vis_net
 9 | 
10 | 
11 | def parse_args():
12 |     parser = argparse.ArgumentParser('Visualize Attention Maps')
13 |     parser.add_argument('--cfg', type=str, help='path to config file')
14 |     parser.add_argument('--dist', help='whether to use distributed training', default=False, action='store_true')
15 |     parser.add_argument('--slurm', help='whether this is a slurm job', default=False, action='store_true')
16 |     parser.add_argument('--save-dir', help='directory to save attention maps', type=str, default='./attention_maps')
17 | 
18 |     args = parser.parse_args()
19 | 
20 |     if args.cfg is not None:
21 |         update_config(args.cfg)
22 | 
23 |     if args.slurm:
24 |         proc_id = int(os.environ['SLURM_PROCID'])
25 |         ntasks = int(os.environ['SLURM_NTASKS'])
26 |         node_list = os.environ['SLURM_NODELIST']
27 |         num_gpus = torch.cuda.device_count()
28 |         addr = subprocess.getoutput(
29 |             'scontrol show hostname {} | head -n1'.format(node_list))
30 |         os.environ['MASTER_PORT'] = str(29500)
31 |         os.environ['MASTER_ADDR'] = addr
32 |         os.environ['WORLD_SIZE'] = str(ntasks)
33 |         os.environ['RANK'] = str(proc_id)
34 |         os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
35 | 
36 |     return args, config
37 | 
38 | 
39 | def main():
40 |     args, config = parse_args()
41 |     rank, model = vis_net(args, config, args.save_dir)
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     main()


--------------------------------------------------------------------------------
/refcoco/_init_paths.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | this_dir = os.path.abspath(os.path.dirname(__file__))
 5 | 
 6 | 
 7 | def add_path(path):
 8 |     if path not in sys.path:
 9 |         sys.path.insert(0, path)
10 | 
11 | 
12 | root_path = os.path.join(this_dir, '../')
13 | add_path(root_path)
14 | 


--------------------------------------------------------------------------------
/refcoco/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/refcoco/data/__init__.py


--------------------------------------------------------------------------------
/refcoco/data/collate_batch.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from common.utils.clip_pad import *
 3 | 
 4 | 
 5 | class BatchCollator(object):
 6 |     def __init__(self, dataset, append_ind=False):
 7 |         self.dataset = dataset
 8 |         self.test_mode = self.dataset.test_mode
 9 |         self.data_names = self.dataset.data_names
10 |         self.append_ind = append_ind
11 | 
12 |     def __call__(self, batch):
13 |         if not isinstance(batch, list):
14 |             batch = list(batch)
15 | 
16 |         if batch[0][self.data_names.index('image')] is not None:
17 |             max_shape = tuple(max(s) for s in zip(*[data[self.data_names.index('image')].shape for data in batch]))
18 |             image_none = False
19 |         else:
20 |             image_none = True
21 |         max_boxes = max([data[self.data_names.index('boxes')].shape[0] for data in batch])
22 |         max_expression_length = max([len(data[self.data_names.index('expression')]) for data in batch])
23 | 
24 |         for i, ibatch in enumerate(batch):
25 |             out = {}
26 | 
27 |             if image_none:
28 |                 out['image'] = None
29 |             else:
30 |                 image = ibatch[self.data_names.index('image')]
31 |                 out['image'] = clip_pad_images(image, max_shape, pad=0)
32 | 
33 |             boxes = ibatch[self.data_names.index('boxes')]
34 |             out['boxes'] = clip_pad_boxes(boxes, max_boxes, pad=-2)
35 | 
36 |             expression = ibatch[self.data_names.index('expression')]
37 |             out['expression'] = clip_pad_1d(expression, max_expression_length, pad=0)
38 | 
39 |             if 'label' in self.data_names:
40 |                 label = ibatch[self.data_names.index('label')]
41 |                 out['label'] = clip_pad_1d(label, max_boxes, pad=-1)
42 | 
43 |             other_names = [data_name for data_name in self.data_names if data_name not in out]
44 |             for name in other_names:
45 |                 out[name] = torch.as_tensor(ibatch[self.data_names.index(name)])
46 | 
47 |             batch[i] = tuple(out[data_name] for data_name in self.data_names)
48 |             if self.append_ind:
49 |                 batch[i] += (torch.tensor(i, dtype=torch.int64),)
50 | 
51 |         out_tuple = ()
52 |         for items in zip(*batch):
53 |             if items[0] is None:
54 |                 out_tuple += (None,)
55 |             else:
56 |                 out_tuple += (torch.stack(tuple(items), dim=0), )
57 | 
58 |         return out_tuple
59 | 
60 | 


--------------------------------------------------------------------------------
/refcoco/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .refcoco import RefCOCO
2 | 
3 | 


--------------------------------------------------------------------------------
/refcoco/data/datasets/refer/Makefile:
--------------------------------------------------------------------------------
1 | all:
2 | 	# install pycocotools/mask locally
3 | 	# copy from https://github.com/pdollar/coco.git
4 | 	python setup.py build_ext --inplace
5 | 	rm -rf build
6 | 
7 | 


--------------------------------------------------------------------------------
/refcoco/data/datasets/refer/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/refcoco/data/datasets/refer/__init__.py


--------------------------------------------------------------------------------
/refcoco/data/datasets/refer/external/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | _mask.so
3 | _mask.c
4 | 


--------------------------------------------------------------------------------
/refcoco/data/datasets/refer/external/README.md:
--------------------------------------------------------------------------------
1 | The codes inside this folder are copied from pycocotools: https://github.com/pdollar/coco


--------------------------------------------------------------------------------
/refcoco/data/datasets/refer/external/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'tylin'
2 | 


--------------------------------------------------------------------------------
/refcoco/data/datasets/refer/external/maskApi.h:
--------------------------------------------------------------------------------
 1 | /**************************************************************************
 2 | * Microsoft COCO Toolbox.      version 2.0
 3 | * Data, paper, and tutorials available at:  http://mscoco.org/
 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015.
 5 | * Licensed under the Simplified BSD License [see coco/license.txt]
 6 | **************************************************************************/
 7 | #pragma once
 8 | 
 9 | typedef unsigned int uint;
10 | typedef unsigned long siz;
11 | typedef unsigned char byte;
12 | typedef double* BB;
13 | typedef struct { siz h, w, m; uint *cnts; } RLE;
14 | 
15 | /* Initialize/destroy RLE. */
16 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts );
17 | void rleFree( RLE *R );
18 | 
19 | /* Initialize/destroy RLE array. */
20 | void rlesInit( RLE **R, siz n );
21 | void rlesFree( RLE **R, siz n );
22 | 
23 | /* Encode binary masks using RLE. */
24 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n );
25 | 
26 | /* Decode binary masks encoded via RLE. */
27 | void rleDecode( const RLE *R, byte *mask, siz n );
28 | 
29 | /* Compute union or intersection of encoded masks. */
30 | void rleMerge( const RLE *R, RLE *M, siz n, int intersect );
31 | 
32 | /* Compute area of encoded masks. */
33 | void rleArea( const RLE *R, siz n, uint *a );
34 | 
35 | /* Compute intersection over union between masks. */
36 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o );
37 | 
38 | /* Compute non-maximum suppression between bounding masks */
39 | void rleNms( RLE *dt, siz n, uint *keep, double thr );
40 | 
41 | /* Compute intersection over union between bounding boxes. */
42 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o );
43 | 
44 | /* Compute non-maximum suppression between bounding boxes */
45 | void bbNms( BB dt, siz n, uint *keep, double thr );
46 | 
47 | /* Get bounding boxes surrounding encoded masks. */
48 | void rleToBbox( const RLE *R, BB bb, siz n );
49 | 
50 | /* Convert bounding boxes to encoded masks. */
51 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n );
52 | 
53 | /* Convert polygon to encoded mask. */
54 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w );
55 | 
56 | /* Get compressed string representation of encoded mask. */
57 | char* rleToString( const RLE *R );
58 | 
59 | /* Convert from compressed string representation of encoded mask. */
60 | void rleFrString( RLE *R, char *s, siz h, siz w );
61 | 


--------------------------------------------------------------------------------
/refcoco/data/datasets/refer/setup.py:
--------------------------------------------------------------------------------
 1 | """
 2 | This code is for making mask.so, used to visualize the segmentation of referred object.
 3 | All "mask" related code is copied from https://github.com/pdollar/coco.git
 4 | """
 5 | from distutils.core import setup
 6 | from Cython.Build import cythonize
 7 | from distutils.extension import Extension
 8 | import numpy as np
 9 | 
10 | ext_modules = [
11 |             Extension(
12 |                 'external._mask',
13 |                 sources=['external/maskApi.c', 'external/_mask.pyx'],
14 |                 include_dirs = [np.get_include(), 'external'],
15 |                 extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99'],
16 |             )
17 |         ]
18 | 
19 | setup(
20 |     name='external',
21 |     packages=['external'],
22 |     package_dir = {'external': 'external'},
23 |     version='2.0',
24 |     ext_modules=cythonize(ext_modules)
25 |     )
26 | 


--------------------------------------------------------------------------------
/refcoco/data/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | from .distributed import DistributedSampler
2 | from .grouped_batch_sampler import GroupedBatchSampler
3 | 
4 | 


--------------------------------------------------------------------------------
/refcoco/data/samplers/distributed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | # Code is copy-pasted exactly as in torch.utils.data.distributed.
 3 | # FIXME remove this once c10d fixes the bug it has
 4 | import math
 5 | import torch
 6 | import torch.distributed as dist
 7 | from torch.utils.data.sampler import Sampler
 8 | 
 9 | 
10 | class DistributedSampler(Sampler):
11 |     """Sampler that restricts data loading to a subset of the dataset.
12 |     It is especially useful in conjunction with
13 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
14 |     process can pass a DistributedSampler instance as a DataLoader sampler,
15 |     and load a subset of the original dataset that is exclusive to it.
16 |     .. note::
17 |         Dataset is assumed to be of constant size.
18 |     Arguments:
19 |         dataset: Dataset used for sampling.
20 |         num_replicas (optional): Number of processes participating in
21 |             distributed training.
22 |         rank (optional): Rank of the current process within num_replicas.
23 |     """
24 | 
25 |     def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
26 |         if num_replicas is None:
27 |             if not dist.is_available():
28 |                 raise RuntimeError("Requires distributed package to be available")
29 |             num_replicas = dist.get_world_size()
30 |         if rank is None:
31 |             if not dist.is_available():
32 |                 raise RuntimeError("Requires distributed package to be available")
33 |             rank = dist.get_rank()
34 |         self.dataset = dataset
35 |         self.num_replicas = num_replicas
36 |         self.rank = rank
37 |         self.epoch = 0
38 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
39 |         self.total_size = self.num_samples * self.num_replicas
40 |         self.shuffle = shuffle
41 | 
42 |     def __iter__(self):
43 |         if self.shuffle:
44 |             # deterministically shuffle based on epoch
45 |             g = torch.Generator()
46 |             g.manual_seed(self.epoch)
47 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
48 |         else:
49 |             indices = torch.arange(len(self.dataset)).tolist()
50 | 
51 |         # add extra samples to make it evenly divisible
52 |         indices += indices[: (self.total_size - len(indices))]
53 |         assert len(indices) == self.total_size
54 | 
55 |         # subsample
56 |         offset = self.num_samples * self.rank
57 |         indices = indices[offset : offset + self.num_samples]
58 |         assert len(indices) == self.num_samples
59 | 
60 |         return iter(indices)
61 | 
62 |     def __len__(self):
63 |         return self.num_samples
64 | 
65 |     def set_epoch(self, epoch):
66 |         self.epoch = epoch


--------------------------------------------------------------------------------
/refcoco/data/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | from .transforms import Compose
2 | from .transforms import Resize
3 | from .transforms import RandomHorizontalFlip
4 | from .transforms import ToTensor
5 | from .transforms import Normalize
6 | 
7 | from .build import build_transforms
8 | 


--------------------------------------------------------------------------------
/refcoco/data/transforms/build.py:
--------------------------------------------------------------------------------
 1 | from . import transforms as T
 2 | 
 3 | 
 4 | def build_transforms(cfg, mode='train'):
 5 |     assert mode in ['train', 'test', 'val']
 6 |     min_size = cfg.SCALES[0]
 7 |     max_size = cfg.SCALES[1]
 8 |     assert min_size <= max_size
 9 | 
10 |     if mode == 'train':
11 |         flip_prob = cfg.TRAIN.FLIP_PROB
12 |     elif mode == 'test':
13 |         flip_prob = cfg.TEST.FLIP_PROB
14 |     else:
15 |         flip_prob = cfg.VAL.FLIP_PROB
16 | 
17 |     to_bgr255 = True
18 | 
19 |     normalize_transform = T.Normalize(
20 |         mean=cfg.NETWORK.PIXEL_MEANS, std=cfg.NETWORK.PIXEL_STDS, to_bgr255=to_bgr255
21 |     )
22 | 
23 |     # transform = T.Compose(
24 |     #     [
25 |     #         T.Resize(min_size, max_size),
26 |     #         T.RandomHorizontalFlip(flip_prob),
27 |     #         T.ToTensor(),
28 |     #         normalize_transform,
29 |     #         T.FixPadding(min_size, max_size, pad=0)
30 |     #     ]
31 |     # )
32 | 
33 |     transform = T.Compose(
34 |         [
35 |             T.Resize(min_size, max_size),
36 |             T.RandomHorizontalFlip(flip_prob),
37 |             T.ToTensor(),
38 |             normalize_transform,
39 |         ]
40 |     )
41 | 
42 |     return transform
43 | 


--------------------------------------------------------------------------------
/refcoco/function/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/refcoco/function/__init__.py


--------------------------------------------------------------------------------
/refcoco/function/val.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | import torch
 3 | from common.trainer import to_cuda
 4 | 
 5 | 
 6 | @torch.no_grad()
 7 | def do_validation(net, val_loader, metrics, label_index_in_batch):
 8 |     net.eval()
 9 |     metrics.reset()
10 |     for nbatch, batch in enumerate(val_loader):
11 |         batch = to_cuda(batch)
12 |         label = batch[label_index_in_batch]
13 |         datas = [batch[i] for i in range(len(batch)) if i != label_index_in_batch % len(batch)]
14 | 
15 |         outputs = net(*datas)
16 |         outputs.update({'label': label})
17 |         metrics.update(outputs)
18 | 
19 | 


--------------------------------------------------------------------------------
/refcoco/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .resnet_vlbert_for_refcoco import ResNetVLBERT
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/refcoco/test.py:
--------------------------------------------------------------------------------
 1 | import _init_paths
 2 | import os
 3 | import argparse
 4 | 
 5 | from refcoco.function.config import config, update_config
 6 | from refcoco.function.test import test_net
 7 | 
 8 | 
 9 | def parse_args():
10 |     parser = argparse.ArgumentParser('Train Cognition Network')
11 |     parser.add_argument('--cfg', type=str, help='path to config file')
12 |     parser.add_argument('--ckpt', type=str, help='root path to store checkpoint')
13 |     parser.add_argument('--gpus', type=int, nargs='+', help='indices of GPUs to use', default=[0])
14 |     parser.add_argument('--bs', type=int)
15 |     parser.add_argument('--split', type=str, choices=['test', 'testA', 'testB', 'val'], default='val')
16 |     parser.add_argument('--result-path', type=str, help='dir to save result file')
17 |     parser.add_argument('--result-name', type=str, help='name of result file')
18 | 
19 |     args = parser.parse_args()
20 | 
21 |     if args.cfg is not None:
22 |         update_config(args.cfg)
23 | 
24 |     config.GPUS = ','.join([str(index) for index in args.gpus])
25 | 
26 |     if args.bs is not None:
27 |         config.TEST.BATCH_IMAGES = args.bs
28 | 
29 |     return args, config
30 | 
31 | 
32 | def main():
33 |     args, config = parse_args()
34 | 
35 |     test_net(args, config)
36 | 
37 | 
38 | if __name__ == '__main__':
39 |     main()
40 | 


--------------------------------------------------------------------------------
/refcoco/train_end2end.py:
--------------------------------------------------------------------------------
 1 | import _init_paths
 2 | import os
 3 | import argparse
 4 | import torch
 5 | import subprocess
 6 | 
 7 | from refcoco.function.config import config, update_config
 8 | from refcoco.function.train import train_net
 9 | from refcoco.function.test import test_net
10 | 
11 | 
12 | def parse_args():
13 |     parser = argparse.ArgumentParser('Train Cognition Network')
14 |     parser.add_argument('--cfg', type=str, help='path to config file')
15 |     parser.add_argument('--model-dir', type=str, help='root path to store checkpoint')
16 |     parser.add_argument('--log-dir', type=str, help='tensorboard log dir')
17 |     parser.add_argument('--dist', help='whether to use distributed training', default=False, action='store_true')
18 |     parser.add_argument('--slurm', help='whether this is a slurm job', default=False, action='store_true')
19 |     parser.add_argument('--do-test', help='whether to generate csv result on test set',
20 |                         default=False, action='store_true')
21 |     parser.add_argument('--cudnn-off', help='disable cudnn', default=False, action='store_true')
22 | 
23 |     # easy test pretrain model
24 |     parser.add_argument('--partial-pretrain', type=str)
25 | 
26 |     args = parser.parse_args()
27 | 
28 |     if args.cfg is not None:
29 |         update_config(args.cfg)
30 |     if args.model_dir is not None:
31 |         config.OUTPUT_PATH = os.path.join(args.model_dir, config.OUTPUT_PATH)
32 | 
33 |     if args.partial_pretrain is not None:
34 |         config.NETWORK.PARTIAL_PRETRAIN = args.partial_pretrain
35 | 
36 |     if args.slurm:
37 |         proc_id = int(os.environ['SLURM_PROCID'])
38 |         ntasks = int(os.environ['SLURM_NTASKS'])
39 |         node_list = os.environ['SLURM_NODELIST']
40 |         num_gpus = torch.cuda.device_count()
41 |         addr = subprocess.getoutput(
42 |             'scontrol show hostname {} | head -n1'.format(node_list))
43 |         os.environ['MASTER_PORT'] = str(29500)
44 |         os.environ['MASTER_ADDR'] = addr
45 |         os.environ['WORLD_SIZE'] = str(ntasks)
46 |         os.environ['RANK'] = str(proc_id)
47 |         os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
48 | 
49 |     return args, config
50 | 
51 | 
52 | def main():
53 |     args, config = parse_args()
54 |     rank, model = train_net(args, config)
55 |     if args.do_test and (rank is None or rank == 0):
56 |         test_net(args, config)
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | easydict
 2 | jsonlines
 3 | matplotlib
 4 | networkx
 5 | numpy
 6 | opencv-python
 7 | pandas
 8 | Pillow
 9 | protobuf==3.10.0
10 | pycocotools
11 | PyYAML
12 | regex==2019.8.19
13 | requests==2.22.0
14 | scikit-image
15 | scipy
16 | tensorboard
17 | tensorboardX
18 | tensorflow
19 | tqdm
20 | urllib3
21 | boto3
22 | 


--------------------------------------------------------------------------------
/scripts/dist_run_multi.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | python ./scripts/launch.py \
4 |     --nnodes "$1" --node_rank "$2" --master_addr "$3" --nproc_per_node "$4" \
5 |     "$5" --cfg "$6" --model-dir "$7"


--------------------------------------------------------------------------------
/scripts/dist_run_single.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | python ./scripts/launch.py \
4 |     --nproc_per_node "$1" \
5 |     "$2" --cfg "$3" --model-dir "$4"
6 | 


--------------------------------------------------------------------------------
/scripts/dist_run_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | PARTITION=$1
 6 | JOB_NAME=$2
 7 | RUN_SCRIPT=$3
 8 | CONFIG=$4
 9 | WORK_DIR=$5
10 | GPUS=${6:-8}
11 | GPUS_PER_NODE=${GPUS_PER_NODE:-8}
12 | CPUS_PER_TASK=${CPUS_PER_TASK:-5}
13 | SRUN_ARGS=${SRUN_ARGS:-""}
14 | PY_ARGS=${PY_ARGS:-""}
15 | 
16 | srun -p ${PARTITION} \
17 |     --job-name=${JOB_NAME} \
18 |     --gres=gpu:${GPUS_PER_NODE} \
19 |     --ntasks=${GPUS} \
20 |     --ntasks-per-node=${GPUS_PER_NODE} \
21 |     --cpus-per-task=${CPUS_PER_TASK} \
22 |     --kill-on-bad-exit=1 \
23 |     ${SRUN_ARGS} \
24 |     python -u ${RUN_SCRIPT} \
25 |     --cfg ${CONFIG} \
26 |     --model-dir ${WORK_DIR} \
27 |     --slurm --dist ${PY_ARGS}
28 | 


--------------------------------------------------------------------------------
/scripts/init.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | cd ./common/lib/roi_pooling/
 4 | python setup.py build_ext --inplace
 5 | cd ../../../
 6 | 
 7 | #cd ./refcoco/data/datasets/refer/
 8 | #make
 9 | #cd ../../../../
10 | 
11 | 
12 | 
13 | 


--------------------------------------------------------------------------------
/scripts/init_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | PARTITION=$1
 6 | JOB_NAME=$2
 7 | GPUS=${3:-1}
 8 | GPUS_PER_NODE=${GPUS_PER_NODE:-1}
 9 | CPUS_PER_TASK=${CPUS_PER_TASK:-5}
10 | SRUN_ARGS=${SRUN_ARGS:-""}
11 | 
12 | srun -p ${PARTITION} \
13 |     --job-name=${JOB_NAME} \
14 |     --gres=gpu:${GPUS_PER_NODE} \
15 |     --ntasks=${GPUS} \
16 |     --ntasks-per-node=${GPUS_PER_NODE} \
17 |     --cpus-per-task=${CPUS_PER_TASK} \
18 |     --kill-on-bad-exit=1 \
19 |     ${SRUN_ARGS} \
20 |     ./scripts/init.sh
21 | 
22 | 
23 | 


--------------------------------------------------------------------------------
/scripts/nondist_run.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 | 
3 | python "$1" --cfg "$2" --model-dir "$3"
4 | 
5 | 


--------------------------------------------------------------------------------
/scripts/nondist_run_slurm.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | PARTITION=$1
 6 | JOB_NAME=$2
 7 | RUN_SCRIPT=$3
 8 | CONFIG=$4
 9 | WORK_DIR=$5
10 | GPUS=${6:-8}
11 | CPUS_PER_TASK=${CPUS_PER_TASK:-40}
12 | SRUN_ARGS=${SRUN_ARGS:-""}
13 | PY_ARGS=${PY_ARGS:-""}
14 | 
15 | srun -p ${PARTITION} \
16 |     --job-name=${JOB_NAME} \
17 |     --gres=gpu:${GPUS} \
18 |     --ntasks=1 \
19 |     --ntasks-per-node=1 \
20 |     --cpus-per-task=${CPUS_PER_TASK} \
21 |     --kill-on-bad-exit=1 \
22 |     ${SRUN_ARGS} \
23 |     python -u ${RUN_SCRIPT} \
24 |     --cfg ${CONFIG} \
25 |     --model-dir ${WORK_DIR} \
26 |     ${PY_ARGS}
27 | 


--------------------------------------------------------------------------------
/vcr/_init_paths.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | this_dir = os.path.abspath(os.path.dirname(__file__))
 5 | 
 6 | 
 7 | def add_path(path):
 8 |     if path not in sys.path:
 9 |         sys.path.insert(0, path)
10 | 
11 | 
12 | root_path = os.path.join(this_dir, '../')
13 | add_path(root_path)
14 | 


--------------------------------------------------------------------------------
/vcr/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/vcr/data/__init__.py


--------------------------------------------------------------------------------
/vcr/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .vcr import VCRDataset
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/vcr/data/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | from .distributed import DistributedSampler
2 | from .grouped_batch_sampler import GroupedBatchSampler
3 | 
4 | 


--------------------------------------------------------------------------------
/vcr/data/samplers/distributed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | # Code is copy-pasted exactly as in torch.utils.data.distributed.
 3 | # FIXME remove this once c10d fixes the bug it has
 4 | import math
 5 | import torch
 6 | import torch.distributed as dist
 7 | from torch.utils.data.sampler import Sampler
 8 | 
 9 | 
10 | class DistributedSampler(Sampler):
11 |     """Sampler that restricts data loading to a subset of the dataset.
12 |     It is especially useful in conjunction with
13 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
14 |     process can pass a DistributedSampler instance as a DataLoader sampler,
15 |     and load a subset of the original dataset that is exclusive to it.
16 |     .. note::
17 |         Dataset is assumed to be of constant size.
18 |     Arguments:
19 |         dataset: Dataset used for sampling.
20 |         num_replicas (optional): Number of processes participating in
21 |             distributed training.
22 |         rank (optional): Rank of the current process within num_replicas.
23 |     """
24 | 
25 |     def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
26 |         if num_replicas is None:
27 |             if not dist.is_available():
28 |                 raise RuntimeError("Requires distributed package to be available")
29 |             num_replicas = dist.get_world_size()
30 |         if rank is None:
31 |             if not dist.is_available():
32 |                 raise RuntimeError("Requires distributed package to be available")
33 |             rank = dist.get_rank()
34 |         self.dataset = dataset
35 |         self.num_replicas = num_replicas
36 |         self.rank = rank
37 |         self.epoch = 0
38 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
39 |         self.total_size = self.num_samples * self.num_replicas
40 |         self.shuffle = shuffle
41 | 
42 |     def __iter__(self):
43 |         if self.shuffle:
44 |             # deterministically shuffle based on epoch
45 |             g = torch.Generator()
46 |             g.manual_seed(self.epoch)
47 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
48 |         else:
49 |             indices = torch.arange(len(self.dataset)).tolist()
50 | 
51 |         # add extra samples to make it evenly divisible
52 |         indices += indices[: (self.total_size - len(indices))]
53 |         assert len(indices) == self.total_size
54 | 
55 |         # subsample
56 |         offset = self.num_samples * self.rank
57 |         indices = indices[offset : offset + self.num_samples]
58 |         assert len(indices) == self.num_samples
59 | 
60 |         return iter(indices)
61 | 
62 |     def __len__(self):
63 |         return self.num_samples
64 | 
65 |     def set_epoch(self, epoch):
66 |         self.epoch = epoch


--------------------------------------------------------------------------------
/vcr/data/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | from .transforms import Compose
2 | from .transforms import Resize
3 | from .transforms import RandomHorizontalFlip
4 | from .transforms import ToTensor
5 | from .transforms import Normalize
6 | 
7 | from .build import build_transforms
8 | 


--------------------------------------------------------------------------------
/vcr/data/transforms/build.py:
--------------------------------------------------------------------------------
 1 | from . import transforms as T
 2 | 
 3 | 
 4 | def build_transforms(cfg, mode='train'):
 5 |     assert mode in ['train', 'test', 'val']
 6 |     min_size = cfg.SCALES[0]
 7 |     max_size = cfg.SCALES[1]
 8 |     assert min_size <= max_size
 9 | 
10 |     if mode == 'train':
11 |         flip_prob = cfg.TRAIN.FLIP_PROB
12 |     elif mode == 'test':
13 |         flip_prob = cfg.TEST.FLIP_PROB
14 |     else:
15 |         flip_prob = cfg.VAL.FLIP_PROB
16 | 
17 |     to_bgr255 = True
18 | 
19 |     normalize_transform = T.Normalize(
20 |         mean=cfg.NETWORK.PIXEL_MEANS, std=cfg.NETWORK.PIXEL_STDS, to_bgr255=to_bgr255
21 |     )
22 | 
23 |     transform = T.Compose(
24 |         [
25 |             T.Resize(min_size, max_size),
26 |             T.RandomHorizontalFlip(flip_prob),
27 |             T.ToTensor(),
28 |             normalize_transform,
29 |             T.FixPadding(min_size, max_size, pad=0)
30 |         ]
31 |     )
32 |     return transform
33 | 


--------------------------------------------------------------------------------
/vcr/function/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/vcr/function/__init__.py


--------------------------------------------------------------------------------
/vcr/function/val.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | import torch
 3 | from common.trainer import to_cuda
 4 | 
 5 | 
 6 | @torch.no_grad()
 7 | def do_validation(net, val_loader, metrics, label_index_in_batch):
 8 |     net.eval()
 9 |     metrics.reset()
10 |     for nbatch, batch in enumerate(val_loader):
11 |         batch = to_cuda(batch)
12 |         label = batch[label_index_in_batch]
13 |         datas = [batch[i] for i in range(len(batch)) if i != label_index_in_batch % len(batch)]
14 | 
15 |         outputs = net(*datas)
16 |         outputs.update({'label': label})
17 |         metrics.update(outputs)
18 | 
19 | 
20 | @torch.no_grad()
21 | def joint_validation(answer_net, rationale_net, answer_val_loader, rationale_val_loader, metrics, label_index_in_batch,
22 |                      show_progress=False):
23 |     answer_net.eval()
24 |     rationale_net.eval()
25 |     metrics.reset()
26 | 
27 |     def step(a_batch, r_batch):
28 |         a_batch = to_cuda(a_batch)
29 |         a_label = a_batch[label_index_in_batch]
30 |         a_datas = [a_batch[i] for i in range(len(a_batch)) if i != label_index_in_batch % len(a_batch)]
31 |         r_batch = to_cuda(r_batch)
32 |         r_label = r_batch[label_index_in_batch]
33 |         r_datas = [r_batch[i] for i in range(len(r_batch)) if i != label_index_in_batch % len(r_batch)]
34 | 
35 |         a_outputs = answer_net(*a_datas)
36 |         r_outputs = rationale_net(*r_datas)
37 |         outputs = {'answer_' + k: v for k, v in a_outputs.items()}
38 |         outputs.update({'rationale_' + k: v for k, v in r_outputs.items()})
39 |         outputs.update({'answer_label': a_label,
40 |                         'rationale_label': r_label})
41 |         metrics.update(outputs)
42 | 
43 |     if show_progress:
44 |         from tqdm import tqdm
45 |         for a_batch, r_batch in tqdm(zip(answer_val_loader, rationale_val_loader)):
46 |             step(a_batch, r_batch)
47 |     else:
48 |         for a_batch, r_batch in zip(answer_val_loader, rationale_val_loader):
49 |             step(a_batch, r_batch)
50 | 


--------------------------------------------------------------------------------
/vcr/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .resnet_vlbert_for_vcr import ResNetVLBERT
2 | 
3 | 
4 | 
5 | 


--------------------------------------------------------------------------------
/vcr/test.py:
--------------------------------------------------------------------------------
 1 | import _init_paths
 2 | import os
 3 | import argparse
 4 | from copy import deepcopy
 5 | 
 6 | from vcr.function.config import config, update_config
 7 | from vcr.function.test import test_net, merge_result
 8 | 
 9 | 
10 | def parse_args():
11 |     parser = argparse.ArgumentParser('Get Jointly Test Result of Cognition Network')
12 |     parser.add_argument('--a-cfg', type=str, help='path to answer net config yaml')
13 |     parser.add_argument('--r-cfg', type=str, help='path to rationale net config yaml')
14 |     parser.add_argument('--a-ckpt', type=str, help='path to checkpoint of answer net')
15 |     parser.add_argument('--r-ckpt', type=str, help='path to checkpoint of rationale net')
16 |     parser.add_argument('--a-bs', type=int)
17 |     parser.add_argument('--r-bs', type=int)
18 |     parser.add_argument('--gpus', type=int, nargs='+', default=[0])
19 |     parser.add_argument('--test-file', type=str)
20 |     parser.add_argument('--result-path', type=str, help='path to store test result csv file.', default='./test_result')
21 |     parser.add_argument('--result-name', type=str)
22 |     parser.add_argument('--fp16', default=False, action='store_true')
23 |     parser.add_argument('--use-cache', default=False, action='store_true')
24 | 
25 |     args = parser.parse_args()
26 |     a_config = r_config = None
27 |     reset_config = deepcopy(config)
28 |     if args.a_cfg is not None:
29 |         a_config = config
30 |         if reset_config is not None:
31 |             a_config.update(deepcopy(reset_config))
32 |         if args.a_cfg is not None:
33 |             update_config(args.a_cfg)
34 |         a_config = deepcopy(a_config)
35 |     if args.r_cfg is not None:
36 |         r_config = config
37 |         if reset_config is not None:
38 |             r_config.update(deepcopy(reset_config))
39 |         if args.r_cfg is not None:
40 |             update_config(args.r_cfg)
41 |         r_config = deepcopy(r_config)
42 |     if args.a_bs is not None:
43 |         a_config.TEST.BATCH_IMAGES = args.a_bs
44 |     if args.r_bs is not None:
45 |         r_config.TEST.BATCH_IMAGES = args.r_bs
46 | 
47 |     if args.test_file is not None:
48 |         a_config.DATASET.TEST_ANNOTATION_FILE = args.test_file
49 |         r_config.DATASET.TEST_ANNOTATION_FILE = args.test_file
50 | 
51 |     return args, a_config, r_config
52 | 
53 | 
54 | def main():
55 |     args, a_config, r_config = parse_args()
56 | 
57 |     if args.a_ckpt:
58 |         a_config.DATASET.TASK = 'Q2A'
59 |         a_config.GPUS = ','.join([str(k) for k in args.gpus])
60 |         a_result_csv = test_net(args,
61 |                                 a_config,
62 |                                 ckpt_path=args.a_ckpt,
63 |                                 save_path=args.result_path,
64 |                                 save_name=args.result_name)
65 |     if args.r_ckpt:
66 |         r_config.DATASET.TASK = 'QA2R'
67 |         r_config.GPUS = ','.join([str(k) for k in args.gpus])
68 |         r_result_csv = test_net(args,
69 |                                 r_config,
70 |                                 ckpt_path=args.r_ckpt,
71 |                                 save_path=args.result_path,
72 |                                 save_name=args.result_name)
73 |     if args.a_ckpt and args.r_ckpt:
74 |         merge_result(a_result_csv, r_result_csv,
75 |                      os.path.join(args.result_path, '{}_test_result_Q2AR.csv'.format(args.result_name)))
76 | 
77 | 
78 | if __name__ == '__main__':
79 |     main()
80 | 
81 | 
82 | 


--------------------------------------------------------------------------------
/vcr/train_end2end.py:
--------------------------------------------------------------------------------
 1 | import _init_paths
 2 | import os
 3 | import argparse
 4 | import torch
 5 | import subprocess
 6 | 
 7 | from vcr.function.config import config, update_config
 8 | from vcr.function.train import train_net
 9 | from vcr.function.test import test_net
10 | 
11 | def parse_args():
12 |     parser = argparse.ArgumentParser('Train Cognition Network')
13 |     parser.add_argument('--cfg', type=str, help='path to config file')
14 |     parser.add_argument('--model-dir', type=str, help='root path to store checkpoint')
15 |     parser.add_argument('--log-dir', type=str, help='tensorboard log dir')
16 |     parser.add_argument('--dist', help='whether to use distributed training', default=False, action='store_true')
17 |     parser.add_argument('--slurm', help='whether this is a slurm job', default=False, action='store_true')
18 |     parser.add_argument('--do-test', help='whether to generate csv result on test set',
19 |                         default=False, action='store_true')
20 |     parser.add_argument('--cudnn-off', help='disable cudnn', default=False, action='store_true')
21 | 
22 |     # easy test pretrain model
23 |     parser.add_argument('--partial-pretrain', type=str)
24 | 
25 |     args = parser.parse_args()
26 | 
27 |     if args.cfg is not None:
28 |         update_config(args.cfg)
29 |     if args.model_dir is not None:
30 |         config.OUTPUT_PATH = os.path.join(args.model_dir, config.OUTPUT_PATH)
31 | 
32 |     if args.partial_pretrain is not None:
33 |         config.NETWORK.PARTIAL_PRETRAIN = args.partial_pretrain
34 | 
35 |     if args.slurm:
36 |         proc_id = int(os.environ['SLURM_PROCID'])
37 |         ntasks = int(os.environ['SLURM_NTASKS'])
38 |         node_list = os.environ['SLURM_NODELIST']
39 |         num_gpus = torch.cuda.device_count()
40 |         addr = subprocess.getoutput(
41 |             'scontrol show hostname {} | head -n1'.format(node_list))
42 |         os.environ['MASTER_PORT'] = str(29500)
43 |         os.environ['MASTER_ADDR'] = addr
44 |         os.environ['WORLD_SIZE'] = str(ntasks)
45 |         os.environ['RANK'] = str(proc_id)
46 |         os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
47 | 
48 |     return args, config
49 | 
50 | 
51 | def main():
52 |     args, config = parse_args()
53 |     rank, model = train_net(args, config)
54 |     if args.do_test and (rank is None or rank == 0):
55 |         test_net(args, config)
56 | 
57 | 
58 | if __name__ == '__main__':
59 |     main()
60 | 
61 | 
62 | 


--------------------------------------------------------------------------------
/viz/VISUALIZATION.md:
--------------------------------------------------------------------------------
 1 | # Visualization
 2 | 
 3 | The code is based on [bertviz](https://github.com/jessevig/bertviz), a nice tool for BERT visualization.
 4 | 
 5 | ## Prepare
 6 | 
 7 | * Change work directory to this directory.
 8 | 
 9 |   ```bash
10 |   cd ./viz
11 |   ```
12 | 
13 | * Create a soft link to the data folder (If you are working on Windows, please modify the data path in the jupyter notebook by yourself).
14 | 
15 |   ```bash
16 |   ln -s ../data ./
17 |   ```
18 | 
19 | * Download and unzip COCO val2017: [images](http://images.cocodataset.org/zips/val2017.zip), [annotations](http://images.cocodataset.org/annotations/annotations_trainval2017.zip), place them under ```./data/coco```.
20 | 
21 | * (Optional) Download pre-trained models as described in [PREPARE_PRETRAINED_MODELS.md](../model/pretrained_model/PREPARE_PRETRAINED_MODELS.md), if you want to precompute all attention maps by yourself.
22 | 
23 | ## Pre-compute attention maps
24 | * Pre-computing all attention maps on COCO val2017: 
25 |   
26 |   ```bash
27 |   python pretrain/vis_attention_maps.py --cfg cfgs/pretrain/vis_attention_maps_coco.yaml --save-dir ./vl-bert_viz
28 |   ```
29 | * We provide 100 pre-computed attention maps of vl-bert-base-e2e on COCO val2017: [GoogleDrive](https://drive.google.com/file/d/1TFfqArX3lwOPQ8EklZ6px5-gvOvoGdTr/view?usp=sharing) [BaiduPan](https://pan.baidu.com/s/1l0T5vAuklQTrAmD3wbJ7uQ), please download and unzip it into ```./data```.
30 | 
31 | ## Visualization on Jupyter Notebook
32 | * Open Jupyter Notebook  in this directory and select ```model_view_vl-bert_coco.ipynb```.
33 |     ```bash
34 |     jupyter notebook
35 |     ```
36 | 
37 | * Run all cells in the notebook in order.
38 | 
39 | * Browse attention maps in the last cell, you can change the image id to visualize other examples.
40 | 
41 |  
42 | 


--------------------------------------------------------------------------------
/viz/_init_paths.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | this_dir = os.path.abspath(os.path.dirname(__file__))
 5 | 
 6 | 
 7 | def add_path(path):
 8 |     if path not in sys.path:
 9 |         sys.path.insert(0, path)
10 | 
11 | 
12 | root_path = os.path.join(this_dir, '../')
13 | add_path(root_path)
14 | 


--------------------------------------------------------------------------------
/viz/bertviz/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/viz/bertviz/__init__.py


--------------------------------------------------------------------------------
/viz/bertviz/model_view.py:
--------------------------------------------------------------------------------
 1 | # coding=utf-8
 2 | # Copyright 2018 The Tensor2Tensor Authors.
 3 | #
 4 | # Licensed under the Apache License, Version 2.0 (the "License");
 5 | # you may not use this file except in compliance with the License.
 6 | # You may obtain a copy of the License at
 7 | #
 8 | #     http://www.apache.org/licenses/LICENSE-2.0
 9 | #
10 | # Unless required by applicable law or agreed to in writing, software
11 | # distributed under the License is distributed on an "AS IS" BASIS,
12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 | # See the License for the specific language governing permissions and
14 | # limitations under the License.
15 | #
16 | # Change log
17 | # 12/12/18  Jesse Vig   Adapted to BERT model
18 | # 12/19/18  Jesse Vig   Assorted cleanup. Changed orientation of attention matrices. Updated comments.
19 | 
20 | 
21 | """Module for postprocessing and displaying transformer attentions.
22 | 
23 | This module is designed to be called from an ipython notebook.
24 | """
25 | 
26 | import json
27 | from IPython.core.display import display, HTML, Javascript
28 | import os
29 | 
30 | def show(model, model_type, tokenizer, sentence_a, sentence_b=None, attn_data=None):
31 | 
32 |     if sentence_b:
33 |         vis_html = """
34 |           <span style="user-select:none">
35 |             Attention: <select id="filter">
36 |               <option value="ab">Text -> RoIs</option>
37 |               <option value="ba">RoIs -> Text</option>
38 |               <option value="aa">Text -> Text</option>
39 |               <option value="bb">RoIs -> RoIs</option>
40 |               <option value="all">All</option>
41 |             </select>
42 |           </span>
43 |           <div id='vis'></div> 
44 |         """
45 |     else:
46 |         vis_html = """
47 |           <div id='vis'></div> 
48 |         """
49 | 
50 |     display(HTML(vis_html))
51 |     __location__ = os.path.realpath(
52 |         os.path.join(os.getcwd(), os.path.dirname(__file__)))
53 |     vis_js = open(os.path.join(__location__, 'model_view.js')).read()
54 |     if attn_data is None:
55 |         from bertviz.attention import get_attention
56 |         attn_data = get_attention(model, model_type, tokenizer, sentence_a, sentence_b)
57 |     params = {
58 |         'attention': attn_data,
59 |         'default_filter': "ab"
60 |     }
61 |     display(Javascript('window.params = %s' % json.dumps(params)))
62 |     display(Javascript(vis_js))
63 | 
64 | 


--------------------------------------------------------------------------------
/vqa/_init_paths.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | 
 4 | this_dir = os.path.abspath(os.path.dirname(__file__))
 5 | 
 6 | 
 7 | def add_path(path):
 8 |     if path not in sys.path:
 9 |         sys.path.insert(0, path)
10 | 
11 | 
12 | root_path = os.path.join(this_dir, '../')
13 | add_path(root_path)
14 | 


--------------------------------------------------------------------------------
/vqa/data/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/vqa/data/__init__.py


--------------------------------------------------------------------------------
/vqa/data/collate_batch.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from common.utils.clip_pad import *
 3 | 
 4 | 
 5 | class BatchCollator(object):
 6 |     def __init__(self, dataset, append_ind=False):
 7 |         self.dataset = dataset
 8 |         self.test_mode = self.dataset.test_mode
 9 |         self.data_names = self.dataset.data_names
10 |         self.append_ind = append_ind
11 | 
12 |     def __call__(self, batch):
13 |         if not isinstance(batch, list):
14 |             batch = list(batch)
15 | 
16 |         if batch[0][self.data_names.index('image')] is not None:
17 |             max_shape = tuple(max(s) for s in zip(*[data[self.data_names.index('image')].shape for data in batch]))
18 |             image_none = False
19 |         else:
20 |             image_none = True
21 |         max_boxes = max([data[self.data_names.index('boxes')].shape[0] for data in batch])
22 |         max_question_length = max([len(data[self.data_names.index('question')]) for data in batch])
23 | 
24 |         for i, ibatch in enumerate(batch):
25 |             out = {}
26 | 
27 |             if image_none:
28 |                 out['image'] = None
29 |             else:
30 |                 image = ibatch[self.data_names.index('image')]
31 |                 out['image'] = clip_pad_images(image, max_shape, pad=0)
32 | 
33 |             boxes = ibatch[self.data_names.index('boxes')]
34 |             out['boxes'] = clip_pad_boxes(boxes, max_boxes, pad=-2)
35 | 
36 |             question = ibatch[self.data_names.index('question')]
37 |             out['question'] = clip_pad_1d(question, max_question_length, pad=0)
38 | 
39 |             other_names = [data_name for data_name in self.data_names if data_name not in out]
40 |             for name in other_names:
41 |                 out[name] = torch.as_tensor(ibatch[self.data_names.index(name)])
42 | 
43 |             batch[i] = tuple(out[data_name] for data_name in self.data_names)
44 |             if self.append_ind:
45 |                 batch[i] += (torch.tensor(i, dtype=torch.int64),)
46 | 
47 |         out_tuple = ()
48 |         for items in zip(*batch):
49 |             if items[0] is None:
50 |                 out_tuple += (None,)
51 |             else:
52 |                 out_tuple += (torch.stack(tuple(items), dim=0), )
53 | 
54 |         return out_tuple
55 | 
56 | 


--------------------------------------------------------------------------------
/vqa/data/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .vqa import VQA
2 | 
3 | 


--------------------------------------------------------------------------------
/vqa/data/samplers/__init__.py:
--------------------------------------------------------------------------------
1 | from .distributed import DistributedSampler
2 | from .grouped_batch_sampler import GroupedBatchSampler
3 | 
4 | 


--------------------------------------------------------------------------------
/vqa/data/samplers/distributed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | # Code is copy-pasted exactly as in torch.utils.data.distributed.
 3 | # FIXME remove this once c10d fixes the bug it has
 4 | import math
 5 | import torch
 6 | import torch.distributed as dist
 7 | from torch.utils.data.sampler import Sampler
 8 | 
 9 | 
10 | class DistributedSampler(Sampler):
11 |     """Sampler that restricts data loading to a subset of the dataset.
12 |     It is especially useful in conjunction with
13 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
14 |     process can pass a DistributedSampler instance as a DataLoader sampler,
15 |     and load a subset of the original dataset that is exclusive to it.
16 |     .. note::
17 |         Dataset is assumed to be of constant size.
18 |     Arguments:
19 |         dataset: Dataset used for sampling.
20 |         num_replicas (optional): Number of processes participating in
21 |             distributed training.
22 |         rank (optional): Rank of the current process within num_replicas.
23 |     """
24 | 
25 |     def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
26 |         if num_replicas is None:
27 |             if not dist.is_available():
28 |                 raise RuntimeError("Requires distributed package to be available")
29 |             num_replicas = dist.get_world_size()
30 |         if rank is None:
31 |             if not dist.is_available():
32 |                 raise RuntimeError("Requires distributed package to be available")
33 |             rank = dist.get_rank()
34 |         self.dataset = dataset
35 |         self.num_replicas = num_replicas
36 |         self.rank = rank
37 |         self.epoch = 0
38 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
39 |         self.total_size = self.num_samples * self.num_replicas
40 |         self.shuffle = shuffle
41 | 
42 |     def __iter__(self):
43 |         if self.shuffle:
44 |             # deterministically shuffle based on epoch
45 |             g = torch.Generator()
46 |             g.manual_seed(self.epoch)
47 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
48 |         else:
49 |             indices = torch.arange(len(self.dataset)).tolist()
50 | 
51 |         # add extra samples to make it evenly divisible
52 |         indices += indices[: (self.total_size - len(indices))]
53 |         assert len(indices) == self.total_size
54 | 
55 |         # subsample
56 |         offset = self.num_samples * self.rank
57 |         indices = indices[offset : offset + self.num_samples]
58 |         assert len(indices) == self.num_samples
59 | 
60 |         return iter(indices)
61 | 
62 |     def __len__(self):
63 |         return self.num_samples
64 | 
65 |     def set_epoch(self, epoch):
66 |         self.epoch = epoch


--------------------------------------------------------------------------------
/vqa/data/transforms/__init__.py:
--------------------------------------------------------------------------------
1 | from .transforms import Compose
2 | from .transforms import Resize
3 | from .transforms import RandomHorizontalFlip
4 | from .transforms import ToTensor
5 | from .transforms import Normalize
6 | 
7 | from .build import build_transforms
8 | 


--------------------------------------------------------------------------------
/vqa/data/transforms/build.py:
--------------------------------------------------------------------------------
 1 | from . import transforms as T
 2 | 
 3 | 
 4 | def build_transforms(cfg, mode='train'):
 5 |     assert mode in ['train', 'test', 'val']
 6 |     min_size = cfg.SCALES[0]
 7 |     max_size = cfg.SCALES[1]
 8 |     assert min_size <= max_size
 9 | 
10 |     if mode == 'train':
11 |         flip_prob = cfg.TRAIN.FLIP_PROB
12 |     elif mode == 'test':
13 |         flip_prob = cfg.TEST.FLIP_PROB
14 |     else:
15 |         flip_prob = cfg.VAL.FLIP_PROB
16 | 
17 |     to_bgr255 = True
18 | 
19 |     normalize_transform = T.Normalize(
20 |         mean=cfg.NETWORK.PIXEL_MEANS, std=cfg.NETWORK.PIXEL_STDS, to_bgr255=to_bgr255
21 |     )
22 | 
23 |     # transform = T.Compose(
24 |     #     [
25 |     #         T.Resize(min_size, max_size),
26 |     #         T.RandomHorizontalFlip(flip_prob),
27 |     #         T.ToTensor(),
28 |     #         normalize_transform,
29 |     #         T.FixPadding(min_size, max_size, pad=0)
30 |     #     ]
31 |     # )
32 | 
33 |     transform = T.Compose(
34 |         [
35 |             T.Resize(min_size, max_size),
36 |             T.RandomHorizontalFlip(flip_prob),
37 |             T.ToTensor(),
38 |             normalize_transform,
39 |         ]
40 |     )
41 | 
42 |     return transform
43 | 


--------------------------------------------------------------------------------
/vqa/function/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/vqa/function/__init__.py


--------------------------------------------------------------------------------
/vqa/function/val.py:
--------------------------------------------------------------------------------
 1 | from collections import namedtuple
 2 | import torch
 3 | from common.trainer import to_cuda
 4 | 
 5 | 
 6 | @torch.no_grad()
 7 | def do_validation(net, val_loader, metrics, label_index_in_batch):
 8 |     net.eval()
 9 |     metrics.reset()
10 |     for nbatch, batch in enumerate(val_loader):
11 |         batch = to_cuda(batch)
12 |         label = batch[label_index_in_batch]
13 |         datas = [batch[i] for i in range(len(batch)) if i != label_index_in_batch % len(batch)]
14 | 
15 |         outputs = net(*datas)
16 |         outputs.update({'label': label})
17 |         metrics.update(outputs)
18 | 
19 | 


--------------------------------------------------------------------------------
/vqa/modules/__init__.py:
--------------------------------------------------------------------------------
1 | from .resnet_vlbert_for_vqa import ResNetVLBERT
2 | 
3 | 
4 | 


--------------------------------------------------------------------------------
/vqa/test.py:
--------------------------------------------------------------------------------
 1 | import _init_paths
 2 | import os
 3 | import argparse
 4 | from copy import deepcopy
 5 | 
 6 | from vqa.function.config import config, update_config
 7 | from vqa.function.test import test_net
 8 | 
 9 | 
10 | def parse_args():
11 |     parser = argparse.ArgumentParser('Get Test Result of VQA Network')
12 |     parser.add_argument('--cfg', type=str, help='path to answer net config yaml')
13 |     parser.add_argument('--ckpt', type=str, help='path to checkpoint of answer net')
14 |     parser.add_argument('--bs', type=int)
15 |     parser.add_argument('--gpus', type=int, nargs='+')
16 |     parser.add_argument('--model-dir', type=str, help='root path to store checkpoint')
17 |     parser.add_argument('--result-path', type=str, help='path to store test result file.')
18 |     parser.add_argument('--result-name', type=str)
19 |     parser.add_argument('--split', default='test2015')
20 | 
21 |     args = parser.parse_args()
22 | 
23 |     if args.cfg is not None:
24 |         update_config(args.cfg)
25 |     if args.bs is not None:
26 |         config.TEST.BATCH_IMAGES = args.bs
27 |     if args.gpus is not None:
28 |         config.GPUS = ','.join([str(gpu) for gpu in args.gpus])
29 |     if args.split is not None:
30 |         config.DATASET.TEST_IMAGE_SET = args.split
31 |     if args.model_dir is not None:
32 |         config.OUTPUT_PATH = os.path.join(args.model_dir, config.OUTPUT_PATH)
33 | 
34 |     return args, config
35 | 
36 | 
37 | def main():
38 |     args, config = parse_args()
39 | 
40 |     result_json_path = test_net(args, config,
41 |                                 ckpt_path=args.ckpt, save_path=args.result_path, save_name=args.result_name)
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     main()
46 | 


--------------------------------------------------------------------------------
/vqa/train_end2end.py:
--------------------------------------------------------------------------------
 1 | import _init_paths
 2 | import os
 3 | import argparse
 4 | import torch
 5 | import subprocess
 6 | 
 7 | from vqa.function.config import config, update_config
 8 | from vqa.function.train import train_net
 9 | from vqa.function.test import test_net
10 | 
11 | 
12 | def parse_args():
13 |     parser = argparse.ArgumentParser('Train Cognition Network')
14 |     parser.add_argument('--cfg', type=str, help='path to config file')
15 |     parser.add_argument('--model-dir', type=str, help='root path to store checkpoint')
16 |     parser.add_argument('--log-dir', type=str, help='tensorboard log dir')
17 |     parser.add_argument('--dist', help='whether to use distributed training', default=False, action='store_true')
18 |     parser.add_argument('--slurm', help='whether this is a slurm job', default=False, action='store_true')
19 |     parser.add_argument('--do-test', help='whether to generate csv result on test set',
20 |                         default=False, action='store_true')
21 |     parser.add_argument('--cudnn-off', help='disable cudnn', default=False, action='store_true')
22 | 
23 |     # easy test pretrain model
24 |     parser.add_argument('--partial-pretrain', type=str)
25 | 
26 |     args = parser.parse_args()
27 | 
28 |     if args.cfg is not None:
29 |         update_config(args.cfg)
30 |     if args.model_dir is not None:
31 |         config.OUTPUT_PATH = os.path.join(args.model_dir, config.OUTPUT_PATH)
32 | 
33 |     if args.partial_pretrain is not None:
34 |         config.NETWORK.PARTIAL_PRETRAIN = args.partial_pretrain
35 | 
36 |     if args.slurm:
37 |         proc_id = int(os.environ['SLURM_PROCID'])
38 |         ntasks = int(os.environ['SLURM_NTASKS'])
39 |         node_list = os.environ['SLURM_NODELIST']
40 |         num_gpus = torch.cuda.device_count()
41 |         addr = subprocess.getoutput(
42 |             'scontrol show hostname {} | head -n1'.format(node_list))
43 |         os.environ['MASTER_PORT'] = str(29500)
44 |         os.environ['MASTER_ADDR'] = addr
45 |         os.environ['WORLD_SIZE'] = str(ntasks)
46 |         os.environ['RANK'] = str(proc_id)
47 |         os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
48 | 
49 |     return args, config
50 | 
51 | 
52 | def main():
53 |     args, config = parse_args()
54 |     rank, model = train_net(args, config)
55 |     if args.do_test and (rank is None or rank == 0):
56 |         test_net(args, config)
57 | 
58 | 
59 | if __name__ == '__main__':
60 |     main()
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------