├── LICENSE ├── README.md ├── cfgs ├── pretrain │ ├── base_e2e_16x16G_fp16.yaml │ ├── base_prec_4x16G_fp32.yaml │ ├── base_prec_withouttextonly_4x16G_fp32.yaml │ ├── large_e2e_16x16G_fp16.yaml │ ├── large_prec_4x16G_fp16.yaml │ └── vis_attention_maps_coco.yaml ├── refcoco │ ├── base_detected_regions_4x16G.yaml │ ├── base_gt_boxes_4x16G.yaml │ ├── large_detected_regions_4x16G.yaml │ └── large_gt_boxes_4x16G.yaml ├── vcr │ ├── base_q2a_4x16G_fp32.yaml │ ├── base_qa2r_4x16G_fp32.yaml │ ├── large_q2a_16x16G_fp16.yaml │ ├── large_q2a_4x16G_fp16.yaml │ ├── large_qa2r_16x16G_fp16.yaml │ └── large_qa2r_4x16G_fp16.yaml └── vqa │ ├── base_4x16G_fp32.yaml │ └── large_4x16G_fp32.yaml ├── common ├── __init__.py ├── backbone │ ├── __init__.py │ └── resnet │ │ ├── __init__.py │ │ └── resnet.py ├── callbacks │ ├── batch_end_callbacks │ │ ├── __init__.py │ │ └── speedometer.py │ └── epoch_end_callbacks │ │ ├── __init__.py │ │ ├── checkpoint.py │ │ └── validation_monitor.py ├── fast_rcnn.py ├── lib │ └── roi_pooling │ │ ├── ROIAlign.h │ │ ├── ROIPool.h │ │ ├── __init__.py │ │ ├── cpu │ │ ├── ROIAlign_cpu.cpp │ │ └── vision.h │ │ ├── cuda │ │ ├── ROIAlign_cuda.cu │ │ ├── ROIPool_cuda.cu │ │ └── vision.h │ │ ├── debug.py │ │ ├── roi_align.py │ │ ├── roi_pool.py │ │ ├── setup.py │ │ └── vision.cpp ├── lr_scheduler.py ├── metrics │ ├── __init__.py │ ├── composite_eval_metric.py │ ├── eval_metric.py │ ├── pretrain_metrics.py │ ├── refcoco_metrics.py │ ├── vcr_metrics.py │ └── vqa_metrics.py ├── module.py ├── nlp │ ├── __init__.py │ ├── bert │ │ ├── __init__.py │ │ └── optimization.py │ ├── bert_encoder_wrapper.py │ ├── encoder_base.py │ ├── input_variational_dropout.py │ ├── misc.py │ ├── roberta │ │ ├── __init__.py │ │ ├── modeling_roberta.py │ │ ├── tokenization_roberta.py │ │ └── utils.py │ └── time_distributed.py ├── trainer.py ├── utils │ ├── __init__.py │ ├── bbox.py │ ├── clip_pad.py │ ├── create_logger.py │ ├── flatten.py │ ├── load.py │ ├── mask.py │ ├── masked_softmax.py │ ├── misc.py │ ├── multi_task_dataloader.py │ ├── pad_sequence.py │ └── zipreader.py └── visual_linguistic_bert.py ├── data ├── PREPARE_DATA.md └── conceptual-captions │ ├── ReadMe.txt │ └── utils │ ├── check_valid.py │ ├── download_train.sh │ ├── download_val.sh │ ├── gen_train4download.py │ ├── gen_train_image_json.py │ ├── gen_val4download.py │ └── gen_val_image_json.py ├── external └── pytorch_pretrained_bert │ ├── __init__.py │ ├── __main__.py │ ├── convert_gpt2_checkpoint_to_pytorch.py │ ├── convert_openai_checkpoint_to_pytorch.py │ ├── convert_tf_checkpoint_to_pytorch.py │ ├── convert_transfo_xl_checkpoint_to_pytorch.py │ ├── file_utils.py │ ├── modeling.py │ ├── modeling_gpt2.py │ ├── modeling_openai.py │ ├── modeling_transfo_xl.py │ ├── modeling_transfo_xl_utilities.py │ ├── optimization.py │ ├── optimization_openai.py │ ├── tokenization.py │ ├── tokenization_gpt2.py │ ├── tokenization_openai.py │ └── tokenization_transfo_xl.py ├── figs ├── attention_viz.png └── pretrain.png ├── model └── pretrained_model │ └── PREPARE_PRETRAINED_MODELS.md ├── pretrain ├── _init_paths.py ├── data │ ├── __init__.py │ ├── build.py │ ├── collate_batch.py │ ├── datasets │ │ ├── __init__.py │ │ ├── coco_captions.py │ │ ├── conceptual_captions.py │ │ └── general_corpus.py │ ├── samplers │ │ ├── __init__.py │ │ ├── distributed.py │ │ └── grouped_batch_sampler.py │ └── transforms │ │ ├── __init__.py │ │ ├── build.py │ │ └── transforms.py ├── function │ ├── __init__.py │ ├── config.py │ ├── train.py │ ├── val.py │ └── vis.py ├── modules │ ├── __init__.py │ ├── resnet_vlbert_for_attention_vis.py │ ├── resnet_vlbert_for_pretraining.py │ └── resnet_vlbert_for_pretraining_multitask.py ├── train_end2end.py └── vis_attention_maps.py ├── refcoco ├── _init_paths.py ├── data │ ├── __init__.py │ ├── build.py │ ├── collate_batch.py │ ├── datasets │ │ ├── __init__.py │ │ ├── refcoco.py │ │ └── refer │ │ │ ├── Makefile │ │ │ ├── __init__.py │ │ │ ├── external │ │ │ ├── .gitignore │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── _mask.pyx │ │ │ ├── mask.py │ │ │ ├── maskApi.c │ │ │ └── maskApi.h │ │ │ ├── refer.py │ │ │ └── setup.py │ ├── samplers │ │ ├── __init__.py │ │ ├── distributed.py │ │ └── grouped_batch_sampler.py │ └── transforms │ │ ├── __init__.py │ │ ├── build.py │ │ └── transforms.py ├── function │ ├── __init__.py │ ├── config.py │ ├── test.py │ ├── train.py │ └── val.py ├── modules │ ├── __init__.py │ └── resnet_vlbert_for_refcoco.py ├── test.py └── train_end2end.py ├── requirements.txt ├── scripts ├── dist_run_multi.sh ├── dist_run_single.sh ├── dist_run_slurm.sh ├── init.sh ├── init_slurm.sh ├── launch.py ├── nondist_run.sh └── nondist_run_slurm.sh ├── vcr ├── _init_paths.py ├── data │ ├── __init__.py │ ├── build.py │ ├── collate_batch.py │ ├── datasets │ │ ├── __init__.py │ │ └── vcr.py │ ├── samplers │ │ ├── __init__.py │ │ ├── distributed.py │ │ └── grouped_batch_sampler.py │ └── transforms │ │ ├── __init__.py │ │ ├── build.py │ │ └── transforms.py ├── function │ ├── __init__.py │ ├── config.py │ ├── test.py │ ├── train.py │ └── val.py ├── modules │ ├── __init__.py │ └── resnet_vlbert_for_vcr.py ├── test.py ├── train_end2end.py └── val.py ├── viz ├── VISUALIZATION.md ├── _init_paths.py ├── bertviz │ ├── __init__.py │ ├── attention.py │ ├── model_view.js │ └── model_view.py └── model_view_vl-bert_coco.ipynb └── vqa ├── _init_paths.py ├── data ├── __init__.py ├── build.py ├── collate_batch.py ├── datasets │ ├── __init__.py │ └── vqa.py ├── samplers │ ├── __init__.py │ ├── distributed.py │ └── grouped_batch_sampler.py └── transforms │ ├── __init__.py │ ├── build.py │ └── transforms.py ├── function ├── __init__.py ├── config.py ├── test.py ├── train.py └── val.py ├── modules ├── __init__.py └── resnet_vlbert_for_vqa.py ├── test.py └── train_end2end.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Weijie Su 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /cfgs/pretrain/base_e2e_16x16G_fp16.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/pretrain/vl-bert' 4 | MODULE: ResNetVLBERTForPretrainingMultitask 5 | GPUS: '0,1,2,3' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_base_res101_pretrain_multitask' 10 | NUM_WORKERS_PER_GPU: 4 11 | SCALES: 12 | - 600 13 | - 1000 14 | 15 | DATASET: 16 | 17 | - DATASET: conceptual_captions 18 | APPEND_INDEX: false 19 | DATASET_PATH: './data/conceptual-captions' 20 | ROOT_PATH: './' 21 | TRAIN_IMAGE_SET: 'train' 22 | VAL_IMAGE_SET: 'val' 23 | TEST_IMAGE_SET: 'val' 24 | ADD_IMAGE_AS_A_BOX: true 25 | ZIP_MODE: false 26 | CACHE_MODE: false 27 | IGNORE_DB_CACHE: false 28 | MASK_SIZE: 14 29 | 30 | - DATASET: general_corpus 31 | TRAIN_ANNOTATION_FILE: './data/en_corpus/bc1g.doc+./data/en_corpus/wiki.doc' 32 | VAL_ANNOTATION_FILE: './data/en_corpus/bc1g.doc' 33 | TEST_ANNOTATION_FILE: './data/en_corpus/bc1g.doc' 34 | SEQ_LEN: 64 35 | MIN_SEQ_LEN: 64 36 | 37 | NETWORK: 38 | PARTIAL_PRETRAIN: "" 39 | PARTIAL_PRETRAIN_PREFIX_CHANGES: [] 40 | IMAGE_NUM_LAYERS: 101 41 | IMAGE_C5_DILATED: true 42 | IMAGE_STRIDE_IN_1x1: true 43 | PIXEL_MEANS: 44 | - 102.9801 45 | - 115.9465 46 | - 122.7717 47 | PIXEL_STDS: 48 | - 1.0 49 | - 1.0 50 | - 1.0 51 | IMAGE_FEAT_PRECOMPUTED: false 52 | IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua' 53 | IMAGE_PRETRAINED_EPOCH: 0 54 | IMAGE_FROZEN_BACKBONE_STAGES: 55 | - 1 56 | - 2 57 | IMAGE_FROZEN_BN: true 58 | IMAGE_FINAL_DIM: 768 59 | IMAGE_SEMANTIC: false 60 | OUTPUT_CONV5: false 61 | BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased' 62 | BERT_PRETRAINED: '' 63 | BERT_PRETRAINED_EPOCH: 0 64 | BERT_FROZEN: false 65 | ENABLE_CNN_REG_LOSS: false 66 | MLM_LOSS_NORM_IN_BATCH_FIRST: false 67 | MVRC_LOSS_NORM_IN_BATCH_FIRST: false 68 | WITH_REL_LOSS: false 69 | WITH_MLM_LOSS: true 70 | WITH_MVRC_LOSS: true 71 | 72 | VLBERT: 73 | with_pooler: false 74 | input_transform_type: 1 75 | visual_size: 768 76 | hidden_size: 768 77 | num_hidden_layers: 12 78 | num_attention_heads: 12 79 | intermediate_size: 3072 80 | hidden_act: "gelu" 81 | hidden_dropout_prob: 0.1 82 | attention_probs_dropout_prob: 0.1 83 | max_position_embeddings: 512 84 | type_vocab_size: 3 85 | vocab_size: 30522 86 | initializer_range: 0.02 87 | visual_scale_text_init: 0.0 88 | visual_scale_object_init: 0.0 89 | visual_ln: true 90 | pos_embedding_frozen: false 91 | 92 | TRAIN: 93 | SHUFFLE: true 94 | FLIP_PROB: 0.5 95 | BATCH_IMAGES: 96 | - 8 97 | - 8 98 | ASPECT_GROUPING: false 99 | RESUME: false 100 | AUTO_RESUME: true 101 | BEGIN_EPOCH: 0 102 | END_EPOCH: 10 103 | OPTIMIZER: 'AdamW' 104 | CLIP_GRAD_NORM: 10 105 | LR: 1.0e-7 106 | LR_SCHEDULE: 'triangle' 107 | WD: 0.0001 108 | WARMUP: true 109 | WARMUP_METHOD: 'linear' 110 | WARMUP_FACTOR: 0.0 111 | WARMUP_STEPS: 16000 112 | FP16: true 113 | FP16_LOSS_SCALE: 'dynamic' 114 | LOSS_LOGGERS: 115 | - "mlm_loss_wvc,MLMLossWVC" 116 | - "mlm_loss_aux,MLMLossAUX" 117 | - "mvrc_loss,MVRCLoss" 118 | 119 | VAL: 120 | SHUFFLE: false 121 | FLIP_PROB: 0 122 | BATCH_IMAGES: 123 | - 8 124 | - 8 125 | 126 | TEST: 127 | SHUFFLE: false 128 | FLIP_PROB: 0 129 | TEST_EPOCH: 0 130 | BATCH_IMAGES: 131 | - 8 132 | - 8 -------------------------------------------------------------------------------- /cfgs/pretrain/base_prec_4x16G_fp32.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/pretrain/vlbert' 4 | MODULE: ResNetVLBERTForPretrainingMultitask 5 | GPUS: '0,1,2,3' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_base_res101_pretrain_multitask' 10 | NUM_WORKERS_PER_GPU: 4 11 | SCALES: 12 | - 600 13 | - 1000 14 | 15 | DATASET: 16 | 17 | - DATASET: conceptual_captions 18 | APPEND_INDEX: false 19 | DATASET_PATH: './data/conceptual-captions' 20 | ROOT_PATH: './' 21 | TRAIN_IMAGE_SET: 'train' 22 | VAL_IMAGE_SET: 'val' 23 | TEST_IMAGE_SET: 'val' 24 | ADD_IMAGE_AS_A_BOX: true 25 | ZIP_MODE: false 26 | CACHE_MODE: false 27 | IGNORE_DB_CACHE: false 28 | MASK_SIZE: 14 29 | 30 | - DATASET: general_corpus 31 | TRAIN_ANNOTATION_FILE: './data/en_corpus/bc1g.doc+./data/en_corpus/wiki.doc' 32 | VAL_ANNOTATION_FILE: './data/en_corpus/bc1g.doc' 33 | TEST_ANNOTATION_FILE: './data/en_corpus/bc1g.doc' 34 | SEQ_LEN: 64 35 | MIN_SEQ_LEN: 64 36 | 37 | NETWORK: 38 | PARTIAL_PRETRAIN: "" 39 | PARTIAL_PRETRAIN_PREFIX_CHANGES: [] 40 | IMAGE_NUM_LAYERS: 101 41 | IMAGE_C5_DILATED: true 42 | IMAGE_STRIDE_IN_1x1: true 43 | PIXEL_MEANS: 44 | - 102.9801 45 | - 115.9465 46 | - 122.7717 47 | PIXEL_STDS: 48 | - 1.0 49 | - 1.0 50 | - 1.0 51 | IMAGE_FEAT_PRECOMPUTED: true 52 | IMAGE_PRETRAINED: '' 53 | IMAGE_PRETRAINED_EPOCH: 0 54 | IMAGE_FROZEN_BACKBONE_STAGES: 55 | - 1 56 | - 2 57 | IMAGE_FROZEN_BN: true 58 | IMAGE_FINAL_DIM: 768 59 | IMAGE_SEMANTIC: false 60 | OUTPUT_CONV5: false 61 | BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased' 62 | BERT_PRETRAINED: '' 63 | BERT_PRETRAINED_EPOCH: 0 64 | BERT_FROZEN: false 65 | ENABLE_CNN_REG_LOSS: false 66 | MLM_LOSS_NORM_IN_BATCH_FIRST: false 67 | MVRC_LOSS_NORM_IN_BATCH_FIRST: false 68 | WITH_REL_LOSS: false 69 | WITH_MLM_LOSS: true 70 | WITH_MVRC_LOSS: true 71 | 72 | VLBERT: 73 | with_pooler: false 74 | input_transform_type: 1 75 | visual_size: 768 76 | hidden_size: 768 77 | num_hidden_layers: 12 78 | num_attention_heads: 12 79 | intermediate_size: 3072 80 | hidden_act: "gelu" 81 | hidden_dropout_prob: 0.1 82 | attention_probs_dropout_prob: 0.1 83 | max_position_embeddings: 512 84 | type_vocab_size: 3 85 | vocab_size: 30522 86 | initializer_range: 0.02 87 | visual_scale_text_init: 0.0 88 | visual_scale_object_init: 0.0 89 | visual_ln: true 90 | pos_embedding_frozen: false 91 | 92 | TRAIN: 93 | SHUFFLE: true 94 | FLIP_PROB: 0.5 95 | BATCH_IMAGES: 96 | - 32 97 | - 32 98 | ASPECT_GROUPING: false 99 | RESUME: false 100 | AUTO_RESUME: true 101 | BEGIN_EPOCH: 0 102 | END_EPOCH: 10 103 | OPTIMIZER: 'AdamW' 104 | CLIP_GRAD_NORM: 10 105 | LR: 1.0e-7 106 | LR_SCHEDULE: 'triangle' 107 | WD: 0.0001 108 | WARMUP: true 109 | WARMUP_METHOD: 'linear' 110 | WARMUP_FACTOR: 0.0 111 | WARMUP_STEPS: 16000 112 | FP16: false 113 | FP16_LOSS_SCALE: 128.0 114 | LOSS_LOGGERS: 115 | - "mlm_loss_wvc,MLMLossWVC" 116 | - "mlm_loss_aux,MLMLossAUX" 117 | - "mvrc_loss,MVRCLoss" 118 | 119 | VAL: 120 | SHUFFLE: false 121 | FLIP_PROB: 0 122 | BATCH_IMAGES: 123 | - 32 124 | - 32 125 | 126 | TEST: 127 | SHUFFLE: false 128 | FLIP_PROB: 0 129 | TEST_EPOCH: 0 130 | BATCH_IMAGES: 131 | - 32 132 | - 32 -------------------------------------------------------------------------------- /cfgs/pretrain/base_prec_withouttextonly_4x16G_fp32.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/pretrain/vlbert' 4 | MODULE: ResNetVLBERTForPretraining 5 | GPUS: '0,1,2,3' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_base_res101_pretrain' 10 | NUM_WORKERS_PER_GPU: 4 11 | SCALES: 12 | - 600 13 | - 1000 14 | 15 | DATASET: 16 | DATASET: conceptual_captions 17 | APPEND_INDEX: false 18 | DATASET_PATH: './data/conceptual-captions' 19 | ROOT_PATH: './' 20 | TRAIN_IMAGE_SET: 'train' 21 | VAL_IMAGE_SET: 'val' 22 | TEST_IMAGE_SET: 'val' 23 | ADD_IMAGE_AS_A_BOX: true 24 | ZIP_MODE: false 25 | CACHE_MODE: false 26 | IGNORE_DB_CACHE: false 27 | MASK_SIZE: 14 28 | 29 | NETWORK: 30 | PARTIAL_PRETRAIN: "" 31 | PARTIAL_PRETRAIN_PREFIX_CHANGES: [] 32 | IMAGE_NUM_LAYERS: 101 33 | IMAGE_C5_DILATED: true 34 | IMAGE_STRIDE_IN_1x1: true 35 | PIXEL_MEANS: 36 | - 102.9801 37 | - 115.9465 38 | - 122.7717 39 | PIXEL_STDS: 40 | - 1.0 41 | - 1.0 42 | - 1.0 43 | IMAGE_FEAT_PRECOMPUTED: true 44 | IMAGE_PRETRAINED: '' 45 | IMAGE_PRETRAINED_EPOCH: 0 46 | IMAGE_FROZEN_BACKBONE_STAGES: 47 | - 1 48 | - 2 49 | IMAGE_FROZEN_BN: true 50 | IMAGE_FINAL_DIM: 768 51 | IMAGE_SEMANTIC: false 52 | OUTPUT_CONV5: false 53 | BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased' 54 | BERT_PRETRAINED: '' 55 | BERT_PRETRAINED_EPOCH: 0 56 | BERT_FROZEN: false 57 | ENABLE_CNN_REG_LOSS: false 58 | MLM_LOSS_NORM_IN_BATCH_FIRST: false 59 | MVRC_LOSS_NORM_IN_BATCH_FIRST: false 60 | WITH_REL_LOSS: false 61 | WITH_MLM_LOSS: true 62 | WITH_MVRC_LOSS: true 63 | 64 | VLBERT: 65 | with_pooler: false 66 | input_transform_type: 1 67 | visual_size: 768 68 | hidden_size: 768 69 | num_hidden_layers: 12 70 | num_attention_heads: 12 71 | intermediate_size: 3072 72 | hidden_act: "gelu" 73 | hidden_dropout_prob: 0.1 74 | attention_probs_dropout_prob: 0.1 75 | max_position_embeddings: 512 76 | type_vocab_size: 3 77 | vocab_size: 30522 78 | initializer_range: 0.02 79 | visual_scale_text_init: 0.0 80 | visual_scale_object_init: 0.0 81 | visual_ln: true 82 | pos_embedding_frozen: false 83 | 84 | TRAIN: 85 | SHUFFLE: true 86 | FLIP_PROB: 0.5 87 | BATCH_IMAGES: 64 88 | ASPECT_GROUPING: false 89 | RESUME: false 90 | AUTO_RESUME: true 91 | BEGIN_EPOCH: 0 92 | END_EPOCH: 10 93 | OPTIMIZER: 'AdamW' 94 | CLIP_GRAD_NORM: 10 95 | LR: 1.0e-7 96 | LR_SCHEDULE: 'triangle' 97 | WD: 0.0001 98 | WARMUP: true 99 | WARMUP_METHOD: 'linear' 100 | WARMUP_FACTOR: 0.0 101 | WARMUP_STEPS: 8000 102 | FP16: false 103 | FP16_LOSS_SCALE: 128.0 104 | LOSS_LOGGERS: 105 | - "mlm_loss,MLMLossWVC" 106 | - "mvrc_loss,MVRCLoss" 107 | 108 | VAL: 109 | SHUFFLE: false 110 | FLIP_PROB: 0 111 | BATCH_IMAGES: 64 112 | 113 | TEST: 114 | SHUFFLE: false 115 | FLIP_PROB: 0 116 | TEST_EPOCH: 0 117 | BATCH_IMAGES: 64 -------------------------------------------------------------------------------- /cfgs/pretrain/large_e2e_16x16G_fp16.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/pretrain/vlbert' 4 | MODULE: ResNetVLBERTForPretrainingMultitask 5 | GPUS: '0,1,2,3' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_large_res101_pretrain_multitask' 10 | NUM_WORKERS_PER_GPU: 4 11 | SCALES: 12 | - 600 13 | - 1000 14 | 15 | DATASET: 16 | 17 | - DATASET: conceptual_captions 18 | APPEND_INDEX: false 19 | DATASET_PATH: './data/conceptual-captions' 20 | ROOT_PATH: './' 21 | TRAIN_IMAGE_SET: 'train' 22 | VAL_IMAGE_SET: 'val' 23 | TEST_IMAGE_SET: 'val' 24 | ADD_IMAGE_AS_A_BOX: true 25 | ZIP_MODE: false 26 | CACHE_MODE: false 27 | IGNORE_DB_CACHE: false 28 | MASK_SIZE: 14 29 | 30 | - DATASET: general_corpus 31 | TRAIN_ANNOTATION_FILE: './data/en_corpus/bc1g.doc+./data/en_corpus/wiki.doc' 32 | VAL_ANNOTATION_FILE: './data/en_corpus/bc1g.doc' 33 | TEST_ANNOTATION_FILE: './data/en_corpus/bc1g.doc' 34 | SEQ_LEN: 64 35 | MIN_SEQ_LEN: 64 36 | 37 | NETWORK: 38 | PARTIAL_PRETRAIN: "" 39 | PARTIAL_PRETRAIN_PREFIX_CHANGES: [] 40 | IMAGE_NUM_LAYERS: 101 41 | IMAGE_C5_DILATED: true 42 | IMAGE_STRIDE_IN_1x1: true 43 | PIXEL_MEANS: 44 | - 102.9801 45 | - 115.9465 46 | - 122.7717 47 | PIXEL_STDS: 48 | - 1.0 49 | - 1.0 50 | - 1.0 51 | IMAGE_FEAT_PRECOMPUTED: false 52 | IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua' 53 | IMAGE_PRETRAINED_EPOCH: 0 54 | IMAGE_FROZEN_BACKBONE_STAGES: 55 | - 1 56 | - 2 57 | IMAGE_FROZEN_BN: true 58 | IMAGE_FINAL_DIM: 1024 59 | IMAGE_SEMANTIC: false 60 | OUTPUT_CONV5: false 61 | BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased' 62 | BERT_PRETRAINED: '' 63 | BERT_PRETRAINED_EPOCH: 0 64 | BERT_FROZEN: false 65 | ENABLE_CNN_REG_LOSS: false 66 | MLM_LOSS_NORM_IN_BATCH_FIRST: false 67 | MVRC_LOSS_NORM_IN_BATCH_FIRST: false 68 | WITH_REL_LOSS: false 69 | WITH_MLM_LOSS: true 70 | WITH_MVRC_LOSS: true 71 | 72 | VLBERT: 73 | with_pooler: false 74 | input_transform_type: 1 75 | visual_size: 1024 76 | hidden_size: 1024 77 | num_hidden_layers: 24 78 | num_attention_heads: 16 79 | intermediate_size: 4096 80 | hidden_act: "gelu" 81 | hidden_dropout_prob: 0.1 82 | attention_probs_dropout_prob: 0.1 83 | max_position_embeddings: 512 84 | type_vocab_size: 3 85 | vocab_size: 30522 86 | initializer_range: 0.02 87 | visual_scale_text_init: 0.0 88 | visual_scale_object_init: 0.0 89 | visual_ln: true 90 | pos_embedding_frozen: false 91 | 92 | TRAIN: 93 | SHUFFLE: true 94 | FLIP_PROB: 0.5 95 | BATCH_IMAGES: 96 | - 4 97 | - 4 98 | ASPECT_GROUPING: false 99 | RESUME: false 100 | AUTO_RESUME: true 101 | BEGIN_EPOCH: 0 102 | END_EPOCH: 10 103 | OPTIMIZER: 'AdamW' 104 | CLIP_GRAD_NORM: 10 105 | GRAD_ACCUMULATE_STEPS: 2 106 | LR: 1.0e-7 107 | LR_SCHEDULE: 'triangle' 108 | WD: 0.0001 109 | WARMUP: true 110 | WARMUP_METHOD: 'linear' 111 | WARMUP_FACTOR: 0.0 112 | WARMUP_STEPS: 16000 113 | FP16: true 114 | FP16_LOSS_SCALE: 'dynamic' 115 | LOSS_LOGGERS: 116 | - "mlm_loss_wvc,MLMLossWVC" 117 | - "mlm_loss_aux,MLMLossAUX" 118 | - "mvrc_loss,MVRCLoss" 119 | 120 | VAL: 121 | SHUFFLE: false 122 | FLIP_PROB: 0 123 | BATCH_IMAGES: 124 | - 4 125 | - 4 126 | 127 | TEST: 128 | SHUFFLE: false 129 | FLIP_PROB: 0 130 | TEST_EPOCH: 0 131 | BATCH_IMAGES: 132 | - 4 133 | - 4 -------------------------------------------------------------------------------- /cfgs/pretrain/large_prec_4x16G_fp16.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/pretrain/vlbert' 4 | MODULE: ResNetVLBERTForPretrainingMultitask 5 | GPUS: '0,1,2,3' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_large_res101_pretrain_multitask' 10 | NUM_WORKERS_PER_GPU: 4 11 | SCALES: 12 | - 600 13 | - 1000 14 | 15 | DATASET: 16 | 17 | - DATASET: conceptual_captions 18 | APPEND_INDEX: false 19 | DATASET_PATH: './data/conceptual-captions' 20 | ROOT_PATH: './' 21 | TRAIN_IMAGE_SET: 'train' 22 | VAL_IMAGE_SET: 'val' 23 | TEST_IMAGE_SET: 'val' 24 | ADD_IMAGE_AS_A_BOX: true 25 | ZIP_MODE: false 26 | CACHE_MODE: false 27 | IGNORE_DB_CACHE: false 28 | MASK_SIZE: 14 29 | 30 | - DATASET: general_corpus 31 | TRAIN_ANNOTATION_FILE: './data/en_corpus/bc1g.doc+./data/en_corpus/wiki.doc' 32 | VAL_ANNOTATION_FILE: './data/en_corpus/bc1g.doc' 33 | TEST_ANNOTATION_FILE: './data/en_corpus/bc1g.doc' 34 | SEQ_LEN: 64 35 | MIN_SEQ_LEN: 64 36 | 37 | NETWORK: 38 | PARTIAL_PRETRAIN: "" 39 | PARTIAL_PRETRAIN_PREFIX_CHANGES: [] 40 | IMAGE_NUM_LAYERS: 101 41 | IMAGE_C5_DILATED: true 42 | IMAGE_STRIDE_IN_1x1: true 43 | PIXEL_MEANS: 44 | - 102.9801 45 | - 115.9465 46 | - 122.7717 47 | PIXEL_STDS: 48 | - 1.0 49 | - 1.0 50 | - 1.0 51 | IMAGE_FEAT_PRECOMPUTED: true 52 | IMAGE_PRETRAINED: '' 53 | IMAGE_PRETRAINED_EPOCH: 0 54 | IMAGE_FROZEN_BACKBONE_STAGES: 55 | - 1 56 | - 2 57 | IMAGE_FROZEN_BN: true 58 | IMAGE_FINAL_DIM: 1024 59 | IMAGE_SEMANTIC: false 60 | OUTPUT_CONV5: false 61 | BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased' 62 | BERT_PRETRAINED: '' 63 | BERT_PRETRAINED_EPOCH: 0 64 | BERT_FROZEN: false 65 | ENABLE_CNN_REG_LOSS: false 66 | MLM_LOSS_NORM_IN_BATCH_FIRST: false 67 | MVRC_LOSS_NORM_IN_BATCH_FIRST: false 68 | WITH_REL_LOSS: false 69 | WITH_MLM_LOSS: true 70 | WITH_MVRC_LOSS: true 71 | 72 | VLBERT: 73 | with_pooler: false 74 | input_transform_type: 1 75 | visual_size: 1024 76 | hidden_size: 1024 77 | num_hidden_layers: 24 78 | num_attention_heads: 16 79 | intermediate_size: 4096 80 | hidden_act: "gelu" 81 | hidden_dropout_prob: 0.1 82 | attention_probs_dropout_prob: 0.1 83 | max_position_embeddings: 512 84 | type_vocab_size: 3 85 | vocab_size: 30522 86 | initializer_range: 0.02 87 | visual_scale_text_init: 0.0 88 | visual_scale_object_init: 0.0 89 | visual_ln: true 90 | pos_embedding_frozen: false 91 | 92 | TRAIN: 93 | SHUFFLE: true 94 | FLIP_PROB: 0.5 95 | BATCH_IMAGES: 96 | - 32 97 | - 32 98 | ASPECT_GROUPING: false 99 | RESUME: false 100 | AUTO_RESUME: true 101 | BEGIN_EPOCH: 0 102 | END_EPOCH: 10 103 | OPTIMIZER: 'AdamW' 104 | CLIP_GRAD_NORM: 10 105 | LR: 1.0e-7 106 | LR_SCHEDULE: 'triangle' 107 | WD: 0.0001 108 | WARMUP: true 109 | WARMUP_METHOD: 'linear' 110 | WARMUP_FACTOR: 0.0 111 | WARMUP_STEPS: 16000 112 | FP16: true 113 | FP16_LOSS_SCALE: 'dynamic' 114 | LOSS_LOGGERS: 115 | - "mlm_loss_wvc,MLMLossWVC" 116 | - "mlm_loss_aux,MLMLossAUX" 117 | - "mvrc_loss,MVRCLoss" 118 | 119 | VAL: 120 | SHUFFLE: false 121 | FLIP_PROB: 0 122 | BATCH_IMAGES: 123 | - 32 124 | - 32 125 | 126 | TEST: 127 | SHUFFLE: false 128 | FLIP_PROB: 0 129 | TEST_EPOCH: 0 130 | BATCH_IMAGES: 131 | - 32 132 | - 32 -------------------------------------------------------------------------------- /cfgs/pretrain/vis_attention_maps_coco.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/pretrain/vlbert' 4 | MODULE: ResNetVLBERTForAttentionVis 5 | GPUS: '0' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_base_res101_pretrain' 10 | NUM_WORKERS_PER_GPU: 2 11 | SCALES: 12 | - 600 13 | - 1000 14 | 15 | DATASET: 16 | DATASET: coco_captions 17 | APPEND_INDEX: false 18 | DATASET_PATH: './data/coco' 19 | ROOT_PATH: './' 20 | TRAIN_IMAGE_SET: 'train' 21 | VAL_IMAGE_SET: 'val' 22 | TEST_IMAGE_SET: 'val' 23 | ADD_IMAGE_AS_A_BOX: true 24 | ZIP_MODE: false 25 | CACHE_MODE: false 26 | IGNORE_DB_CACHE: false 27 | MASK_SIZE: 14 28 | 29 | NETWORK: 30 | PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-base-e2e.model" 31 | PARTIAL_PRETRAIN_PREFIX_CHANGES: [] 32 | IMAGE_NUM_LAYERS: 101 33 | IMAGE_C5_DILATED: true 34 | IMAGE_STRIDE_IN_1x1: true 35 | PIXEL_MEANS: 36 | - 102.9801 37 | - 115.9465 38 | - 122.7717 39 | PIXEL_STDS: 40 | - 1.0 41 | - 1.0 42 | - 1.0 43 | IMAGE_FEAT_PRECOMPUTED: false 44 | IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua' 45 | IMAGE_PRETRAINED_EPOCH: 0 46 | IMAGE_FROZEN_BACKBONE_STAGES: 47 | - 1 48 | - 2 49 | IMAGE_FROZEN_BN: true 50 | IMAGE_FINAL_DIM: 768 51 | IMAGE_SEMANTIC: false 52 | OUTPUT_CONV5: false 53 | BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased' 54 | BERT_PRETRAINED: '' 55 | BERT_PRETRAINED_EPOCH: 0 56 | BERT_FROZEN: false 57 | ENABLE_CNN_REG_LOSS: false 58 | MLM_LOSS_NORM_IN_BATCH_FIRST: false 59 | MVRC_LOSS_NORM_IN_BATCH_FIRST: false 60 | WITH_REL_LOSS: false 61 | WITH_MLM_LOSS: false 62 | WITH_MVRC_LOSS: false 63 | 64 | VLBERT: 65 | with_pooler: false 66 | input_transform_type: 1 67 | visual_size: 768 68 | hidden_size: 768 69 | num_hidden_layers: 12 70 | num_attention_heads: 12 71 | intermediate_size: 3072 72 | hidden_act: "gelu" 73 | hidden_dropout_prob: 0.1 74 | attention_probs_dropout_prob: 0.1 75 | max_position_embeddings: 512 76 | type_vocab_size: 3 77 | vocab_size: 30522 78 | initializer_range: 0.02 79 | visual_scale_text_init: 0.0 80 | visual_scale_object_init: 0.0 81 | visual_ln: true 82 | pos_embedding_frozen: false 83 | 84 | VAL: 85 | SHUFFLE: false 86 | FLIP_PROB: 0 87 | BATCH_IMAGES: 1 -------------------------------------------------------------------------------- /cfgs/refcoco/base_detected_regions_4x16G.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/vl-bert/refcoco+' 4 | MODULE: ResNetVLBERT 5 | GPUS: '0,1,2,3' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_base_res101_refcoco' 10 | NUM_WORKERS_PER_GPU: 4 11 | SCALES: 12 | - 600 13 | - 1000 14 | 15 | DATASET: 16 | DATASET: refcoco+ 17 | LABEL_INDEX_IN_BATCH: -1 18 | APPEND_INDEX: false 19 | DATASET_PATH: './data/coco' 20 | ROOT_PATH: './' 21 | TRAIN_IMAGE_SET: 'train' 22 | VAL_IMAGE_SET: 'val' 23 | TEST_IMAGE_SET: 'test' 24 | ADD_IMAGE_AS_A_BOX: true 25 | ZIP_MODE: false 26 | CACHE_MODE: false 27 | IGNORE_DB_CACHE: true 28 | TRAIN_BOXES: "proposal+gt" 29 | VAL_BOXES: "proposal" 30 | TEST_BOXES: "proposal" 31 | 32 | 33 | NETWORK: 34 | PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-base-e2e.model" 35 | PARTIAL_PRETRAIN_PREFIX_CHANGES: 36 | - "vlbert.mvrc_head.transform->final_mlp.0" 37 | - "module.vlbert.mvrc_head.transform->module.final_mlp.0" 38 | - "vlbert->vlbert" 39 | - "module.vlbert->module.vlbert" 40 | IMAGE_NUM_LAYERS: 101 41 | IMAGE_C5_DILATED: true 42 | IMAGE_STRIDE_IN_1x1: true 43 | PIXEL_MEANS: 44 | - 102.9801 45 | - 115.9465 46 | - 122.7717 47 | PIXEL_STDS: 48 | - 1.0 49 | - 1.0 50 | - 1.0 51 | IMAGE_FEAT_PRECOMPUTED: false 52 | IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua' 53 | IMAGE_PRETRAINED_EPOCH: 0 54 | IMAGE_FROZEN_BACKBONE_STAGES: 55 | - 1 56 | - 2 57 | IMAGE_FROZEN_BN: true 58 | IMAGE_FINAL_DIM: 768 59 | IMAGE_SEMANTIC: false 60 | OUTPUT_CONV5: false 61 | BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased' 62 | BERT_PRETRAINED: '' 63 | BERT_PRETRAINED_EPOCH: 0 64 | BERT_FROZEN: false 65 | ENABLE_CNN_REG_LOSS: false 66 | 67 | VLBERT: 68 | input_transform_type: 1 69 | visual_size: 768 70 | hidden_size: 768 71 | num_hidden_layers: 12 72 | num_attention_heads: 12 73 | intermediate_size: 3072 74 | hidden_act: "gelu" 75 | hidden_dropout_prob: 0.1 76 | attention_probs_dropout_prob: 0.1 77 | max_position_embeddings: 512 78 | type_vocab_size: 3 79 | vocab_size: 30522 80 | initializer_range: 0.02 81 | visual_scale_text_init: 0.0 82 | visual_scale_object_init: 0.0 83 | visual_ln: true 84 | 85 | CLASSIFIER_DROPOUT: 0.0 86 | 87 | TRAIN: 88 | SHUFFLE: true 89 | FLIP_PROB: 0.5 90 | BATCH_IMAGES: 4 91 | ASPECT_GROUPING: true 92 | RESUME: false 93 | AUTO_RESUME: true 94 | BEGIN_EPOCH: 0 95 | END_EPOCH: 20 96 | OPTIMIZER: 'AdamW' 97 | CLIP_GRAD_NORM: 1.0 98 | GRAD_ACCUMULATE_STEPS: 2 99 | LR: 8.00e-7 100 | LR_SCHEDULE: 'triangle' 101 | WD: 0.0001 102 | WARMUP: true 103 | WARMUP_METHOD: 'linear' 104 | WARMUP_FACTOR: 0.0 105 | WARMUP_STEPS: 3750 106 | FP16: false 107 | FP16_LOSS_SCALE: 128.0 108 | 109 | VAL: 110 | SHUFFLE: false 111 | FLIP_PROB: 0 112 | BATCH_IMAGES: 4 113 | 114 | TEST: 115 | SHUFFLE: false 116 | FLIP_PROB: 0 117 | TEST_EPOCH: 0 118 | BATCH_IMAGES: 4 119 | -------------------------------------------------------------------------------- /cfgs/refcoco/base_gt_boxes_4x16G.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/vl-bert/refcoco+' 4 | MODULE: ResNetVLBERT 5 | GPUS: '0,1,2,3' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_base_res101_refcoco' 10 | NUM_WORKERS_PER_GPU: 4 11 | SCALES: 12 | - 600 13 | - 1000 14 | 15 | DATASET: 16 | DATASET: refcoco+ 17 | LABEL_INDEX_IN_BATCH: -1 18 | APPEND_INDEX: false 19 | DATASET_PATH: './data/coco' 20 | ROOT_PATH: './' 21 | TRAIN_IMAGE_SET: 'train' 22 | VAL_IMAGE_SET: 'val' 23 | TEST_IMAGE_SET: 'test' 24 | ADD_IMAGE_AS_A_BOX: true 25 | ZIP_MODE: false 26 | CACHE_MODE: false 27 | IGNORE_DB_CACHE: true 28 | TRAIN_BOXES: "gt" 29 | VAL_BOXES: "gt" 30 | TEST_BOXES: "gt" 31 | 32 | 33 | NETWORK: 34 | PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-base-e2e.model" 35 | PARTIAL_PRETRAIN_PREFIX_CHANGES: 36 | - "vlbert.mvrc_head.transform->final_mlp.0" 37 | - "module.vlbert.mvrc_head.transform->module.final_mlp.0" 38 | - "vlbert->vlbert" 39 | - "module.vlbert->module.vlbert" 40 | IMAGE_NUM_LAYERS: 101 41 | IMAGE_C5_DILATED: true 42 | IMAGE_STRIDE_IN_1x1: true 43 | PIXEL_MEANS: 44 | - 102.9801 45 | - 115.9465 46 | - 122.7717 47 | PIXEL_STDS: 48 | - 1.0 49 | - 1.0 50 | - 1.0 51 | IMAGE_FEAT_PRECOMPUTED: false 52 | IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua' 53 | IMAGE_PRETRAINED_EPOCH: 0 54 | IMAGE_FROZEN_BACKBONE_STAGES: 55 | - 1 56 | - 2 57 | IMAGE_FROZEN_BN: true 58 | IMAGE_FINAL_DIM: 768 59 | IMAGE_SEMANTIC: false 60 | OUTPUT_CONV5: false 61 | BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased' 62 | BERT_PRETRAINED: '' 63 | BERT_PRETRAINED_EPOCH: 0 64 | BERT_FROZEN: false 65 | ENABLE_CNN_REG_LOSS: false 66 | 67 | VLBERT: 68 | input_transform_type: 1 69 | visual_size: 768 70 | hidden_size: 768 71 | num_hidden_layers: 12 72 | num_attention_heads: 12 73 | intermediate_size: 3072 74 | hidden_act: "gelu" 75 | hidden_dropout_prob: 0.1 76 | attention_probs_dropout_prob: 0.1 77 | max_position_embeddings: 512 78 | type_vocab_size: 3 79 | vocab_size: 30522 80 | initializer_range: 0.02 81 | visual_scale_text_init: 0.0 82 | visual_scale_object_init: 0.0 83 | visual_ln: true 84 | 85 | CLASSIFIER_DROPOUT: 0.0 86 | 87 | TRAIN: 88 | SHUFFLE: true 89 | FLIP_PROB: 0.5 90 | BATCH_IMAGES: 4 91 | ASPECT_GROUPING: true 92 | RESUME: false 93 | AUTO_RESUME: true 94 | BEGIN_EPOCH: 0 95 | END_EPOCH: 20 96 | OPTIMIZER: 'AdamW' 97 | CLIP_GRAD_NORM: 1.0 98 | GRAD_ACCUMULATE_STEPS: 2 99 | LR: 8.00e-7 100 | LR_SCHEDULE: 'triangle' 101 | WD: 0.0001 102 | WARMUP: true 103 | WARMUP_METHOD: 'linear' 104 | WARMUP_FACTOR: 0.0 105 | WARMUP_STEPS: 3750 106 | FP16: false 107 | FP16_LOSS_SCALE: 128.0 108 | 109 | VAL: 110 | SHUFFLE: false 111 | FLIP_PROB: 0 112 | BATCH_IMAGES: 4 113 | 114 | TEST: 115 | SHUFFLE: false 116 | FLIP_PROB: 0 117 | TEST_EPOCH: 0 118 | BATCH_IMAGES: 4 119 | -------------------------------------------------------------------------------- /cfgs/refcoco/large_detected_regions_4x16G.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/vl-bert/refcoco+' 4 | MODULE: ResNetVLBERT 5 | GPUS: '0,1,2,3' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_large_res101_refcoco' 10 | NUM_WORKERS_PER_GPU: 4 11 | SCALES: 12 | - 600 13 | - 1000 14 | 15 | DATASET: 16 | DATASET: refcoco+ 17 | LABEL_INDEX_IN_BATCH: -1 18 | APPEND_INDEX: false 19 | DATASET_PATH: './data/coco' 20 | ROOT_PATH: './' 21 | TRAIN_IMAGE_SET: 'train' 22 | VAL_IMAGE_SET: 'val' 23 | TEST_IMAGE_SET: 'test' 24 | ADD_IMAGE_AS_A_BOX: true 25 | ZIP_MODE: false 26 | CACHE_MODE: false 27 | IGNORE_DB_CACHE: true 28 | TRAIN_BOXES: "proposal+gt" 29 | VAL_BOXES: "proposal" 30 | TEST_BOXES: "proposal" 31 | 32 | 33 | NETWORK: 34 | PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-large-e2e.model" 35 | PARTIAL_PRETRAIN_PREFIX_CHANGES: 36 | - "vlbert.mvrc_head.transform->final_mlp.0" 37 | - "module.vlbert.mvrc_head.transform->module.final_mlp.0" 38 | - "vlbert->vlbert" 39 | - "module.vlbert->module.vlbert" 40 | IMAGE_NUM_LAYERS: 101 41 | IMAGE_C5_DILATED: true 42 | IMAGE_STRIDE_IN_1x1: true 43 | PIXEL_MEANS: 44 | - 102.9801 45 | - 115.9465 46 | - 122.7717 47 | PIXEL_STDS: 48 | - 1.0 49 | - 1.0 50 | - 1.0 51 | IMAGE_FEAT_PRECOMPUTED: false 52 | IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua' 53 | IMAGE_PRETRAINED_EPOCH: 0 54 | IMAGE_FROZEN_BACKBONE_STAGES: 55 | - 1 56 | - 2 57 | IMAGE_FROZEN_BN: true 58 | IMAGE_FINAL_DIM: 1024 59 | IMAGE_SEMANTIC: false 60 | OUTPUT_CONV5: false 61 | BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased' 62 | BERT_PRETRAINED: '' 63 | BERT_PRETRAINED_EPOCH: 0 64 | BERT_FROZEN: false 65 | ENABLE_CNN_REG_LOSS: false 66 | 67 | VLBERT: 68 | input_transform_type: 1 69 | visual_size: 1024 70 | hidden_size: 1024 71 | num_hidden_layers: 24 72 | num_attention_heads: 16 73 | intermediate_size: 4096 74 | hidden_act: "gelu" 75 | hidden_dropout_prob: 0.1 76 | attention_probs_dropout_prob: 0.1 77 | max_position_embeddings: 512 78 | type_vocab_size: 3 79 | vocab_size: 30522 80 | initializer_range: 0.02 81 | visual_scale_text_init: 0.0 82 | visual_scale_object_init: 0.0 83 | visual_ln: true 84 | 85 | CLASSIFIER_DROPOUT: 0.0 86 | 87 | TRAIN: 88 | SHUFFLE: true 89 | FLIP_PROB: 0.5 90 | BATCH_IMAGES: 2 91 | ASPECT_GROUPING: true 92 | RESUME: false 93 | AUTO_RESUME: true 94 | BEGIN_EPOCH: 0 95 | END_EPOCH: 20 96 | OPTIMIZER: 'AdamW' 97 | CLIP_GRAD_NORM: 1.0 98 | GRAD_ACCUMULATE_STEPS: 4 99 | LR: 8.00e-7 100 | LR_SCHEDULE: 'triangle' 101 | WD: 0.0001 102 | WARMUP: true 103 | WARMUP_METHOD: 'linear' 104 | WARMUP_FACTOR: 0.0 105 | WARMUP_STEPS: 3750 106 | FP16: false 107 | FP16_LOSS_SCALE: 128.0 108 | 109 | VAL: 110 | SHUFFLE: false 111 | FLIP_PROB: 0 112 | BATCH_IMAGES: 2 113 | 114 | TEST: 115 | SHUFFLE: false 116 | FLIP_PROB: 0 117 | TEST_EPOCH: 0 118 | BATCH_IMAGES: 2 -------------------------------------------------------------------------------- /cfgs/refcoco/large_gt_boxes_4x16G.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/vl-bert/refcoco+' 4 | MODULE: ResNetVLBERT 5 | GPUS: '0,1,2,3' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_large_res101_refcoco' 10 | NUM_WORKERS_PER_GPU: 4 11 | SCALES: 12 | - 600 13 | - 1000 14 | 15 | DATASET: 16 | DATASET: refcoco+ 17 | LABEL_INDEX_IN_BATCH: -1 18 | APPEND_INDEX: false 19 | DATASET_PATH: './data/coco' 20 | ROOT_PATH: './' 21 | TRAIN_IMAGE_SET: 'train' 22 | VAL_IMAGE_SET: 'val' 23 | TEST_IMAGE_SET: 'test' 24 | ADD_IMAGE_AS_A_BOX: true 25 | ZIP_MODE: false 26 | CACHE_MODE: false 27 | IGNORE_DB_CACHE: true 28 | TRAIN_BOXES: "gt" 29 | VAL_BOXES: "gt" 30 | TEST_BOXES: "gt" 31 | 32 | 33 | NETWORK: 34 | PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-large-e2e.model" 35 | PARTIAL_PRETRAIN_PREFIX_CHANGES: 36 | - "vlbert.mvrc_head.transform->final_mlp.0" 37 | - "module.vlbert.mvrc_head.transform->module.final_mlp.0" 38 | - "vlbert->vlbert" 39 | - "module.vlbert->module.vlbert" 40 | IMAGE_NUM_LAYERS: 101 41 | IMAGE_C5_DILATED: true 42 | IMAGE_STRIDE_IN_1x1: true 43 | PIXEL_MEANS: 44 | - 102.9801 45 | - 115.9465 46 | - 122.7717 47 | PIXEL_STDS: 48 | - 1.0 49 | - 1.0 50 | - 1.0 51 | IMAGE_FEAT_PRECOMPUTED: false 52 | IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua' 53 | IMAGE_PRETRAINED_EPOCH: 0 54 | IMAGE_FROZEN_BACKBONE_STAGES: 55 | - 1 56 | - 2 57 | IMAGE_FROZEN_BN: true 58 | IMAGE_FINAL_DIM: 1024 59 | IMAGE_SEMANTIC: false 60 | OUTPUT_CONV5: false 61 | BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased' 62 | BERT_PRETRAINED: '' 63 | BERT_PRETRAINED_EPOCH: 0 64 | BERT_FROZEN: false 65 | ENABLE_CNN_REG_LOSS: false 66 | 67 | VLBERT: 68 | input_transform_type: 1 69 | visual_size: 1024 70 | hidden_size: 1024 71 | num_hidden_layers: 24 72 | num_attention_heads: 16 73 | intermediate_size: 4096 74 | hidden_act: "gelu" 75 | hidden_dropout_prob: 0.1 76 | attention_probs_dropout_prob: 0.1 77 | max_position_embeddings: 512 78 | type_vocab_size: 3 79 | vocab_size: 30522 80 | initializer_range: 0.02 81 | visual_scale_text_init: 0.0 82 | visual_scale_object_init: 0.0 83 | visual_ln: true 84 | 85 | CLASSIFIER_DROPOUT: 0.0 86 | 87 | TRAIN: 88 | SHUFFLE: true 89 | FLIP_PROB: 0.5 90 | BATCH_IMAGES: 2 91 | ASPECT_GROUPING: true 92 | RESUME: false 93 | AUTO_RESUME: true 94 | BEGIN_EPOCH: 0 95 | END_EPOCH: 20 96 | OPTIMIZER: 'AdamW' 97 | CLIP_GRAD_NORM: 1.0 98 | GRAD_ACCUMULATE_STEPS: 4 99 | LR: 8.00e-7 100 | LR_SCHEDULE: 'triangle' 101 | WD: 0.0001 102 | WARMUP: true 103 | WARMUP_METHOD: 'linear' 104 | WARMUP_FACTOR: 0.0 105 | WARMUP_STEPS: 3750 106 | FP16: false 107 | FP16_LOSS_SCALE: 128.0 108 | 109 | VAL: 110 | SHUFFLE: false 111 | FLIP_PROB: 0 112 | BATCH_IMAGES: 2 113 | 114 | TEST: 115 | SHUFFLE: false 116 | FLIP_PROB: 0 117 | TEST_EPOCH: 0 118 | BATCH_IMAGES: 2 -------------------------------------------------------------------------------- /cfgs/vcr/base_q2a_4x16G_fp32.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/vl-bert/vcr' 4 | MODULE: ResNetVLBERT 5 | GPUS: '0,1,2,3' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_base_a_res101' 10 | NUM_WORKERS_PER_GPU: 4 11 | SCALES: 12 | - 600 13 | - 1200 14 | 15 | DATASET: 16 | DATASET: vcr 17 | LABEL_INDEX_IN_BATCH: 7 18 | APPEND_INDEX: false 19 | TASK: 'Q2A' 20 | BASIC_ALIGN: false 21 | DATASET_PATH: './data/vcr' 22 | ROOT_PATH: './' 23 | TRAIN_IMAGE_SET: 'vcr1images' 24 | VAL_IMAGE_SET: 'vcr1images' 25 | TEST_IMAGE_SET: 'vcr1images' 26 | TRAIN_ANNOTATION_FILE: 'train.jsonl' 27 | VAL_ANNOTATION_FILE: 'val.jsonl' 28 | TEST_ANNOTATION_FILE: 'test.jsonl' 29 | ONLY_USE_RELEVANT_DETS: false 30 | ADD_IMAGE_AS_A_BOX: true 31 | ZIP_MODE: false 32 | CACHE_MODE: false 33 | IGNORE_DB_CACHE: true 34 | MASK_SIZE: 14 35 | 36 | NETWORK: 37 | PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-base-e2e.model" 38 | PARTIAL_PRETRAIN_PREFIX_CHANGES: 39 | - "vlbert.mvrc_head.transform->cnn_loss_reg.0" 40 | - "module.vlbert.mvrc_head.transform->module.cnn_loss_reg.0" 41 | - "module.vlbert->module.vlbert._module" 42 | - "vlbert->vlbert._module" 43 | PARTIAL_PRETRAIN_SEGMB_INIT: true 44 | IMAGE_NUM_LAYERS: 101 45 | IMAGE_C5_DILATED: true 46 | IMAGE_STRIDE_IN_1x1: true 47 | PIXEL_MEANS: 48 | - 102.9801 49 | - 115.9465 50 | - 122.7717 51 | PIXEL_STDS: 52 | - 1.0 53 | - 1.0 54 | - 1.0 55 | IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua' 56 | IMAGE_PRETRAINED_EPOCH: 0 57 | IMAGE_FROZEN_BACKBONE_STAGES: 58 | - 1 59 | - 2 60 | IMAGE_FROZEN_BN: true 61 | IMAGE_FINAL_DIM: 768 62 | IMAGE_SEMANTIC: false 63 | OUTPUT_CONV5: false 64 | BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased' 65 | BERT_PRETRAINED: '' 66 | BERT_PRETRAINED_EPOCH: 0 67 | BERT_FROZEN: false 68 | ENABLE_CNN_REG_LOSS: true 69 | ANS_LOSS_WEIGHT: 1.0 70 | CNN_LOSS_TOP: true 71 | 72 | VLBERT: 73 | input_transform_type: 1 74 | visual_size: 768 75 | hidden_size: 768 76 | num_hidden_layers: 12 77 | num_attention_heads: 12 78 | intermediate_size: 3072 79 | hidden_act: "gelu" 80 | hidden_dropout_prob: 0.1 81 | attention_probs_dropout_prob: 0.1 82 | max_position_embeddings: 512 83 | type_vocab_size: 3 84 | vocab_size: 30522 85 | initializer_range: 0.02 86 | visual_scale_text_init: 0.0 87 | visual_scale_object_init: 0.0 88 | visual_ln: true 89 | object_word_embed_mode: 2 90 | 91 | CLASSIFIER_TYPE: "1fc" 92 | CLASSIFIER_HIDDEN_SIZE: 1024 93 | CLASSIFIER_DROPOUT: 0.1 94 | CLASSIFIER_SIGMOID: true 95 | 96 | TRAIN: 97 | SHUFFLE: true 98 | FLIP_PROB: 0.5 99 | BATCH_IMAGES: 4 100 | ASPECT_GROUPING: false 101 | RESUME: false 102 | AUTO_RESUME: true 103 | BEGIN_EPOCH: 0 104 | END_EPOCH: 20 105 | OPTIMIZER: 'SGD' 106 | CLIP_GRAD_NORM: 10 107 | GRAD_ACCUMULATE_STEPS: 4 108 | LR_FACTOR: 0.1 109 | LR_STEP: "14,18" 110 | LR: 7.0e-5 111 | WD: 0.0001 112 | WARMUP: true 113 | WARMUP_METHOD: 'linear' 114 | WARMUP_FACTOR: 0.0 115 | WARMUP_STEPS: 1000 116 | MOMENTUM: 0.9 117 | FP16: false 118 | FP16_LOSS_SCALE: 128.0 119 | 120 | VAL: 121 | SHUFFLE: false 122 | FLIP_PROB: 0 123 | BATCH_IMAGES: 4 124 | 125 | TEST: 126 | SHUFFLE: false 127 | FLIP_PROB: 0 128 | TEST_EPOCH: 0 129 | BATCH_IMAGES: 4 -------------------------------------------------------------------------------- /cfgs/vcr/base_qa2r_4x16G_fp32.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/vl-bert/vcr' 4 | MODULE: ResNetVLBERT 5 | GPUS: '0,1,2,3' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_base_r_res101' 10 | NUM_WORKERS_PER_GPU: 4 11 | SCALES: 12 | - 600 13 | - 1200 14 | 15 | DATASET: 16 | DATASET: vcr 17 | LABEL_INDEX_IN_BATCH: 7 18 | APPEND_INDEX: false 19 | TASK: 'QA2R' 20 | BASIC_ALIGN: false 21 | DATASET_PATH: './data/vcr' 22 | ROOT_PATH: './' 23 | TRAIN_IMAGE_SET: 'vcr1images' 24 | VAL_IMAGE_SET: 'vcr1images' 25 | TEST_IMAGE_SET: 'vcr1images' 26 | TRAIN_ANNOTATION_FILE: 'train.jsonl' 27 | VAL_ANNOTATION_FILE: 'val.jsonl' 28 | TEST_ANNOTATION_FILE: 'test.jsonl' 29 | ONLY_USE_RELEVANT_DETS: false 30 | ADD_IMAGE_AS_A_BOX: true 31 | ZIP_MODE: false 32 | CACHE_MODE: false 33 | IGNORE_DB_CACHE: true 34 | MASK_SIZE: 14 35 | 36 | NETWORK: 37 | PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-base-e2e.model" 38 | PARTIAL_PRETRAIN_PREFIX_CHANGES: 39 | - "vlbert.mvrc_head.transform->cnn_loss_reg.0" 40 | - "module.vlbert.mvrc_head.transform->module.cnn_loss_reg.0" 41 | - "module.vlbert->module.vlbert._module" 42 | - "vlbert->vlbert._module" 43 | PARTIAL_PRETRAIN_SEGMB_INIT: true 44 | IMAGE_NUM_LAYERS: 101 45 | IMAGE_C5_DILATED: true 46 | IMAGE_STRIDE_IN_1x1: true 47 | PIXEL_MEANS: 48 | - 102.9801 49 | - 115.9465 50 | - 122.7717 51 | PIXEL_STDS: 52 | - 1.0 53 | - 1.0 54 | - 1.0 55 | IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua' 56 | IMAGE_PRETRAINED_EPOCH: 0 57 | IMAGE_FROZEN_BACKBONE_STAGES: 58 | - 1 59 | - 2 60 | IMAGE_FROZEN_BN: true 61 | IMAGE_FINAL_DIM: 768 62 | IMAGE_SEMANTIC: false 63 | OUTPUT_CONV5: false 64 | BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased' 65 | BERT_PRETRAINED: '' 66 | BERT_PRETRAINED_EPOCH: 0 67 | BERT_FROZEN: false 68 | ENABLE_CNN_REG_LOSS: true 69 | ANS_LOSS_WEIGHT: 1.0 70 | CNN_LOSS_TOP: true 71 | 72 | VLBERT: 73 | input_transform_type: 1 74 | visual_size: 768 75 | hidden_size: 768 76 | num_hidden_layers: 12 77 | num_attention_heads: 12 78 | intermediate_size: 3072 79 | hidden_act: "gelu" 80 | hidden_dropout_prob: 0.1 81 | attention_probs_dropout_prob: 0.1 82 | max_position_embeddings: 512 83 | type_vocab_size: 3 84 | vocab_size: 30522 85 | initializer_range: 0.02 86 | visual_scale_text_init: 0.0 87 | visual_scale_object_init: 0.0 88 | visual_ln: true 89 | object_word_embed_mode: 2 90 | 91 | CLASSIFIER_TYPE: "1fc" 92 | CLASSIFIER_HIDDEN_SIZE: 1024 93 | CLASSIFIER_DROPOUT: 0.1 94 | CLASSIFIER_SIGMOID: true 95 | 96 | TRAIN: 97 | SHUFFLE: true 98 | FLIP_PROB: 0.5 99 | BATCH_IMAGES: 4 100 | ASPECT_GROUPING: false 101 | RESUME: false 102 | AUTO_RESUME: true 103 | BEGIN_EPOCH: 0 104 | END_EPOCH: 20 105 | OPTIMIZER: 'SGD' 106 | CLIP_GRAD_NORM: 10 107 | GRAD_ACCUMULATE_STEPS: 4 108 | LR_FACTOR: 0.1 109 | LR_STEP: "14,18" 110 | LR: 7.0e-5 111 | WD: 0.0001 112 | WARMUP: true 113 | WARMUP_METHOD: 'linear' 114 | WARMUP_FACTOR: 0.0 115 | WARMUP_STEPS: 1000 116 | MOMENTUM: 0.9 117 | FP16: false 118 | FP16_LOSS_SCALE: 128.0 119 | 120 | VAL: 121 | SHUFFLE: false 122 | FLIP_PROB: 0 123 | BATCH_IMAGES: 4 124 | 125 | TEST: 126 | SHUFFLE: false 127 | FLIP_PROB: 0 128 | TEST_EPOCH: 0 129 | BATCH_IMAGES: 4 -------------------------------------------------------------------------------- /cfgs/vcr/large_q2a_16x16G_fp16.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/vl-bert/vcr' 4 | MODULE: ResNetVLBERT 5 | GPUS: '0,1,2,3' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_large_a_res101' 10 | NUM_WORKERS_PER_GPU: 4 11 | SCALES: 12 | - 600 13 | - 1200 14 | 15 | DATASET: 16 | DATASET: vcr 17 | LABEL_INDEX_IN_BATCH: 7 18 | APPEND_INDEX: false 19 | TASK: 'Q2A' 20 | BASIC_ALIGN: false 21 | DATASET_PATH: './data/vcr' 22 | ROOT_PATH: './' 23 | TRAIN_IMAGE_SET: 'vcr1images' 24 | VAL_IMAGE_SET: 'vcr1images' 25 | TEST_IMAGE_SET: 'vcr1images' 26 | TRAIN_ANNOTATION_FILE: 'train.jsonl' 27 | VAL_ANNOTATION_FILE: 'val.jsonl' 28 | TEST_ANNOTATION_FILE: 'test.jsonl' 29 | ONLY_USE_RELEVANT_DETS: false 30 | ADD_IMAGE_AS_A_BOX: true 31 | ZIP_MODE: false 32 | CACHE_MODE: false 33 | IGNORE_DB_CACHE: true 34 | MASK_SIZE: 14 35 | 36 | NETWORK: 37 | PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-large-e2e.model" 38 | PARTIAL_PRETRAIN_PREFIX_CHANGES: 39 | - "vlbert.mvrc_head.transform->cnn_loss_reg.0" 40 | - "module.vlbert.mvrc_head.transform->module.cnn_loss_reg.0" 41 | - "module.vlbert->module.vlbert._module" 42 | - "vlbert->vlbert._module" 43 | PARTIAL_PRETRAIN_SEGMB_INIT: true 44 | IMAGE_NUM_LAYERS: 101 45 | IMAGE_C5_DILATED: true 46 | IMAGE_STRIDE_IN_1x1: true 47 | PIXEL_MEANS: 48 | - 102.9801 49 | - 115.9465 50 | - 122.7717 51 | PIXEL_STDS: 52 | - 1.0 53 | - 1.0 54 | - 1.0 55 | IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua' 56 | IMAGE_PRETRAINED_EPOCH: 0 57 | IMAGE_FROZEN_BACKBONE_STAGES: 58 | - 1 59 | - 2 60 | IMAGE_FROZEN_BN: true 61 | IMAGE_FINAL_DIM: 1024 62 | IMAGE_SEMANTIC: false 63 | OUTPUT_CONV5: false 64 | BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased' 65 | BERT_PRETRAINED: '' 66 | BERT_PRETRAINED_EPOCH: 0 67 | BERT_FROZEN: false 68 | ENABLE_CNN_REG_LOSS: true 69 | ANS_LOSS_WEIGHT: 1.0 70 | CNN_LOSS_TOP: true 71 | 72 | VLBERT: 73 | input_transform_type: 1 74 | visual_size: 1024 75 | hidden_size: 1024 76 | num_hidden_layers: 24 77 | num_attention_heads: 16 78 | intermediate_size: 4096 79 | hidden_act: "gelu" 80 | hidden_dropout_prob: 0.1 81 | attention_probs_dropout_prob: 0.1 82 | max_position_embeddings: 512 83 | type_vocab_size: 3 84 | vocab_size: 30522 85 | initializer_range: 0.02 86 | visual_scale_text_init: 0.0 87 | visual_scale_object_init: 0.0 88 | visual_ln: true 89 | object_word_embed_mode: 2 90 | 91 | CLASSIFIER_TYPE: "1fc" 92 | CLASSIFIER_HIDDEN_SIZE: 1024 93 | CLASSIFIER_DROPOUT: 0.1 94 | CLASSIFIER_SIGMOID: true 95 | 96 | TRAIN: 97 | SHUFFLE: true 98 | FLIP_PROB: 0.5 99 | BATCH_IMAGES: 4 100 | ASPECT_GROUPING: false 101 | RESUME: false 102 | AUTO_RESUME: true 103 | BEGIN_EPOCH: 0 104 | END_EPOCH: 20 105 | OPTIMIZER: 'SGD' 106 | CLIP_GRAD_NORM: 10 107 | LR_FACTOR: 0.1 108 | LR_STEP: "14,18" 109 | LR: 7.0e-5 110 | WD: 0.0001 111 | WARMUP: true 112 | WARMUP_METHOD: 'linear' 113 | WARMUP_FACTOR: 0.0 114 | WARMUP_STEPS: 1000 115 | MOMENTUM: 0.9 116 | FP16: true 117 | FP16_LOSS_SCALE: 'dynamic' 118 | 119 | VAL: 120 | SHUFFLE: false 121 | FLIP_PROB: 0 122 | BATCH_IMAGES: 4 123 | 124 | TEST: 125 | SHUFFLE: false 126 | FLIP_PROB: 0 127 | TEST_EPOCH: 0 128 | BATCH_IMAGES: 4 -------------------------------------------------------------------------------- /cfgs/vcr/large_q2a_4x16G_fp16.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/vl-bert/vcr' 4 | MODULE: ResNetVLBERT 5 | GPUS: '0,1,2,3' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_large_a_res101' 10 | NUM_WORKERS_PER_GPU: 4 11 | SCALES: 12 | - 600 13 | - 1200 14 | 15 | DATASET: 16 | DATASET: vcr 17 | LABEL_INDEX_IN_BATCH: 7 18 | APPEND_INDEX: false 19 | TASK: 'Q2A' 20 | BASIC_ALIGN: false 21 | DATASET_PATH: './data/vcr' 22 | ROOT_PATH: './' 23 | TRAIN_IMAGE_SET: 'vcr1images' 24 | VAL_IMAGE_SET: 'vcr1images' 25 | TEST_IMAGE_SET: 'vcr1images' 26 | TRAIN_ANNOTATION_FILE: 'train.jsonl' 27 | VAL_ANNOTATION_FILE: 'val.jsonl' 28 | TEST_ANNOTATION_FILE: 'test.jsonl' 29 | ONLY_USE_RELEVANT_DETS: false 30 | ADD_IMAGE_AS_A_BOX: true 31 | ZIP_MODE: false 32 | CACHE_MODE: false 33 | IGNORE_DB_CACHE: true 34 | MASK_SIZE: 14 35 | 36 | NETWORK: 37 | PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-large-e2e.model" 38 | PARTIAL_PRETRAIN_PREFIX_CHANGES: 39 | - "vlbert.mvrc_head.transform->cnn_loss_reg.0" 40 | - "module.vlbert.mvrc_head.transform->module.cnn_loss_reg.0" 41 | - "module.vlbert->module.vlbert._module" 42 | - "vlbert->vlbert._module" 43 | PARTIAL_PRETRAIN_SEGMB_INIT: true 44 | IMAGE_NUM_LAYERS: 101 45 | IMAGE_C5_DILATED: true 46 | IMAGE_STRIDE_IN_1x1: true 47 | PIXEL_MEANS: 48 | - 102.9801 49 | - 115.9465 50 | - 122.7717 51 | PIXEL_STDS: 52 | - 1.0 53 | - 1.0 54 | - 1.0 55 | IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua' 56 | IMAGE_PRETRAINED_EPOCH: 0 57 | IMAGE_FROZEN_BACKBONE_STAGES: 58 | - 1 59 | - 2 60 | IMAGE_FROZEN_BN: true 61 | IMAGE_FINAL_DIM: 1024 62 | IMAGE_SEMANTIC: false 63 | OUTPUT_CONV5: false 64 | BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased' 65 | BERT_PRETRAINED: '' 66 | BERT_PRETRAINED_EPOCH: 0 67 | BERT_FROZEN: false 68 | ENABLE_CNN_REG_LOSS: true 69 | ANS_LOSS_WEIGHT: 1.0 70 | CNN_LOSS_TOP: true 71 | 72 | VLBERT: 73 | input_transform_type: 1 74 | visual_size: 1024 75 | hidden_size: 1024 76 | num_hidden_layers: 24 77 | num_attention_heads: 16 78 | intermediate_size: 4096 79 | hidden_act: "gelu" 80 | hidden_dropout_prob: 0.1 81 | attention_probs_dropout_prob: 0.1 82 | max_position_embeddings: 512 83 | type_vocab_size: 3 84 | vocab_size: 30522 85 | initializer_range: 0.02 86 | visual_scale_text_init: 0.0 87 | visual_scale_object_init: 0.0 88 | visual_ln: true 89 | object_word_embed_mode: 2 90 | 91 | CLASSIFIER_TYPE: "1fc" 92 | CLASSIFIER_HIDDEN_SIZE: 1024 93 | CLASSIFIER_DROPOUT: 0.1 94 | CLASSIFIER_SIGMOID: true 95 | 96 | TRAIN: 97 | SHUFFLE: true 98 | FLIP_PROB: 0.5 99 | BATCH_IMAGES: 4 100 | ASPECT_GROUPING: false 101 | RESUME: false 102 | AUTO_RESUME: true 103 | BEGIN_EPOCH: 0 104 | END_EPOCH: 20 105 | OPTIMIZER: 'SGD' 106 | CLIP_GRAD_NORM: 10 107 | GRAD_ACCUMULATE_STEPS: 4 108 | LR_FACTOR: 0.1 109 | LR_STEP: "14,18" 110 | LR: 7.0e-5 111 | WD: 0.0001 112 | WARMUP: true 113 | WARMUP_METHOD: 'linear' 114 | WARMUP_FACTOR: 0.0 115 | WARMUP_STEPS: 1000 116 | MOMENTUM: 0.9 117 | FP16: true 118 | FP16_LOSS_SCALE: 'dynamic' 119 | 120 | VAL: 121 | SHUFFLE: false 122 | FLIP_PROB: 0 123 | BATCH_IMAGES: 4 124 | 125 | TEST: 126 | SHUFFLE: false 127 | FLIP_PROB: 0 128 | TEST_EPOCH: 0 129 | BATCH_IMAGES: 4 -------------------------------------------------------------------------------- /cfgs/vcr/large_qa2r_16x16G_fp16.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/vl-bert/vcr' 4 | MODULE: ResNetVLBERT 5 | GPUS: '0,1,2,3' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_large_r_res101' 10 | NUM_WORKERS_PER_GPU: 4 11 | SCALES: 12 | - 600 13 | - 1200 14 | 15 | DATASET: 16 | DATASET: vcr 17 | LABEL_INDEX_IN_BATCH: 7 18 | APPEND_INDEX: false 19 | TASK: 'QA2R' 20 | BASIC_ALIGN: false 21 | DATASET_PATH: './data/vcr' 22 | ROOT_PATH: './' 23 | TRAIN_IMAGE_SET: 'vcr1images' 24 | VAL_IMAGE_SET: 'vcr1images' 25 | TEST_IMAGE_SET: 'vcr1images' 26 | TRAIN_ANNOTATION_FILE: 'train.jsonl' 27 | VAL_ANNOTATION_FILE: 'val.jsonl' 28 | TEST_ANNOTATION_FILE: 'test.jsonl' 29 | ONLY_USE_RELEVANT_DETS: false 30 | ADD_IMAGE_AS_A_BOX: true 31 | ZIP_MODE: false 32 | CACHE_MODE: false 33 | IGNORE_DB_CACHE: true 34 | MASK_SIZE: 14 35 | 36 | NETWORK: 37 | PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-large-e2e.model" 38 | PARTIAL_PRETRAIN_PREFIX_CHANGES: 39 | - "vlbert.mvrc_head.transform->cnn_loss_reg.0" 40 | - "module.vlbert.mvrc_head.transform->module.cnn_loss_reg.0" 41 | - "module.vlbert->module.vlbert._module" 42 | - "vlbert->vlbert._module" 43 | PARTIAL_PRETRAIN_SEGMB_INIT: true 44 | IMAGE_NUM_LAYERS: 101 45 | IMAGE_C5_DILATED: true 46 | IMAGE_STRIDE_IN_1x1: true 47 | PIXEL_MEANS: 48 | - 102.9801 49 | - 115.9465 50 | - 122.7717 51 | PIXEL_STDS: 52 | - 1.0 53 | - 1.0 54 | - 1.0 55 | IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua' 56 | IMAGE_PRETRAINED_EPOCH: 0 57 | IMAGE_FROZEN_BACKBONE_STAGES: 58 | - 1 59 | - 2 60 | IMAGE_FROZEN_BN: true 61 | IMAGE_FINAL_DIM: 1024 62 | IMAGE_SEMANTIC: false 63 | OUTPUT_CONV5: false 64 | BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased' 65 | BERT_PRETRAINED: '' 66 | BERT_PRETRAINED_EPOCH: 0 67 | BERT_FROZEN: false 68 | ENABLE_CNN_REG_LOSS: true 69 | ANS_LOSS_WEIGHT: 1.0 70 | CNN_LOSS_TOP: true 71 | 72 | VLBERT: 73 | input_transform_type: 1 74 | visual_size: 1024 75 | hidden_size: 1024 76 | num_hidden_layers: 24 77 | num_attention_heads: 16 78 | intermediate_size: 4096 79 | hidden_act: "gelu" 80 | hidden_dropout_prob: 0.1 81 | attention_probs_dropout_prob: 0.1 82 | max_position_embeddings: 512 83 | type_vocab_size: 3 84 | vocab_size: 30522 85 | initializer_range: 0.02 86 | visual_scale_text_init: 0.0 87 | visual_scale_object_init: 0.0 88 | visual_ln: true 89 | object_word_embed_mode: 2 90 | 91 | CLASSIFIER_TYPE: "1fc" 92 | CLASSIFIER_HIDDEN_SIZE: 1024 93 | CLASSIFIER_DROPOUT: 0.1 94 | CLASSIFIER_SIGMOID: true 95 | 96 | TRAIN: 97 | SHUFFLE: true 98 | FLIP_PROB: 0.5 99 | BATCH_IMAGES: 4 100 | ASPECT_GROUPING: false 101 | RESUME: false 102 | AUTO_RESUME: true 103 | BEGIN_EPOCH: 0 104 | END_EPOCH: 20 105 | OPTIMIZER: 'SGD' 106 | CLIP_GRAD_NORM: 10 107 | LR_FACTOR: 0.1 108 | LR_STEP: "14,18" 109 | LR: 7.0e-5 110 | WD: 0.0001 111 | WARMUP: true 112 | WARMUP_METHOD: 'linear' 113 | WARMUP_FACTOR: 0.0 114 | WARMUP_STEPS: 1000 115 | MOMENTUM: 0.9 116 | FP16: true 117 | FP16_LOSS_SCALE: 'dynamic' 118 | 119 | VAL: 120 | SHUFFLE: false 121 | FLIP_PROB: 0 122 | BATCH_IMAGES: 4 123 | 124 | TEST: 125 | SHUFFLE: false 126 | FLIP_PROB: 0 127 | TEST_EPOCH: 0 128 | BATCH_IMAGES: 4 -------------------------------------------------------------------------------- /cfgs/vcr/large_qa2r_4x16G_fp16.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/vl-bert/vcr' 4 | MODULE: ResNetVLBERT 5 | GPUS: '0,1,2,3' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_large_r_res101' 10 | NUM_WORKERS_PER_GPU: 4 11 | SCALES: 12 | - 600 13 | - 1200 14 | 15 | DATASET: 16 | DATASET: vcr 17 | LABEL_INDEX_IN_BATCH: 7 18 | APPEND_INDEX: false 19 | TASK: 'QA2R' 20 | BASIC_ALIGN: false 21 | DATASET_PATH: './data/vcr' 22 | ROOT_PATH: './' 23 | TRAIN_IMAGE_SET: 'vcr1images' 24 | VAL_IMAGE_SET: 'vcr1images' 25 | TEST_IMAGE_SET: 'vcr1images' 26 | TRAIN_ANNOTATION_FILE: 'train.jsonl' 27 | VAL_ANNOTATION_FILE: 'val.jsonl' 28 | TEST_ANNOTATION_FILE: 'test.jsonl' 29 | ONLY_USE_RELEVANT_DETS: false 30 | ADD_IMAGE_AS_A_BOX: true 31 | ZIP_MODE: false 32 | CACHE_MODE: false 33 | IGNORE_DB_CACHE: true 34 | MASK_SIZE: 14 35 | 36 | NETWORK: 37 | PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-large-e2e.model" 38 | PARTIAL_PRETRAIN_PREFIX_CHANGES: 39 | - "vlbert.mvrc_head.transform->cnn_loss_reg.0" 40 | - "module.vlbert.mvrc_head.transform->module.cnn_loss_reg.0" 41 | - "module.vlbert->module.vlbert._module" 42 | - "vlbert->vlbert._module" 43 | PARTIAL_PRETRAIN_SEGMB_INIT: true 44 | IMAGE_NUM_LAYERS: 101 45 | IMAGE_C5_DILATED: true 46 | IMAGE_STRIDE_IN_1x1: true 47 | PIXEL_MEANS: 48 | - 102.9801 49 | - 115.9465 50 | - 122.7717 51 | PIXEL_STDS: 52 | - 1.0 53 | - 1.0 54 | - 1.0 55 | IMAGE_PRETRAINED: './model/pretrained_model/resnet101-pt-vgbua' 56 | IMAGE_PRETRAINED_EPOCH: 0 57 | IMAGE_FROZEN_BACKBONE_STAGES: 58 | - 1 59 | - 2 60 | IMAGE_FROZEN_BN: true 61 | IMAGE_FINAL_DIM: 1024 62 | IMAGE_SEMANTIC: false 63 | OUTPUT_CONV5: false 64 | BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased' 65 | BERT_PRETRAINED: '' 66 | BERT_PRETRAINED_EPOCH: 0 67 | BERT_FROZEN: false 68 | ENABLE_CNN_REG_LOSS: true 69 | ANS_LOSS_WEIGHT: 1.0 70 | CNN_LOSS_TOP: true 71 | 72 | VLBERT: 73 | input_transform_type: 1 74 | visual_size: 1024 75 | hidden_size: 1024 76 | num_hidden_layers: 24 77 | num_attention_heads: 16 78 | intermediate_size: 4096 79 | hidden_act: "gelu" 80 | hidden_dropout_prob: 0.1 81 | attention_probs_dropout_prob: 0.1 82 | max_position_embeddings: 512 83 | type_vocab_size: 3 84 | vocab_size: 30522 85 | initializer_range: 0.02 86 | visual_scale_text_init: 0.0 87 | visual_scale_object_init: 0.0 88 | visual_ln: true 89 | object_word_embed_mode: 2 90 | 91 | CLASSIFIER_TYPE: "1fc" 92 | CLASSIFIER_HIDDEN_SIZE: 1024 93 | CLASSIFIER_DROPOUT: 0.1 94 | CLASSIFIER_SIGMOID: true 95 | 96 | TRAIN: 97 | SHUFFLE: true 98 | FLIP_PROB: 0.5 99 | BATCH_IMAGES: 4 100 | ASPECT_GROUPING: false 101 | RESUME: false 102 | AUTO_RESUME: true 103 | BEGIN_EPOCH: 0 104 | END_EPOCH: 20 105 | OPTIMIZER: 'SGD' 106 | CLIP_GRAD_NORM: 10 107 | GRAD_ACCUMULATE_STEPS: 4 108 | LR_FACTOR: 0.1 109 | LR_STEP: "14,18" 110 | LR: 7.0e-5 111 | WD: 0.0001 112 | WARMUP: true 113 | WARMUP_METHOD: 'linear' 114 | WARMUP_FACTOR: 0.0 115 | WARMUP_STEPS: 1000 116 | MOMENTUM: 0.9 117 | FP16: true 118 | FP16_LOSS_SCALE: 'dynamic' 119 | 120 | VAL: 121 | SHUFFLE: false 122 | FLIP_PROB: 0 123 | BATCH_IMAGES: 4 124 | 125 | TEST: 126 | SHUFFLE: false 127 | FLIP_PROB: 0 128 | TEST_EPOCH: 0 129 | BATCH_IMAGES: 4 -------------------------------------------------------------------------------- /cfgs/vqa/base_4x16G_fp32.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/vl-bert/vqa' 4 | MODULE: ResNetVLBERT 5 | GPUS: '0,1,2,3' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_base_res101_vqa' 10 | NUM_WORKERS_PER_GPU: 4 11 | SCALES: 12 | - 600 13 | - 1000 14 | 15 | DATASET: 16 | DATASET: vqa 17 | ANSWER_VOCAB_FILE: './data/coco/vqa/answers_vqa.txt' 18 | LABEL_INDEX_IN_BATCH: -1 19 | APPEND_INDEX: false 20 | DATASET_PATH: './data/coco' 21 | ROOT_PATH: './' 22 | TRAIN_IMAGE_SET: 'train2014+val2014' 23 | VAL_IMAGE_SET: 'val2014' 24 | TEST_IMAGE_SET: 'test2015' 25 | ADD_IMAGE_AS_A_BOX: true 26 | ZIP_MODE: false 27 | CACHE_MODE: false 28 | IGNORE_DB_CACHE: false 29 | MASK_SIZE: 14 30 | BOXES: "10-100ada" 31 | USE_IMDB: false 32 | 33 | NETWORK: 34 | PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-base-prec.model" 35 | PARTIAL_PRETRAIN_PREFIX_CHANGES: 36 | - "vlbert.mlm_head.predictions.transform->final_mlp.0" 37 | - "module.vlbert.mlm_head.predictions.transform->module.final_mlp.0" 38 | - "vlbert->vlbert" 39 | - "module.vlbert->module.vlbert" 40 | IMAGE_NUM_LAYERS: 101 41 | IMAGE_C5_DILATED: true 42 | IMAGE_STRIDE_IN_1x1: true 43 | PIXEL_MEANS: 44 | - 102.9801 45 | - 115.9465 46 | - 122.7717 47 | PIXEL_STDS: 48 | - 1.0 49 | - 1.0 50 | - 1.0 51 | IMAGE_FEAT_PRECOMPUTED: true 52 | IMAGE_PRETRAINED: '' 53 | IMAGE_PRETRAINED_EPOCH: 0 54 | IMAGE_FROZEN_BACKBONE_STAGES: 55 | - 1 56 | - 2 57 | IMAGE_FROZEN_BN: true 58 | IMAGE_FINAL_DIM: 768 59 | IMAGE_SEMANTIC: false 60 | OUTPUT_CONV5: false 61 | BERT_MODEL_NAME: './model/pretrained_model/bert-base-uncased' 62 | BERT_PRETRAINED: '' 63 | BERT_PRETRAINED_EPOCH: 0 64 | BERT_FROZEN: false 65 | ENABLE_CNN_REG_LOSS: false 66 | 67 | VLBERT: 68 | input_transform_type: 1 69 | visual_size: 768 70 | hidden_size: 768 71 | num_hidden_layers: 12 72 | num_attention_heads: 12 73 | intermediate_size: 3072 74 | hidden_act: "gelu" 75 | hidden_dropout_prob: 0.1 76 | attention_probs_dropout_prob: 0.1 77 | max_position_embeddings: 512 78 | type_vocab_size: 3 79 | vocab_size: 30522 80 | initializer_range: 0.02 81 | visual_scale_text_init: 0.0 82 | visual_scale_object_init: 0.0 83 | visual_ln: true 84 | 85 | CLASSIFIER_TYPE: "mlm" 86 | CLASSIFIER_PRETRAINED: true 87 | CLASSIFIER_DROPOUT: 0.1 88 | 89 | TRAIN: 90 | SHUFFLE: true 91 | FLIP_PROB: 0.5 92 | BATCH_IMAGES: 64 93 | ASPECT_GROUPING: false 94 | RESUME: false 95 | AUTO_RESUME: true 96 | BEGIN_EPOCH: 0 97 | END_EPOCH: 5 98 | OPTIMIZER: 'AdamW' 99 | CLIP_GRAD_NORM: 1.0 100 | LR: 6.25e-7 101 | LR_SCHEDULE: 'triangle' 102 | WD: 0.0001 103 | WARMUP: true 104 | WARMUP_METHOD: 'linear' 105 | WARMUP_FACTOR: 0.0 106 | WARMUP_STEPS: 500 107 | FP16: false 108 | FP16_LOSS_SCALE: 128.0 109 | 110 | VAL: 111 | SHUFFLE: false 112 | FLIP_PROB: 0 113 | BATCH_IMAGES: 64 114 | 115 | TEST: 116 | SHUFFLE: false 117 | FLIP_PROB: 0 118 | TEST_EPOCH: 0 119 | BATCH_IMAGES: 64 -------------------------------------------------------------------------------- /cfgs/vqa/large_4x16G_fp32.yaml: -------------------------------------------------------------------------------- 1 | --- 2 | RNG_SEED: 12345 3 | OUTPUT_PATH: './output/vl-bert/vqa' 4 | MODULE: ResNetVLBERT 5 | GPUS: '0,1,2,3' 6 | LOG_FREQUENT: 100 7 | VAL_FREQUENT: 1 8 | CHECKPOINT_FREQUENT: 1 9 | MODEL_PREFIX: 'vl-bert_large_res101_vqa' 10 | NUM_WORKERS_PER_GPU: 4 11 | SCALES: 12 | - 600 13 | - 1000 14 | 15 | DATASET: 16 | DATASET: vqa 17 | ANSWER_VOCAB_FILE: './data/coco/vqa/answers_vqa.txt' 18 | LABEL_INDEX_IN_BATCH: -1 19 | APPEND_INDEX: false 20 | DATASET_PATH: './data/coco' 21 | ROOT_PATH: './' 22 | TRAIN_IMAGE_SET: 'train2014+val2014' 23 | VAL_IMAGE_SET: 'val2014' 24 | TEST_IMAGE_SET: 'test2015' 25 | ADD_IMAGE_AS_A_BOX: true 26 | ZIP_MODE: false 27 | CACHE_MODE: false 28 | IGNORE_DB_CACHE: false 29 | MASK_SIZE: 14 30 | BOXES: "10-100ada" 31 | USE_IMDB: false 32 | 33 | NETWORK: 34 | PARTIAL_PRETRAIN: "./model/pretrained_model/vl-bert-large-prec.model" 35 | PARTIAL_PRETRAIN_PREFIX_CHANGES: 36 | - "vlbert.mlm_head.predictions.transform->final_mlp.0" 37 | - "module.vlbert.mlm_head.predictions.transform->module.final_mlp.0" 38 | - "vlbert->vlbert" 39 | - "module.vlbert->module.vlbert" 40 | IMAGE_NUM_LAYERS: 101 41 | IMAGE_C5_DILATED: true 42 | IMAGE_STRIDE_IN_1x1: true 43 | PIXEL_MEANS: 44 | - 102.9801 45 | - 115.9465 46 | - 122.7717 47 | PIXEL_STDS: 48 | - 1.0 49 | - 1.0 50 | - 1.0 51 | IMAGE_FEAT_PRECOMPUTED: true 52 | IMAGE_PRETRAINED: '' 53 | IMAGE_PRETRAINED_EPOCH: 0 54 | IMAGE_FROZEN_BACKBONE_STAGES: 55 | - 1 56 | - 2 57 | IMAGE_FROZEN_BN: true 58 | IMAGE_FINAL_DIM: 1024 59 | IMAGE_SEMANTIC: false 60 | OUTPUT_CONV5: false 61 | BERT_MODEL_NAME: './model/pretrained_model/bert-large-uncased' 62 | BERT_PRETRAINED: '' 63 | BERT_PRETRAINED_EPOCH: 0 64 | BERT_FROZEN: false 65 | ENABLE_CNN_REG_LOSS: false 66 | 67 | VLBERT: 68 | input_transform_type: 1 69 | visual_size: 1024 70 | hidden_size: 1024 71 | num_hidden_layers: 24 72 | num_attention_heads: 16 73 | intermediate_size: 4096 74 | hidden_act: "gelu" 75 | hidden_dropout_prob: 0.1 76 | attention_probs_dropout_prob: 0.1 77 | max_position_embeddings: 512 78 | type_vocab_size: 3 79 | vocab_size: 30522 80 | initializer_range: 0.02 81 | visual_scale_text_init: 0.0 82 | visual_scale_object_init: 0.0 83 | visual_ln: true 84 | 85 | CLASSIFIER_TYPE: "mlm" 86 | CLASSIFIER_PRETRAINED: true 87 | CLASSIFIER_DROPOUT: 0.1 88 | 89 | TRAIN: 90 | SHUFFLE: true 91 | FLIP_PROB: 0.5 92 | BATCH_IMAGES: 16 93 | ASPECT_GROUPING: false 94 | RESUME: false 95 | AUTO_RESUME: true 96 | BEGIN_EPOCH: 0 97 | END_EPOCH: 5 98 | OPTIMIZER: 'AdamW' 99 | CLIP_GRAD_NORM: 1.0 100 | GRAD_ACCUMULATE_STEPS: 4 101 | LR: 6.25e-7 102 | LR_SCHEDULE: 'triangle' 103 | WD: 0.0001 104 | WARMUP: true 105 | WARMUP_METHOD: 'linear' 106 | WARMUP_FACTOR: 0.0 107 | WARMUP_STEPS: 500 108 | FP16: false 109 | FP16_LOSS_SCALE: 128.0 110 | 111 | VAL: 112 | SHUFFLE: false 113 | FLIP_PROB: 0 114 | BATCH_IMAGES: 16 115 | 116 | TEST: 117 | SHUFFLE: false 118 | FLIP_PROB: 0 119 | TEST_EPOCH: 0 120 | BATCH_IMAGES: 16 -------------------------------------------------------------------------------- /common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/common/__init__.py -------------------------------------------------------------------------------- /common/backbone/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet import resnet18, resnet34, resnet50, resnet101, resnet152 2 | -------------------------------------------------------------------------------- /common/backbone/resnet/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet import * 2 | -------------------------------------------------------------------------------- /common/callbacks/batch_end_callbacks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/common/callbacks/batch_end_callbacks/__init__.py -------------------------------------------------------------------------------- /common/callbacks/epoch_end_callbacks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/common/callbacks/epoch_end_callbacks/__init__.py -------------------------------------------------------------------------------- /common/callbacks/epoch_end_callbacks/checkpoint.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class Checkpoint(object): 5 | def __init__(self, prefix, frequent): 6 | super(Checkpoint, self).__init__() 7 | self.prefix = prefix 8 | self.frequent = frequent 9 | 10 | def __call__(self, epoch_num, net, optimizer, writer, validation_monitor=None): 11 | if (epoch_num + 1) % self.frequent == 0: 12 | param_name = '{}-{:04d}.model'.format(self.prefix, epoch_num) 13 | checkpoint_dict = dict() 14 | checkpoint_dict['state_dict'] = net.state_dict() 15 | checkpoint_dict['optimizer'] = optimizer.state_dict() 16 | save_to_best = False 17 | if validation_monitor is not None: 18 | checkpoint_dict['validation_monitor'] = validation_monitor.state_dict() 19 | if validation_monitor.best_epoch == epoch_num: 20 | save_to_best = True 21 | torch.save(checkpoint_dict, param_name) 22 | if save_to_best: 23 | best_param_name = '{}-best.model'.format(self.prefix) 24 | torch.save(checkpoint_dict, best_param_name) 25 | print('Save new best model to {}.'.format(best_param_name)) 26 | -------------------------------------------------------------------------------- /common/callbacks/epoch_end_callbacks/validation_monitor.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import shutil 3 | 4 | 5 | class ValidationMonitor(object): 6 | def __init__(self, val_func, val_loader, metrics, host_metric_name='Acc', label_index_in_batch=-1): 7 | super(ValidationMonitor, self).__init__() 8 | self.val_func = val_func 9 | self.val_loader = val_loader 10 | self.metrics = metrics 11 | self.host_metric_name = host_metric_name 12 | self.best_epoch = -1 13 | self.best_val = -1.0 14 | self.label_index_in_batch = label_index_in_batch 15 | 16 | def state_dict(self): 17 | return {'best_epoch': self.best_epoch, 18 | 'best_val': self.best_val} 19 | 20 | def load_state_dict(self, state_dict): 21 | assert 'best_epoch' in state_dict, 'miss key \'best_epoch\'' 22 | assert 'best_val' in state_dict, 'miss key \'best_val\'' 23 | self.best_epoch = state_dict['best_epoch'] 24 | self.best_val = state_dict['best_val'] 25 | 26 | def __call__(self, epoch_num, net, optimizer, writer): 27 | self.val_func(net, self.val_loader, self.metrics, self.label_index_in_batch) 28 | 29 | name, value = self.metrics.get() 30 | s = "Epoch[%d] \tVal-" % (epoch_num) 31 | for n, v in zip(name, value): 32 | if n == self.host_metric_name and v > self.best_val: 33 | self.best_epoch = epoch_num 34 | self.best_val = v 35 | logging.info('New Best Val {}: {}, Epoch: {}'.format(self.host_metric_name, self.best_val, self.best_epoch)) 36 | print('New Best Val {}: {}, Epoch: {}'.format(self.host_metric_name, self.best_val, self.best_epoch)) 37 | s += "%s=%f,\t" % (n, v) 38 | if writer is not None: 39 | writer.add_scalar(tag='Val-' + n, 40 | scalar_value=v, 41 | global_step=epoch_num + 1) 42 | logging.info(s) 43 | print(s) 44 | 45 | logging.info('Best Val {}: {}, Epoch: {}'.format(self.host_metric_name, self.best_val, self.best_epoch)) 46 | print('Best Val {}: {}, Epoch: {}'.format(self.host_metric_name, self.best_val, self.best_epoch)) 47 | 48 | 49 | 50 | 51 | 52 | -------------------------------------------------------------------------------- /common/lib/roi_pooling/ROIAlign.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "cpu/vision.h" 5 | 6 | #ifdef WITH_CUDA 7 | #include "cuda/vision.h" 8 | #endif 9 | 10 | // Interface for Python 11 | at::Tensor ROIAlign_forward(const at::Tensor& input, 12 | const at::Tensor& rois, 13 | const float spatial_scale, 14 | const int pooled_height, 15 | const int pooled_width, 16 | const int sampling_ratio) { 17 | if (input.type().is_cuda()) { 18 | #ifdef WITH_CUDA 19 | return ROIAlign_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 20 | #else 21 | AT_ERROR("Not compiled with GPU support"); 22 | #endif 23 | } 24 | return ROIAlign_forward_cpu(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio); 25 | } 26 | 27 | at::Tensor ROIAlign_backward(const at::Tensor& grad, 28 | const at::Tensor& rois, 29 | const float spatial_scale, 30 | const int pooled_height, 31 | const int pooled_width, 32 | const int batch_size, 33 | const int channels, 34 | const int height, 35 | const int width, 36 | const int sampling_ratio) { 37 | if (grad.type().is_cuda()) { 38 | #ifdef WITH_CUDA 39 | return ROIAlign_backward_cuda(grad, rois, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width, sampling_ratio); 40 | #else 41 | AT_ERROR("Not compiled with GPU support"); 42 | #endif 43 | } 44 | AT_ERROR("Not implemented on the CPU"); 45 | } 46 | 47 | -------------------------------------------------------------------------------- /common/lib/roi_pooling/ROIPool.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | 4 | #include "cpu/vision.h" 5 | 6 | #ifdef WITH_CUDA 7 | #include "cuda/vision.h" 8 | #endif 9 | 10 | 11 | std::tuple ROIPool_forward(const at::Tensor& input, 12 | const at::Tensor& rois, 13 | const float spatial_scale, 14 | const int pooled_height, 15 | const int pooled_width) { 16 | if (input.type().is_cuda()) { 17 | #ifdef WITH_CUDA 18 | return ROIPool_forward_cuda(input, rois, spatial_scale, pooled_height, pooled_width); 19 | #else 20 | AT_ERROR("Not compiled with GPU support"); 21 | #endif 22 | } 23 | AT_ERROR("Not implemented on the CPU"); 24 | } 25 | 26 | at::Tensor ROIPool_backward(const at::Tensor& grad, 27 | const at::Tensor& input, 28 | const at::Tensor& rois, 29 | const at::Tensor& argmax, 30 | const float spatial_scale, 31 | const int pooled_height, 32 | const int pooled_width, 33 | const int batch_size, 34 | const int channels, 35 | const int height, 36 | const int width) { 37 | if (grad.type().is_cuda()) { 38 | #ifdef WITH_CUDA 39 | return ROIPool_backward_cuda(grad, input, rois, argmax, spatial_scale, pooled_height, pooled_width, batch_size, channels, height, width); 40 | #else 41 | AT_ERROR("Not compiled with GPU support"); 42 | #endif 43 | } 44 | AT_ERROR("Not implemented on the CPU"); 45 | } 46 | 47 | 48 | 49 | -------------------------------------------------------------------------------- /common/lib/roi_pooling/__init__.py: -------------------------------------------------------------------------------- 1 | from .roi_align import ROIAlign 2 | from .roi_pool import ROIPool -------------------------------------------------------------------------------- /common/lib/roi_pooling/cpu/vision.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include 4 | 5 | 6 | at::Tensor ROIAlign_forward_cpu(const at::Tensor& input, 7 | const at::Tensor& rois, 8 | const float spatial_scale, 9 | const int pooled_height, 10 | const int pooled_width, 11 | const int sampling_ratio); 12 | 13 | 14 | -------------------------------------------------------------------------------- /common/lib/roi_pooling/cuda/vision.h: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #pragma once 3 | #include 4 | 5 | 6 | at::Tensor ROIAlign_forward_cuda(const at::Tensor& input, 7 | const at::Tensor& rois, 8 | const float spatial_scale, 9 | const int pooled_height, 10 | const int pooled_width, 11 | const int sampling_ratio); 12 | 13 | at::Tensor ROIAlign_backward_cuda(const at::Tensor& grad, 14 | const at::Tensor& rois, 15 | const float spatial_scale, 16 | const int pooled_height, 17 | const int pooled_width, 18 | const int batch_size, 19 | const int channels, 20 | const int height, 21 | const int width, 22 | const int sampling_ratio); 23 | 24 | 25 | std::tuple ROIPool_forward_cuda(const at::Tensor& input, 26 | const at::Tensor& rois, 27 | const float spatial_scale, 28 | const int pooled_height, 29 | const int pooled_width); 30 | 31 | at::Tensor ROIPool_backward_cuda(const at::Tensor& grad, 32 | const at::Tensor& input, 33 | const at::Tensor& rois, 34 | const at::Tensor& argmax, 35 | const float spatial_scale, 36 | const int pooled_height, 37 | const int pooled_width, 38 | const int batch_size, 39 | const int channels, 40 | const int height, 41 | const int width); 42 | 43 | at::Tensor nms_cuda(const at::Tensor boxes, float nms_overlap_thresh); 44 | 45 | 46 | at::Tensor compute_flow_cuda(const at::Tensor& boxes, 47 | const int height, 48 | const int width); 49 | -------------------------------------------------------------------------------- /common/lib/roi_pooling/debug.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from roi_pool import ROIPool 3 | from roi_align import ROIAlign 4 | 5 | align = ROIAlign(output_size=(3, 3), spatial_scale=1.0, sampling_ratio=1) 6 | pool = ROIPool(output_size=(3, 3), spatial_scale=1.0) 7 | 8 | device = torch.device("cuda:0") 9 | 10 | feature = torch.arange(81*2*3).view((2,3,9,9)).float().to(device) 11 | rois = torch.Tensor([[0,0,0,9,9],[1,0,0,9,9],[1,0,0,7,7]]).to(device) 12 | 13 | pooled = pool(feature,rois) 14 | aligned = align(feature,rois) 15 | 16 | import IPython 17 | IPython.embed() 18 | -------------------------------------------------------------------------------- /common/lib/roi_pooling/roi_align.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Function 5 | from torch.autograd.function import once_differentiable 6 | from torch.nn.modules.utils import _pair 7 | 8 | from . import C_ROIPooling 9 | 10 | 11 | class _ROIAlign(Function): 12 | @staticmethod 13 | def forward(ctx, input, rois, output_size, spatial_scale, sampling_ratio): 14 | ctx.save_for_backward(rois) 15 | ctx.output_size = _pair(output_size) 16 | ctx.spatial_scale = spatial_scale 17 | ctx.sampling_ratio = sampling_ratio 18 | ctx.input_shape = input.size() 19 | output = C_ROIPooling.roi_align_forward( 20 | input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio 21 | ) 22 | return output 23 | 24 | @staticmethod 25 | @once_differentiable 26 | def backward(ctx, grad_output): 27 | rois, = ctx.saved_tensors 28 | output_size = ctx.output_size 29 | spatial_scale = ctx.spatial_scale 30 | sampling_ratio = ctx.sampling_ratio 31 | bs, ch, h, w = ctx.input_shape 32 | grad_input = C_ROIPooling.roi_align_backward( 33 | grad_output, 34 | rois, 35 | spatial_scale, 36 | output_size[0], 37 | output_size[1], 38 | bs, 39 | ch, 40 | h, 41 | w, 42 | sampling_ratio, 43 | ) 44 | return grad_input, None, None, None, None 45 | 46 | 47 | roi_align = _ROIAlign.apply 48 | 49 | 50 | class ROIAlign(nn.Module): 51 | def __init__(self, output_size, spatial_scale, sampling_ratio=1): 52 | """ 53 | :param output_size: e.g. (3,3) 54 | :param spatial_scale: e.g. 1.0/16 55 | :param sampling_ratio: e.g. 1 56 | """ 57 | super(ROIAlign, self).__init__() 58 | self.output_size = output_size 59 | self.spatial_scale = spatial_scale 60 | self.sampling_ratio = sampling_ratio 61 | 62 | def forward(self, input, rois): 63 | """ 64 | :param input: the input features [B C H W] 65 | :param rois: [k, 5]: (im_index, x1, y1, x2, y2) 66 | :return: pooled features [K C H W], K = k 67 | """ 68 | return roi_align( 69 | input.float(), rois.float(), self.output_size, self.spatial_scale, self.sampling_ratio 70 | ) 71 | 72 | def __repr__(self): 73 | tmpstr = self.__class__.__name__ + "(" 74 | tmpstr += "output_size=" + str(self.output_size) 75 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 76 | tmpstr += ", sampling_ratio=" + str(self.sampling_ratio) 77 | tmpstr += ")" 78 | return tmpstr 79 | -------------------------------------------------------------------------------- /common/lib/roi_pooling/roi_pool.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | import torch 3 | from torch import nn 4 | from torch.autograd import Function 5 | from torch.autograd.function import once_differentiable 6 | from torch.nn.modules.utils import _pair 7 | 8 | from . import C_ROIPooling 9 | 10 | 11 | class _ROIPool(Function): 12 | @staticmethod 13 | def forward(ctx, input, rois, output_size, spatial_scale): 14 | ctx.output_size = _pair(output_size) 15 | ctx.spatial_scale = spatial_scale 16 | ctx.input_shape = input.size() 17 | output, argmax = C_ROIPooling.roi_pool_forward( 18 | input, rois, spatial_scale, output_size[0], output_size[1] 19 | ) 20 | ctx.save_for_backward(input, rois, argmax) 21 | return output 22 | 23 | @staticmethod 24 | @once_differentiable 25 | def backward(ctx, grad_output): 26 | input, rois, argmax = ctx.saved_tensors 27 | output_size = ctx.output_size 28 | spatial_scale = ctx.spatial_scale 29 | bs, ch, h, w = ctx.input_shape 30 | grad_input = C_ROIPooling.roi_pool_backward( 31 | grad_output, 32 | input, 33 | rois, 34 | argmax, 35 | spatial_scale, 36 | output_size[0], 37 | output_size[1], 38 | bs, 39 | ch, 40 | h, 41 | w, 42 | ) 43 | return grad_input, None, None, None 44 | 45 | 46 | roi_pool = _ROIPool.apply 47 | 48 | 49 | class ROIPool(nn.Module): 50 | def __init__(self, output_size, spatial_scale): 51 | """ 52 | :param output_size: e.g. (3,3) 53 | :param spatial_scale: e.g. 1.0/16 54 | """ 55 | super(ROIPool, self).__init__() 56 | self.output_size = output_size 57 | self.spatial_scale = spatial_scale 58 | 59 | def forward(self, input, rois): 60 | """ 61 | :param input: the input features [B C H W] 62 | :param rois: [k, 5] : (im_index, x1, y1, x2, y2) 63 | :return: pooled features (K C H W), K = k 64 | """ 65 | return roi_pool(input.float(), rois.float(), self.output_size, self.spatial_scale) 66 | 67 | def __repr__(self): 68 | tmpstr = self.__class__.__name__ + "(" 69 | tmpstr += "output_size=" + str(self.output_size) 70 | tmpstr += ", spatial_scale=" + str(self.spatial_scale) 71 | tmpstr += ")" 72 | return tmpstr 73 | -------------------------------------------------------------------------------- /common/lib/roi_pooling/setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | #!/usr/bin/env python 3 | 4 | import glob 5 | import os 6 | 7 | import torch 8 | from setuptools import find_packages 9 | from setuptools import setup 10 | from torch.utils.cpp_extension import CUDA_HOME 11 | from torch.utils.cpp_extension import CppExtension 12 | from torch.utils.cpp_extension import CUDAExtension 13 | 14 | requirements = ["torch", "torchvision"] 15 | 16 | 17 | def get_extensions(): 18 | this_dir = os.path.dirname(os.path.abspath(__file__)) 19 | extensions_dir = this_dir 20 | 21 | main_file = glob.glob(os.path.join(extensions_dir, "*.cpp")) 22 | source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) 23 | source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu")) 24 | 25 | sources = main_file + source_cpu 26 | extension = CppExtension 27 | 28 | extra_compile_args = {"cxx": []} 29 | define_macros = [] 30 | 31 | if torch.cuda.is_available() and CUDA_HOME is not None: 32 | extension = CUDAExtension 33 | sources += source_cuda 34 | define_macros += [("WITH_CUDA", None)] 35 | extra_compile_args["nvcc"] = [ 36 | "-DCUDA_HAS_FP16=1", 37 | "-D__CUDA_NO_HALF_OPERATORS__", 38 | "-D__CUDA_NO_HALF_CONVERSIONS__", 39 | "-D__CUDA_NO_HALF2_OPERATORS__", 40 | ] 41 | 42 | sources = [os.path.join(extensions_dir, s) for s in sources] 43 | 44 | include_dirs = [extensions_dir] 45 | 46 | ext_modules = [ 47 | extension( 48 | "C_ROIPooling", 49 | sources, 50 | include_dirs=include_dirs, 51 | define_macros=define_macros, 52 | extra_compile_args=extra_compile_args, 53 | ) 54 | ] 55 | 56 | return ext_modules 57 | 58 | 59 | setup( 60 | name="C_ROIPooling", 61 | version="0.1", 62 | description="ROIPooling in C++ or CUDA", 63 | ext_modules=get_extensions(), 64 | cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension}, 65 | ) 66 | -------------------------------------------------------------------------------- /common/lib/roi_pooling/vision.cpp: -------------------------------------------------------------------------------- 1 | // Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | 3 | #include "ROIAlign.h" 4 | #include "ROIPool.h" 5 | 6 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 7 | m.def("roi_align_forward", &ROIAlign_forward, "ROIAlign_forward"); 8 | m.def("roi_align_backward", &ROIAlign_backward, "ROIAlign_backward"); 9 | m.def("roi_pool_forward", &ROIPool_forward, "ROIPool_forward"); 10 | m.def("roi_pool_backward", &ROIPool_backward, "ROIPool_backward"); 11 | } 12 | -------------------------------------------------------------------------------- /common/lr_scheduler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | from bisect import bisect_right 3 | 4 | import torch 5 | 6 | 7 | # FIXME ideally this would be achieved with a CombinedLRScheduler, 8 | # separating MultiStepLR with WarmupLR 9 | # but the current LRScheduler design doesn't allow it 10 | class WarmupMultiStepLR(torch.optim.lr_scheduler._LRScheduler): 11 | def __init__( 12 | self, 13 | optimizer, 14 | milestones, 15 | gamma=0.1, 16 | warmup_factor=1.0 / 3, 17 | warmup_iters=500, 18 | warmup_method="linear", 19 | last_epoch=-1, 20 | ): 21 | if not list(milestones) == sorted(milestones): 22 | raise ValueError( 23 | "Milestones should be a list of" " increasing integers. Got {}", 24 | milestones, 25 | ) 26 | 27 | if warmup_method not in ("constant", "linear"): 28 | raise ValueError( 29 | "Only 'constant' or 'linear' warmup_method accepted" 30 | "got {}".format(warmup_method) 31 | ) 32 | self.milestones = milestones 33 | self.gamma = gamma 34 | self.warmup_factor = warmup_factor 35 | self.warmup_iters = warmup_iters 36 | self.warmup_method = warmup_method 37 | super(WarmupMultiStepLR, self).__init__(optimizer, last_epoch) 38 | 39 | def get_lr(self): 40 | warmup_factor = 1 41 | if self.last_epoch < self.warmup_iters: 42 | if self.warmup_method == "constant": 43 | warmup_factor = self.warmup_factor 44 | elif self.warmup_method == "linear": 45 | alpha = self.last_epoch / self.warmup_iters 46 | warmup_factor = self.warmup_factor * (1 - alpha) + alpha 47 | return [ 48 | base_lr 49 | * warmup_factor 50 | * self.gamma ** bisect_right(self.milestones, self.last_epoch) 51 | for base_lr in self.base_lrs 52 | ] 53 | -------------------------------------------------------------------------------- /common/metrics/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/common/metrics/__init__.py -------------------------------------------------------------------------------- /common/metrics/composite_eval_metric.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from .eval_metric import EvalMetric 3 | import torch 4 | 5 | class CompositeEvalMetric(EvalMetric): 6 | """Manages multiple evaluation metrics. 7 | Args: 8 | metrics (list of EvalMetric): List of child metrics. 9 | name (str): Name of this metric instance for display. 10 | """ 11 | 12 | def __init__(self, metrics=None, name='composite'): 13 | super(CompositeEvalMetric, self).__init__(name) 14 | if metrics is None: 15 | metrics = [] 16 | self.metrics = metrics 17 | 18 | def add(self, metric): 19 | """Adds a child metric. 20 | Args: 21 | metric (EvalMetric): A metric instance. 22 | """ 23 | self.metrics.append(metric) 24 | 25 | def get_metric(self, index): 26 | """Returns a child metric. 27 | Args: 28 | index (int): Index of child metric in the list of metrics. 29 | """ 30 | try: 31 | return self.metrics[index] 32 | except IndexError: 33 | return ValueError("Metric index {} is out of range 0 and {}".format( 34 | index, len(self.metrics))) 35 | 36 | def update(self, outputs): 37 | """Updates the internal evaluation result. 38 | Args: 39 | labels (dict of `NDArray`): The labels of the data. 40 | preds (dict of `NDArray`): Predicted values. 41 | """ 42 | for metric in self.metrics: 43 | metric.update(outputs) 44 | 45 | def reset(self): 46 | """Resets the internal evaluation result to initial state.""" 47 | try: 48 | for metric in self.metrics: 49 | metric.reset() 50 | except AttributeError: 51 | pass 52 | 53 | def get(self): 54 | """Returns the current evaluation result. 55 | Returns: 56 | names (list of str): Name of the metrics. 57 | values (list of float): Value of the evaluations. 58 | """ 59 | names = [] 60 | values = [] 61 | for metric in self.metrics: 62 | name, value = metric.get() 63 | if isinstance(name, str): 64 | name = [name] 65 | if isinstance(value, (float, int, np.generic,torch.Tensor)): 66 | value = [value] 67 | names.extend(name) 68 | values.extend(value) 69 | return names, values 70 | -------------------------------------------------------------------------------- /common/metrics/eval_metric.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.distributed as distributed 3 | 4 | 5 | class EvalMetric(object): 6 | """Base class for all evaluation metrics. 7 | .. note:: 8 | This is a base class that provides common metric interfaces. 9 | One should not use this class directly, but instead create new metric 10 | classes that extend it. 11 | Args 12 | name (str): Name of this metric instance for display. 13 | """ 14 | 15 | def __init__(self, name, allreduce=False, num_replicas=1, **kwargs): 16 | self.name = str(name) 17 | self.allreduce=allreduce 18 | self.num_replicas = num_replicas 19 | self._kwargs = kwargs 20 | self.reset() 21 | 22 | def __str__(self): 23 | return "EvalMetric: {}".format(dict(self.get_name_value())) 24 | 25 | def update(self, outputs): 26 | """Updates the internal evaluation result. 27 | Args 28 | labels (list of `NDArray`): The labels of the data. 29 | preds (list of `NDArray`): Predicted values. 30 | """ 31 | raise NotImplementedError() 32 | 33 | def reset(self): 34 | """Resets the internal evaluation result to initial state.""" 35 | self.num_inst = torch.tensor(0.) 36 | self.sum_metric = torch.tensor(0.) 37 | 38 | def get(self): 39 | """Returns the current evaluation result. 40 | Returns: 41 | names (list of str): Name of the metrics. 42 | values (list of float): Value of the evaluations. 43 | """ 44 | if self.num_inst.item() == 0: 45 | return (self.name, float('nan')) 46 | else: 47 | if self.allreduce: 48 | num_inst = self.num_inst.clone().cuda() 49 | sum_metric = self.sum_metric.clone().cuda() 50 | distributed.all_reduce(num_inst, op=distributed.ReduceOp.SUM) 51 | distributed.all_reduce(sum_metric, op=distributed.ReduceOp.SUM) 52 | metric_tensor = (sum_metric / num_inst).detach().cpu() 53 | else: 54 | metric_tensor = (self.sum_metric / self.num_inst).detach().cpu() 55 | 56 | return (self.name, metric_tensor.item()) 57 | 58 | def get_name_value(self): 59 | """Returns zipped name and value pairs. 60 | Returns 61 | A (list of tuples): (name, value) tuple list. 62 | """ 63 | name, value = self.get() 64 | if not isinstance(name, list): 65 | name = [name] 66 | if not isinstance(value, list): 67 | value = [value] 68 | return list(zip(name, value)) 69 | -------------------------------------------------------------------------------- /common/metrics/pretrain_metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .eval_metric import EvalMetric 3 | 4 | 5 | class LossLogger(EvalMetric): 6 | def __init__(self, output_name, display_name=None, 7 | allreduce=False, num_replicas=1): 8 | self.output_name = output_name 9 | if display_name is None: 10 | display_name = output_name 11 | super(LossLogger, self).__init__(display_name, allreduce, num_replicas) 12 | 13 | def update(self, outputs): 14 | with torch.no_grad(): 15 | if self.output_name in outputs: 16 | self.sum_metric += float(outputs[self.output_name].mean().item()) 17 | self.num_inst += 1 18 | 19 | 20 | class RelationshipAccuracy(EvalMetric): 21 | def __init__(self, allreduce=False, num_replicas=1): 22 | super(RelationshipAccuracy, self).__init__('RelAcc', allreduce, num_replicas) 23 | 24 | def update(self, outputs): 25 | with torch.no_grad(): 26 | logits = outputs['relationship_logits'] 27 | label = outputs['relationship_label'] 28 | self.sum_metric += float((logits.argmax(dim=1) == label).sum().item()) 29 | self.num_inst += logits.shape[0] 30 | 31 | 32 | class MLMAccuracy(EvalMetric): 33 | def __init__(self, allreduce=False, num_replicas=1): 34 | super(MLMAccuracy, self).__init__('MLMAcc', allreduce, num_replicas) 35 | 36 | def update(self, outputs): 37 | with torch.no_grad(): 38 | logits = outputs['mlm_logits'] 39 | label = outputs['mlm_label'] 40 | keep = (label != -1) 41 | if keep.sum() > 0: 42 | self.sum_metric += float((logits[keep].argmax(dim=1) == label[keep]).sum().item()) 43 | self.num_inst += keep.sum().item() 44 | 45 | 46 | class MLMAccuracyWVC(EvalMetric): 47 | def __init__(self, allreduce=False, num_replicas=1): 48 | super(MLMAccuracyWVC, self).__init__('MLMAccWVC', allreduce, num_replicas) 49 | 50 | def update(self, outputs): 51 | with torch.no_grad(): 52 | logits = outputs['mlm_logits_wvc'] 53 | label = outputs['mlm_label_wvc'] 54 | keep = (label != -1) 55 | if keep.sum() > 0: 56 | self.sum_metric += float((logits[keep].argmax(dim=1) == label[keep]).sum().item()) 57 | self.num_inst += keep.sum().item() 58 | 59 | 60 | class MLMAccuracyAUX(EvalMetric): 61 | def __init__(self, allreduce=False, num_replicas=1): 62 | super(MLMAccuracyAUX, self).__init__('MLMAccAUX', allreduce, num_replicas) 63 | 64 | def update(self, outputs): 65 | with torch.no_grad(): 66 | logits = outputs['mlm_logits_aux'] 67 | label = outputs['mlm_label_aux'] 68 | keep = (label != -1) 69 | if keep.sum() > 0: 70 | self.sum_metric += float((logits[keep].argmax(dim=1) == label[keep]).sum().item()) 71 | self.num_inst += keep.sum().item() 72 | 73 | 74 | class MVRCAccuracy(EvalMetric): 75 | def __init__(self, allreduce=False, num_replicas=1): 76 | super(MVRCAccuracy, self).__init__('MVRCAccuracy', allreduce, num_replicas) 77 | 78 | def update(self, outputs): 79 | with torch.no_grad(): 80 | logits = outputs['mvrc_logits'] 81 | label = outputs['mvrc_label'] 82 | keep = (label.sum(2) - 1.0).abs() < 0.1 83 | if keep.sum() > 0: 84 | self.sum_metric += float((logits[keep].argmax(dim=1) == label[keep].argmax(dim=1)).sum().item()) 85 | self.num_inst += keep.sum().item() 86 | 87 | 88 | 89 | 90 | -------------------------------------------------------------------------------- /common/metrics/refcoco_metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .eval_metric import EvalMetric 3 | 4 | 5 | class LossLogger(EvalMetric): 6 | def __init__(self, output_name, display_name=None, 7 | allreduce=False, num_replicas=1): 8 | self.output_name = output_name 9 | if display_name is None: 10 | display_name = output_name 11 | super(LossLogger, self).__init__(display_name, allreduce, num_replicas) 12 | 13 | def update(self, outputs): 14 | with torch.no_grad(): 15 | if self.output_name in outputs: 16 | self.sum_metric += float(outputs[self.output_name].mean().item()) 17 | self.num_inst += 1 18 | 19 | 20 | class RefAccuracy(EvalMetric): 21 | def __init__(self, allreduce=False, num_replicas=1): 22 | super(RefAccuracy, self).__init__('RefAcc', allreduce, num_replicas) 23 | 24 | def update(self, outputs): 25 | with torch.no_grad(): 26 | cls_logits = outputs['label_logits'] 27 | label = outputs['label'] 28 | bs, _ = cls_logits.shape 29 | batch_inds = torch.arange(bs, device=cls_logits.device) 30 | self.sum_metric += float((label[batch_inds, cls_logits.argmax(1)] > 0.5).sum().item()) 31 | self.num_inst += cls_logits.shape[0] 32 | 33 | 34 | class ClsAccuracy(EvalMetric): 35 | def __init__(self, allreduce=False, num_replicas=1): 36 | super(ClsAccuracy, self).__init__('ClsAcc', allreduce, num_replicas) 37 | 38 | def update(self, outputs): 39 | with torch.no_grad(): 40 | cls_logits = outputs['label_logits'] 41 | cls_pred = (cls_logits > 0).long() 42 | label = outputs['label'].long() 43 | keep = (label >= 0) 44 | self.sum_metric += float((cls_pred[keep] == label[keep]).sum().item()) 45 | self.num_inst += keep.sum().item() 46 | 47 | 48 | class ClsPosAccuracy(EvalMetric): 49 | def __init__(self, allreduce=False, num_replicas=1): 50 | super(ClsPosAccuracy, self).__init__('ClsPosAcc', allreduce, num_replicas) 51 | 52 | def update(self, outputs): 53 | with torch.no_grad(): 54 | cls_logits = outputs['label_logits'] 55 | cls_pred = (cls_logits > 0).long() 56 | label = outputs['label'].long() 57 | keep = (label == 1) 58 | self.sum_metric += float((cls_pred[keep] == label[keep]).sum().item()) 59 | self.num_inst += keep.sum().item() 60 | 61 | 62 | class ClsPosFraction(EvalMetric): 63 | def __init__(self, allreduce=False, num_replicas=1): 64 | super(ClsPosFraction, self).__init__('ClsPosFrac', allreduce, num_replicas) 65 | 66 | def update(self, outputs): 67 | with torch.no_grad(): 68 | label = outputs['label'].long() 69 | num_pos = (label == 1).sum().item() 70 | num_valid = (label >= 0).sum().item() 71 | self.sum_metric += float(num_pos) 72 | self.num_inst += float(num_valid) 73 | 74 | 75 | 76 | 77 | 78 | 79 | -------------------------------------------------------------------------------- /common/metrics/vcr_metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .eval_metric import EvalMetric 3 | 4 | 5 | class LossLogger(EvalMetric): 6 | def __init__(self, output_name, display_name=None, 7 | allreduce=False, num_replicas=1): 8 | self.output_name = output_name 9 | if display_name is None: 10 | display_name = output_name 11 | super(LossLogger, self).__init__(display_name, allreduce, num_replicas) 12 | 13 | def update(self, outputs): 14 | with torch.no_grad(): 15 | if self.output_name in outputs: 16 | self.sum_metric += float(outputs[self.output_name].mean().item()) 17 | self.num_inst += 1 18 | 19 | 20 | class Accuracy(EvalMetric): 21 | def __init__(self, allreduce=False, num_replicas=1): 22 | super(Accuracy, self).__init__('Acc', allreduce, num_replicas) 23 | 24 | def update(self, outputs): 25 | with torch.no_grad(): 26 | _filter = outputs['label'] != -1 27 | cls_logits = outputs['label_logits'][_filter] 28 | label = outputs['label'][_filter] 29 | if cls_logits.dim() == 1: 30 | cls_logits = cls_logits.view((-1, 4)) 31 | label = label.view((-1, 4)).argmax(1) 32 | self.sum_metric += float((cls_logits.argmax(dim=1) == label).sum().item()) 33 | self.num_inst += cls_logits.shape[0] 34 | 35 | 36 | class AnsLoss(EvalMetric): 37 | def __init__(self, allreduce=False, num_replicas=1): 38 | super(AnsLoss, self).__init__('AnsLoss', allreduce, num_replicas) 39 | 40 | def update(self, outputs): 41 | with torch.no_grad(): 42 | self.sum_metric += float(outputs['ans_loss'].mean().item()) 43 | self.num_inst += 1 44 | 45 | 46 | class CNNRegLoss(EvalMetric): 47 | def __init__(self, allreduce=False, num_replicas=1): 48 | super(CNNRegLoss, self).__init__('CNNRegLoss', allreduce, num_replicas) 49 | 50 | def update(self, outputs): 51 | with torch.no_grad(): 52 | if 'cnn_regularization_loss' in outputs: 53 | self.sum_metric += float(outputs['cnn_regularization_loss'].mean().item()) 54 | self.num_inst += 1 55 | 56 | 57 | class PositiveFraction(EvalMetric): 58 | def __init__(self, allreduce=False, num_replicas=1): 59 | super(PositiveFraction, self).__init__('PosFraction', allreduce, num_replicas) 60 | 61 | def update(self, outputs): 62 | with torch.no_grad(): 63 | self.sum_metric += float(outputs['positive_fraction'].mean().item()) 64 | self.num_inst += 1 65 | 66 | 67 | class JointAccuracy(EvalMetric): 68 | def __init__(self, allreduce=False, num_replicas=1): 69 | super(JointAccuracy, self).__init__('JointAcc', allreduce, num_replicas) 70 | 71 | def update(self, outputs): 72 | a_cls_logits = outputs['answer_label_logits'] 73 | a_label = outputs['answer_label'] 74 | r_cls_logits = outputs['rationale_label_logits'] 75 | r_label = outputs['rationale_label'] 76 | self.sum_metric += float(((a_cls_logits.argmax(dim=1) == a_label) 77 | & (r_cls_logits.argmax(dim=1) == r_label)).sum().item()) 78 | self.num_inst += a_cls_logits.shape[0] 79 | 80 | 81 | 82 | -------------------------------------------------------------------------------- /common/metrics/vqa_metrics.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from .eval_metric import EvalMetric 3 | 4 | 5 | class LossLogger(EvalMetric): 6 | def __init__(self, output_name, display_name=None, 7 | allreduce=False, num_replicas=1): 8 | self.output_name = output_name 9 | if display_name is None: 10 | display_name = output_name 11 | super(LossLogger, self).__init__(display_name, allreduce, num_replicas) 12 | 13 | def update(self, outputs): 14 | with torch.no_grad(): 15 | if self.output_name in outputs: 16 | self.sum_metric += float(outputs[self.output_name].mean().item()) 17 | self.num_inst += 1 18 | 19 | 20 | class SoftAccuracy(EvalMetric): 21 | def __init__(self, allreduce=False, num_replicas=1): 22 | super(SoftAccuracy, self).__init__('SoftAcc', allreduce, num_replicas) 23 | 24 | def update(self, outputs): 25 | with torch.no_grad(): 26 | cls_logits = outputs['label_logits'] 27 | label = outputs['label'] 28 | bs, num_classes = cls_logits.shape 29 | batch_inds = torch.arange(bs, device=cls_logits.device) 30 | self.sum_metric += float(label[batch_inds, cls_logits.argmax(1)].sum().item()) 31 | self.num_inst += cls_logits.shape[0] 32 | 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /common/module.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | from typing import Dict 3 | import torch 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | 7 | 8 | class Module(nn.Module): 9 | def __init__(self, config): 10 | super(Module, self).__init__() 11 | self.config = config 12 | 13 | def init_weight(self): 14 | raise NotImplementedError() 15 | 16 | def fix_params(self): 17 | raise NotImplementedError() 18 | 19 | def forward(self, *inputs, **kwargs): 20 | inputs, kwargs = self.preprocess(*inputs, **kwargs) 21 | if self.training: 22 | return self.train_forward(*inputs, **kwargs) 23 | else: 24 | return self.inference_forward(*inputs, **kwargs) 25 | 26 | def train_forward(self, *inputs, **kwargs): 27 | """ 28 | def train_forward(self, data, label, **kwargs): 29 | # this is a toy example for 1 output, 2 loss function 30 | 31 | output = None 32 | loss1 = torch.tensor(0.0) 33 | loss2 = torch.tensor(0.0) 34 | 35 | outputs = {'output': output, 36 | 'loss1': loss1, 37 | 'loss2': loss2} 38 | loss = loss1 + loss2 39 | 40 | return outputs, loss 41 | """ 42 | raise NotImplemented 43 | 44 | def inference_forward(self, *inputs, **kwargs): 45 | """ 46 | def inference_forward(self, data, **kwargs): 47 | output = None 48 | outputs = {'output': output} 49 | return outputs 50 | """ 51 | raise NotImplemented 52 | 53 | def preprocess(self, *inputs, **kwargs): 54 | if self.training: 55 | return self.train_preprocess(*inputs, **kwargs) 56 | else: 57 | return self.inference_preprocess(*inputs, **kwargs) 58 | 59 | def train_preprocess(self, *inputs, **kwargs): 60 | return inputs, kwargs 61 | 62 | def inference_preprocess(self, *inputs, **kwargs): 63 | return inputs, kwargs 64 | -------------------------------------------------------------------------------- /common/nlp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/common/nlp/__init__.py -------------------------------------------------------------------------------- /common/nlp/bert/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/common/nlp/bert/__init__.py -------------------------------------------------------------------------------- /common/nlp/bert_encoder_wrapper.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | from external.pytorch_pretrained_bert.modeling import BertEncoder, BertLayerNorm 4 | 5 | 6 | class BertEncoderWrapper(nn.Module): 7 | def __init__(self, bert_config, input_size, output_all_encoded_layers=False): 8 | super(BertEncoderWrapper, self).__init__() 9 | self.bert_config = bert_config 10 | self.output_all_encoded_layers = output_all_encoded_layers 11 | self.input_transform = nn.Linear(input_size, bert_config.hidden_size) 12 | self.with_position_embeddings = False if 'with_position_embeddings' not in bert_config \ 13 | else bert_config.with_position_embeddings 14 | if self.with_position_embeddings: 15 | self.position_embedding = nn.Embedding(bert_config.max_position_embeddings, bert_config.hidden_size) 16 | self.LayerNorm = BertLayerNorm(bert_config.hidden_size, eps=1e-12) 17 | self.dropout = nn.Dropout(bert_config.hidden_dropout_prob) 18 | self.bert_encoder = BertEncoder(bert_config) 19 | 20 | self.apply(self.init_bert_weights) 21 | 22 | def init_bert_weights(self, module): 23 | """ Initialize the weights. 24 | """ 25 | if isinstance(module, (nn.Linear, nn.Embedding)): 26 | # Slightly different from the TF version which uses truncated_normal for initialization 27 | # cf https://github.com/pytorch/pytorch/pull/5617 28 | module.weight.data.normal_(mean=0.0, std=self.bert_config.initializer_range) 29 | elif isinstance(module, BertLayerNorm): 30 | module.bias.data.zero_() 31 | module.weight.data.fill_(1.0) 32 | if isinstance(module, nn.Linear) and module.bias is not None: 33 | module.bias.data.zero_() 34 | 35 | def get_output_dim(self): 36 | return self.bert_config.hidden_size 37 | 38 | def forward(self, inputs, mask): 39 | inputs = self.input_transform(inputs) 40 | if self.with_position_embeddings: 41 | seq_length = inputs.size(1) 42 | position_ids = torch.arange(seq_length, dtype=torch.long, device=inputs.device) 43 | position_ids = position_ids.unsqueeze(0).expand((inputs.shape[0], inputs.shape[1])) 44 | position_embeddings = self.position_embedding(position_ids) 45 | inputs = inputs + position_embeddings 46 | inputs = self.LayerNorm(inputs) 47 | inputs = self.dropout(inputs) 48 | 49 | extended_attention_mask = mask.unsqueeze(1).unsqueeze(2) 50 | # Since attention_mask is 1.0 for positions we want to attend and 0.0 for 51 | # masked positions, this operation will create a tensor which is 0.0 for 52 | # positions we want to attend and -10000.0 for masked positions. 53 | # Since we are adding it to the raw scores before the softmax, this is 54 | # effectively the same as removing these entirely. 55 | extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) 56 | extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0 57 | output = self.bert_encoder(inputs, 58 | extended_attention_mask, 59 | output_all_encoded_layers=self.output_all_encoded_layers) 60 | if not self.output_all_encoded_layers: 61 | output = output[0] 62 | return output 63 | 64 | -------------------------------------------------------------------------------- /common/nlp/input_variational_dropout.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class InputVariationalDropout(torch.nn.Dropout): 4 | """ 5 | Apply the dropout technique in Gal and Ghahramani, "Dropout as a Bayesian Approximation: 6 | Representing Model Uncertainty in Deep Learning" (https://arxiv.org/abs/1506.02142) to a 7 | 3D tensor. 8 | 9 | This module accepts a 3D tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` 10 | and samples a single dropout mask of shape ``(batch_size, embedding_dim)`` and applies 11 | it to every time step. 12 | """ 13 | def forward(self, input_tensor): 14 | # pylint: disable=arguments-differ 15 | """ 16 | Apply dropout to input tensor. 17 | 18 | Parameters 19 | ---------- 20 | input_tensor: ``torch.FloatTensor`` 21 | A tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` 22 | 23 | Returns 24 | ------- 25 | output: ``torch.FloatTensor`` 26 | A tensor of shape ``(batch_size, num_timesteps, embedding_dim)`` with dropout applied. 27 | """ 28 | ones = input_tensor.data.new_ones(input_tensor.shape[0], input_tensor.shape[-1]) 29 | dropout_mask = torch.nn.functional.dropout(ones, self.p, self.training, inplace=False) 30 | if self.inplace: 31 | input_tensor *= dropout_mask.unsqueeze(1) 32 | return None 33 | else: 34 | return dropout_mask.unsqueeze(1) * input_tensor -------------------------------------------------------------------------------- /common/nlp/misc.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import random 3 | 4 | 5 | def get_align_matrix(aligned_ids, sparse=False, device=None, dtype=torch.float32): 6 | """ 7 | Get aligned matrix for feature alignment in sentence embedding 8 | :param aligned_ids: list, aligned_ids[k] means original index of k-th token 9 | :param sparse: whether to return sparse matrix 10 | :param device: device of returned align matrix 11 | :param dtype: dtype of returned align matrix 12 | :return: align_matrix: torch.FloatTensor, shape: (L, L') 13 | 14 | Example: 15 | >> aligned_ids = [0, 0, 1, 2, 2, 2] 16 | >> get_align_matrix(aligned_ids) 17 | tensor([[0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000], 18 | [0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000], 19 | [0.0000, 0.0000, 0.0000, 0.3333, 0.3333, 0.3333]]) 20 | """ 21 | 22 | l0 = max(aligned_ids) + 1 23 | l1 = len(aligned_ids) 24 | if sparse: 25 | raise NotImplementedError 26 | else: 27 | align_matrix = torch.zeros((l0, l1), dtype=dtype, device=device) 28 | align_matrix[aligned_ids, torch.arange(l1)] = 1 29 | align_matrix = align_matrix / align_matrix.sum(dim=1, keepdim=True) 30 | 31 | return align_matrix 32 | 33 | 34 | def get_all_ngrams(words): 35 | """ 36 | Get all n-grams of words 37 | :param words: list of str 38 | :return: ngrams, list of (list of str) 39 | """ 40 | ngrams = [] 41 | N = len(words) 42 | for n in range(1, N + 1): 43 | for i in range(0, N - n + 1): 44 | ngrams.append([words[j] for j in range(i, i + n)]) 45 | 46 | return ngrams 47 | 48 | 49 | def random_word_with_token_ids(token_ids, tokenizer): 50 | """ 51 | Masking some random tokens for Language Model task with probabilities as in the original BERT paper. 52 | :param token_ids: list of int, list of token id. 53 | :param tokenizer: Tokenizer, object used for tokenization (we need it's vocab here) 54 | :return: (list of str, list of int), masked tokens and related labels for LM prediction 55 | """ 56 | output_label = [] 57 | mask_id = tokenizer.convert_tokens_to_ids(['[MASK]'])[0] 58 | 59 | for i, token_id in enumerate(token_ids): 60 | prob = random.random() 61 | # mask token with 15% probability 62 | if prob < 0.15: 63 | prob /= 0.15 64 | 65 | # 80% randomly change token to mask token 66 | if prob < 0.8: 67 | token_ids[i] = mask_id 68 | 69 | # 10% randomly change token to random token 70 | elif prob < 0.9: 71 | token_ids[i] = random.choice(list(tokenizer.vocab.items()))[1] 72 | 73 | # -> rest 10% randomly keep current token 74 | 75 | # append current token to output (we will predict these later) 76 | output_label.append(token_id) 77 | else: 78 | # no masking token (will be ignored by loss function later) 79 | output_label.append(-1) 80 | 81 | return token_ids, output_label 82 | 83 | 84 | 85 | 86 | 87 | -------------------------------------------------------------------------------- /common/nlp/roberta/__init__.py: -------------------------------------------------------------------------------- 1 | from .tokenization_roberta import RobertaTokenizer 2 | -------------------------------------------------------------------------------- /common/nlp/time_distributed.py: -------------------------------------------------------------------------------- 1 | """ 2 | A wrapper that unrolls the second (time) dimension of a tensor 3 | into the first (batch) dimension, applies some other ``Module``, 4 | and then rolls the time dimension back up. 5 | """ 6 | 7 | import torch 8 | 9 | 10 | class TimeDistributed(torch.nn.Module): 11 | """ 12 | Given an input shaped like ``(batch_size, time_steps, [rest])`` and a ``Module`` that takes 13 | inputs like ``(batch_size, [rest])``, ``TimeDistributed`` reshapes the input to be 14 | ``(batch_size * time_steps, [rest])``, applies the contained ``Module``, then reshapes it back. 15 | 16 | Note that while the above gives shapes with ``batch_size`` first, this ``Module`` also works if 17 | ``batch_size`` is second - we always just combine the first two dimensions, then split them. 18 | """ 19 | def __init__(self, module): 20 | super(TimeDistributed, self).__init__() 21 | self._module = module 22 | 23 | def forward(self, *inputs, **kwargs): # pylint: disable=arguments-differ 24 | reshaped_inputs = [] 25 | for input_tensor in inputs: 26 | input_size = input_tensor.size() 27 | if len(input_size) <= 2: 28 | raise RuntimeError("No dimension to distribute: " + str(input_size)) 29 | 30 | # Squash batch_size and time_steps into a single axis; result has shape 31 | # (batch_size * time_steps, input_size). 32 | squashed_shape = [-1] + [x for x in input_size[2:]] 33 | reshaped_inputs.append(input_tensor.contiguous().view(*squashed_shape)) 34 | 35 | reshaped_outputs = self._module(*reshaped_inputs, **kwargs) 36 | 37 | if isinstance(reshaped_outputs, torch.Tensor): 38 | # Now get the output back into the right shape. 39 | # (batch_size, time_steps, [hidden_size]) 40 | new_shape = [input_size[0], input_size[1]] + [x for x in reshaped_outputs.size()[1:]] 41 | outputs = reshaped_outputs.contiguous().view(*new_shape) 42 | elif isinstance(reshaped_outputs, tuple): 43 | outputs = [] 44 | for output in reshaped_outputs: 45 | new_shape = [input_size[0], input_size[1]] + [x for x in output.size()[1:]] 46 | outputs.append(output.contiguous().view(*new_shape)) 47 | outputs = tuple(outputs) 48 | else: 49 | raise ValueError("Not support!") 50 | 51 | return outputs 52 | -------------------------------------------------------------------------------- /common/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/common/utils/__init__.py -------------------------------------------------------------------------------- /common/utils/bbox.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def nonlinear_transform(ex_rois, gt_rois): 5 | """ 6 | compute bounding box regression targets from ex_rois to gt_rois 7 | :param ex_rois: [k, 4] ([x1, y1, x2, y2]) 8 | :param gt_rois: [k, 4] (corresponding gt_boxes [x1, y1, x2, y2] ) 9 | :return: bbox_targets: [k, 4] 10 | """ 11 | assert ex_rois.shape[0] == gt_rois.shape[0], 'inconsistent rois number' 12 | 13 | ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0 14 | ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0 15 | ex_ctr_x = ex_rois[:, 0] + 0.5 * (ex_widths - 1.0) 16 | ex_ctr_y = ex_rois[:, 1] + 0.5 * (ex_heights - 1.0) 17 | 18 | gt_widths = gt_rois[:, 2] - gt_rois[:, 0] + 1.0 19 | gt_heights = gt_rois[:, 3] - gt_rois[:, 1] + 1.0 20 | gt_ctr_x = gt_rois[:, 0] + 0.5 * (gt_widths - 1.0) 21 | gt_ctr_y = gt_rois[:, 1] + 0.5 * (gt_heights - 1.0) 22 | 23 | targets_dx = (gt_ctr_x - ex_ctr_x) / (ex_widths + 1e-6) 24 | targets_dy = (gt_ctr_y - ex_ctr_y) / (ex_heights + 1e-6) 25 | targets_dw = torch.log(gt_widths / (ex_widths).clamp(min=1e-6)) 26 | targets_dh = torch.log(gt_heights / ((ex_heights).clamp(min=1e-6))) 27 | 28 | targets = torch.cat( 29 | (targets_dx.view(-1, 1), targets_dy.view(-1, 1), targets_dw.view(-1, 1), targets_dh.view(-1, 1)), dim=-1) 30 | return targets 31 | 32 | 33 | def coordinate_embeddings(boxes, dim): 34 | """ 35 | Coordinate embeddings of bounding boxes 36 | :param boxes: [K, 6] ([x1, y1, x2, y2, w_image, h_image]) 37 | :param dim: sin/cos embedding dimension 38 | :return: [K, 4, 2 * dim] 39 | """ 40 | 41 | num_boxes = boxes.shape[0] 42 | w = boxes[:, 4] 43 | h = boxes[:, 5] 44 | 45 | # transform to (x_c, y_c, w, h) format 46 | boxes_ = boxes.new_zeros((num_boxes, 4)) 47 | boxes_[:, 0] = (boxes[:, 0] + boxes[:, 2]) / 2 48 | boxes_[:, 1] = (boxes[:, 1] + boxes[:, 3]) / 2 49 | boxes_[:, 2] = boxes[:, 2] - boxes[:, 0] 50 | boxes_[:, 3] = boxes[:, 3] - boxes[:, 1] 51 | boxes = boxes_ 52 | 53 | # position 54 | pos = boxes.new_zeros((num_boxes, 4)) 55 | pos[:, 0] = boxes[:, 0] / w * 100 56 | pos[:, 1] = boxes[:, 1] / h * 100 57 | pos[:, 2] = boxes[:, 2] / w * 100 58 | pos[:, 3] = boxes[:, 3] / h * 100 59 | 60 | # sin/cos embedding 61 | dim_mat = 1000 ** (torch.arange(dim, dtype=boxes.dtype, device=boxes.device) / dim) 62 | sin_embedding = (pos.view((num_boxes, 4, 1)) / dim_mat.view((1, 1, -1))).sin() 63 | cos_embedding = (pos.view((num_boxes, 4, 1)) / dim_mat.view((1, 1, -1))).cos() 64 | 65 | return torch.cat((sin_embedding, cos_embedding), dim=-1) 66 | 67 | 68 | def bbox_iou_py_vectorized(boxes, query_boxes): 69 | n_ = boxes.shape[0] 70 | k_ = query_boxes.shape[0] 71 | n_mesh, k_mesh = torch.meshgrid([torch.arange(n_), torch.arange(k_)]) 72 | n_mesh = n_mesh.contiguous().view(-1) 73 | k_mesh = k_mesh.contiguous().view(-1) 74 | boxes = boxes[n_mesh] 75 | query_boxes = query_boxes[k_mesh] 76 | 77 | x11, y11, x12, y12 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3] 78 | x21, y21, x22, y22 = query_boxes[:, 0], query_boxes[:, 1], query_boxes[:, 2], query_boxes[:, 3] 79 | xA = torch.max(x11, x21) 80 | yA = torch.max(y11, y21) 81 | xB = torch.min(x12, x22) 82 | yB = torch.min(y12, y22) 83 | interArea = torch.clamp(xB - xA + 1, min=0) * torch.clamp(yB - yA + 1, min=0) 84 | boxAArea = (x12 - x11 + 1) * (y12 - y11 + 1) 85 | boxBArea = (x22 - x21 + 1) * (y22 - y21 + 1) 86 | iou = interArea / (boxAArea + boxBArea - interArea) 87 | 88 | return iou.view(n_, k_).to(boxes.device) 89 | 90 | 91 | 92 | 93 | 94 | 95 | -------------------------------------------------------------------------------- /common/utils/clip_pad.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def clip_pad_images(tensor, pad_shape, pad=0): 5 | """ 6 | Clip clip_pad_images of the pad area. 7 | :param tensor: [c, H, W] 8 | :param pad_shape: [h, w] 9 | :return: [c, h, w] 10 | """ 11 | if not isinstance(tensor, torch.Tensor): 12 | tensor = torch.as_tensor(tensor) 13 | H, W = tensor.shape[1:] 14 | h = pad_shape[1] 15 | w = pad_shape[2] 16 | 17 | tensor_ret = torch.zeros((tensor.shape[0], h, w), dtype=tensor.dtype) + pad 18 | tensor_ret[:, :min(h, H), :min(w, W)] = tensor[:, :min(h, H), :min(w, W)] 19 | 20 | return tensor_ret 21 | 22 | 23 | def clip_pad_boxes(tensor, pad_length, pad=0): 24 | """ 25 | Clip boxes of the pad area. 26 | :param tensor: [k, d] 27 | :param pad_shape: K 28 | :return: [K, d] 29 | """ 30 | if not isinstance(tensor, torch.Tensor): 31 | tensor = torch.as_tensor(tensor) 32 | k = tensor.shape[0] 33 | d = tensor.shape[1] 34 | K = pad_length 35 | tensor_ret = torch.zeros((K, d), dtype=tensor.dtype) + pad 36 | tensor_ret[:min(k, K), :] = tensor[:min(k, K), :] 37 | 38 | return tensor_ret 39 | 40 | 41 | def clip_pad_1d(tensor, pad_length, pad=0): 42 | if not isinstance(tensor, torch.Tensor): 43 | tensor = torch.as_tensor(tensor) 44 | tensor_ret = torch.zeros((pad_length, ), dtype=tensor.dtype) + pad 45 | tensor_ret[:min(tensor.shape[0], pad_length)] = tensor[:min(tensor.shape[0], pad_length)] 46 | 47 | return tensor_ret 48 | 49 | 50 | def clip_pad_2d(tensor, pad_shape, pad=0): 51 | if not isinstance(tensor, torch.Tensor): 52 | tensor = torch.as_tensor(tensor) 53 | tensor_ret = torch.zeros(*pad_shape, dtype=tensor.dtype) + pad 54 | tensor_ret[:min(tensor.shape[0], pad_shape[0]), :min(tensor.shape[1], pad_shape[1])] \ 55 | = tensor[:min(tensor.shape[0], pad_shape[0]), :min(tensor.shape[1], pad_shape[1])] 56 | 57 | return tensor_ret 58 | -------------------------------------------------------------------------------- /common/utils/create_logger.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------- 2 | # Deformable Convolutional Networks 3 | # Copyright (c) 2017 Microsoft 4 | # Licensed under The Apache-2.0 License [see LICENSE for details] 5 | # Written by Bin Xiao 6 | # -------------------------------------------------------- 7 | 8 | import os 9 | import logging 10 | import time 11 | import errno 12 | 13 | 14 | def makedirsExist(path): 15 | try: 16 | os.makedirs(path) 17 | except OSError as e: 18 | if e.errno == errno.EEXIST: 19 | print('Directory not created.') 20 | else: 21 | raise 22 | 23 | 24 | def create_logger(root_output_path, config_file, image_set, split='train', hypers=()): 25 | # set up logger 26 | if not os.path.exists(root_output_path): 27 | makedirsExist(root_output_path) 28 | assert os.path.exists(root_output_path), '{} does not exist'.format(root_output_path) 29 | 30 | cfg_name = os.path.splitext(os.path.basename(config_file))[0] 31 | 32 | config_output_path = os.path.join(root_output_path, '{}'.format(cfg_name)) 33 | for (hyper_name, hyper_val) in hypers: 34 | config_output_path += '@{}={}'.format(hyper_name, hyper_val) 35 | if not os.path.exists(config_output_path): 36 | makedirsExist(config_output_path) 37 | 38 | final_output_path = os.path.join(config_output_path, image_set + '_' + split) 39 | if not os.path.exists(final_output_path): 40 | makedirsExist(final_output_path) 41 | 42 | log_file = '{}_{}.log'.format(cfg_name, time.strftime('%Y-%m-%d-%H-%M')) 43 | head = '%(asctime)-15s %(message)s' 44 | logging.basicConfig(filename=os.path.join(final_output_path, log_file), format=head) 45 | logger = logging.getLogger() 46 | logger.setLevel(logging.INFO) 47 | 48 | return logger, final_output_path 49 | 50 | 51 | -------------------------------------------------------------------------------- /common/utils/flatten.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class Flattener(torch.nn.Module): 5 | def __init__(self): 6 | """ 7 | Flattens last 3 dimensions to make it only batch size, -1 8 | """ 9 | super(Flattener, self).__init__() 10 | 11 | def forward(self, x): 12 | return x.view(x.size(0), -1) 13 | -------------------------------------------------------------------------------- /common/utils/mask.py: -------------------------------------------------------------------------------- 1 | from skimage.draw import polygon 2 | import torch 3 | 4 | 5 | def generate_instance_mask(seg_polys, box, mask_size=(14, 14), dtype=torch.float32, copy=True): 6 | """ 7 | Generate instance mask from polygon 8 | :param seg_poly: torch.Tensor, (N, 2), (x, y) coordinate of N vertices of segmented foreground polygon 9 | :param box: array-like, (4, ), (xmin, ymin, xmax, ymax), instance bounding box 10 | :param mask_size: tuple, (mask_height, mask_weight) 11 | :param dtype: data type of generated mask 12 | :param copy: whether copy seg_polys to a new tensor first 13 | :return: torch.Tensor, of mask_size, instance mask 14 | """ 15 | mask = torch.zeros(mask_size, dtype=dtype) 16 | w_ratio = float(mask_size[0]) / (box[2] - box[0] + 1) 17 | h_ratio = float(mask_size[1]) / (box[3] - box[1] + 1) 18 | 19 | # import IPython 20 | # IPython.embed() 21 | 22 | for seg_poly in seg_polys: 23 | if copy: 24 | seg_poly = seg_poly.detach().clone() 25 | seg_poly = seg_poly.type(torch.float32) 26 | seg_poly[:, 0] = (seg_poly[:, 0] - box[0]) * w_ratio 27 | seg_poly[:, 1] = (seg_poly[:, 1] - box[1]) * h_ratio 28 | rr, cc = polygon(seg_poly[:, 1].clamp(min=0, max=mask_size[1] - 1), 29 | seg_poly[:, 0].clamp(min=0, max=mask_size[0] - 1)) 30 | 31 | mask[rr, cc] = 1 32 | return mask 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /common/utils/masked_softmax.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def masked_softmax(vector: torch.Tensor, mask: torch.Tensor, dim: int = -1) -> torch.Tensor: 5 | """ 6 | ``torch.nn.functional.softmax(vector)`` does not work if some elements of ``vector`` should be 7 | masked. This performs a softmax on just the non-masked portions of ``vector``. Passing 8 | ``None`` in for the mask is also acceptable; you'll just get a regular softmax. 9 | 10 | ``vector`` can have an arbitrary number of dimensions; the only requirement is that ``mask`` is 11 | broadcastable to ``vector's`` shape. If ``mask`` has fewer dimensions than ``vector``, we will 12 | unsqueeze on dimension 1 until they match. If you need a different unsqueezing of your mask, 13 | do it yourself before passing the mask into this function. 14 | 15 | In the case that the input vector is completely masked, this function returns an array 16 | of ``0.0``. This behavior may cause ``NaN`` if this is used as the last layer of a model 17 | that uses categorical cross-entropy loss. 18 | """ 19 | if mask is None: 20 | result = torch.nn.functional.softmax(vector, dim=dim) 21 | else: 22 | mask = mask.type(vector.dtype) 23 | while mask.dim() < vector.dim(): 24 | mask = mask.unsqueeze(1) 25 | # To limit numerical errors from large vector elements outside the mask, we zero these out. 26 | result = torch.nn.functional.softmax(vector * mask, dim=dim) 27 | result = result * mask 28 | result = result / (result.sum(dim=dim, keepdim=True) + (1e-7 if vector.dtype == torch.half else 1e-13)) 29 | return result 30 | -------------------------------------------------------------------------------- /common/utils/multi_task_dataloader.py: -------------------------------------------------------------------------------- 1 | from functools import reduce 2 | import operator 3 | from typing import List 4 | from torch.utils.data import DataLoader 5 | import sys 6 | 7 | INT_MAX = sys.maxsize 8 | 9 | 10 | def prod(iterable): 11 | if len(list(iterable)) > 0: 12 | return reduce(operator.mul, iterable) 13 | else: 14 | return 1 15 | 16 | 17 | class MultiTaskDataLoader(object): 18 | """ 19 | Multi-task DataLoader, the first dataloader is master dataloader 20 | """ 21 | def __init__(self, 22 | loaders: List[DataLoader]): 23 | assert len(loaders) > 1, "Less than 2 loader!" 24 | self.loaders = loaders 25 | self.iters = [iter(loader) for loader in loaders] 26 | self.lens = [len(loader) for loader in loaders] 27 | self.global_idx_in_cycle = 0 28 | 29 | def __iter__(self): 30 | if self.global_idx_in_cycle > 0: 31 | self.iters[0] = iter(self.loaders[0]) 32 | return self 33 | 34 | def __next__(self): 35 | output_tuple = (*next(self.iters[0]), ) 36 | for k, (loader, _iter) in enumerate(zip(self.loaders[1:], self.iters[1:])): 37 | if hasattr(loader.batch_sampler.sampler, 'set_epoch'): 38 | loader.batch_sampler.sampler.set_epoch(int(self.global_idx_in_cycle / self.lens[k+1])) 39 | try: 40 | output_tuple += (*next(_iter), ) 41 | except StopIteration: 42 | _iter = iter(loader) 43 | self.iters[k+1] = _iter 44 | output_tuple += (*next(_iter), ) 45 | 46 | if self.global_idx_in_cycle < INT_MAX - 1: 47 | self.global_idx_in_cycle += 1 48 | else: 49 | self.global_idx_in_cycle = 0 50 | 51 | return output_tuple 52 | 53 | def __len__(self): 54 | return self.lens[0] 55 | 56 | 57 | 58 | -------------------------------------------------------------------------------- /common/utils/pad_sequence.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def pad_sequence(sequence, lengths): 5 | """ 6 | :param sequence: [\sum b, .....] sequence 7 | :param lengths: [b1, b2, b3...] that sum to \sum b 8 | :return: [len(lengths), maxlen(b), .....] tensor 9 | """ 10 | 11 | output = sequence.new_zeros(len(lengths), max(lengths), *sequence.shape[1:]) 12 | start = 0 13 | for i, diff in enumerate(lengths): 14 | if diff > 0: 15 | output[i, :diff] = sequence[start:(start + diff)] 16 | start += diff 17 | return output 18 | -------------------------------------------------------------------------------- /common/utils/zipreader.py: -------------------------------------------------------------------------------- 1 | import zipfile 2 | import os 3 | import io 4 | import time 5 | from PIL import Image 6 | 7 | 8 | class ZipReader(object): 9 | zip_bank = dict() 10 | 11 | def __init__(self): 12 | super(ZipReader, self).__init__() 13 | 14 | @staticmethod 15 | def get_zipfile(path): 16 | zip_bank = ZipReader.zip_bank 17 | if path in zip_bank: 18 | return zip_bank[path] 19 | else: 20 | print("creating new zip_bank") 21 | zfile = zipfile.ZipFile(path, 'r') 22 | zip_bank[path] = zfile 23 | return zip_bank[path] 24 | 25 | @staticmethod 26 | def split_zip_style_path(path): 27 | pos_zip_at = path.index('.zip@') 28 | if pos_zip_at == len(path): 29 | print("character '@' is not found from the given path '%s'" % (path)) 30 | assert 0 31 | pos_at = pos_zip_at + len('.zip@') - 1 32 | 33 | zip_path = path[0: pos_at] 34 | folder_path = path[pos_at + 1:] 35 | folder_path = str.strip(folder_path, '/') 36 | return zip_path, folder_path 37 | 38 | @staticmethod 39 | def list_folder(path): 40 | zip_path, folder_path = ZipReader.split_zip_style_path(path) 41 | 42 | zfile = ZipReader.get_zipfile(zip_path) 43 | folder_list = [] 44 | for file_foler_name in zfile.namelist(): 45 | file_foler_name = str.strip(file_foler_name, '/') 46 | if file_foler_name.startswith(folder_path) and \ 47 | len(os.path.splitext(file_foler_name)[-1]) == 0 and \ 48 | file_foler_name != folder_path: 49 | if len(folder_path) == 0: 50 | folder_list.append(file_foler_name) 51 | else: 52 | folder_list.append(file_foler_name[len(folder_path)+1:]) 53 | 54 | return folder_list 55 | 56 | @staticmethod 57 | def list_files(path, extension=['.*']): 58 | zip_path, folder_path = ZipReader.split_zip_style_path(path) 59 | 60 | zfile = ZipReader.get_zipfile(zip_path) 61 | file_lists = [] 62 | for file_foler_name in zfile.namelist(): 63 | file_foler_name = str.strip(file_foler_name, '/') 64 | if file_foler_name.startswith(folder_path) and str.lower(os.path.splitext(file_foler_name)[-1]) in extension: 65 | if len(folder_path) == 0: 66 | file_lists.append(file_foler_name) 67 | else: 68 | file_lists.append(file_foler_name[len(folder_path)+1:]) 69 | 70 | return file_lists 71 | 72 | @staticmethod 73 | def imread(path): 74 | zip_path, path_img = ZipReader.split_zip_style_path(path) 75 | zfile = ZipReader.get_zipfile(zip_path) 76 | data = zfile.read(path_img) 77 | im = Image.open(io.BytesIO(data)) 78 | return im 79 | 80 | @staticmethod 81 | def read(path): 82 | zip_path, path_img = ZipReader.split_zip_style_path(path) 83 | zfile = ZipReader.get_zipfile(zip_path) 84 | data = zfile.read(path_img) 85 | return data 86 | -------------------------------------------------------------------------------- /data/PREPARE_DATA.md: -------------------------------------------------------------------------------- 1 | # Prepare Data 2 | 3 | Download datasets as you need, and organize them as following: 4 | ``` 5 | code_root/ 6 | └── data/ 7 | ├── conceptual-captions/ 8 | │ ├── train_image/ 9 | │   ├── val_image/ 10 | │ ├── train_frcnn/ 11 | │ ├── val_frcnn/ 12 | │ ├── train.json 13 | │ ├── val.json 14 | │ ├── train_frcnn.json 15 | │ └── val_frcnn.json 16 | ├── en_corpus/ 17 | │ ├── wiki.doc 18 | │ └── bc1g.doc 19 | ├── vcr/ 20 | │   ├── vcr1images/ 21 | │ ├── train.jsonl 22 | │ ├── val.jsonl 23 | │ └── test.jsonl 24 | └── coco/ 25 | ├── train2014/ 26 | ├── val2014/ 27 | ├── test2015/ 28 | ├── annotations/ 29 | ├── vqa/ 30 | ├── refcoco+/ 31 | │ └── proposal/ 32 | └── vgbua_res101_precomputed/ 33 | ├── trainval2014_resnet101_faster_rcnn_genome 34 | └── test2015_resnet101_faster_rcnn_genome 35 | 36 | ``` 37 | ## Pre-training Data 38 | 39 | ### Conceptual Captions 40 | See [ReadMe.txt](./conceptual-captions/ReadMe.txt). 41 | 42 | ### English Wikipedia & BooksCorpus 43 | * Wikipedia: [GoogleDrive](https://drive.google.com/file/d/1rZJ-Nj_SSqwu85tME3wbN8tfGhljfAsf/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1HSgUZXRESxVnx9ATOHwSrQ) 44 | * BooksCorpus: [GoogleDrive](https://drive.google.com/file/d/16T5EYqIjO-tAj1OFxz6bnnzEABCusCcv/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1797WFFUTnRJakgGxefSrBg) 45 | 46 | ## Fine-tuning Data 47 | 48 | ### VCR 49 | * Download and unzip images & annotations from [here](https://visualcommonsense.com/download/). 50 | 51 | ### VQA & RefCOCO+ 52 | 53 | #### Common 54 | * Download and unzip COCO 2014 images & annotations from [here](http://cocodataset.org/#download). 55 | 56 | #### VQA 57 | * Download and unzip annotations from [here](https://visualqa.org/download.html) (including "VQA Annotations" and "VQA Input Questions"), 58 | place all these files directly under ```./data/coco/vqa```. 59 | * Download and unzip following precomputed boxes & features into ```./data/coco/vgbua_res101_precomputed```. 60 | * train2014 + val2014: [GoogleDrive](https://drive.google.com/file/d/1KyLyqTqBsMX7QtLTma0xFrmhAzdQDUed/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1Udtoi2TC-nAimZf-vLC9PQ) 61 | * test2015: [GoogleDrive](https://drive.google.com/file/d/10nM3kRz2c827aqwVvLnv430YYFp0po6O/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1wd3rWfPWLBhGkEc10N9e1Q) 62 | 63 | * Download answer vocabulary from [GoogleDrive](https://drive.google.com/file/d/1CPnYcOgIOP5CZkp_KChuCg54_Ljr6-fp/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1IvPsH-mmqHi2glgznaBuYw), place it under the folder ```./data/coco/vqa/```. 64 | 65 | #### RefCOCO+ 66 | 67 | * Download and unzip [annotations](http://bvisionweb1.cs.unc.edu/licheng/referit/data/refcoco+.zip), place all files in ```refcoco+/``` directly under ```./data/coco/refcoco+```. 68 | * Download [region proposals](http://bvision.cs.unc.edu/licheng/MattNet/detections.zip), place all files in ```detections/refcoco+_unc``` directly under ```./data/coco/refcoco+/proposal```. -------------------------------------------------------------------------------- /data/conceptual-captions/ReadMe.txt: -------------------------------------------------------------------------------- 1 | 0. create a python 2.7 conda environment: 2 | 3 | conda create -n cc python=2.7 pip 4 | conda activate cc 5 | pip install Cython numpy Pillow 6 | 7 | 1. download "Train_GCC-training.tsv" and "Validation_GCC-1.1.0-Validation.tsv" from 8 | https://ai.google.com/research/ConceptualCaptions/download 9 | 10 | 2. move "Train_GCC-training.tsv" and "Validation_GCC-1.1.0-Validation.tsv" into 11 | conceptual-captions/utils/ 12 | 13 | 3. cd to conceptual-captions/utils/ 14 | 15 | 4. python gen_train4download.py 16 | python gen_val4download.py 17 | 18 | 5. sh download_train.sh 19 | sh download_val.sh 20 | 21 | * you may need to run these commands multiple times to avoid temporary network failures and download as more images as possible 22 | * these commands will skip already successfully downloaded images, so don't worry about wasting time 23 | 24 | 6. 1) zip (without compression) "train_image" by 25 | 26 | cd ../train_image 27 | zip -0 ../train_image.zip ./* 28 | cd ../utils/ 29 | 30 | 2) zip (without compression) "val_image" by 31 | 32 | cd ../val_image 33 | zip -0 ../val_image.zip ./* 34 | cd ../utils/ 35 | 36 | 7. python gen_train_image_json.py 37 | python gen_val_image_json.py 38 | 39 | 40 | 8. git clone https://github.com/jackroos/bottom-up-attention and follow "Installation" : 41 | 42 | 1) Build the Cython modules 43 | 44 | cd $REPO_ROOT/lib 45 | make 46 | 47 | 2) Build Caffe and pycaffe 48 | 49 | cd $REPO_ROOT/caffe 50 | # Now follow the Caffe installation instructions here: 51 | # http://caffe.berkeleyvision.org/installation.html 52 | 53 | # If you're experienced with Caffe and have all of the requirements installed 54 | # and your Makefile.config in place, then simply do: 55 | make -j8 && make pycaffe 56 | 57 | 3) Download pretrained model (https://www.dropbox.com/s/5xethd2nxa8qrnq/resnet101_faster_rcnn_final.caffemodel?dl=1), and put it under data/faster_rcnn_models. 58 | 59 | 9. python ./tools/generate_tsv_v2.py --gpu 0,1,2,3,4,5,6,7 --cfg experiments/cfgs/faster_rcnn_end2end_resnet.yml --def models/vg/ResNet-101/faster_rcnn_end2end_final/test.prototxt --net data/faster_rcnn_models/resnet101_faster_rcnn_final.caffemodel --split conceptual_captions_train --data_root {Conceptual_Captions_Root} --out {Conceptual_Captions_Root}/train_frcnn/ 60 | 61 | python ./tools/generate_tsv_v2.py --gpu 0,1,2,3,4,5,6,7 --cfg experiments/cfgs/faster_rcnn_end2end_resnet.yml --def models/vg/ResNet-101/faster_rcnn_end2end_final/test.prototxt --net data/faster_rcnn_models/resnet101_faster_rcnn_final.caffemodel --split conceptual_captions_val --data_root {Conceptual_Captions_Root} --out {Conceptual_Captions_Root}/val_frcnn/ 62 | 63 | 10. zip (without compression) "train_frcnn" and "val_frcnn" similar to step 6. 64 | -------------------------------------------------------------------------------- /data/conceptual-captions/utils/check_valid.py: -------------------------------------------------------------------------------- 1 | import sys 2 | from PIL import Image 3 | 4 | import warnings 5 | 6 | warnings.filterwarnings("ignore", "(Possibly )?corrupt EXIF data", UserWarning) 7 | 8 | try: 9 | im = Image.open(sys.argv[1]).convert('RGB') 10 | # remove images with too small or too large size 11 | if (im.size[0] < 10 or im.size[1] < 10 or im.size[0] > 10000 or im.size[1] > 10000): 12 | raise Exception('') 13 | except: 14 | print(sys.argv[1]) 15 | -------------------------------------------------------------------------------- /data/conceptual-captions/utils/download_train.sh: -------------------------------------------------------------------------------- 1 | # use 20 threads 2 | 3 | cat train4download.txt | xargs -n 2 -P 20 wget -nc -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' --timeout=1 --waitretry=0 --tries=5 --retry-connrefused -nv -O 4 | find ../train_image -type f -size -1c -exec rm {} \; 5 | ls -d ../train_image/* | xargs -n 1 -P 20 python check_valid.py | tee train_size_invalid.txt 6 | xargs rm < train_size_invalid.txt 7 | rm train_size_invalid.txt 8 | ls ../train_image > train_valid.txt 9 | -------------------------------------------------------------------------------- /data/conceptual-captions/utils/download_val.sh: -------------------------------------------------------------------------------- 1 | # use 20 threads 2 | 3 | cat val4download.txt | xargs -n 2 -P 20 wget -nc -U 'Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17' --timeout=1 --waitretry=0 --tries=5 --retry-connrefused -nv -O 4 | find ../val_image -type f -size -1c -exec rm {} \; 5 | ls -d ../val_image/* | xargs -n 1 -P 20 python check_valid.py | tee val_size_invalid.txt 6 | xargs rm < val_size_invalid.txt 7 | rm val_size_invalid.txt 8 | ls ../val_image > val_valid.txt 9 | -------------------------------------------------------------------------------- /data/conceptual-captions/utils/gen_train4download.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | captions = [] 4 | urls = [] 5 | 6 | with open('Train_GCC-training.tsv') as fp: 7 | for cnt, line in enumerate(fp): 8 | s = line.split('\t') 9 | captions.append(s[0].split(' ')) 10 | urls.append(s[1][:-1]) 11 | 12 | with open('train4download.txt', 'w') as fp: 13 | for cnt, url in enumerate(urls): 14 | fp.write("../train_image/{:08d}.jpg\t\"{}\"\n".format(cnt, url)) 15 | 16 | if not os.path.exists('../train_image'): 17 | os.makedirs('../train_image') -------------------------------------------------------------------------------- /data/conceptual-captions/utils/gen_train_image_json.py: -------------------------------------------------------------------------------- 1 | captions = [] 2 | urls = [] 3 | 4 | with open('Train_GCC-training.tsv') as fp: 5 | for cnt, line in enumerate(fp): 6 | s = line.split('\t') 7 | captions.append(s[0].split(' ')) 8 | urls.append(s[1][:-1]) 9 | 10 | valids = set([]) 11 | with open('train_valid.txt') as fp: 12 | for cnt, line in enumerate(fp): 13 | valids.add(line[:-1]) 14 | 15 | import json 16 | with open('train.json', 'w') as outfile: 17 | for cnt, (cap, url) in enumerate(zip(captions, urls)): 18 | im = "{:08d}.jpg".format(cnt) 19 | if (im in valids): 20 | d = {'image':"train_image.zip@/{}".format(im), 'caption':cap} 21 | json.dump(d, outfile) 22 | outfile.write('\n') 23 | 24 | 25 | import json 26 | with open('train_frcnn.json', 'w') as outfile: 27 | for cnt, (cap, url) in enumerate(zip(captions, urls)): 28 | im = "{:08d}.jpg".format(cnt) 29 | if (im in valids): 30 | d = {'image':"train_image.zip@/{}".format(im), 'caption':cap, 'frcnn':"train_frcnn.zip@/{:08d}.json".format(cnt)} 31 | json.dump(d, outfile) 32 | outfile.write('\n') -------------------------------------------------------------------------------- /data/conceptual-captions/utils/gen_val4download.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | captions = [] 4 | urls = [] 5 | 6 | with open('Validation_GCC-1.1.0-Validation.tsv') as fp: 7 | for cnt, line in enumerate(fp): 8 | s = line.split('\t') 9 | captions.append(s[0].split(' ')) 10 | urls.append(s[1][:-1]) 11 | 12 | with open('val4download.txt', 'w') as fp: 13 | for cnt, url in enumerate(urls): 14 | fp.write("../val_image/{:08d}.jpg\t\"{}\"\n".format(cnt, url)) 15 | 16 | if not os.path.exists('../val_image'): 17 | os.makedirs('../val_image') -------------------------------------------------------------------------------- /data/conceptual-captions/utils/gen_val_image_json.py: -------------------------------------------------------------------------------- 1 | captions = [] 2 | urls = [] 3 | 4 | with open('Validation_GCC-1.1.0-Validation.tsv') as fp: 5 | for cnt, line in enumerate(fp): 6 | s = line.split('\t') 7 | captions.append(s[0].split(' ')) 8 | urls.append(s[1][:-1]) 9 | 10 | valids = set([]) 11 | with open('val_valid.txt') as fp: 12 | for cnt, line in enumerate(fp): 13 | valids.add(line[:-1]) 14 | 15 | import json 16 | with open('val.json', 'w') as outfile: 17 | for cnt, (cap, url) in enumerate(zip(captions, urls)): 18 | im = "{:08d}.jpg".format(cnt) 19 | if (im in valids): 20 | d = {'image':"val_image.zip@/{}".format(im), 'caption':cap} 21 | json.dump(d, outfile) 22 | outfile.write('\n') 23 | 24 | import json 25 | with open('val_frcnn.json', 'w') as outfile: 26 | for cnt, (cap, url) in enumerate(zip(captions, urls)): 27 | im = "{:08d}.jpg".format(cnt) 28 | if (im in valids): 29 | d = {'image':"val_image.zip@/{}".format(im), 'caption':cap, 'frcnn':"val_frcnn.zip@/{:08d}.json".format(cnt)} 30 | json.dump(d, outfile) 31 | outfile.write('\n') -------------------------------------------------------------------------------- /external/pytorch_pretrained_bert/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.6.0" 2 | from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer 3 | from .tokenization_openai import OpenAIGPTTokenizer 4 | from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus) 5 | from .tokenization_gpt2 import GPT2Tokenizer 6 | 7 | from .modeling import (BertConfig, BertModel, BertForPreTraining, 8 | BertForMaskedLM, BertForNextSentencePrediction, 9 | BertForSequenceClassification, BertForMultipleChoice, 10 | BertForTokenClassification, BertForQuestionAnswering, 11 | load_tf_weights_in_bert) 12 | from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel, 13 | OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel, 14 | load_tf_weights_in_openai_gpt) 15 | from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel, 16 | load_tf_weights_in_transfo_xl) 17 | from .modeling_gpt2 import (GPT2Config, GPT2Model, 18 | GPT2LMHeadModel, GPT2DoubleHeadsModel, 19 | load_tf_weights_in_gpt2) 20 | 21 | from .optimization import BertAdam 22 | from .optimization_openai import OpenAIAdam 23 | 24 | from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path 25 | -------------------------------------------------------------------------------- /external/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | from io import open 21 | 22 | import torch 23 | 24 | from external.pytorch_pretrained_bert.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME, 25 | GPT2Config, 26 | GPT2Model, 27 | load_tf_weights_in_gpt2) 28 | 29 | 30 | def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path): 31 | # Construct model 32 | if gpt2_config_file == "": 33 | config = GPT2Config() 34 | else: 35 | config = GPT2Config(gpt2_config_file) 36 | model = GPT2Model(config) 37 | 38 | # Load weights from numpy 39 | load_tf_weights_in_gpt2(model, gpt2_checkpoint_path) 40 | 41 | # Save pytorch-model 42 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 43 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 44 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 45 | torch.save(model.state_dict(), pytorch_weights_dump_path) 46 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 47 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 48 | f.write(config.to_json_string()) 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | ## Required parameters 54 | parser.add_argument("--gpt2_checkpoint_path", 55 | default = None, 56 | type = str, 57 | required = True, 58 | help = "Path the TensorFlow checkpoint path.") 59 | parser.add_argument("--pytorch_dump_folder_path", 60 | default = None, 61 | type = str, 62 | required = True, 63 | help = "Path to the output PyTorch model.") 64 | parser.add_argument("--gpt2_config_file", 65 | default = "", 66 | type = str, 67 | help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" 68 | "This specifies the model architecture.") 69 | args = parser.parse_args() 70 | convert_gpt2_checkpoint_to_pytorch(args.gpt2_checkpoint_path, 71 | args.gpt2_config_file, 72 | args.pytorch_dump_folder_path) 73 | -------------------------------------------------------------------------------- /external/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert OpenAI GPT checkpoint.""" 16 | 17 | from __future__ import absolute_import, division, print_function 18 | 19 | import argparse 20 | from io import open 21 | 22 | import torch 23 | 24 | from external.pytorch_pretrained_bert.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME, 25 | OpenAIGPTConfig, 26 | OpenAIGPTModel, 27 | load_tf_weights_in_openai_gpt) 28 | 29 | 30 | def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path): 31 | # Construct model 32 | if openai_config_file == "": 33 | config = OpenAIGPTConfig() 34 | else: 35 | config = OpenAIGPTConfig(openai_config_file) 36 | model = OpenAIGPTModel(config) 37 | 38 | # Load weights from numpy 39 | load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path) 40 | 41 | # Save pytorch-model 42 | pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME 43 | pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME 44 | print("Save PyTorch model to {}".format(pytorch_weights_dump_path)) 45 | torch.save(model.state_dict(), pytorch_weights_dump_path) 46 | print("Save configuration file to {}".format(pytorch_config_dump_path)) 47 | with open(pytorch_config_dump_path, "w", encoding="utf-8") as f: 48 | f.write(config.to_json_string()) 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | ## Required parameters 54 | parser.add_argument("--openai_checkpoint_folder_path", 55 | default = None, 56 | type = str, 57 | required = True, 58 | help = "Path the TensorFlow checkpoint path.") 59 | parser.add_argument("--pytorch_dump_folder_path", 60 | default = None, 61 | type = str, 62 | required = True, 63 | help = "Path to the output PyTorch model.") 64 | parser.add_argument("--openai_config_file", 65 | default = "", 66 | type = str, 67 | help = "An optional config json file corresponding to the pre-trained OpenAI model. \n" 68 | "This specifies the model architecture.") 69 | args = parser.parse_args() 70 | convert_openai_checkpoint_to_pytorch(args.openai_checkpoint_folder_path, 71 | args.openai_config_file, 72 | args.pytorch_dump_folder_path) 73 | -------------------------------------------------------------------------------- /external/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The HugginFace Inc. team. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | """Convert BERT checkpoint.""" 16 | 17 | from __future__ import absolute_import 18 | from __future__ import division 19 | from __future__ import print_function 20 | 21 | import argparse 22 | import torch 23 | 24 | from external.pytorch_pretrained_bert.modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert 25 | 26 | def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path): 27 | # Initialise PyTorch model 28 | config = BertConfig.from_json_file(bert_config_file) 29 | print("Building PyTorch model from configuration: {}".format(str(config))) 30 | model = BertForPreTraining(config) 31 | 32 | # Load weights from tf checkpoint 33 | load_tf_weights_in_bert(model, tf_checkpoint_path) 34 | 35 | # Save pytorch-model 36 | print("Save PyTorch model to {}".format(pytorch_dump_path)) 37 | torch.save(model.state_dict(), pytorch_dump_path) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | ## Required parameters 43 | parser.add_argument("--tf_checkpoint_path", 44 | default = None, 45 | type = str, 46 | required = True, 47 | help = "Path the TensorFlow checkpoint path.") 48 | parser.add_argument("--bert_config_file", 49 | default = None, 50 | type = str, 51 | required = True, 52 | help = "The config json file corresponding to the pre-trained BERT model. \n" 53 | "This specifies the model architecture.") 54 | parser.add_argument("--pytorch_dump_path", 55 | default = None, 56 | type = str, 57 | required = True, 58 | help = "Path to the output PyTorch model.") 59 | args = parser.parse_args() 60 | convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path, 61 | args.bert_config_file, 62 | args.pytorch_dump_path) 63 | -------------------------------------------------------------------------------- /figs/attention_viz.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/figs/attention_viz.png -------------------------------------------------------------------------------- /figs/pretrain.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/figs/pretrain.png -------------------------------------------------------------------------------- /model/pretrained_model/PREPARE_PRETRAINED_MODELS.md: -------------------------------------------------------------------------------- 1 | # Prepare Pre-trained Models 2 | Download pre-trained models and organize them as following: 3 | ``` 4 | code_root/ 5 | └── model/ 6 | └── pretrained_model/ 7 | ├── vl-bert-base-e2e.model 8 |    ├── vl-bert-large-e2e.model 9 | ├── vl-bert-base-prec.model 10 | ├── vl-bert-large-prec.model 11 | ├── bert-base-uncased/ 12 | │ ├── vocab.txt 13 |    │ ├── bert_config.json 14 | │ └── pytorch_model.bin 15 | ├── bert-large-uncased/ 16 | │ ├── vocab.txt 17 |    │ ├── bert_config.json 18 | │ └── pytorch_model.bin 19 | └── resnet101-pt-vgbua-0000.model 20 | ``` 21 | 22 | 23 | ## VL-BERT 24 | 25 | | Model Name | Download Link | 26 | | ------------------ | --------------- | 27 | | vl-bert-base-e2e | [GoogleDrive](https://drive.google.com/file/d/1jjV1ARYMs37tOaBalhJmwq7LcWeMai96/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1rl0Hl-iZZHL-3fj8hE_Uug) | 28 | | vl-bert-large-e2e | [GoogleDrive](https://drive.google.com/file/d/1YTHWWyP7Kq6zPySoEcTs3STaQdc5OJ7f/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1yqpDZRuGLsRXpklDgSC_Jw) | 29 | | vl-bert-base-prec | [GoogleDrive](https://drive.google.com/file/d/1YBFsyoWwz83VPzbimKymSBxE37gYtfgh/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1SvGbE2cjw8jEGWwSfJBFQQ) | 30 | | vl-bert-large-prec | [GoogleDrive](https://drive.google.com/file/d/1REZLN7c3JCHVFoi_nEO-Nn6A4PTKIygG/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1k4eQe2rGGGVD24ZksJteNA) | 31 | 32 | ***Note***: models with suffix "e2e" means parameters of Fast-RCNN is tuned during pre-training, 33 | while "prec" means Fast-RCNN is fixed during pre-training and for effeciency the visual features is precomputed using 34 | [bottom-up-attention](https://github.com/peteanderson80/bottom-up-attention). 35 | 36 | ## BERT & ResNet 37 | 38 | Download following pre-trained BERT and ResNet and place them under this folder. 39 | 40 | * BERT: [GoogleDrive](https://drive.google.com/file/d/14VceZht89V5i54-_xWiw58Rosa5NDL2H/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1dyYcw50eZznL02ilG676Yw) 41 | * ResNet101 pretrained on Visual Genome: 42 | [GoogleDrive](https://drive.google.com/file/d/1qJYtsGw1SfAyvknDZeRBnp2cF4VNjiDE/view?usp=sharing) / [BaiduPan](https://pan.baidu.com/s/1_yfZG8VqbWmp5Kr9w2DKGQ) 43 | (converted from [caffe model](https://www.dropbox.com/s/wqada4qiv1dz9dk/resnet101_faster_rcnn_final.caffemodel?dl=1)) -------------------------------------------------------------------------------- /pretrain/_init_paths.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | this_dir = os.path.abspath(os.path.dirname(__file__)) 5 | 6 | 7 | def add_path(path): 8 | if path not in sys.path: 9 | sys.path.insert(0, path) 10 | 11 | 12 | root_path = os.path.join(this_dir, '../') 13 | add_path(root_path) 14 | -------------------------------------------------------------------------------- /pretrain/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/pretrain/data/__init__.py -------------------------------------------------------------------------------- /pretrain/data/collate_batch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from common.utils.clip_pad import * 3 | 4 | 5 | class BatchCollator(object): 6 | def __init__(self, dataset, append_ind=False): 7 | self.dataset = dataset 8 | self.test_mode = self.dataset.test_mode 9 | self.data_names = self.dataset.data_names 10 | self.append_ind = append_ind 11 | 12 | def __call__(self, batch): 13 | if not isinstance(batch, list): 14 | batch = list(batch) 15 | 16 | if 'image' in self.data_names: 17 | if batch[0][self.data_names.index('image')] is not None: 18 | max_shape = tuple(max(s) for s in zip(*[data[self.data_names.index('image')].shape for data in batch])) 19 | image_none = False 20 | else: 21 | image_none = True 22 | if 'boxes' in self.data_names: 23 | max_boxes = max([data[self.data_names.index('boxes')].shape[0] for data in batch]) 24 | if 'text' in self.data_names: 25 | max_text_length = max([len(data[self.data_names.index('text')]) for data in batch]) 26 | 27 | for i, ibatch in enumerate(batch): 28 | out = {} 29 | 30 | if 'image' in self.data_names: 31 | if image_none: 32 | out['image'] = None 33 | else: 34 | image = ibatch[self.data_names.index('image')] 35 | out['image'] = clip_pad_images(image, max_shape, pad=0) 36 | 37 | if 'boxes' in self.data_names: 38 | boxes = ibatch[self.data_names.index('boxes')] 39 | out['boxes'] = clip_pad_boxes(boxes, max_boxes, pad=-2) 40 | 41 | if 'text' in self.data_names: 42 | text = ibatch[self.data_names.index('text')] 43 | out['text'] = clip_pad_1d(text, max_text_length, pad=0) 44 | 45 | if 'mlm_labels' in self.data_names: 46 | mlm_labels = ibatch[self.data_names.index('mlm_labels')] 47 | out['mlm_labels'] = clip_pad_1d(mlm_labels, max_text_length, pad=-1) 48 | 49 | if 'mvrc_ops' in self.data_names: 50 | mvrc_ops = ibatch[self.data_names.index('mvrc_ops')] 51 | out['mvrc_ops'] = clip_pad_1d(mvrc_ops, max_boxes, pad=0) 52 | 53 | if 'mvrc_labels' in self.data_names: 54 | mvrc_labels = ibatch[self.data_names.index('mvrc_labels')] 55 | out['mvrc_labels'] = clip_pad_boxes(mvrc_labels, max_boxes, pad=0) 56 | 57 | other_names = [data_name for data_name in self.data_names if data_name not in out] 58 | for name in other_names: 59 | out[name] = torch.as_tensor(ibatch[self.data_names.index(name)]) 60 | 61 | batch[i] = tuple(out[data_name] for data_name in self.data_names) 62 | if self.append_ind: 63 | batch[i] += (torch.tensor(i, dtype=torch.int64),) 64 | 65 | out_tuple = () 66 | for items in zip(*batch): 67 | if items[0] is None: 68 | out_tuple += (None,) 69 | else: 70 | out_tuple += (torch.stack(tuple(items), dim=0), ) 71 | 72 | return out_tuple 73 | 74 | -------------------------------------------------------------------------------- /pretrain/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .conceptual_captions import ConceptualCaptionsDataset 2 | from .coco_captions import COCOCaptionsDataset 3 | from .general_corpus import GeneralCorpus 4 | 5 | 6 | -------------------------------------------------------------------------------- /pretrain/data/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | from .distributed import DistributedSampler 2 | from .grouped_batch_sampler import GroupedBatchSampler 3 | 4 | -------------------------------------------------------------------------------- /pretrain/data/samplers/distributed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # Code is copy-pasted exactly as in torch.utils.data.distributed. 3 | # FIXME remove this once c10d fixes the bug it has 4 | import math 5 | import torch 6 | import torch.distributed as dist 7 | from torch.utils.data.sampler import Sampler 8 | 9 | 10 | class DistributedSampler(Sampler): 11 | """Sampler that restricts data loading to a subset of the dataset. 12 | It is especially useful in conjunction with 13 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 14 | process can pass a DistributedSampler instance as a DataLoader sampler, 15 | and load a subset of the original dataset that is exclusive to it. 16 | .. note:: 17 | Dataset is assumed to be of constant size. 18 | Arguments: 19 | dataset: Dataset used for sampling. 20 | num_replicas (optional): Number of processes participating in 21 | distributed training. 22 | rank (optional): Rank of the current process within num_replicas. 23 | """ 24 | 25 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): 26 | if num_replicas is None: 27 | if not dist.is_available(): 28 | raise RuntimeError("Requires distributed package to be available") 29 | num_replicas = dist.get_world_size() 30 | if rank is None: 31 | if not dist.is_available(): 32 | raise RuntimeError("Requires distributed package to be available") 33 | rank = dist.get_rank() 34 | self.dataset = dataset 35 | self.num_replicas = num_replicas 36 | self.rank = rank 37 | self.epoch = 0 38 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 39 | self.total_size = self.num_samples * self.num_replicas 40 | self.shuffle = shuffle 41 | 42 | def __iter__(self): 43 | if self.shuffle: 44 | # deterministically shuffle based on epoch 45 | g = torch.Generator() 46 | g.manual_seed(self.epoch) 47 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 48 | else: 49 | indices = torch.arange(len(self.dataset)).tolist() 50 | 51 | # add extra samples to make it evenly divisible 52 | indices += indices[: (self.total_size - len(indices))] 53 | assert len(indices) == self.total_size 54 | 55 | # subsample 56 | offset = self.num_samples * self.rank 57 | indices = indices[offset : offset + self.num_samples] 58 | assert len(indices) == self.num_samples 59 | 60 | return iter(indices) 61 | 62 | def __len__(self): 63 | return self.num_samples 64 | 65 | def set_epoch(self, epoch): 66 | self.epoch = epoch -------------------------------------------------------------------------------- /pretrain/data/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | from .transforms import Compose 2 | from .transforms import Resize 3 | from .transforms import RandomHorizontalFlip 4 | from .transforms import ToTensor 5 | from .transforms import Normalize 6 | 7 | from .build import build_transforms 8 | -------------------------------------------------------------------------------- /pretrain/data/transforms/build.py: -------------------------------------------------------------------------------- 1 | from . import transforms as T 2 | 3 | 4 | def build_transforms(cfg, mode='train'): 5 | assert mode in ['train', 'test', 'val'] 6 | min_size = cfg.SCALES[0] 7 | max_size = cfg.SCALES[1] 8 | assert min_size <= max_size 9 | 10 | if mode == 'train': 11 | flip_prob = cfg.TRAIN.FLIP_PROB 12 | elif mode == 'test': 13 | flip_prob = cfg.TEST.FLIP_PROB 14 | else: 15 | flip_prob = cfg.VAL.FLIP_PROB 16 | 17 | to_bgr255 = True 18 | 19 | normalize_transform = T.Normalize( 20 | mean=cfg.NETWORK.PIXEL_MEANS, std=cfg.NETWORK.PIXEL_STDS, to_bgr255=to_bgr255 21 | ) 22 | 23 | # transform = T.Compose( 24 | # [ 25 | # T.Resize(min_size, max_size), 26 | # T.RandomHorizontalFlip(flip_prob), 27 | # T.ToTensor(), 28 | # normalize_transform, 29 | # T.FixPadding(min_size, max_size, pad=0) 30 | # ] 31 | # ) 32 | 33 | transform = T.Compose( 34 | [ 35 | T.Resize(min_size, max_size), 36 | T.RandomHorizontalFlip(flip_prob), 37 | T.ToTensor(), 38 | normalize_transform, 39 | ] 40 | ) 41 | 42 | return transform 43 | -------------------------------------------------------------------------------- /pretrain/function/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/pretrain/function/__init__.py -------------------------------------------------------------------------------- /pretrain/function/val.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import torch 3 | from common.trainer import to_cuda 4 | 5 | 6 | @torch.no_grad() 7 | def do_validation(net, val_loader, metrics, label_index_in_batch): 8 | net.eval() 9 | metrics.reset() 10 | for nbatch, batch in enumerate(val_loader): 11 | batch = to_cuda(batch) 12 | outputs, _ = net(*batch) 13 | metrics.update(outputs) 14 | 15 | -------------------------------------------------------------------------------- /pretrain/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet_vlbert_for_pretraining import ResNetVLBERTForPretraining 2 | from .resnet_vlbert_for_pretraining_multitask import ResNetVLBERTForPretrainingMultitask 3 | from .resnet_vlbert_for_attention_vis import ResNetVLBERTForAttentionVis 4 | 5 | 6 | -------------------------------------------------------------------------------- /pretrain/train_end2end.py: -------------------------------------------------------------------------------- 1 | import _init_paths 2 | import os 3 | import argparse 4 | import torch 5 | import subprocess 6 | 7 | from pretrain.function.config import config, update_config 8 | from pretrain.function.train import train_net 9 | 10 | 11 | def parse_args(): 12 | parser = argparse.ArgumentParser('Train Cognition Network') 13 | parser.add_argument('--cfg', type=str, help='path to config file') 14 | parser.add_argument('--model-dir', type=str, help='root path to store checkpoint') 15 | parser.add_argument('--log-dir', type=str, help='tensorboard log dir') 16 | parser.add_argument('--dist', help='whether to use distributed training', default=False, action='store_true') 17 | parser.add_argument('--slurm', help='whether this is a slurm job', default=False, action='store_true') 18 | parser.add_argument('--do-test', help='whether to generate csv result on test set', 19 | default=False, action='store_true') 20 | parser.add_argument('--cudnn-off', help='disable cudnn', default=False, action='store_true') 21 | 22 | args = parser.parse_args() 23 | 24 | if args.cfg is not None: 25 | update_config(args.cfg) 26 | if args.model_dir is not None: 27 | config.OUTPUT_PATH = os.path.join(args.model_dir, config.OUTPUT_PATH) 28 | 29 | if args.slurm: 30 | proc_id = int(os.environ['SLURM_PROCID']) 31 | ntasks = int(os.environ['SLURM_NTASKS']) 32 | node_list = os.environ['SLURM_NODELIST'] 33 | num_gpus = torch.cuda.device_count() 34 | addr = subprocess.getoutput( 35 | 'scontrol show hostname {} | head -n1'.format(node_list)) 36 | os.environ['MASTER_PORT'] = str(29500) 37 | os.environ['MASTER_ADDR'] = addr 38 | os.environ['WORLD_SIZE'] = str(ntasks) 39 | os.environ['RANK'] = str(proc_id) 40 | os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) 41 | 42 | return args, config 43 | 44 | 45 | def main(): 46 | args, config = parse_args() 47 | rank, model = train_net(args, config) 48 | 49 | 50 | if __name__ == '__main__': 51 | main() 52 | 53 | 54 | -------------------------------------------------------------------------------- /pretrain/vis_attention_maps.py: -------------------------------------------------------------------------------- 1 | import _init_paths 2 | import os 3 | import argparse 4 | import torch 5 | import subprocess 6 | 7 | from pretrain.function.config import config, update_config 8 | from pretrain.function.vis import vis_net 9 | 10 | 11 | def parse_args(): 12 | parser = argparse.ArgumentParser('Visualize Attention Maps') 13 | parser.add_argument('--cfg', type=str, help='path to config file') 14 | parser.add_argument('--dist', help='whether to use distributed training', default=False, action='store_true') 15 | parser.add_argument('--slurm', help='whether this is a slurm job', default=False, action='store_true') 16 | parser.add_argument('--save-dir', help='directory to save attention maps', type=str, default='./attention_maps') 17 | 18 | args = parser.parse_args() 19 | 20 | if args.cfg is not None: 21 | update_config(args.cfg) 22 | 23 | if args.slurm: 24 | proc_id = int(os.environ['SLURM_PROCID']) 25 | ntasks = int(os.environ['SLURM_NTASKS']) 26 | node_list = os.environ['SLURM_NODELIST'] 27 | num_gpus = torch.cuda.device_count() 28 | addr = subprocess.getoutput( 29 | 'scontrol show hostname {} | head -n1'.format(node_list)) 30 | os.environ['MASTER_PORT'] = str(29500) 31 | os.environ['MASTER_ADDR'] = addr 32 | os.environ['WORLD_SIZE'] = str(ntasks) 33 | os.environ['RANK'] = str(proc_id) 34 | os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) 35 | 36 | return args, config 37 | 38 | 39 | def main(): 40 | args, config = parse_args() 41 | rank, model = vis_net(args, config, args.save_dir) 42 | 43 | 44 | if __name__ == '__main__': 45 | main() -------------------------------------------------------------------------------- /refcoco/_init_paths.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | this_dir = os.path.abspath(os.path.dirname(__file__)) 5 | 6 | 7 | def add_path(path): 8 | if path not in sys.path: 9 | sys.path.insert(0, path) 10 | 11 | 12 | root_path = os.path.join(this_dir, '../') 13 | add_path(root_path) 14 | -------------------------------------------------------------------------------- /refcoco/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/refcoco/data/__init__.py -------------------------------------------------------------------------------- /refcoco/data/collate_batch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from common.utils.clip_pad import * 3 | 4 | 5 | class BatchCollator(object): 6 | def __init__(self, dataset, append_ind=False): 7 | self.dataset = dataset 8 | self.test_mode = self.dataset.test_mode 9 | self.data_names = self.dataset.data_names 10 | self.append_ind = append_ind 11 | 12 | def __call__(self, batch): 13 | if not isinstance(batch, list): 14 | batch = list(batch) 15 | 16 | if batch[0][self.data_names.index('image')] is not None: 17 | max_shape = tuple(max(s) for s in zip(*[data[self.data_names.index('image')].shape for data in batch])) 18 | image_none = False 19 | else: 20 | image_none = True 21 | max_boxes = max([data[self.data_names.index('boxes')].shape[0] for data in batch]) 22 | max_expression_length = max([len(data[self.data_names.index('expression')]) for data in batch]) 23 | 24 | for i, ibatch in enumerate(batch): 25 | out = {} 26 | 27 | if image_none: 28 | out['image'] = None 29 | else: 30 | image = ibatch[self.data_names.index('image')] 31 | out['image'] = clip_pad_images(image, max_shape, pad=0) 32 | 33 | boxes = ibatch[self.data_names.index('boxes')] 34 | out['boxes'] = clip_pad_boxes(boxes, max_boxes, pad=-2) 35 | 36 | expression = ibatch[self.data_names.index('expression')] 37 | out['expression'] = clip_pad_1d(expression, max_expression_length, pad=0) 38 | 39 | if 'label' in self.data_names: 40 | label = ibatch[self.data_names.index('label')] 41 | out['label'] = clip_pad_1d(label, max_boxes, pad=-1) 42 | 43 | other_names = [data_name for data_name in self.data_names if data_name not in out] 44 | for name in other_names: 45 | out[name] = torch.as_tensor(ibatch[self.data_names.index(name)]) 46 | 47 | batch[i] = tuple(out[data_name] for data_name in self.data_names) 48 | if self.append_ind: 49 | batch[i] += (torch.tensor(i, dtype=torch.int64),) 50 | 51 | out_tuple = () 52 | for items in zip(*batch): 53 | if items[0] is None: 54 | out_tuple += (None,) 55 | else: 56 | out_tuple += (torch.stack(tuple(items), dim=0), ) 57 | 58 | return out_tuple 59 | 60 | -------------------------------------------------------------------------------- /refcoco/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .refcoco import RefCOCO 2 | 3 | -------------------------------------------------------------------------------- /refcoco/data/datasets/refer/Makefile: -------------------------------------------------------------------------------- 1 | all: 2 | # install pycocotools/mask locally 3 | # copy from https://github.com/pdollar/coco.git 4 | python setup.py build_ext --inplace 5 | rm -rf build 6 | 7 | -------------------------------------------------------------------------------- /refcoco/data/datasets/refer/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/refcoco/data/datasets/refer/__init__.py -------------------------------------------------------------------------------- /refcoco/data/datasets/refer/external/.gitignore: -------------------------------------------------------------------------------- 1 | *.pyc 2 | _mask.so 3 | _mask.c 4 | -------------------------------------------------------------------------------- /refcoco/data/datasets/refer/external/README.md: -------------------------------------------------------------------------------- 1 | The codes inside this folder are copied from pycocotools: https://github.com/pdollar/coco -------------------------------------------------------------------------------- /refcoco/data/datasets/refer/external/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'tylin' 2 | -------------------------------------------------------------------------------- /refcoco/data/datasets/refer/external/maskApi.h: -------------------------------------------------------------------------------- 1 | /************************************************************************** 2 | * Microsoft COCO Toolbox. version 2.0 3 | * Data, paper, and tutorials available at: http://mscoco.org/ 4 | * Code written by Piotr Dollar and Tsung-Yi Lin, 2015. 5 | * Licensed under the Simplified BSD License [see coco/license.txt] 6 | **************************************************************************/ 7 | #pragma once 8 | 9 | typedef unsigned int uint; 10 | typedef unsigned long siz; 11 | typedef unsigned char byte; 12 | typedef double* BB; 13 | typedef struct { siz h, w, m; uint *cnts; } RLE; 14 | 15 | /* Initialize/destroy RLE. */ 16 | void rleInit( RLE *R, siz h, siz w, siz m, uint *cnts ); 17 | void rleFree( RLE *R ); 18 | 19 | /* Initialize/destroy RLE array. */ 20 | void rlesInit( RLE **R, siz n ); 21 | void rlesFree( RLE **R, siz n ); 22 | 23 | /* Encode binary masks using RLE. */ 24 | void rleEncode( RLE *R, const byte *mask, siz h, siz w, siz n ); 25 | 26 | /* Decode binary masks encoded via RLE. */ 27 | void rleDecode( const RLE *R, byte *mask, siz n ); 28 | 29 | /* Compute union or intersection of encoded masks. */ 30 | void rleMerge( const RLE *R, RLE *M, siz n, int intersect ); 31 | 32 | /* Compute area of encoded masks. */ 33 | void rleArea( const RLE *R, siz n, uint *a ); 34 | 35 | /* Compute intersection over union between masks. */ 36 | void rleIou( RLE *dt, RLE *gt, siz m, siz n, byte *iscrowd, double *o ); 37 | 38 | /* Compute non-maximum suppression between bounding masks */ 39 | void rleNms( RLE *dt, siz n, uint *keep, double thr ); 40 | 41 | /* Compute intersection over union between bounding boxes. */ 42 | void bbIou( BB dt, BB gt, siz m, siz n, byte *iscrowd, double *o ); 43 | 44 | /* Compute non-maximum suppression between bounding boxes */ 45 | void bbNms( BB dt, siz n, uint *keep, double thr ); 46 | 47 | /* Get bounding boxes surrounding encoded masks. */ 48 | void rleToBbox( const RLE *R, BB bb, siz n ); 49 | 50 | /* Convert bounding boxes to encoded masks. */ 51 | void rleFrBbox( RLE *R, const BB bb, siz h, siz w, siz n ); 52 | 53 | /* Convert polygon to encoded mask. */ 54 | void rleFrPoly( RLE *R, const double *xy, siz k, siz h, siz w ); 55 | 56 | /* Get compressed string representation of encoded mask. */ 57 | char* rleToString( const RLE *R ); 58 | 59 | /* Convert from compressed string representation of encoded mask. */ 60 | void rleFrString( RLE *R, char *s, siz h, siz w ); 61 | -------------------------------------------------------------------------------- /refcoco/data/datasets/refer/setup.py: -------------------------------------------------------------------------------- 1 | """ 2 | This code is for making mask.so, used to visualize the segmentation of referred object. 3 | All "mask" related code is copied from https://github.com/pdollar/coco.git 4 | """ 5 | from distutils.core import setup 6 | from Cython.Build import cythonize 7 | from distutils.extension import Extension 8 | import numpy as np 9 | 10 | ext_modules = [ 11 | Extension( 12 | 'external._mask', 13 | sources=['external/maskApi.c', 'external/_mask.pyx'], 14 | include_dirs = [np.get_include(), 'external'], 15 | extra_compile_args=['-Wno-cpp', '-Wno-unused-function', '-std=c99'], 16 | ) 17 | ] 18 | 19 | setup( 20 | name='external', 21 | packages=['external'], 22 | package_dir = {'external': 'external'}, 23 | version='2.0', 24 | ext_modules=cythonize(ext_modules) 25 | ) 26 | -------------------------------------------------------------------------------- /refcoco/data/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | from .distributed import DistributedSampler 2 | from .grouped_batch_sampler import GroupedBatchSampler 3 | 4 | -------------------------------------------------------------------------------- /refcoco/data/samplers/distributed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # Code is copy-pasted exactly as in torch.utils.data.distributed. 3 | # FIXME remove this once c10d fixes the bug it has 4 | import math 5 | import torch 6 | import torch.distributed as dist 7 | from torch.utils.data.sampler import Sampler 8 | 9 | 10 | class DistributedSampler(Sampler): 11 | """Sampler that restricts data loading to a subset of the dataset. 12 | It is especially useful in conjunction with 13 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 14 | process can pass a DistributedSampler instance as a DataLoader sampler, 15 | and load a subset of the original dataset that is exclusive to it. 16 | .. note:: 17 | Dataset is assumed to be of constant size. 18 | Arguments: 19 | dataset: Dataset used for sampling. 20 | num_replicas (optional): Number of processes participating in 21 | distributed training. 22 | rank (optional): Rank of the current process within num_replicas. 23 | """ 24 | 25 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): 26 | if num_replicas is None: 27 | if not dist.is_available(): 28 | raise RuntimeError("Requires distributed package to be available") 29 | num_replicas = dist.get_world_size() 30 | if rank is None: 31 | if not dist.is_available(): 32 | raise RuntimeError("Requires distributed package to be available") 33 | rank = dist.get_rank() 34 | self.dataset = dataset 35 | self.num_replicas = num_replicas 36 | self.rank = rank 37 | self.epoch = 0 38 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 39 | self.total_size = self.num_samples * self.num_replicas 40 | self.shuffle = shuffle 41 | 42 | def __iter__(self): 43 | if self.shuffle: 44 | # deterministically shuffle based on epoch 45 | g = torch.Generator() 46 | g.manual_seed(self.epoch) 47 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 48 | else: 49 | indices = torch.arange(len(self.dataset)).tolist() 50 | 51 | # add extra samples to make it evenly divisible 52 | indices += indices[: (self.total_size - len(indices))] 53 | assert len(indices) == self.total_size 54 | 55 | # subsample 56 | offset = self.num_samples * self.rank 57 | indices = indices[offset : offset + self.num_samples] 58 | assert len(indices) == self.num_samples 59 | 60 | return iter(indices) 61 | 62 | def __len__(self): 63 | return self.num_samples 64 | 65 | def set_epoch(self, epoch): 66 | self.epoch = epoch -------------------------------------------------------------------------------- /refcoco/data/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | from .transforms import Compose 2 | from .transforms import Resize 3 | from .transforms import RandomHorizontalFlip 4 | from .transforms import ToTensor 5 | from .transforms import Normalize 6 | 7 | from .build import build_transforms 8 | -------------------------------------------------------------------------------- /refcoco/data/transforms/build.py: -------------------------------------------------------------------------------- 1 | from . import transforms as T 2 | 3 | 4 | def build_transforms(cfg, mode='train'): 5 | assert mode in ['train', 'test', 'val'] 6 | min_size = cfg.SCALES[0] 7 | max_size = cfg.SCALES[1] 8 | assert min_size <= max_size 9 | 10 | if mode == 'train': 11 | flip_prob = cfg.TRAIN.FLIP_PROB 12 | elif mode == 'test': 13 | flip_prob = cfg.TEST.FLIP_PROB 14 | else: 15 | flip_prob = cfg.VAL.FLIP_PROB 16 | 17 | to_bgr255 = True 18 | 19 | normalize_transform = T.Normalize( 20 | mean=cfg.NETWORK.PIXEL_MEANS, std=cfg.NETWORK.PIXEL_STDS, to_bgr255=to_bgr255 21 | ) 22 | 23 | # transform = T.Compose( 24 | # [ 25 | # T.Resize(min_size, max_size), 26 | # T.RandomHorizontalFlip(flip_prob), 27 | # T.ToTensor(), 28 | # normalize_transform, 29 | # T.FixPadding(min_size, max_size, pad=0) 30 | # ] 31 | # ) 32 | 33 | transform = T.Compose( 34 | [ 35 | T.Resize(min_size, max_size), 36 | T.RandomHorizontalFlip(flip_prob), 37 | T.ToTensor(), 38 | normalize_transform, 39 | ] 40 | ) 41 | 42 | return transform 43 | -------------------------------------------------------------------------------- /refcoco/function/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/refcoco/function/__init__.py -------------------------------------------------------------------------------- /refcoco/function/val.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import torch 3 | from common.trainer import to_cuda 4 | 5 | 6 | @torch.no_grad() 7 | def do_validation(net, val_loader, metrics, label_index_in_batch): 8 | net.eval() 9 | metrics.reset() 10 | for nbatch, batch in enumerate(val_loader): 11 | batch = to_cuda(batch) 12 | label = batch[label_index_in_batch] 13 | datas = [batch[i] for i in range(len(batch)) if i != label_index_in_batch % len(batch)] 14 | 15 | outputs = net(*datas) 16 | outputs.update({'label': label}) 17 | metrics.update(outputs) 18 | 19 | -------------------------------------------------------------------------------- /refcoco/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet_vlbert_for_refcoco import ResNetVLBERT 2 | 3 | 4 | -------------------------------------------------------------------------------- /refcoco/test.py: -------------------------------------------------------------------------------- 1 | import _init_paths 2 | import os 3 | import argparse 4 | 5 | from refcoco.function.config import config, update_config 6 | from refcoco.function.test import test_net 7 | 8 | 9 | def parse_args(): 10 | parser = argparse.ArgumentParser('Train Cognition Network') 11 | parser.add_argument('--cfg', type=str, help='path to config file') 12 | parser.add_argument('--ckpt', type=str, help='root path to store checkpoint') 13 | parser.add_argument('--gpus', type=int, nargs='+', help='indices of GPUs to use', default=[0]) 14 | parser.add_argument('--bs', type=int) 15 | parser.add_argument('--split', type=str, choices=['test', 'testA', 'testB', 'val'], default='val') 16 | parser.add_argument('--result-path', type=str, help='dir to save result file') 17 | parser.add_argument('--result-name', type=str, help='name of result file') 18 | 19 | args = parser.parse_args() 20 | 21 | if args.cfg is not None: 22 | update_config(args.cfg) 23 | 24 | config.GPUS = ','.join([str(index) for index in args.gpus]) 25 | 26 | if args.bs is not None: 27 | config.TEST.BATCH_IMAGES = args.bs 28 | 29 | return args, config 30 | 31 | 32 | def main(): 33 | args, config = parse_args() 34 | 35 | test_net(args, config) 36 | 37 | 38 | if __name__ == '__main__': 39 | main() 40 | -------------------------------------------------------------------------------- /refcoco/train_end2end.py: -------------------------------------------------------------------------------- 1 | import _init_paths 2 | import os 3 | import argparse 4 | import torch 5 | import subprocess 6 | 7 | from refcoco.function.config import config, update_config 8 | from refcoco.function.train import train_net 9 | from refcoco.function.test import test_net 10 | 11 | 12 | def parse_args(): 13 | parser = argparse.ArgumentParser('Train Cognition Network') 14 | parser.add_argument('--cfg', type=str, help='path to config file') 15 | parser.add_argument('--model-dir', type=str, help='root path to store checkpoint') 16 | parser.add_argument('--log-dir', type=str, help='tensorboard log dir') 17 | parser.add_argument('--dist', help='whether to use distributed training', default=False, action='store_true') 18 | parser.add_argument('--slurm', help='whether this is a slurm job', default=False, action='store_true') 19 | parser.add_argument('--do-test', help='whether to generate csv result on test set', 20 | default=False, action='store_true') 21 | parser.add_argument('--cudnn-off', help='disable cudnn', default=False, action='store_true') 22 | 23 | # easy test pretrain model 24 | parser.add_argument('--partial-pretrain', type=str) 25 | 26 | args = parser.parse_args() 27 | 28 | if args.cfg is not None: 29 | update_config(args.cfg) 30 | if args.model_dir is not None: 31 | config.OUTPUT_PATH = os.path.join(args.model_dir, config.OUTPUT_PATH) 32 | 33 | if args.partial_pretrain is not None: 34 | config.NETWORK.PARTIAL_PRETRAIN = args.partial_pretrain 35 | 36 | if args.slurm: 37 | proc_id = int(os.environ['SLURM_PROCID']) 38 | ntasks = int(os.environ['SLURM_NTASKS']) 39 | node_list = os.environ['SLURM_NODELIST'] 40 | num_gpus = torch.cuda.device_count() 41 | addr = subprocess.getoutput( 42 | 'scontrol show hostname {} | head -n1'.format(node_list)) 43 | os.environ['MASTER_PORT'] = str(29500) 44 | os.environ['MASTER_ADDR'] = addr 45 | os.environ['WORLD_SIZE'] = str(ntasks) 46 | os.environ['RANK'] = str(proc_id) 47 | os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) 48 | 49 | return args, config 50 | 51 | 52 | def main(): 53 | args, config = parse_args() 54 | rank, model = train_net(args, config) 55 | if args.do_test and (rank is None or rank == 0): 56 | test_net(args, config) 57 | 58 | 59 | if __name__ == '__main__': 60 | main() 61 | 62 | 63 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | easydict 2 | jsonlines 3 | matplotlib 4 | networkx 5 | numpy 6 | opencv-python 7 | pandas 8 | Pillow 9 | protobuf==3.10.0 10 | pycocotools 11 | PyYAML 12 | regex==2019.8.19 13 | requests==2.22.0 14 | scikit-image 15 | scipy 16 | tensorboard 17 | tensorboardX 18 | tensorflow 19 | tqdm 20 | urllib3 21 | boto3 22 | -------------------------------------------------------------------------------- /scripts/dist_run_multi.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python ./scripts/launch.py \ 4 | --nnodes "$1" --node_rank "$2" --master_addr "$3" --nproc_per_node "$4" \ 5 | "$5" --cfg "$6" --model-dir "$7" -------------------------------------------------------------------------------- /scripts/dist_run_single.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python ./scripts/launch.py \ 4 | --nproc_per_node "$1" \ 5 | "$2" --cfg "$3" --model-dir "$4" 6 | -------------------------------------------------------------------------------- /scripts/dist_run_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | RUN_SCRIPT=$3 8 | CONFIG=$4 9 | WORK_DIR=$5 10 | GPUS=${6:-8} 11 | GPUS_PER_NODE=${GPUS_PER_NODE:-8} 12 | CPUS_PER_TASK=${CPUS_PER_TASK:-5} 13 | SRUN_ARGS=${SRUN_ARGS:-""} 14 | PY_ARGS=${PY_ARGS:-""} 15 | 16 | srun -p ${PARTITION} \ 17 | --job-name=${JOB_NAME} \ 18 | --gres=gpu:${GPUS_PER_NODE} \ 19 | --ntasks=${GPUS} \ 20 | --ntasks-per-node=${GPUS_PER_NODE} \ 21 | --cpus-per-task=${CPUS_PER_TASK} \ 22 | --kill-on-bad-exit=1 \ 23 | ${SRUN_ARGS} \ 24 | python -u ${RUN_SCRIPT} \ 25 | --cfg ${CONFIG} \ 26 | --model-dir ${WORK_DIR} \ 27 | --slurm --dist ${PY_ARGS} 28 | -------------------------------------------------------------------------------- /scripts/init.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | cd ./common/lib/roi_pooling/ 4 | python setup.py build_ext --inplace 5 | cd ../../../ 6 | 7 | #cd ./refcoco/data/datasets/refer/ 8 | #make 9 | #cd ../../../../ 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /scripts/init_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | GPUS=${3:-1} 8 | GPUS_PER_NODE=${GPUS_PER_NODE:-1} 9 | CPUS_PER_TASK=${CPUS_PER_TASK:-5} 10 | SRUN_ARGS=${SRUN_ARGS:-""} 11 | 12 | srun -p ${PARTITION} \ 13 | --job-name=${JOB_NAME} \ 14 | --gres=gpu:${GPUS_PER_NODE} \ 15 | --ntasks=${GPUS} \ 16 | --ntasks-per-node=${GPUS_PER_NODE} \ 17 | --cpus-per-task=${CPUS_PER_TASK} \ 18 | --kill-on-bad-exit=1 \ 19 | ${SRUN_ARGS} \ 20 | ./scripts/init.sh 21 | 22 | 23 | -------------------------------------------------------------------------------- /scripts/nondist_run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | python "$1" --cfg "$2" --model-dir "$3" 4 | 5 | -------------------------------------------------------------------------------- /scripts/nondist_run_slurm.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | set -x 4 | 5 | PARTITION=$1 6 | JOB_NAME=$2 7 | RUN_SCRIPT=$3 8 | CONFIG=$4 9 | WORK_DIR=$5 10 | GPUS=${6:-8} 11 | CPUS_PER_TASK=${CPUS_PER_TASK:-40} 12 | SRUN_ARGS=${SRUN_ARGS:-""} 13 | PY_ARGS=${PY_ARGS:-""} 14 | 15 | srun -p ${PARTITION} \ 16 | --job-name=${JOB_NAME} \ 17 | --gres=gpu:${GPUS} \ 18 | --ntasks=1 \ 19 | --ntasks-per-node=1 \ 20 | --cpus-per-task=${CPUS_PER_TASK} \ 21 | --kill-on-bad-exit=1 \ 22 | ${SRUN_ARGS} \ 23 | python -u ${RUN_SCRIPT} \ 24 | --cfg ${CONFIG} \ 25 | --model-dir ${WORK_DIR} \ 26 | ${PY_ARGS} 27 | -------------------------------------------------------------------------------- /vcr/_init_paths.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | this_dir = os.path.abspath(os.path.dirname(__file__)) 5 | 6 | 7 | def add_path(path): 8 | if path not in sys.path: 9 | sys.path.insert(0, path) 10 | 11 | 12 | root_path = os.path.join(this_dir, '../') 13 | add_path(root_path) 14 | -------------------------------------------------------------------------------- /vcr/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/vcr/data/__init__.py -------------------------------------------------------------------------------- /vcr/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .vcr import VCRDataset 2 | 3 | 4 | -------------------------------------------------------------------------------- /vcr/data/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | from .distributed import DistributedSampler 2 | from .grouped_batch_sampler import GroupedBatchSampler 3 | 4 | -------------------------------------------------------------------------------- /vcr/data/samplers/distributed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # Code is copy-pasted exactly as in torch.utils.data.distributed. 3 | # FIXME remove this once c10d fixes the bug it has 4 | import math 5 | import torch 6 | import torch.distributed as dist 7 | from torch.utils.data.sampler import Sampler 8 | 9 | 10 | class DistributedSampler(Sampler): 11 | """Sampler that restricts data loading to a subset of the dataset. 12 | It is especially useful in conjunction with 13 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 14 | process can pass a DistributedSampler instance as a DataLoader sampler, 15 | and load a subset of the original dataset that is exclusive to it. 16 | .. note:: 17 | Dataset is assumed to be of constant size. 18 | Arguments: 19 | dataset: Dataset used for sampling. 20 | num_replicas (optional): Number of processes participating in 21 | distributed training. 22 | rank (optional): Rank of the current process within num_replicas. 23 | """ 24 | 25 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): 26 | if num_replicas is None: 27 | if not dist.is_available(): 28 | raise RuntimeError("Requires distributed package to be available") 29 | num_replicas = dist.get_world_size() 30 | if rank is None: 31 | if not dist.is_available(): 32 | raise RuntimeError("Requires distributed package to be available") 33 | rank = dist.get_rank() 34 | self.dataset = dataset 35 | self.num_replicas = num_replicas 36 | self.rank = rank 37 | self.epoch = 0 38 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 39 | self.total_size = self.num_samples * self.num_replicas 40 | self.shuffle = shuffle 41 | 42 | def __iter__(self): 43 | if self.shuffle: 44 | # deterministically shuffle based on epoch 45 | g = torch.Generator() 46 | g.manual_seed(self.epoch) 47 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 48 | else: 49 | indices = torch.arange(len(self.dataset)).tolist() 50 | 51 | # add extra samples to make it evenly divisible 52 | indices += indices[: (self.total_size - len(indices))] 53 | assert len(indices) == self.total_size 54 | 55 | # subsample 56 | offset = self.num_samples * self.rank 57 | indices = indices[offset : offset + self.num_samples] 58 | assert len(indices) == self.num_samples 59 | 60 | return iter(indices) 61 | 62 | def __len__(self): 63 | return self.num_samples 64 | 65 | def set_epoch(self, epoch): 66 | self.epoch = epoch -------------------------------------------------------------------------------- /vcr/data/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | from .transforms import Compose 2 | from .transforms import Resize 3 | from .transforms import RandomHorizontalFlip 4 | from .transforms import ToTensor 5 | from .transforms import Normalize 6 | 7 | from .build import build_transforms 8 | -------------------------------------------------------------------------------- /vcr/data/transforms/build.py: -------------------------------------------------------------------------------- 1 | from . import transforms as T 2 | 3 | 4 | def build_transforms(cfg, mode='train'): 5 | assert mode in ['train', 'test', 'val'] 6 | min_size = cfg.SCALES[0] 7 | max_size = cfg.SCALES[1] 8 | assert min_size <= max_size 9 | 10 | if mode == 'train': 11 | flip_prob = cfg.TRAIN.FLIP_PROB 12 | elif mode == 'test': 13 | flip_prob = cfg.TEST.FLIP_PROB 14 | else: 15 | flip_prob = cfg.VAL.FLIP_PROB 16 | 17 | to_bgr255 = True 18 | 19 | normalize_transform = T.Normalize( 20 | mean=cfg.NETWORK.PIXEL_MEANS, std=cfg.NETWORK.PIXEL_STDS, to_bgr255=to_bgr255 21 | ) 22 | 23 | transform = T.Compose( 24 | [ 25 | T.Resize(min_size, max_size), 26 | T.RandomHorizontalFlip(flip_prob), 27 | T.ToTensor(), 28 | normalize_transform, 29 | T.FixPadding(min_size, max_size, pad=0) 30 | ] 31 | ) 32 | return transform 33 | -------------------------------------------------------------------------------- /vcr/function/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/vcr/function/__init__.py -------------------------------------------------------------------------------- /vcr/function/val.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import torch 3 | from common.trainer import to_cuda 4 | 5 | 6 | @torch.no_grad() 7 | def do_validation(net, val_loader, metrics, label_index_in_batch): 8 | net.eval() 9 | metrics.reset() 10 | for nbatch, batch in enumerate(val_loader): 11 | batch = to_cuda(batch) 12 | label = batch[label_index_in_batch] 13 | datas = [batch[i] for i in range(len(batch)) if i != label_index_in_batch % len(batch)] 14 | 15 | outputs = net(*datas) 16 | outputs.update({'label': label}) 17 | metrics.update(outputs) 18 | 19 | 20 | @torch.no_grad() 21 | def joint_validation(answer_net, rationale_net, answer_val_loader, rationale_val_loader, metrics, label_index_in_batch, 22 | show_progress=False): 23 | answer_net.eval() 24 | rationale_net.eval() 25 | metrics.reset() 26 | 27 | def step(a_batch, r_batch): 28 | a_batch = to_cuda(a_batch) 29 | a_label = a_batch[label_index_in_batch] 30 | a_datas = [a_batch[i] for i in range(len(a_batch)) if i != label_index_in_batch % len(a_batch)] 31 | r_batch = to_cuda(r_batch) 32 | r_label = r_batch[label_index_in_batch] 33 | r_datas = [r_batch[i] for i in range(len(r_batch)) if i != label_index_in_batch % len(r_batch)] 34 | 35 | a_outputs = answer_net(*a_datas) 36 | r_outputs = rationale_net(*r_datas) 37 | outputs = {'answer_' + k: v for k, v in a_outputs.items()} 38 | outputs.update({'rationale_' + k: v for k, v in r_outputs.items()}) 39 | outputs.update({'answer_label': a_label, 40 | 'rationale_label': r_label}) 41 | metrics.update(outputs) 42 | 43 | if show_progress: 44 | from tqdm import tqdm 45 | for a_batch, r_batch in tqdm(zip(answer_val_loader, rationale_val_loader)): 46 | step(a_batch, r_batch) 47 | else: 48 | for a_batch, r_batch in zip(answer_val_loader, rationale_val_loader): 49 | step(a_batch, r_batch) 50 | -------------------------------------------------------------------------------- /vcr/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet_vlbert_for_vcr import ResNetVLBERT 2 | 3 | 4 | 5 | -------------------------------------------------------------------------------- /vcr/test.py: -------------------------------------------------------------------------------- 1 | import _init_paths 2 | import os 3 | import argparse 4 | from copy import deepcopy 5 | 6 | from vcr.function.config import config, update_config 7 | from vcr.function.test import test_net, merge_result 8 | 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser('Get Jointly Test Result of Cognition Network') 12 | parser.add_argument('--a-cfg', type=str, help='path to answer net config yaml') 13 | parser.add_argument('--r-cfg', type=str, help='path to rationale net config yaml') 14 | parser.add_argument('--a-ckpt', type=str, help='path to checkpoint of answer net') 15 | parser.add_argument('--r-ckpt', type=str, help='path to checkpoint of rationale net') 16 | parser.add_argument('--a-bs', type=int) 17 | parser.add_argument('--r-bs', type=int) 18 | parser.add_argument('--gpus', type=int, nargs='+', default=[0]) 19 | parser.add_argument('--test-file', type=str) 20 | parser.add_argument('--result-path', type=str, help='path to store test result csv file.', default='./test_result') 21 | parser.add_argument('--result-name', type=str) 22 | parser.add_argument('--fp16', default=False, action='store_true') 23 | parser.add_argument('--use-cache', default=False, action='store_true') 24 | 25 | args = parser.parse_args() 26 | a_config = r_config = None 27 | reset_config = deepcopy(config) 28 | if args.a_cfg is not None: 29 | a_config = config 30 | if reset_config is not None: 31 | a_config.update(deepcopy(reset_config)) 32 | if args.a_cfg is not None: 33 | update_config(args.a_cfg) 34 | a_config = deepcopy(a_config) 35 | if args.r_cfg is not None: 36 | r_config = config 37 | if reset_config is not None: 38 | r_config.update(deepcopy(reset_config)) 39 | if args.r_cfg is not None: 40 | update_config(args.r_cfg) 41 | r_config = deepcopy(r_config) 42 | if args.a_bs is not None: 43 | a_config.TEST.BATCH_IMAGES = args.a_bs 44 | if args.r_bs is not None: 45 | r_config.TEST.BATCH_IMAGES = args.r_bs 46 | 47 | if args.test_file is not None: 48 | a_config.DATASET.TEST_ANNOTATION_FILE = args.test_file 49 | r_config.DATASET.TEST_ANNOTATION_FILE = args.test_file 50 | 51 | return args, a_config, r_config 52 | 53 | 54 | def main(): 55 | args, a_config, r_config = parse_args() 56 | 57 | if args.a_ckpt: 58 | a_config.DATASET.TASK = 'Q2A' 59 | a_config.GPUS = ','.join([str(k) for k in args.gpus]) 60 | a_result_csv = test_net(args, 61 | a_config, 62 | ckpt_path=args.a_ckpt, 63 | save_path=args.result_path, 64 | save_name=args.result_name) 65 | if args.r_ckpt: 66 | r_config.DATASET.TASK = 'QA2R' 67 | r_config.GPUS = ','.join([str(k) for k in args.gpus]) 68 | r_result_csv = test_net(args, 69 | r_config, 70 | ckpt_path=args.r_ckpt, 71 | save_path=args.result_path, 72 | save_name=args.result_name) 73 | if args.a_ckpt and args.r_ckpt: 74 | merge_result(a_result_csv, r_result_csv, 75 | os.path.join(args.result_path, '{}_test_result_Q2AR.csv'.format(args.result_name))) 76 | 77 | 78 | if __name__ == '__main__': 79 | main() 80 | 81 | 82 | -------------------------------------------------------------------------------- /vcr/train_end2end.py: -------------------------------------------------------------------------------- 1 | import _init_paths 2 | import os 3 | import argparse 4 | import torch 5 | import subprocess 6 | 7 | from vcr.function.config import config, update_config 8 | from vcr.function.train import train_net 9 | from vcr.function.test import test_net 10 | 11 | def parse_args(): 12 | parser = argparse.ArgumentParser('Train Cognition Network') 13 | parser.add_argument('--cfg', type=str, help='path to config file') 14 | parser.add_argument('--model-dir', type=str, help='root path to store checkpoint') 15 | parser.add_argument('--log-dir', type=str, help='tensorboard log dir') 16 | parser.add_argument('--dist', help='whether to use distributed training', default=False, action='store_true') 17 | parser.add_argument('--slurm', help='whether this is a slurm job', default=False, action='store_true') 18 | parser.add_argument('--do-test', help='whether to generate csv result on test set', 19 | default=False, action='store_true') 20 | parser.add_argument('--cudnn-off', help='disable cudnn', default=False, action='store_true') 21 | 22 | # easy test pretrain model 23 | parser.add_argument('--partial-pretrain', type=str) 24 | 25 | args = parser.parse_args() 26 | 27 | if args.cfg is not None: 28 | update_config(args.cfg) 29 | if args.model_dir is not None: 30 | config.OUTPUT_PATH = os.path.join(args.model_dir, config.OUTPUT_PATH) 31 | 32 | if args.partial_pretrain is not None: 33 | config.NETWORK.PARTIAL_PRETRAIN = args.partial_pretrain 34 | 35 | if args.slurm: 36 | proc_id = int(os.environ['SLURM_PROCID']) 37 | ntasks = int(os.environ['SLURM_NTASKS']) 38 | node_list = os.environ['SLURM_NODELIST'] 39 | num_gpus = torch.cuda.device_count() 40 | addr = subprocess.getoutput( 41 | 'scontrol show hostname {} | head -n1'.format(node_list)) 42 | os.environ['MASTER_PORT'] = str(29500) 43 | os.environ['MASTER_ADDR'] = addr 44 | os.environ['WORLD_SIZE'] = str(ntasks) 45 | os.environ['RANK'] = str(proc_id) 46 | os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) 47 | 48 | return args, config 49 | 50 | 51 | def main(): 52 | args, config = parse_args() 53 | rank, model = train_net(args, config) 54 | if args.do_test and (rank is None or rank == 0): 55 | test_net(args, config) 56 | 57 | 58 | if __name__ == '__main__': 59 | main() 60 | 61 | 62 | -------------------------------------------------------------------------------- /viz/VISUALIZATION.md: -------------------------------------------------------------------------------- 1 | # Visualization 2 | 3 | The code is based on [bertviz](https://github.com/jessevig/bertviz), a nice tool for BERT visualization. 4 | 5 | ## Prepare 6 | 7 | * Change work directory to this directory. 8 | 9 | ```bash 10 | cd ./viz 11 | ``` 12 | 13 | * Create a soft link to the data folder (If you are working on Windows, please modify the data path in the jupyter notebook by yourself). 14 | 15 | ```bash 16 | ln -s ../data ./ 17 | ``` 18 | 19 | * Download and unzip COCO val2017: [images](http://images.cocodataset.org/zips/val2017.zip), [annotations](http://images.cocodataset.org/annotations/annotations_trainval2017.zip), place them under ```./data/coco```. 20 | 21 | * (Optional) Download pre-trained models as described in [PREPARE_PRETRAINED_MODELS.md](../model/pretrained_model/PREPARE_PRETRAINED_MODELS.md), if you want to precompute all attention maps by yourself. 22 | 23 | ## Pre-compute attention maps 24 | * Pre-computing all attention maps on COCO val2017: 25 | 26 | ```bash 27 | python pretrain/vis_attention_maps.py --cfg cfgs/pretrain/vis_attention_maps_coco.yaml --save-dir ./vl-bert_viz 28 | ``` 29 | * We provide 100 pre-computed attention maps of vl-bert-base-e2e on COCO val2017: [GoogleDrive](https://drive.google.com/file/d/1TFfqArX3lwOPQ8EklZ6px5-gvOvoGdTr/view?usp=sharing) [BaiduPan](https://pan.baidu.com/s/1l0T5vAuklQTrAmD3wbJ7uQ), please download and unzip it into ```./data```. 30 | 31 | ## Visualization on Jupyter Notebook 32 | * Open Jupyter Notebook in this directory and select ```model_view_vl-bert_coco.ipynb```. 33 | ```bash 34 | jupyter notebook 35 | ``` 36 | 37 | * Run all cells in the notebook in order. 38 | 39 | * Browse attention maps in the last cell, you can change the image id to visualize other examples. 40 | 41 | 42 | -------------------------------------------------------------------------------- /viz/_init_paths.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | this_dir = os.path.abspath(os.path.dirname(__file__)) 5 | 6 | 7 | def add_path(path): 8 | if path not in sys.path: 9 | sys.path.insert(0, path) 10 | 11 | 12 | root_path = os.path.join(this_dir, '../') 13 | add_path(root_path) 14 | -------------------------------------------------------------------------------- /viz/bertviz/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/viz/bertviz/__init__.py -------------------------------------------------------------------------------- /viz/bertviz/model_view.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # Copyright 2018 The Tensor2Tensor Authors. 3 | # 4 | # Licensed under the Apache License, Version 2.0 (the "License"); 5 | # you may not use this file except in compliance with the License. 6 | # You may obtain a copy of the License at 7 | # 8 | # http://www.apache.org/licenses/LICENSE-2.0 9 | # 10 | # Unless required by applicable law or agreed to in writing, software 11 | # distributed under the License is distributed on an "AS IS" BASIS, 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 | # See the License for the specific language governing permissions and 14 | # limitations under the License. 15 | # 16 | # Change log 17 | # 12/12/18 Jesse Vig Adapted to BERT model 18 | # 12/19/18 Jesse Vig Assorted cleanup. Changed orientation of attention matrices. Updated comments. 19 | 20 | 21 | """Module for postprocessing and displaying transformer attentions. 22 | 23 | This module is designed to be called from an ipython notebook. 24 | """ 25 | 26 | import json 27 | from IPython.core.display import display, HTML, Javascript 28 | import os 29 | 30 | def show(model, model_type, tokenizer, sentence_a, sentence_b=None, attn_data=None): 31 | 32 | if sentence_b: 33 | vis_html = """ 34 | 35 | Attention: 42 | 43 |
44 | """ 45 | else: 46 | vis_html = """ 47 |
48 | """ 49 | 50 | display(HTML(vis_html)) 51 | __location__ = os.path.realpath( 52 | os.path.join(os.getcwd(), os.path.dirname(__file__))) 53 | vis_js = open(os.path.join(__location__, 'model_view.js')).read() 54 | if attn_data is None: 55 | from bertviz.attention import get_attention 56 | attn_data = get_attention(model, model_type, tokenizer, sentence_a, sentence_b) 57 | params = { 58 | 'attention': attn_data, 59 | 'default_filter': "ab" 60 | } 61 | display(Javascript('window.params = %s' % json.dumps(params))) 62 | display(Javascript(vis_js)) 63 | 64 | -------------------------------------------------------------------------------- /vqa/_init_paths.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | 4 | this_dir = os.path.abspath(os.path.dirname(__file__)) 5 | 6 | 7 | def add_path(path): 8 | if path not in sys.path: 9 | sys.path.insert(0, path) 10 | 11 | 12 | root_path = os.path.join(this_dir, '../') 13 | add_path(root_path) 14 | -------------------------------------------------------------------------------- /vqa/data/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/vqa/data/__init__.py -------------------------------------------------------------------------------- /vqa/data/collate_batch.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from common.utils.clip_pad import * 3 | 4 | 5 | class BatchCollator(object): 6 | def __init__(self, dataset, append_ind=False): 7 | self.dataset = dataset 8 | self.test_mode = self.dataset.test_mode 9 | self.data_names = self.dataset.data_names 10 | self.append_ind = append_ind 11 | 12 | def __call__(self, batch): 13 | if not isinstance(batch, list): 14 | batch = list(batch) 15 | 16 | if batch[0][self.data_names.index('image')] is not None: 17 | max_shape = tuple(max(s) for s in zip(*[data[self.data_names.index('image')].shape for data in batch])) 18 | image_none = False 19 | else: 20 | image_none = True 21 | max_boxes = max([data[self.data_names.index('boxes')].shape[0] for data in batch]) 22 | max_question_length = max([len(data[self.data_names.index('question')]) for data in batch]) 23 | 24 | for i, ibatch in enumerate(batch): 25 | out = {} 26 | 27 | if image_none: 28 | out['image'] = None 29 | else: 30 | image = ibatch[self.data_names.index('image')] 31 | out['image'] = clip_pad_images(image, max_shape, pad=0) 32 | 33 | boxes = ibatch[self.data_names.index('boxes')] 34 | out['boxes'] = clip_pad_boxes(boxes, max_boxes, pad=-2) 35 | 36 | question = ibatch[self.data_names.index('question')] 37 | out['question'] = clip_pad_1d(question, max_question_length, pad=0) 38 | 39 | other_names = [data_name for data_name in self.data_names if data_name not in out] 40 | for name in other_names: 41 | out[name] = torch.as_tensor(ibatch[self.data_names.index(name)]) 42 | 43 | batch[i] = tuple(out[data_name] for data_name in self.data_names) 44 | if self.append_ind: 45 | batch[i] += (torch.tensor(i, dtype=torch.int64),) 46 | 47 | out_tuple = () 48 | for items in zip(*batch): 49 | if items[0] is None: 50 | out_tuple += (None,) 51 | else: 52 | out_tuple += (torch.stack(tuple(items), dim=0), ) 53 | 54 | return out_tuple 55 | 56 | -------------------------------------------------------------------------------- /vqa/data/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .vqa import VQA 2 | 3 | -------------------------------------------------------------------------------- /vqa/data/samplers/__init__.py: -------------------------------------------------------------------------------- 1 | from .distributed import DistributedSampler 2 | from .grouped_batch_sampler import GroupedBatchSampler 3 | 4 | -------------------------------------------------------------------------------- /vqa/data/samplers/distributed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # Code is copy-pasted exactly as in torch.utils.data.distributed. 3 | # FIXME remove this once c10d fixes the bug it has 4 | import math 5 | import torch 6 | import torch.distributed as dist 7 | from torch.utils.data.sampler import Sampler 8 | 9 | 10 | class DistributedSampler(Sampler): 11 | """Sampler that restricts data loading to a subset of the dataset. 12 | It is especially useful in conjunction with 13 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 14 | process can pass a DistributedSampler instance as a DataLoader sampler, 15 | and load a subset of the original dataset that is exclusive to it. 16 | .. note:: 17 | Dataset is assumed to be of constant size. 18 | Arguments: 19 | dataset: Dataset used for sampling. 20 | num_replicas (optional): Number of processes participating in 21 | distributed training. 22 | rank (optional): Rank of the current process within num_replicas. 23 | """ 24 | 25 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): 26 | if num_replicas is None: 27 | if not dist.is_available(): 28 | raise RuntimeError("Requires distributed package to be available") 29 | num_replicas = dist.get_world_size() 30 | if rank is None: 31 | if not dist.is_available(): 32 | raise RuntimeError("Requires distributed package to be available") 33 | rank = dist.get_rank() 34 | self.dataset = dataset 35 | self.num_replicas = num_replicas 36 | self.rank = rank 37 | self.epoch = 0 38 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 39 | self.total_size = self.num_samples * self.num_replicas 40 | self.shuffle = shuffle 41 | 42 | def __iter__(self): 43 | if self.shuffle: 44 | # deterministically shuffle based on epoch 45 | g = torch.Generator() 46 | g.manual_seed(self.epoch) 47 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 48 | else: 49 | indices = torch.arange(len(self.dataset)).tolist() 50 | 51 | # add extra samples to make it evenly divisible 52 | indices += indices[: (self.total_size - len(indices))] 53 | assert len(indices) == self.total_size 54 | 55 | # subsample 56 | offset = self.num_samples * self.rank 57 | indices = indices[offset : offset + self.num_samples] 58 | assert len(indices) == self.num_samples 59 | 60 | return iter(indices) 61 | 62 | def __len__(self): 63 | return self.num_samples 64 | 65 | def set_epoch(self, epoch): 66 | self.epoch = epoch -------------------------------------------------------------------------------- /vqa/data/transforms/__init__.py: -------------------------------------------------------------------------------- 1 | from .transforms import Compose 2 | from .transforms import Resize 3 | from .transforms import RandomHorizontalFlip 4 | from .transforms import ToTensor 5 | from .transforms import Normalize 6 | 7 | from .build import build_transforms 8 | -------------------------------------------------------------------------------- /vqa/data/transforms/build.py: -------------------------------------------------------------------------------- 1 | from . import transforms as T 2 | 3 | 4 | def build_transforms(cfg, mode='train'): 5 | assert mode in ['train', 'test', 'val'] 6 | min_size = cfg.SCALES[0] 7 | max_size = cfg.SCALES[1] 8 | assert min_size <= max_size 9 | 10 | if mode == 'train': 11 | flip_prob = cfg.TRAIN.FLIP_PROB 12 | elif mode == 'test': 13 | flip_prob = cfg.TEST.FLIP_PROB 14 | else: 15 | flip_prob = cfg.VAL.FLIP_PROB 16 | 17 | to_bgr255 = True 18 | 19 | normalize_transform = T.Normalize( 20 | mean=cfg.NETWORK.PIXEL_MEANS, std=cfg.NETWORK.PIXEL_STDS, to_bgr255=to_bgr255 21 | ) 22 | 23 | # transform = T.Compose( 24 | # [ 25 | # T.Resize(min_size, max_size), 26 | # T.RandomHorizontalFlip(flip_prob), 27 | # T.ToTensor(), 28 | # normalize_transform, 29 | # T.FixPadding(min_size, max_size, pad=0) 30 | # ] 31 | # ) 32 | 33 | transform = T.Compose( 34 | [ 35 | T.Resize(min_size, max_size), 36 | T.RandomHorizontalFlip(flip_prob), 37 | T.ToTensor(), 38 | normalize_transform, 39 | ] 40 | ) 41 | 42 | return transform 43 | -------------------------------------------------------------------------------- /vqa/function/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jackroos/VL-BERT/4373674cbf2bcd6c09a2c26abfdb6705b870e3be/vqa/function/__init__.py -------------------------------------------------------------------------------- /vqa/function/val.py: -------------------------------------------------------------------------------- 1 | from collections import namedtuple 2 | import torch 3 | from common.trainer import to_cuda 4 | 5 | 6 | @torch.no_grad() 7 | def do_validation(net, val_loader, metrics, label_index_in_batch): 8 | net.eval() 9 | metrics.reset() 10 | for nbatch, batch in enumerate(val_loader): 11 | batch = to_cuda(batch) 12 | label = batch[label_index_in_batch] 13 | datas = [batch[i] for i in range(len(batch)) if i != label_index_in_batch % len(batch)] 14 | 15 | outputs = net(*datas) 16 | outputs.update({'label': label}) 17 | metrics.update(outputs) 18 | 19 | -------------------------------------------------------------------------------- /vqa/modules/__init__.py: -------------------------------------------------------------------------------- 1 | from .resnet_vlbert_for_vqa import ResNetVLBERT 2 | 3 | 4 | -------------------------------------------------------------------------------- /vqa/test.py: -------------------------------------------------------------------------------- 1 | import _init_paths 2 | import os 3 | import argparse 4 | from copy import deepcopy 5 | 6 | from vqa.function.config import config, update_config 7 | from vqa.function.test import test_net 8 | 9 | 10 | def parse_args(): 11 | parser = argparse.ArgumentParser('Get Test Result of VQA Network') 12 | parser.add_argument('--cfg', type=str, help='path to answer net config yaml') 13 | parser.add_argument('--ckpt', type=str, help='path to checkpoint of answer net') 14 | parser.add_argument('--bs', type=int) 15 | parser.add_argument('--gpus', type=int, nargs='+') 16 | parser.add_argument('--model-dir', type=str, help='root path to store checkpoint') 17 | parser.add_argument('--result-path', type=str, help='path to store test result file.') 18 | parser.add_argument('--result-name', type=str) 19 | parser.add_argument('--split', default='test2015') 20 | 21 | args = parser.parse_args() 22 | 23 | if args.cfg is not None: 24 | update_config(args.cfg) 25 | if args.bs is not None: 26 | config.TEST.BATCH_IMAGES = args.bs 27 | if args.gpus is not None: 28 | config.GPUS = ','.join([str(gpu) for gpu in args.gpus]) 29 | if args.split is not None: 30 | config.DATASET.TEST_IMAGE_SET = args.split 31 | if args.model_dir is not None: 32 | config.OUTPUT_PATH = os.path.join(args.model_dir, config.OUTPUT_PATH) 33 | 34 | return args, config 35 | 36 | 37 | def main(): 38 | args, config = parse_args() 39 | 40 | result_json_path = test_net(args, config, 41 | ckpt_path=args.ckpt, save_path=args.result_path, save_name=args.result_name) 42 | 43 | 44 | if __name__ == '__main__': 45 | main() 46 | -------------------------------------------------------------------------------- /vqa/train_end2end.py: -------------------------------------------------------------------------------- 1 | import _init_paths 2 | import os 3 | import argparse 4 | import torch 5 | import subprocess 6 | 7 | from vqa.function.config import config, update_config 8 | from vqa.function.train import train_net 9 | from vqa.function.test import test_net 10 | 11 | 12 | def parse_args(): 13 | parser = argparse.ArgumentParser('Train Cognition Network') 14 | parser.add_argument('--cfg', type=str, help='path to config file') 15 | parser.add_argument('--model-dir', type=str, help='root path to store checkpoint') 16 | parser.add_argument('--log-dir', type=str, help='tensorboard log dir') 17 | parser.add_argument('--dist', help='whether to use distributed training', default=False, action='store_true') 18 | parser.add_argument('--slurm', help='whether this is a slurm job', default=False, action='store_true') 19 | parser.add_argument('--do-test', help='whether to generate csv result on test set', 20 | default=False, action='store_true') 21 | parser.add_argument('--cudnn-off', help='disable cudnn', default=False, action='store_true') 22 | 23 | # easy test pretrain model 24 | parser.add_argument('--partial-pretrain', type=str) 25 | 26 | args = parser.parse_args() 27 | 28 | if args.cfg is not None: 29 | update_config(args.cfg) 30 | if args.model_dir is not None: 31 | config.OUTPUT_PATH = os.path.join(args.model_dir, config.OUTPUT_PATH) 32 | 33 | if args.partial_pretrain is not None: 34 | config.NETWORK.PARTIAL_PRETRAIN = args.partial_pretrain 35 | 36 | if args.slurm: 37 | proc_id = int(os.environ['SLURM_PROCID']) 38 | ntasks = int(os.environ['SLURM_NTASKS']) 39 | node_list = os.environ['SLURM_NODELIST'] 40 | num_gpus = torch.cuda.device_count() 41 | addr = subprocess.getoutput( 42 | 'scontrol show hostname {} | head -n1'.format(node_list)) 43 | os.environ['MASTER_PORT'] = str(29500) 44 | os.environ['MASTER_ADDR'] = addr 45 | os.environ['WORLD_SIZE'] = str(ntasks) 46 | os.environ['RANK'] = str(proc_id) 47 | os.environ['LOCAL_RANK'] = str(proc_id % num_gpus) 48 | 49 | return args, config 50 | 51 | 52 | def main(): 53 | args, config = parse_args() 54 | rank, model = train_net(args, config) 55 | if args.do_test and (rank is None or rank == 0): 56 | test_net(args, config) 57 | 58 | 59 | if __name__ == '__main__': 60 | main() 61 | 62 | 63 | --------------------------------------------------------------------------------