├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── configs ├── __init__.py ├── captioning │ ├── coco │ │ ├── butd.yml │ │ └── butd_beam_search.yml │ └── m4c_textcaps │ │ ├── butd.yml │ │ ├── butd_beam_search.yml │ │ ├── butd_eval_pretrained_coco_model.yml │ │ ├── m4c_captioner.yml │ │ ├── m4c_captioner_coco.yml │ │ ├── m4c_captioner_coco_eval_on_textcaps.yml │ │ ├── m4c_captioner_coco_textcaps_joint.yml │ │ └── m4c_captioner_without_ocr.yml └── vqa │ ├── clevr │ └── cnn_lstm.yml │ ├── m4c_ocrvqa │ └── m4c.yml │ ├── m4c_stvqa │ └── m4c.yml │ ├── m4c_textvqa │ ├── m4c.yml │ ├── m4c_ocr_ml.yml │ └── m4c_with_stvqa.yml │ ├── textvqa │ ├── ban.yml │ ├── lorra.yml │ └── pythia.yml │ ├── visual_genome │ └── pythia.yml │ ├── vizwiz │ ├── ban.yml │ ├── lorra.yml │ └── pythia.yml │ └── vqa2 │ ├── ban.yml │ ├── lorra.yml │ ├── lorra_train_and_val.yml │ ├── pythia.yml │ ├── pythia_12k_iterations_no_resnet.yml │ └── pythia_train_and_val.yml ├── docs ├── Makefile ├── requirements.txt └── source │ ├── common │ ├── registry.rst │ └── sample.rst │ ├── conf.py │ ├── datasets │ ├── base_dataset.rst │ ├── base_dataset_builder.rst │ ├── base_task.rst │ └── processors.rst │ ├── index.rst │ ├── models │ └── base_model.rst │ ├── modules │ ├── losses.rst │ └── metrics.rst │ └── tutorials │ ├── challenge.md │ ├── concepts.md │ ├── dataset.rst │ ├── features.rst │ ├── pretrained_models.md │ └── quickstart.md ├── overview.png ├── projects ├── M4C │ ├── README.md │ └── scripts │ │ └── extract_ocr_frcn_feature.py ├── M4C_Captioner │ ├── README.md │ └── scripts │ │ ├── coco_eval.py │ │ └── textcaps_eval.py └── TextCap_CVPR.pdf ├── pythia ├── __init__.py ├── common │ ├── __init__.py │ ├── batch_collator.py │ ├── constants.py │ ├── dataset_loader.py │ ├── defaults │ │ ├── __init__.py │ │ └── configs │ │ │ ├── base.yml │ │ │ └── datasets │ │ │ ├── captioning │ │ │ ├── coco.yml │ │ │ └── m4c_textcaps.yml │ │ │ ├── dialog │ │ │ └── visual_dialog.yml │ │ │ └── vqa │ │ │ ├── clevr.yml │ │ │ ├── m4c_ocrvqa.yml │ │ │ ├── m4c_stvqa.yml │ │ │ ├── m4c_textvqa.yml │ │ │ ├── textvqa.yml │ │ │ ├── visual_genome.yml │ │ │ ├── vizwiz.yml │ │ │ └── vqa2.yml │ ├── meter.py │ ├── registry.py │ ├── report.py │ ├── sample.py │ └── test_reporter.py ├── datasets │ ├── __init__.py │ ├── base_dataset.py │ ├── base_dataset_builder.py │ ├── captioning │ │ ├── __init__.py │ │ ├── coco │ │ │ ├── __init__.py │ │ │ ├── builder.py │ │ │ └── dataset.py │ │ └── m4c_textcaps │ │ │ ├── __init__.py │ │ │ ├── builder.py │ │ │ └── dataset.py │ ├── concat_dataset.py │ ├── dialog │ │ ├── __init__.py │ │ ├── original.py │ │ └── visual_dialog │ │ │ ├── config.yml │ │ │ └── scripts │ │ │ ├── build_imdb.py │ │ │ └── extract_vocabulary.py │ ├── feature_readers.py │ ├── features_dataset.py │ ├── image_database.py │ ├── multi_dataset.py │ ├── processors.py │ ├── samplers.py │ ├── scene_graph_database.py │ └── vqa │ │ ├── __init__.py │ │ ├── clevr │ │ ├── __init__.py │ │ ├── builder.py │ │ └── dataset.py │ │ ├── m4c_ocrvqa │ │ ├── __init__.py │ │ ├── builder.py │ │ └── dataset.py │ │ ├── m4c_stvqa │ │ ├── __init__.py │ │ ├── builder.py │ │ └── dataset.py │ │ ├── m4c_textvqa │ │ ├── __init__.py │ │ ├── builder.py │ │ └── dataset.py │ │ ├── textvqa │ │ ├── __init__.py │ │ ├── builder.py │ │ └── dataset.py │ │ ├── visual_genome │ │ ├── builder.py │ │ └── dataset.py │ │ ├── vizwiz │ │ ├── __init__.py │ │ ├── builder.py │ │ └── dataset.py │ │ └── vqa2 │ │ ├── __init__.py │ │ ├── builder.py │ │ ├── dataset.py │ │ ├── ocr_builder.py │ │ └── ocr_dataset.py ├── legacy │ ├── best_model │ │ ├── best_model_predict_test.json │ │ └── config.yaml │ ├── config │ │ ├── collections.py │ │ ├── config.py │ │ ├── config_utils.py │ │ ├── demo │ │ │ └── config.yaml │ │ ├── function_config_lib.py │ │ ├── keep │ │ │ ├── MFH_ft.yaml │ │ │ └── detectron.yaml │ │ └── verbose │ │ │ ├── MFH_module.yaml │ │ │ ├── dectectron_finetune.yaml │ │ │ └── default.yaml │ ├── data │ │ └── demo │ │ │ ├── features │ │ │ └── COCO_test2015_000000000001.npy │ │ │ ├── images │ │ │ └── COCO_test2015_000000000001.jpg │ │ │ └── imdb │ │ │ └── imdb_demo.npy │ ├── data_prep │ │ ├── data_preprocess.md │ │ └── vqa_v2.0 │ │ │ ├── build_vqa_2.0_imdb.py │ │ │ ├── download_vqa_2.0.sh │ │ │ ├── extract_ques_info.py │ │ │ ├── extract_vocabulary.py │ │ │ ├── extract_word_glove_embedding.py │ │ │ ├── genome_ids.pkl │ │ │ ├── process_answers.py │ │ │ ├── train_ids.pkl │ │ │ └── val_ids.pkl │ ├── dataset_utils │ │ ├── __init__.py │ │ ├── create_imdb_header.py │ │ ├── dataSet.py │ │ ├── text_processing.py │ │ ├── utils.py │ │ ├── vqa_collates.py │ │ ├── vqa_concate_dataset.py │ │ └── vqa_html_writer.py │ ├── ensemble.py │ ├── eval_model │ │ ├── eval_demo.py │ │ └── vqaEval.py │ ├── global_variables │ │ ├── __init__.py │ │ └── global_variables.py │ ├── info │ │ ├── code_structure_plot.png │ │ ├── pythia.jpg │ │ └── vqa_example.png │ ├── install.sh │ ├── run_test.py │ ├── tools │ │ ├── convert_VD_to_COCO_qa.py │ │ ├── convert_VG_to_COCO.py │ │ ├── convert_VG_to_COCO_qa.py │ │ ├── convert_tsv_feature_to_indiv.py │ │ ├── eval_ensemble_on_val.py │ │ ├── extract_detectron_weights.py │ │ ├── extract_minival_ids.py │ │ ├── extract_visual_features_vgg_pool5.py │ │ ├── generate_minival_annotation.py │ │ ├── mirror_images.py │ │ ├── model_path.py │ │ ├── process_log.py │ │ ├── rename_genome_file.py │ │ ├── subset_val.py │ │ ├── timer.py │ │ └── visualize_bbox.py │ ├── top_down_bottom_up │ │ ├── classifier.py │ │ ├── image_attention.py │ │ ├── image_embedding.py │ │ ├── image_feature_encoding.py │ │ ├── intermediate_layer.py │ │ ├── multi_modal_combine.py │ │ ├── nonlinear_layer.py │ │ ├── post_combine_transform.py │ │ ├── question_embeding.py │ │ ├── top_down_bottom_up_model.py │ │ └── unittests.py │ ├── train.py │ ├── train_model │ │ ├── Engineer.py │ │ ├── Error_analysis.py │ │ ├── Loss.py │ │ ├── __init__.py │ │ ├── dataset_utils.py │ │ ├── eval_utils.py │ │ ├── evaluate.py │ │ ├── evaluate_with_ensemble.py │ │ ├── helper.py │ │ └── model_factory.py │ └── vqa_demo.ipynb ├── models │ ├── __init__.py │ ├── ban.py │ ├── base_model.py │ ├── butd.py │ ├── cnn_lstm.py │ ├── lorra.py │ ├── m4c.py │ ├── m4c_captioner.py │ ├── pythia.py │ ├── top_down_bottom_up.py │ └── visdial_multi_modal.py ├── modules │ ├── __init__.py │ ├── attention.py │ ├── decoders.py │ ├── embeddings.py │ ├── encoders.py │ ├── gpn.py │ ├── layers.py │ ├── losses.py │ ├── metrics.py │ └── refine_mmt.py ├── scripts │ ├── coco │ │ └── coco_caption_eval.py │ ├── extract_vocabulary.py │ ├── features │ │ ├── extract_features.md │ │ ├── extract_features.py │ │ ├── extract_features_vmb.py │ │ ├── extract_resnet152_feat.py │ │ └── extract_resnet_features.py │ └── gqa │ │ ├── README.md │ │ └── convert_gqa_to_vqa.py ├── trainers │ ├── __init__.py │ └── base_trainer.py └── utils │ ├── __init__.py │ ├── build_utils.py │ ├── checkpoint.py │ ├── configuration.py │ ├── dataset_utils.py │ ├── distributed_utils.py │ ├── early_stopping.py │ ├── flags.py │ ├── general.py │ ├── logger.py │ ├── m4c_evaluators.py │ ├── objects_to_byte_tensor.py │ ├── phoc │ ├── __init__.py │ ├── build_phoc.py │ └── src │ │ └── cphoc.c │ ├── process_answers.py │ ├── text_utils.py │ ├── timer.py │ └── vocab.py ├── requirements.txt ├── run.sh ├── setup.py ├── tests ├── __init__.py ├── data │ └── vocab.txt ├── models │ └── test_cnn_lstm.py ├── modules │ ├── __init__.py │ ├── test_layers.py │ ├── test_losses.py │ └── test_metrics.py ├── tasks │ ├── __init__.py │ ├── test_base_dataset.py │ └── test_processors.py ├── test_utils.py └── utils │ ├── __init__.py │ ├── test_general.py │ ├── test_text_utils.py │ └── test_timer.py ├── textcap.yaml ├── tools ├── bert │ ├── extract_bert.sh │ └── extract_bert_embeddings.py └── run.py └── val.sh /CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | 4 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to. 5 | Please read the [full text](https://code.fb.com/codeofconduct/) 6 | so that you can understand what actions will and will not be tolerated. 7 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | BSD License 2 | 3 | For Pythia software 4 | 5 | Copyright (c) Facebook, Inc. and its affiliates. All rights reserved. 6 | 7 | Redistribution and use in source and binary forms, with or without modification, 8 | are permitted provided that the following conditions are met: 9 | 10 | * Redistributions of source code must retain the above copyright notice, this 11 | list of conditions and the following disclaimer. 12 | 13 | * Redistributions in binary form must reproduce the above copyright notice, 14 | this list of conditions and the following disclaimer in the documentation 15 | and/or other materials provided with the distribution. 16 | 17 | * Neither the name Facebook nor the names of its contributors may be used to 18 | endorse or promote products derived from this software without specific 19 | prior written permission. 20 | 21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND 22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR 25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES 26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON 28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Towards Accurate Text-based Image Captioning with Content Diversity Exploration 2 | 3 | Pytorch implementation for the CVPR 2021 paper: [Towards Accurate Text-based Image Captioning with Content Diversity Exploration](https://github.com/guanghuixu/AnchorCaptioner/blob/main/projects/TextCap_CVPR.pdf) 4 | 5 |

6 | Anchor Captioner 7 |

8 | 9 | ## Install 10 | 11 | Clone this repository, and build it with the following command. 12 | 13 | ``` 14 | # activate your own conda environment 15 | # [Alternative] 16 | # conda env create -f textcap.yaml 17 | # conda activate textcap 18 | 19 | git clone git@github.com:guanghuixu/AnchorCaptioner.git 20 | cd AnchorCaptioner 21 | python setup.py build develop 22 | ``` 23 | 24 | ## Data and running scripts 25 | 26 | Some specific annotations required by our method are provided in [here](https://github.com/guanghuixu/AnchorCaptioner/releases/download/data/data.zip). More details please refer to [projects/M4C_Captioner/README.md](https://github.com/guanghuixu/AnchorCaptioner/blob/main/projects/M4C_Captioner/README.md) 27 | 28 | ## Citation 29 | 30 | If you use any part of our code in your research, please cite our paper: 31 | 32 | ```BibTex 33 | @InProceedings{xu2021textcap, 34 | title = {Towards Accurate Text-based Image Captioning with Content Diversity Exploration}, 35 | author = {Guanghui Xu and Mingkui Tan and Shuaicheng Niu and Yucheng Luo and Qing Du and Qi Wu}, 36 | booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition}, 37 | year = {2021} 38 | } 39 | ``` 40 | 41 | ## Acknowledgment 42 | 43 | The code is greatly inspired by the [MMF](https://mmf.readthedocs.io/en/latest/) and [M4C-Captioner](https://github.com/ronghanghu/mmf). 44 | 45 | -------------------------------------------------------------------------------- /configs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/configs/__init__.py -------------------------------------------------------------------------------- /configs/captioning/coco/butd.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - common/defaults/configs/datasets/captioning/coco.yml 3 | model_attributes: 4 | butd: &butd 5 | model_data_dir: ../data/ 6 | metrics: 7 | - type: caption_bleu4 8 | losses: 9 | - type: caption_cross_entropy 10 | classifier: 11 | type: language_decoder 12 | params: 13 | dropout: 0.5 14 | hidden_dim: 1024 15 | feature_dim: 2048 16 | fc_bias_init: 0 17 | image_feature_embeddings: 18 | - modal_combine: 19 | type: top_down_attention_lstm 20 | params: 21 | dropout: 0.5 22 | hidden_dim: 1024 23 | attention_dim: 1024 24 | normalization: softmax 25 | transform: 26 | type: linear 27 | params: 28 | out_dim: 1 29 | image_feature_dim: 2048 30 | embedding_dim: 300 31 | image_feature_encodings: 32 | - type: finetune_faster_rcnn_fpn_fc7 33 | params: 34 | bias_file: detectron/fc6/fc7_b.pkl 35 | weights_file: detectron/fc6/fc7_w.pkl 36 | inference: 37 | type: greedy 38 | optimizer_attributes: 39 | type: Adamax 40 | params: 41 | eps: 1.0e-08 42 | lr: 0.01 43 | weight_decay: 0 44 | training_parameters: 45 | clip_norm_mode: all 46 | clip_gradients: true 47 | lr_ratio: 0.1 48 | lr_scheduler: true 49 | lr_steps: 50 | - 15000 51 | - 25000 52 | - 35000 53 | - 45000 54 | max_grad_l2_norm: 0.25 55 | max_iterations: 50000 56 | use_warmup: true 57 | warmup_factor: 0.2 58 | warmup_iterations: 1000 59 | patience: 4000 60 | batch_size: 256 61 | num_workers: 7 62 | task_size_proportional_sampling: true 63 | monitored_metric: coco/caption_bleu4 64 | metric_minimize: false 65 | -------------------------------------------------------------------------------- /configs/captioning/coco/butd_beam_search.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - ../configs/captioning/coco/butd.yml 3 | model_attributes: 4 | butd: &butd 5 | inference: 6 | type: beam_search 7 | params: 8 | beam_length: 5 9 | training_parameters: 10 | batch_size: 1 11 | -------------------------------------------------------------------------------- /configs/captioning/m4c_textcaps/butd_beam_search.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - ../configs/captioning/m4c_textcaps/butd.yml 3 | dataset_attributes: 4 | coco: 5 | imdb_files: 6 | val: 7 | - imdb/m4c_textcaps/imdb_val_filtered_by_image_id.npy 8 | model_attributes: 9 | butd: &butd 10 | inference: 11 | type: beam_search 12 | params: 13 | beam_length: 5 14 | training_parameters: 15 | batch_size: 1 16 | -------------------------------------------------------------------------------- /configs/captioning/m4c_textcaps/butd_eval_pretrained_coco_model.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - ../configs/captioning/m4c_textcaps/butd_beam_search.yml 3 | dataset_attributes: 4 | coco: 5 | processors: 6 | text_processor: 7 | params: 8 | vocab: 9 | vocab_file: vocabs/vocabulary_captioning_thresh5.txt 10 | caption_processor: 11 | params: 12 | vocab: 13 | vocab_file: vocabs/vocabulary_captioning_thresh5.txt 14 | -------------------------------------------------------------------------------- /configs/captioning/m4c_textcaps/m4c_captioner_coco_eval_on_textcaps.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - ../configs/captioning/m4c_textcaps/m4c_captioner_coco.yml 3 | dataset_attributes: 4 | m4c_textcaps: 5 | image_features: 6 | val: 7 | - open_images/detectron_fix_100/fc6/train,m4c_textvqa_ocr_en_frcn_features/train_images 8 | test: 9 | - open_images/detectron_fix_100/fc6/test,m4c_textvqa_ocr_en_frcn_features/test_images 10 | imdb_files: 11 | val: 12 | - imdb/m4c_textcaps/imdb_val_filtered_by_image_id.npy # only one sample per image_id 13 | test: 14 | - imdb/m4c_textcaps/imdb_test_filtered_by_image_id.npy # only one sample per image_id 15 | -------------------------------------------------------------------------------- /configs/vqa/clevr/cnn_lstm.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - common/defaults/configs/datasets/vqa/clevr.yml 3 | 4 | model_attributes: 5 | cnn_lstm: 6 | metrics: 7 | - type: accuracy 8 | losses: 9 | - type: logit_bce 10 | text_embedding: 11 | embedding_dim: 20 12 | lstm: 13 | input_size: 20 14 | hidden_size: 50 15 | bidirectional: true 16 | batch_first: true 17 | cnn: 18 | layers: 19 | input_dims: [3, 64, 128, 128, 64, 64] 20 | output_dims: [64, 128, 128, 64, 64, 10] 21 | kernel_sizes: [7, 5, 5, 5, 5, 1] 22 | classifier: 23 | input_dim: 450 24 | 25 | optimizer_attributes: 26 | type: Adamax 27 | params: 28 | eps: 1.0e-08 29 | lr: 0.01 30 | weight_decay: 0 31 | 32 | training_parameters: 33 | batch_size: 128 34 | snapshot_interval: 6000 35 | monitored_metric: clevr/accuracy 36 | metric_minimize: false 37 | -------------------------------------------------------------------------------- /configs/vqa/m4c_ocrvqa/m4c.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - common/defaults/configs/datasets/vqa/m4c_ocrvqa.yml 3 | # Use soft copy 4 | dataset_attributes: 5 | m4c_ocrvqa: 6 | image_features: 7 | train: 8 | - m4c_ocrvqa_obj_frcn_features/all,m4c_ocrvqa_ocr_en_frcn_features/all 9 | val: 10 | - m4c_ocrvqa_obj_frcn_features/all,m4c_ocrvqa_ocr_en_frcn_features/all 11 | test: 12 | - m4c_ocrvqa_obj_frcn_features/all,m4c_ocrvqa_ocr_en_frcn_features/all 13 | imdb_files: 14 | train: 15 | - imdb/m4c_ocrvqa/imdb_train.npy 16 | val: 17 | - imdb/m4c_ocrvqa/imdb_val.npy 18 | test: 19 | - imdb/m4c_ocrvqa/imdb_test.npy 20 | processors: 21 | text_processor: 22 | type: bert_tokenizer 23 | params: 24 | max_length: 20 25 | answer_processor: 26 | type: m4c_answer 27 | params: 28 | vocab_file: m4c_vocabs/ocrvqa/fixed_answer_vocab_ocrvqa_82.txt 29 | preprocessor: 30 | type: simple_word 31 | params: {} 32 | context_preprocessor: 33 | type: simple_word 34 | params: {} 35 | max_length: 50 36 | max_copy_steps: 12 37 | num_answers: 10 38 | copy_processor: 39 | type: copy 40 | params: 41 | max_length: 100 42 | phoc_processor: 43 | type: phoc 44 | params: 45 | max_length: 50 46 | model_attributes: 47 | m4c: 48 | lr_scale_frcn: 0.1 49 | lr_scale_text_bert: 0.1 50 | lr_scale_mmt: 1.0 # no scaling 51 | text_bert_init_from_bert_base: true 52 | text_bert: 53 | num_hidden_layers: 3 54 | obj: 55 | mmt_in_dim: 2048 56 | dropout_prob: 0.1 57 | ocr: 58 | mmt_in_dim: 3002 # 300 (FastText) + 604 (PHOC) + 2048 (Faster R-CNN) + 50 (all zeros; legacy) 59 | dropout_prob: 0.1 60 | mmt: 61 | hidden_size: 768 62 | num_hidden_layers: 4 63 | classifier: 64 | type: linear 65 | ocr_max_num: 50 66 | ocr_ptr_net: 67 | hidden_size: 768 68 | query_key_size: 768 69 | params: {} 70 | model_data_dir: ../data 71 | metrics: 72 | - type: ocrvqa_accuracy 73 | losses: 74 | - type: m4c_decoding_bce_with_mask 75 | optimizer_attributes: 76 | params: 77 | eps: 1.0e-08 78 | lr: 1e-4 79 | weight_decay: 0 80 | type: Adam 81 | training_parameters: 82 | clip_norm_mode: all 83 | clip_gradients: true 84 | max_grad_l2_norm: 0.25 85 | lr_scheduler: true 86 | lr_steps: 87 | - 28000 88 | - 38000 89 | lr_ratio: 0.1 90 | use_warmup: true 91 | warmup_factor: 0.2 92 | warmup_iterations: 1000 93 | max_iterations: 48000 94 | batch_size: 128 95 | num_workers: 8 96 | task_size_proportional_sampling: true 97 | monitored_metric: m4c_ocrvqa/ocrvqa_accuracy 98 | metric_minimize: false 99 | -------------------------------------------------------------------------------- /configs/vqa/m4c_stvqa/m4c.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - common/defaults/configs/datasets/vqa/m4c_stvqa.yml 3 | # Use soft copy 4 | dataset_attributes: 5 | m4c_stvqa: 6 | image_features: 7 | train: 8 | - m4c_stvqa_obj_frcn_features/train,m4c_stvqa_ocr_en_frcn_features/train 9 | val: 10 | - m4c_stvqa_obj_frcn_features/train,m4c_stvqa_ocr_en_frcn_features/train 11 | test: 12 | - m4c_stvqa_obj_frcn_features/test_task3,m4c_stvqa_ocr_en_frcn_features/test_task3 13 | imdb_files: 14 | train: 15 | - imdb/m4c_stvqa/imdb_subtrain.npy 16 | val: 17 | - imdb/m4c_stvqa/imdb_subval.npy 18 | test: 19 | - imdb/m4c_stvqa/imdb_test_task3.npy 20 | processors: 21 | text_processor: 22 | type: bert_tokenizer 23 | params: 24 | max_length: 20 25 | answer_processor: 26 | type: m4c_answer 27 | params: 28 | vocab_file: m4c_vocabs/stvqa/fixed_answer_vocab_stvqa_5k.txt 29 | preprocessor: 30 | type: simple_word 31 | params: {} 32 | context_preprocessor: 33 | type: simple_word 34 | params: {} 35 | max_length: 50 36 | max_copy_steps: 12 37 | num_answers: 10 38 | copy_processor: 39 | type: copy 40 | params: 41 | max_length: 100 42 | phoc_processor: 43 | type: phoc 44 | params: 45 | max_length: 50 46 | model_attributes: 47 | m4c: 48 | lr_scale_frcn: 0.1 49 | lr_scale_text_bert: 0.1 50 | lr_scale_mmt: 1.0 # no scaling 51 | text_bert_init_from_bert_base: true 52 | text_bert: 53 | num_hidden_layers: 3 54 | obj: 55 | mmt_in_dim: 2048 56 | dropout_prob: 0.1 57 | ocr: 58 | mmt_in_dim: 3002 # 300 (FastText) + 604 (PHOC) + 2048 (Faster R-CNN) + 50 (all zeros; legacy) 59 | dropout_prob: 0.1 60 | mmt: 61 | hidden_size: 768 62 | num_hidden_layers: 4 63 | classifier: 64 | type: linear 65 | ocr_max_num: 50 66 | ocr_ptr_net: 67 | hidden_size: 768 68 | query_key_size: 768 69 | params: {} 70 | model_data_dir: ../data 71 | metrics: 72 | - type: stvqa_accuracy 73 | - type: stvqa_anls 74 | losses: 75 | - type: m4c_decoding_bce_with_mask 76 | optimizer_attributes: 77 | params: 78 | eps: 1.0e-08 79 | lr: 1e-4 80 | weight_decay: 0 81 | type: Adam 82 | training_parameters: 83 | clip_norm_mode: all 84 | clip_gradients: true 85 | max_grad_l2_norm: 0.25 86 | lr_scheduler: true 87 | lr_steps: 88 | - 14000 89 | - 19000 90 | lr_ratio: 0.1 91 | use_warmup: true 92 | warmup_factor: 0.2 93 | warmup_iterations: 1000 94 | max_iterations: 24000 95 | batch_size: 128 96 | num_workers: 8 97 | task_size_proportional_sampling: true 98 | monitored_metric: m4c_stvqa/stvqa_accuracy 99 | metric_minimize: false 100 | -------------------------------------------------------------------------------- /configs/vqa/m4c_textvqa/m4c.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - common/defaults/configs/datasets/vqa/m4c_textvqa.yml 3 | # Use soft copy 4 | dataset_attributes: 5 | m4c_textvqa: 6 | image_features: 7 | train: 8 | - open_images/detectron_fix_100/fc6/train,m4c_textvqa_ocr_en_frcn_features/train_images 9 | val: 10 | - open_images/detectron_fix_100/fc6/train,m4c_textvqa_ocr_en_frcn_features/train_images 11 | test: 12 | - open_images/detectron_fix_100/fc6/test,m4c_textvqa_ocr_en_frcn_features/test_images 13 | imdb_files: 14 | train: 15 | - imdb/m4c_textvqa/imdb_train_ocr_en.npy 16 | val: 17 | - imdb/m4c_textvqa/imdb_val_ocr_en.npy 18 | test: 19 | - imdb/m4c_textvqa/imdb_test_ocr_en.npy 20 | processors: 21 | text_processor: 22 | type: bert_tokenizer 23 | params: 24 | max_length: 20 25 | answer_processor: 26 | type: m4c_answer 27 | params: 28 | vocab_file: m4c_vocabs/textvqa/fixed_answer_vocab_textvqa_5k.txt 29 | preprocessor: 30 | type: simple_word 31 | params: {} 32 | context_preprocessor: 33 | type: simple_word 34 | params: {} 35 | max_length: 50 36 | max_copy_steps: 12 37 | num_answers: 10 38 | copy_processor: 39 | type: copy 40 | params: 41 | max_length: 100 42 | phoc_processor: 43 | type: phoc 44 | params: 45 | max_length: 50 46 | model_attributes: 47 | m4c: 48 | lr_scale_frcn: 0.1 49 | lr_scale_text_bert: 0.1 50 | lr_scale_mmt: 1.0 # no scaling 51 | text_bert_init_from_bert_base: true 52 | text_bert: 53 | num_hidden_layers: 3 54 | obj: 55 | mmt_in_dim: 2048 56 | dropout_prob: 0.1 57 | ocr: 58 | mmt_in_dim: 3002 # 300 (FastText) + 604 (PHOC) + 2048 (Faster R-CNN) + 50 (all zeros; legacy) 59 | dropout_prob: 0.1 60 | mmt: 61 | hidden_size: 768 62 | num_hidden_layers: 4 63 | classifier: 64 | type: linear 65 | ocr_max_num: 50 66 | ocr_ptr_net: 67 | hidden_size: 768 68 | query_key_size: 768 69 | params: {} 70 | model_data_dir: ../data 71 | metrics: 72 | - type: textvqa_accuracy 73 | losses: 74 | - type: m4c_decoding_bce_with_mask 75 | optimizer_attributes: 76 | params: 77 | eps: 1.0e-08 78 | lr: 1e-4 79 | weight_decay: 0 80 | type: Adam 81 | training_parameters: 82 | clip_norm_mode: all 83 | clip_gradients: true 84 | max_grad_l2_norm: 0.25 85 | lr_scheduler: true 86 | lr_steps: 87 | - 14000 88 | - 19000 89 | lr_ratio: 0.1 90 | use_warmup: true 91 | warmup_factor: 0.2 92 | warmup_iterations: 1000 93 | max_iterations: 24000 94 | batch_size: 128 95 | num_workers: 8 96 | task_size_proportional_sampling: true 97 | monitored_metric: m4c_textvqa/textvqa_accuracy 98 | metric_minimize: false 99 | -------------------------------------------------------------------------------- /configs/vqa/m4c_textvqa/m4c_ocr_ml.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - common/defaults/configs/datasets/vqa/m4c_textvqa.yml 3 | # Use soft copy 4 | dataset_attributes: 5 | m4c_textvqa: 6 | image_features: 7 | train: 8 | - open_images/detectron_fix_100/fc6/train,m4c_textvqa_ocr_ml_frcn_features/train_images 9 | val: 10 | - open_images/detectron_fix_100/fc6/train,m4c_textvqa_ocr_ml_frcn_features/train_images 11 | test: 12 | - open_images/detectron_fix_100/fc6/test,m4c_textvqa_ocr_ml_frcn_features/test_images 13 | imdb_files: 14 | train: 15 | - imdb/m4c_textvqa/imdb_train_ocr_ml.npy 16 | val: 17 | - imdb/m4c_textvqa/imdb_val_ocr_ml.npy 18 | test: 19 | - imdb/m4c_textvqa/imdb_test_ocr_ml.npy 20 | processors: 21 | text_processor: 22 | type: bert_tokenizer 23 | params: 24 | max_length: 20 25 | answer_processor: 26 | type: m4c_answer 27 | params: 28 | vocab_file: m4c_vocabs/textvqa/fixed_answer_vocab_textvqa_5k.txt 29 | preprocessor: 30 | type: simple_word 31 | params: {} 32 | context_preprocessor: 33 | type: simple_word 34 | params: {} 35 | max_length: 50 36 | max_copy_steps: 12 37 | num_answers: 10 38 | copy_processor: 39 | type: copy 40 | params: 41 | max_length: 100 42 | phoc_processor: 43 | type: phoc 44 | params: 45 | max_length: 50 46 | model_attributes: 47 | m4c: 48 | lr_scale_frcn: 0.1 49 | lr_scale_text_bert: 0.1 50 | lr_scale_mmt: 1.0 # no scaling 51 | text_bert_init_from_bert_base: true 52 | text_bert: 53 | num_hidden_layers: 3 54 | obj: 55 | mmt_in_dim: 2048 56 | dropout_prob: 0.1 57 | ocr: 58 | mmt_in_dim: 3002 # 300 (FastText) + 604 (PHOC) + 2048 (Faster R-CNN) + 50 (all zeros; legacy) 59 | dropout_prob: 0.1 60 | mmt: 61 | hidden_size: 768 62 | num_hidden_layers: 4 63 | classifier: 64 | type: linear 65 | ocr_max_num: 50 66 | ocr_ptr_net: 67 | hidden_size: 768 68 | query_key_size: 768 69 | params: {} 70 | model_data_dir: ../data 71 | metrics: 72 | - type: textvqa_accuracy 73 | losses: 74 | - type: m4c_decoding_bce_with_mask 75 | optimizer_attributes: 76 | params: 77 | eps: 1.0e-08 78 | lr: 1e-4 79 | weight_decay: 0 80 | type: Adam 81 | training_parameters: 82 | clip_norm_mode: all 83 | clip_gradients: true 84 | max_grad_l2_norm: 0.25 85 | lr_scheduler: true 86 | lr_steps: 87 | - 14000 88 | - 19000 89 | lr_ratio: 0.1 90 | use_warmup: true 91 | warmup_factor: 0.2 92 | warmup_iterations: 1000 93 | max_iterations: 24000 94 | batch_size: 128 95 | num_workers: 8 96 | task_size_proportional_sampling: true 97 | monitored_metric: m4c_textvqa/textvqa_accuracy 98 | metric_minimize: false 99 | -------------------------------------------------------------------------------- /configs/vqa/textvqa/ban.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - ../configs/vqa/textvqa/pythia.yml 3 | model_attributes: 4 | ban: 5 | metrics: 6 | - type: vqa_accuracy 7 | losses: 8 | - type: logit_bce 9 | text_embedding: 10 | num_hidden: 1280 11 | vocab_size: 1280 12 | emb_size: 300 13 | num_layers: 1 14 | dropout: 0.0 15 | bidirectional: False 16 | rnn_type: 'GRU' 17 | bilinear_attention: 18 | bc_net: 19 | k: 1 20 | dropout: [0.2, 0.5] 21 | h_out: 22 | fc_net: 23 | dims: 600 24 | activation: 25 | dropout: 0.2 26 | gamma: 4 27 | visual_feat_dim: 2048 28 | classifier: 29 | # out dim will be taken from registry as set by dataset builder 30 | hidden_size: 600 31 | dropout: 0.5 32 | -------------------------------------------------------------------------------- /configs/vqa/textvqa/pythia.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - common/defaults/configs/datasets/vqa/textvqa.yml 3 | model_attributes: 4 | pythia: &pythia 5 | model_data_dir: ../data 6 | metrics: 7 | - type: vqa_accuracy 8 | losses: 9 | - type: logit_bce 10 | num_context_features: 1 11 | context_feature_dim: 300 12 | image_feature_dim: 2048 13 | context_max_len: 50 14 | classifier: 15 | type: logit 16 | params: 17 | img_hidden_dim: 5000 18 | text_hidden_dim: 300 19 | image_feature_embeddings: 20 | - modal_combine: 21 | type: non_linear_element_multiply 22 | params: 23 | dropout: 0 24 | hidden_dim: 5000 25 | normalization: softmax 26 | transform: 27 | type: linear 28 | params: 29 | out_dim: 1 30 | image_feature_encodings: 31 | - type: finetune_faster_rcnn_fpn_fc7 32 | params: 33 | bias_file: detectron/fc6/fc7_b.pkl 34 | weights_file: detectron/fc6/fc7_w.pkl 35 | - type: default 36 | params: {} 37 | context_feature_encodings: 38 | - type: default 39 | params: {} 40 | image_text_modal_combine: 41 | type: non_linear_element_multiply 42 | params: 43 | dropout: 0 44 | hidden_dim: 5000 45 | text_embeddings: 46 | - type: attention 47 | params: 48 | hidden_dim: 1024 49 | num_layers: 1 50 | conv1_out: 512 51 | conv2_out: 2 52 | dropout: 0 53 | embedding_dim: 300 54 | kernel_size: 1 55 | padding: 0 56 | pythia_image_only: *pythia 57 | pythia_question_only: *pythia 58 | optimizer_attributes: 59 | type: Adamax 60 | params: 61 | lr: 0.005 62 | training_parameters: 63 | clip_norm_mode: all 64 | clip_gradients: false 65 | max_grad_l2_norm: 0.25 66 | lr_scheduler: true 67 | lr_steps: 68 | - 14000 69 | lr_ratio: 0.01 70 | use_warmup: true 71 | warmup_factor: 0.2 72 | warmup_iterations: 1000 73 | max_iterations: 24000 74 | batch_size: 128 75 | num_workers: 7 76 | task_size_proportional_sampling: true 77 | monitored_metric: textvqa/vqa_accuracy 78 | pretrained_mapping: 79 | text_embeddings: text_embeddings 80 | image_feature_encoders: image_feature_encoders 81 | image_feature_embeddings_list: image_feature_embeddings_list 82 | image_text_multi_modal_combine_layer: image_text_multi_modal_combine_layer 83 | metric_minimize: false 84 | -------------------------------------------------------------------------------- /configs/vqa/visual_genome/pythia.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - ../configs/vqa/vqa2/pythia.yml 3 | - common/defaults/configs/datasets/vqa/visual_genome.yml 4 | dataset_attributes: 5 | visual_genome: 6 | return_scene_graph: false 7 | return_objects: false 8 | return_relationships: false 9 | return_info: false 10 | no_unk: true 11 | training_parameters: 12 | monitored_metric: vqa2/vqa_accuracy -------------------------------------------------------------------------------- /configs/vqa/vizwiz/ban.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - ../configs/vqa/vizwiz/pythia.yml 3 | model_attributes: 4 | ban: 5 | metrics: 6 | - type: vqa_accuracy 7 | losses: 8 | - type: logit_bce 9 | text_embedding: 10 | num_hidden: 1280 11 | vocab_size: 1280 12 | emb_size: 300 13 | num_layers: 1 14 | dropout: 0.0 15 | bidirectional: False 16 | rnn_type: 'GRU' 17 | bilinear_attention: 18 | bc_net: 19 | k: 1 20 | dropout: [0.2, 0.5] 21 | h_out: 22 | fc_net: 23 | dims: 600 24 | activation: 25 | dropout: 0.2 26 | gamma: 4 27 | visual_feat_dim: 2048 28 | classifier: 29 | # out dim will be taken from registry as set by dataset builder 30 | hidden_size: 600 31 | dropout: 0.5 32 | -------------------------------------------------------------------------------- /configs/vqa/vizwiz/pythia.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - common/defaults/configs/datasets/vqa/vizwiz.yml 3 | model_attributes: 4 | pythia: &pythia 5 | model_data_dir: ../data/ 6 | metrics: 7 | - type: vqa_accuracy 8 | losses: 9 | - type: logit_bce 10 | classifier: 11 | type: logit 12 | params: 13 | img_hidden_dim: 5000 14 | text_hidden_dim: 300 15 | image_feature_embeddings: 16 | - modal_combine: 17 | type: non_linear_element_multiply 18 | params: 19 | dropout: 0 20 | hidden_dim: 5000 21 | normalization: softmax 22 | transform: 23 | type: linear 24 | params: 25 | out_dim: 1 26 | image_feature_dim: 2048 27 | image_feature_encodings: 28 | - type: finetune_faster_rcnn_fpn_fc7 29 | params: 30 | bias_file: detectron/fc6/fc7_b.pkl 31 | weights_file: detectron/fc6/fc7_w.pkl 32 | - type: default 33 | params: {} 34 | image_text_modal_combine: 35 | type: non_linear_element_multiply 36 | params: 37 | dropout: 0 38 | hidden_dim: 5000 39 | text_embeddings: 40 | - type: attention 41 | params: 42 | hidden_dim: 1024 43 | num_layers: 1 44 | conv1_out: 512 45 | conv2_out: 2 46 | dropout: 0 47 | embedding_dim: 300 48 | kernel_size: 1 49 | padding: 0 50 | pythia_image_only: *pythia 51 | pythia_question_only: *pythia 52 | optimizer_attributes: 53 | type: Adamax 54 | params: 55 | lr: 0.005 56 | training_parameters: 57 | clip_norm_mode: all 58 | clip_gradients: true 59 | max_grad_l2_norm: 0.25 60 | lr_scheduler: true 61 | lr_steps: 62 | - 14000 63 | lr_ratio: 0.01 64 | use_warmup: true 65 | warmup_factor: 0.2 66 | warmup_iterations: 1000 67 | max_iterations: 24000 68 | batch_size: 128 69 | num_workers: 7 70 | task_size_proportional_sampling: true 71 | monitored_metric: vizwiz/vqa_accuracy 72 | metric_minimize: false 73 | pretrained_mapping: 74 | word_embedding: word_embedding 75 | text_embeddings: text_embeddings 76 | image_feature_encoders: image_feature_encoders 77 | image_feature_embeddings_list: image_feature_embeddings_list 78 | image_text_multi_modal_combine_layer: image_text_multi_modal_combine_layer 79 | -------------------------------------------------------------------------------- /configs/vqa/vqa2/ban.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - common/defaults/configs/datasets/vqa/vqa2.yml 3 | model_attributes: 4 | ban: 5 | metrics: 6 | - vqa_accuracy 7 | losses: 8 | - type: logit_bce 9 | text_embedding: 10 | num_hidden: 1280 11 | vocab_size: 1280 12 | emb_size: 300 13 | num_layers: 1 14 | dropout: 0.0 15 | bidirectional: False 16 | rnn_type: 'GRU' 17 | bilinear_attention: 18 | bc_net: 19 | k: 1 20 | dropout: [0.2, 0.5] 21 | h_out: 22 | fc_net: 23 | dims: 600 24 | activation: 25 | dropout: 0.2 26 | gamma: 4 27 | visual_feat_dim: 2048 28 | classifier: 29 | # out dim will be taken from registry as set by dataset builder 30 | hidden_size: 600 31 | dropout: 0.5 32 | optimizer_attributes: 33 | type: Adamax 34 | params: 35 | eps: 1.0e-08 36 | lr: 0.01 37 | weight_decay: 0 38 | training_parameters: 39 | clip_norm_mode: all 40 | clip_gradients: true 41 | lr_ratio: 0.1 42 | lr_scheduler: true 43 | lr_steps: 44 | - 15000 45 | - 18000 46 | - 20000 47 | - 21000 48 | max_grad_l2_norm: 0.25 49 | max_iterations: 22000 50 | use_warmup: true 51 | warmup_factor: 0.2 52 | warmup_iterations: 1000 53 | patience: 4000 54 | batch_size: 512 55 | num_workers: 7 56 | task_size_proportional_sampling: true 57 | monitored_metric: vqa2/vqa_accuracy 58 | metric_minimize: false 59 | -------------------------------------------------------------------------------- /configs/vqa/vqa2/lorra_train_and_val.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - ../configs/vqa/vqa2/lorra.yml 3 | task_attributes: 4 | vqa: 5 | dataset_attributes: 6 | vqa2: 7 | image_features: 8 | train: 9 | - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014 10 | - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014 11 | val: 12 | - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014 13 | imdb_files: 14 | train: 15 | - imdb/vqa/imdb_train2014.npy 16 | - imdb/vqa/imdb_val2014.npy 17 | val: 18 | - imdb/vqa/imdb_minival2014.npy 19 | -------------------------------------------------------------------------------- /configs/vqa/vqa2/pythia.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - common/defaults/configs/datasets/vqa/vqa2.yml 3 | model_attributes: 4 | pythia: &pythia 5 | model_data_dir: ../data/ 6 | metrics: 7 | - type: vqa_accuracy 8 | losses: 9 | - type: logit_bce 10 | classifier: 11 | type: logit 12 | params: 13 | img_hidden_dim: 5000 14 | text_hidden_dim: 300 15 | image_feature_embeddings: 16 | - modal_combine: 17 | type: non_linear_element_multiply 18 | params: 19 | dropout: 0 20 | hidden_dim: 5000 21 | normalization: softmax 22 | transform: 23 | type: linear 24 | params: 25 | out_dim: 1 26 | image_feature_dim: 2048 27 | image_feature_encodings: 28 | - type: finetune_faster_rcnn_fpn_fc7 29 | params: 30 | bias_file: detectron/fc6/fc7_b.pkl 31 | weights_file: detectron/fc6/fc7_w.pkl 32 | - type: default 33 | params: {} 34 | image_text_modal_combine: 35 | type: non_linear_element_multiply 36 | params: 37 | dropout: 0 38 | hidden_dim: 5000 39 | text_embeddings: 40 | - type: attention 41 | params: 42 | hidden_dim: 1024 43 | num_layers: 1 44 | conv1_out: 512 45 | conv2_out: 2 46 | dropout: 0 47 | embedding_dim: 300 48 | kernel_size: 1 49 | padding: 0 50 | pythia_image_only: *pythia 51 | pythia_question_only: *pythia 52 | optimizer_attributes: 53 | type: Adamax 54 | params: 55 | eps: 1.0e-08 56 | lr: 0.01 57 | weight_decay: 0 58 | training_parameters: 59 | clip_norm_mode: all 60 | clip_gradients: true 61 | lr_ratio: 0.1 62 | lr_scheduler: true 63 | lr_steps: 64 | - 15000 65 | - 18000 66 | - 20000 67 | - 21000 68 | max_grad_l2_norm: 0.25 69 | max_iterations: 22000 70 | use_warmup: true 71 | warmup_factor: 0.2 72 | warmup_iterations: 1000 73 | patience: 4000 74 | batch_size: 512 75 | num_workers: 7 76 | task_size_proportional_sampling: true 77 | monitored_metric: vqa2/vqa_accuracy 78 | metric_minimize: false 79 | pretrained_mapping: 80 | word_embedding: word_embedding 81 | text_embeddings: text_embeddings 82 | image_feature_encoders: image_feature_encoders 83 | image_feature_embeddings_list: image_feature_embeddings_list 84 | image_text_multi_modal_combine_layer: image_text_multi_modal_combine_layer 85 | classifier: classifier 86 | -------------------------------------------------------------------------------- /configs/vqa/vqa2/pythia_12k_iterations_no_resnet.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - ../configs/vqa/vqa2/pythia.yml 3 | dataset_attributes: 4 | vqa2: 5 | image_features: 6 | train: 7 | - coco/detectron_fix_100/fc6/train_val_2014 8 | - coco/detectron_fix_100/fc6/train_val_2014 9 | val: 10 | - coco/detectron_fix_100/fc6/train_val_2014 11 | test: 12 | - coco/detectron_fix_100/fc6/test2015 13 | imdb_files: 14 | train: 15 | - imdb/vqa/imdb_train2014.npy 16 | - imdb/vqa/imdb_val2014.npy 17 | val: 18 | - imdb/vqa/imdb_minival2014.npy 19 | model_attributes: 20 | pythia: 21 | image_feature_encodings: 22 | - type: finetune_faster_rcnn_fpn_fc7 23 | params: 24 | bias_file: detectron/fc6/fc7_b.pkl 25 | weights_file: detectron/fc6/fc7_w.pkl 26 | training_parameters: 27 | max_iterations: 12000 28 | lr_steps: 29 | - 5000 30 | - 7000 31 | - 9000 32 | - 11000 33 | -------------------------------------------------------------------------------- /configs/vqa/vqa2/pythia_train_and_val.yml: -------------------------------------------------------------------------------- 1 | includes: 2 | - ../configs/vqa/vqa2/pythia.yml 3 | dataset_attributes: 4 | vqa2: 5 | image_features: 6 | train: 7 | - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014 8 | - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014 9 | val: 10 | - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014 11 | test: 12 | - coco/detectron_fix_100/fc6/test2015,coco/resnet152/test2015 13 | imdb_files: 14 | train: 15 | - imdb/vqa/imdb_train2014.npy 16 | - imdb/vqa/imdb_val2014.npy 17 | val: 18 | - imdb/vqa/imdb_minival2014.npy 19 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = pythia 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | recommonmark==0.5.0 2 | sphinx 3 | sphinx_rtd_theme==0.4.3 4 | -------------------------------------------------------------------------------- /docs/source/common/registry.rst: -------------------------------------------------------------------------------- 1 | common.registry 2 | =============== 3 | 4 | .. automodule:: pythia.common.registry 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/common/sample.rst: -------------------------------------------------------------------------------- 1 | common.sample 2 | =============== 3 | 4 | .. automodule:: pythia.common.sample 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/datasets/base_dataset.rst: -------------------------------------------------------------------------------- 1 | datasets.base_dataset 2 | ================== 3 | 4 | .. automodule:: pythia.datasets.base_dataset 5 | :members: 6 | :private-members: 7 | -------------------------------------------------------------------------------- /docs/source/datasets/base_dataset_builder.rst: -------------------------------------------------------------------------------- 1 | datasets.base_dataset_builder 2 | ========================== 3 | 4 | .. automodule:: pythia.datasets.base_dataset_builder 5 | :members: 6 | :private-members: 7 | -------------------------------------------------------------------------------- /docs/source/datasets/base_task.rst: -------------------------------------------------------------------------------- 1 | datasets.base_task 2 | ========================== 3 | 4 | .. automodule:: pythia.datasets.base_task 5 | :members: 6 | :private-members: 7 | -------------------------------------------------------------------------------- /docs/source/datasets/processors.rst: -------------------------------------------------------------------------------- 1 | datasets.processors 2 | ========================== 3 | 4 | .. automodule:: pythia.datasets.processors 5 | :members: 6 | :private-members: 7 | -------------------------------------------------------------------------------- /docs/source/models/base_model.rst: -------------------------------------------------------------------------------- 1 | models.base_model 2 | ================= 3 | 4 | .. automodule:: pythia.models.base_model 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/modules/losses.rst: -------------------------------------------------------------------------------- 1 | modules.losses 2 | =============== 3 | 4 | .. automodule:: pythia.modules.losses 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/modules/metrics.rst: -------------------------------------------------------------------------------- 1 | modules.metrics 2 | =============== 3 | 4 | .. automodule:: pythia.modules.metrics 5 | :members: 6 | -------------------------------------------------------------------------------- /docs/source/tutorials/features.rst: -------------------------------------------------------------------------------- 1 | Features 2 | ======== 3 | 4 | Pythia features: 5 | 6 | - **Model Zoo**: Reference implementations for state-of-the-art vision and language model including 7 | LoRRA_ (SoTA on VQA and TextVQA), Pythia_ model (VQA 2018 challenge winner), BAN and BUTD_. 8 | - **Multi-Tasking**: Support for multi-tasking which allows training on 9 | multiple datasets together. 10 | - **Datasets**: Includes support for various datasets built-in including VQA, VizWiz, 11 | TextVQA, VisualDialog, MS COCO Captioning. 12 | - **Modules**: Provides implementations for many commonly used layers 13 | in vision and language domain 14 | - **Distributed**: Support for distributed training based on DataParallel 15 | as well as DistributedDataParallel. 16 | - **Unopinionated**: Unopinionated about the dataset and model implementations 17 | built on top of it. 18 | - **Customization**: Custom losses, metrics, scheduling, optimizers, tensorboard; 19 | suits all your custom needs. 20 | 21 | You can use Pythia to **bootstrap** for your next vision and language multimodal 22 | research project. 23 | 24 | Pythia can also act as **starter codebase** for challenges around vision and 25 | language datasets (TextVQA challenge, VQA challenge). 26 | 27 | .. _lorra: https://arxiv.org/abs/1904.08920 28 | .. _pythia: https://arxiv.org/abs/1807.09956 29 | .. _butd: https://arxiv.org/abs/1707.07998 30 | -------------------------------------------------------------------------------- /overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/overview.png -------------------------------------------------------------------------------- /projects/M4C_Captioner/scripts/coco_eval.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import numpy as np 4 | import os 5 | 6 | sys.path.append( 7 | os.path.join(os.path.dirname(__file__), '../../../pythia/scripts/coco/') 8 | ) 9 | import coco_caption_eval # NoQA 10 | 11 | 12 | def print_metrics(res_metrics): 13 | print(res_metrics) 14 | keys = ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'METEOR', 'ROUGE_L', 'SPICE', 'CIDEr'] 15 | print('\n\n**********\nFinal model performance:\n**********') 16 | for k in keys: 17 | print(k, ': %.1f' % (res_metrics[k] * 100)) 18 | 19 | 20 | if __name__ == '__main__': 21 | import argparse 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--pred_file', type=str, required=True) 24 | parser.add_argument('--set', type=str, default='karpathy_val') 25 | args = parser.parse_args() 26 | 27 | with open(args.pred_file) as f: 28 | preds = json.load(f) 29 | imdb_file = os.path.join( 30 | os.path.dirname(__file__), 31 | '../../../data/imdb/m4c_coco/imdb_{}.npy'.format(args.set) 32 | ) 33 | imdb = np.load(imdb_file, allow_pickle=True) 34 | imdb = imdb[1:] 35 | 36 | gts = [ 37 | {'image_id': info['image_id'], 'caption': info['caption_str']} 38 | for info in imdb 39 | ] 40 | preds = [ 41 | {'image_id': int(p['image_id']), 'caption': p['caption']} 42 | for p in preds 43 | ] 44 | imgids = list(set(g['image_id'] for g in gts)) 45 | 46 | metrics = coco_caption_eval.calculate_metrics( 47 | imgids, {'annotations': gts}, {'annotations': preds} 48 | ) 49 | 50 | print_metrics(metrics) 51 | -------------------------------------------------------------------------------- /projects/M4C_Captioner/scripts/textcaps_eval.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import numpy as np 4 | import os 5 | 6 | sys.path.append( 7 | os.path.join(os.path.dirname(__file__), '../../../pythia/scripts/coco/') 8 | ) 9 | import coco_caption_eval # NoQA 10 | 11 | 12 | def print_metrics(res_metrics): 13 | print(res_metrics) 14 | keys = ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'METEOR', 'ROUGE_L', 'SPICE', 'CIDEr'] 15 | print('\n\n**********\nFinal model performance:\n**********') 16 | for k in keys: 17 | print(k, ': %.1f' % (res_metrics[k] * 100)) 18 | 19 | 20 | if __name__ == '__main__': 21 | import argparse 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--pred_file', type=str, required=True) 24 | parser.add_argument('--set', type=str, default='val') 25 | args = parser.parse_args() 26 | 27 | if args.set not in ['train', 'val']: 28 | raise Exception( 29 | 'this script only supports TextCaps train and val set. ' 30 | 'Please use the EvalAI server for test set evaluation' 31 | ) 32 | 33 | with open(args.pred_file) as f: 34 | preds = json.load(f) 35 | imdb_file = os.path.join( 36 | os.path.dirname(__file__), 37 | '../../../data/imdb/m4c_textcaps/imdb_{}.npy'.format(args.set) 38 | ) 39 | imdb = np.load(imdb_file, allow_pickle=True) 40 | imdb = imdb[1:] 41 | 42 | gts = [ 43 | {'image_id': info['image_id'], 'caption': info['caption_str']} 44 | for info in imdb 45 | ] 46 | preds = [ 47 | {'image_id': p['image_id'], 'caption': p['caption']} 48 | for p in preds 49 | ] 50 | imgids = list(set(g['image_id'] for g in gts)) 51 | 52 | metrics = coco_caption_eval.calculate_metrics( 53 | imgids, {'annotations': gts}, {'annotations': preds} 54 | ) 55 | 56 | print_metrics(metrics) 57 | -------------------------------------------------------------------------------- /projects/TextCap_CVPR.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/projects/TextCap_CVPR.pdf -------------------------------------------------------------------------------- /pythia/__init__.py: -------------------------------------------------------------------------------- 1 | __version__ = "0.3.0" 2 | -------------------------------------------------------------------------------- /pythia/common/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /pythia/common/batch_collator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from pythia.common.sample import SampleList 3 | 4 | 5 | class BatchCollator: 6 | # TODO: Think more if there is a better way to do this 7 | _IDENTICAL_VALUE_KEYS = ["dataset_type", "dataset_name"] 8 | 9 | def __call__(self, batch): 10 | sample_list = SampleList(batch) 11 | for key in self._IDENTICAL_VALUE_KEYS: 12 | sample_list[key + "_"] = sample_list[key] 13 | sample_list[key] = sample_list[key][0] 14 | 15 | return sample_list 16 | -------------------------------------------------------------------------------- /pythia/common/constants.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import os 3 | 4 | 5 | imdb_version = 1 6 | FASTTEXT_WIKI_URL = ( 7 | "https://dl.fbaipublicfiles.com/pythia/pretrained_models/fasttext/wiki.en.bin" 8 | ) 9 | 10 | CLEVR_DOWNLOAD_URL = ( 11 | "https://dl.fbaipublicfiles.com/clevr/CLEVR_v1.0.zip" 12 | ) 13 | 14 | VISUAL_GENOME_CONSTS = { 15 | "imdb_url": "https://dl.fbaipublicfiles.com/pythia/data/imdb/visual_genome.tar.gz", 16 | "features_url": "https://dl.fbaipublicfiles.com/pythia/features/visual_genome.tar.gz", 17 | "synset_file": "vg_synsets.txt", 18 | "vocabs": "https://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz" 19 | } 20 | 21 | VISUAL_DIALOG_CONSTS = { 22 | "imdb_url": { 23 | "train": "https://www.dropbox.com/s/ix8keeudqrd8hn8/visdial_1.0_train.zip?dl=1", 24 | "val": "https://www.dropbox.com/s/ibs3a0zhw74zisc/visdial_1.0_val.zip?dl=1", 25 | "test": "https://www.dropbox.com/s/ibs3a0zhw74zisc/visdial_1.0_test.zip?dl=1" 26 | }, 27 | "features_url": { 28 | "visual_dialog": "https://dl.fbaipublicfiles.com/pythia/features/visual_dialog.tar.gz", 29 | "coco": "https://dl.fbaipublicfiles.com/pythia/features/coco.tar.gz" 30 | }, 31 | "vocabs": "https://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz" 32 | } 33 | 34 | DOWNLOAD_CHUNK_SIZE = 1024 * 1024 35 | -------------------------------------------------------------------------------- /pythia/common/dataset_loader.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import os 3 | 4 | import yaml 5 | from torch.utils.data import DataLoader 6 | 7 | from pythia.common.batch_collator import BatchCollator 8 | from pythia.common.test_reporter import TestReporter 9 | from pythia.datasets.multi_dataset import MultiDataset 10 | from pythia.datasets.samplers import DistributedSampler 11 | from pythia.utils.general import get_batch_size 12 | 13 | 14 | class DatasetLoader: 15 | def __init__(self, config): 16 | self.config = config 17 | 18 | def load_datasets(self): 19 | self.train_dataset = MultiDataset("train") 20 | self.val_dataset = MultiDataset("val") 21 | self.test_dataset = MultiDataset("test") 22 | 23 | self.train_dataset.load(**self.config) 24 | self.val_dataset.load(**self.config) 25 | self.test_dataset.load(**self.config) 26 | 27 | if self.train_dataset.num_datasets == 1: 28 | self.train_loader = self.train_dataset.first_loader 29 | self.val_loader = self.val_dataset.first_loader 30 | self.test_loader = self.test_dataset.first_loader 31 | else: 32 | self.train_loader = self.train_dataset 33 | self.val_loader = self.val_dataset 34 | self.test_loader = self.test_dataset 35 | 36 | self.mapping = { 37 | "train": self.train_dataset, 38 | "val": self.val_dataset, 39 | "test": self.test_dataset, 40 | } 41 | 42 | self.test_reporter = None 43 | self.should_not_log = self.config.training_parameters.should_not_log 44 | 45 | @property 46 | def dataset_config(self): 47 | return self._dataset_config 48 | 49 | @dataset_config.setter 50 | def dataset_config(self, config): 51 | self._dataset_config = config 52 | 53 | def get_config(self): 54 | return self._dataset_config 55 | 56 | def get_test_reporter(self, dataset_type): 57 | dataset = getattr(self, "{}_dataset".format(dataset_type)) 58 | return TestReporter(dataset) 59 | 60 | def update_registry_for_model(self, config): 61 | self.train_dataset.update_registry_for_model(config) 62 | self.val_dataset.update_registry_for_model(config) 63 | self.test_dataset.update_registry_for_model(config) 64 | 65 | def clean_config(self, config): 66 | self.train_dataset.clean_config(config) 67 | self.val_dataset.clean_config(config) 68 | self.test_dataset.clean_config(config) 69 | 70 | def prepare_batch(self, batch, *args, **kwargs): 71 | return self.mapping[batch.dataset_type].prepare_batch(batch) 72 | 73 | def verbose_dump(self, report, *args, **kwargs): 74 | if self.config.training_parameters.verbose_dump: 75 | dataset_type = report.dataset_type 76 | self.mapping[dataset_type].verbose_dump(report, *args, **kwargs) 77 | 78 | def seed_sampler(self, dataset_type, seed): 79 | dataset = getattr(self, "{}_dataset".format(dataset_type)) 80 | dataset.seed_sampler(seed) 81 | -------------------------------------------------------------------------------- /pythia/common/defaults/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/common/defaults/__init__.py -------------------------------------------------------------------------------- /pythia/common/defaults/configs/datasets/captioning/coco.yml: -------------------------------------------------------------------------------- 1 | dataset_attributes: 2 | coco: 3 | data_root_dir: ../data 4 | image_depth_first: false 5 | fast_read: false 6 | image_features: 7 | train: 8 | - coco/detectron_fix_100/fc6/train_val_2014 9 | val: 10 | - coco/detectron_fix_100/fc6/train_val_2014 11 | test: 12 | - coco/detectron_fix_100/fc6/train_val_2014 13 | imdb_files: 14 | train: 15 | - imdb/coco_captions/imdb_karpathy_train.npy 16 | val: 17 | - imdb/coco_captions/imdb_karpathy_val.npy 18 | test: 19 | - imdb/coco_captions/imdb_karpathy_test.npy 20 | features_max_len: 100 21 | processors: 22 | text_processor: 23 | type: vocab 24 | params: 25 | max_length: 52 26 | vocab: 27 | type: intersected 28 | embedding_name: glove.6B.300d 29 | vocab_file: vocabs/vocabulary_captioning_thresh5.txt 30 | preprocessor: 31 | type: simple_sentence 32 | params: {} 33 | caption_processor: 34 | type: caption 35 | params: 36 | vocab: 37 | type: intersected 38 | embedding_name: glove.6B.300d 39 | vocab_file: vocabs/vocabulary_captioning_thresh5.txt 40 | min_captions_per_img: 5 41 | return_info: false 42 | # Return OCR information 43 | use_ocr: false 44 | # Return spatial information of OCR tokens if present 45 | use_ocr_info: false 46 | training_parameters: 47 | monitored_metric: coco/caption_bleu4 48 | metric_minimize: false 49 | -------------------------------------------------------------------------------- /pythia/common/defaults/configs/datasets/captioning/m4c_textcaps.yml: -------------------------------------------------------------------------------- 1 | dataset_attributes: 2 | m4c_textcaps: 3 | data_root_dir: ../data 4 | image_depth_first: false 5 | fast_read: false 6 | features_max_len: 100 7 | processors: 8 | context_processor: 9 | type: fasttext 10 | params: 11 | max_length: 50 12 | model_file: .vector_cache/wiki.en.bin 13 | ocr_token_processor: 14 | type: simple_word 15 | params: {} 16 | bbox_processor: 17 | type: bbox 18 | params: 19 | max_length: 50 20 | return_info: true 21 | use_ocr: true 22 | use_ocr_info: true -------------------------------------------------------------------------------- /pythia/common/defaults/configs/datasets/dialog/visual_dialog.yml: -------------------------------------------------------------------------------- 1 | dataset_attributes: 2 | visual_genome: 3 | data_root_dir: ../data 4 | image_depth_first: false 5 | fast_read: false 6 | image_features: 7 | train: 8 | - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014 9 | val: 10 | - visual_dialog/detectron_fix_100/fc6/val2018,visual_dialog/resnet152/ 11 | test: 12 | - visual_dialog/detectron_fix_100/fc6/test2018,visual_dialog/resnet152/ 13 | imdb_files: 14 | train: 15 | - imdb/visual_dialog/visdial_1.0_train.json 16 | val: 17 | - imdb/visual_dialog/visdial_1.0_val.json 18 | test: 19 | - imdb/visual_dialog/visdial_1.0_test.json 20 | features_max_len: 100 21 | processors: 22 | text_processor: 23 | type: vocab 24 | params: 25 | max_length: 14 26 | vocab: 27 | type: intersected 28 | embedding_name: glove.6B.300d 29 | vocab_file: vocabs/vocabulary_100k.txt 30 | preprocessor: 31 | type: simple_sentence 32 | params: {} 33 | answer_processor: 34 | type: vqa_answer 35 | params: 36 | num_answers: 1 37 | vocab_file: vocabs/answers_vqa.txt 38 | preprocessor: 39 | type: simple_word 40 | params: {} 41 | discriminative_answer_processor: 42 | type: vocab 43 | params: 44 | max_length: 1 45 | vocab: 46 | type: random 47 | vocab_file: vocabs/vocabulary_100k.txt 48 | vg_answer_preprocessor: 49 | type: simple_word 50 | params: {} 51 | history_processor: 52 | type: vocab 53 | params: 54 | max_length: 100 55 | vocab: 56 | type: intersected 57 | embedding_name: glove.6B.300d 58 | vocab_file: vocabs/vocabulary_100k.txt 59 | preprocessor: 60 | type: simple_sentence 61 | params: {} 62 | bbox_processor: 63 | type: bbox 64 | params: 65 | max_length: 50 66 | return_history: true 67 | # Means you have to rank 100 candidate answers 68 | discriminative: 69 | enabled: true 70 | # Only return answer indices, otherwise it will return 71 | # glove embeddings 72 | return_indices: true 73 | no_unk: false 74 | # Return OCR information 75 | use_ocr: false 76 | # Return spatial information of OCR tokens if present 77 | use_ocr_info: false 78 | training_parameters: 79 | monitored_metric: visual_dialog/r@1 80 | metric_minimize: false 81 | -------------------------------------------------------------------------------- /pythia/common/defaults/configs/datasets/vqa/clevr.yml: -------------------------------------------------------------------------------- 1 | dataset_attributes: 2 | clevr: 3 | data_root_dir: ../data 4 | data_folder: CLEVR_v1.0 5 | build_attributes: 6 | min_count: 1 7 | split_regex: " " 8 | keep: 9 | - ";" 10 | - "," 11 | remove: 12 | - "?" 13 | - "." 14 | processors: 15 | text_processor: 16 | type: vocab 17 | params: 18 | max_length: 10 19 | vocab: 20 | type: random 21 | vocab_file: vocabs/clevr_question_vocab.txt 22 | preprocessor: 23 | type: simple_sentence 24 | params: {} 25 | answer_processor: 26 | type: multi_hot_answer_from_vocab 27 | params: 28 | num_answers: 1 29 | # Vocab file is relative to [data_root_dir]/[data_folder] 30 | vocab_file: vocabs/clevr_answer_vocab.txt 31 | preprocessor: 32 | type: simple_word 33 | params: {} -------------------------------------------------------------------------------- /pythia/common/defaults/configs/datasets/vqa/m4c_ocrvqa.yml: -------------------------------------------------------------------------------- 1 | dataset_attributes: 2 | m4c_ocrvqa: 3 | data_root_dir: ../data 4 | image_depth_first: false 5 | fast_read: false 6 | features_max_len: 100 7 | processors: 8 | context_processor: 9 | type: fasttext 10 | params: 11 | max_length: 50 12 | model_file: .vector_cache/wiki.en.bin 13 | ocr_token_processor: 14 | type: simple_word 15 | params: {} 16 | bbox_processor: 17 | type: bbox 18 | params: 19 | max_length: 50 20 | return_info: true 21 | use_ocr: true 22 | use_ocr_info: true -------------------------------------------------------------------------------- /pythia/common/defaults/configs/datasets/vqa/m4c_stvqa.yml: -------------------------------------------------------------------------------- 1 | dataset_attributes: 2 | m4c_stvqa: 3 | data_root_dir: ../data 4 | image_depth_first: false 5 | fast_read: false 6 | features_max_len: 100 7 | processors: 8 | context_processor: 9 | type: fasttext 10 | params: 11 | max_length: 50 12 | model_file: .vector_cache/wiki.en.bin 13 | ocr_token_processor: 14 | type: simple_word 15 | params: {} 16 | bbox_processor: 17 | type: bbox 18 | params: 19 | max_length: 50 20 | return_info: true 21 | use_ocr: true 22 | use_ocr_info: true -------------------------------------------------------------------------------- /pythia/common/defaults/configs/datasets/vqa/m4c_textvqa.yml: -------------------------------------------------------------------------------- 1 | dataset_attributes: 2 | m4c_textvqa: 3 | data_root_dir: ../data 4 | image_depth_first: false 5 | fast_read: false 6 | features_max_len: 100 7 | processors: 8 | context_processor: 9 | type: fasttext 10 | params: 11 | max_length: 50 12 | model_file: .vector_cache/wiki.en.bin 13 | ocr_token_processor: 14 | type: simple_word 15 | params: {} 16 | bbox_processor: 17 | type: bbox 18 | params: 19 | max_length: 50 20 | return_info: true 21 | use_ocr: true 22 | use_ocr_info: true -------------------------------------------------------------------------------- /pythia/common/defaults/configs/datasets/vqa/textvqa.yml: -------------------------------------------------------------------------------- 1 | dataset_attributes: 2 | textvqa: 3 | data_root_dir: ../data 4 | image_depth_first: false 5 | fast_read: false 6 | image_features: 7 | train: 8 | - open_images/detectron_fix_100/fc6/train,open_images/resnet152/train 9 | val: 10 | - open_images/detectron_fix_100/fc6/train,open_images/resnet152/train 11 | test: 12 | - open_images/detectron_fix_100/fc6/test,open_images/resnet152/test 13 | imdb_files: 14 | train: 15 | - imdb/textvqa_0.5/imdb_textvqa_train.npy 16 | val: 17 | - imdb/textvqa_0.5/imdb_textvqa_val.npy 18 | test: 19 | - imdb/textvqa_0.5/imdb_textvqa_test.npy 20 | features_max_len: 137 21 | processors: 22 | text_processor: 23 | type: vocab 24 | params: 25 | max_length: 14 26 | vocab: 27 | type: intersected 28 | embedding_name: glove.6B.300d 29 | vocab_file: vocabs/vocabulary_100k.txt 30 | preprocessor: 31 | type: simple_sentence 32 | params: {} 33 | answer_processor: 34 | type: vqa_answer 35 | params: 36 | vocab_file: vocabs/answers_textvqa_8k.txt 37 | preprocessor: 38 | type: simple_word 39 | params: {} 40 | num_answers: 10 41 | context_processor: 42 | type: fasttext 43 | params: 44 | max_length: 50 45 | model_file: .vector_cache/wiki.en.bin 46 | ocr_token_processor: 47 | type: simple_word 48 | params: {} 49 | bbox_processor: 50 | type: bbox 51 | params: 52 | max_length: 50 53 | return_info: true 54 | # Return OCR information 55 | use_ocr: true 56 | # Return spatial information of OCR tokens if present 57 | use_ocr_info: false 58 | training_parameters: 59 | monitored_metric: textvqa/vqa_accuracy 60 | metric_minimize: false 61 | -------------------------------------------------------------------------------- /pythia/common/defaults/configs/datasets/vqa/vizwiz.yml: -------------------------------------------------------------------------------- 1 | dataset_attributes: 2 | vizwiz: 3 | data_root_dir: ../data 4 | image_depth_first: false 5 | fast_read: false 6 | image_features: 7 | train: 8 | - vizwiz/detectron_fix_100/fc6/train,vizwiz/resnet152/train 9 | val: 10 | - vizwiz/detectron_fix_100/fc6/val,vizwiz/resnet152/val 11 | test: 12 | - vizwiz/detectron_fix_100/fc6/test,vizwiz/resnet152/test 13 | imdb_files: 14 | train: 15 | - imdb/vizwiz/imdb_vizwiz_train.npy 16 | val: 17 | - imdb/vizwiz/imdb_vizwiz_val.npy 18 | test: 19 | - imdb/vizwiz/imdb_vizwiz_test.npy 20 | features_max_len: 100 21 | processors: 22 | text_processor: 23 | type: vocab 24 | params: 25 | max_length: 14 26 | vocab: 27 | type: intersected 28 | embedding_name: glove.6B.300d 29 | vocab_file: vocabs/vocabulary_100k.txt 30 | preprocessor: 31 | type: simple_sentence 32 | params: {} 33 | answer_processor: 34 | type: vqa_answer 35 | params: 36 | vocab_file: vocabs/answers_vizwiz_7k.txt 37 | preprocessor: 38 | type: simple_word 39 | params: {} 40 | num_answers: 10 41 | context_processor: 42 | type: fasttext 43 | params: 44 | max_length: 50 45 | model_file: .vector_cache/wiki.en.bin 46 | ocr_token_processor: 47 | type: simple_word 48 | params: {} 49 | bbox_processor: 50 | type: bbox 51 | params: 52 | max_length: 50 53 | return_info: true 54 | # Return OCR information 55 | use_ocr: false 56 | # Return spatial information of OCR tokens if present 57 | use_ocr_info: false 58 | training_parameters: 59 | monitored_metric: vizwiz/vqa_accuracy 60 | metric_minimize: false 61 | -------------------------------------------------------------------------------- /pythia/common/defaults/configs/datasets/vqa/vqa2.yml: -------------------------------------------------------------------------------- 1 | dataset_attributes: 2 | vqa2: 3 | data_root_dir: ../data 4 | image_depth_first: false 5 | fast_read: false 6 | image_features: 7 | train: 8 | - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014 9 | val: 10 | - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014 11 | test: 12 | - coco/detectron_fix_100/fc6/test2015,coco/resnet152/test2015 13 | imdb_files: 14 | train: 15 | - imdb/vqa/imdb_train2014.npy 16 | val: 17 | - imdb/vqa/imdb_val2014.npy 18 | test: 19 | - imdb/vqa/imdb_test2015.npy 20 | features_max_len: 100 21 | processors: 22 | text_processor: 23 | type: vocab 24 | params: 25 | max_length: 14 26 | vocab: 27 | type: intersected 28 | embedding_name: glove.6B.300d 29 | vocab_file: vocabs/vocabulary_100k.txt 30 | preprocessor: 31 | type: simple_sentence 32 | params: {} 33 | answer_processor: 34 | type: vqa_answer 35 | params: 36 | num_answers: 10 37 | vocab_file: vocabs/answers_vqa.txt 38 | preprocessor: 39 | type: simple_word 40 | params: {} 41 | context_processor: 42 | type: fasttext 43 | params: 44 | download_initially: false 45 | max_length: 50 46 | model_file: .vector_cache/wiki.en.bin 47 | ocr_token_processor: 48 | type: simple_word 49 | params: {} 50 | bbox_processor: 51 | type: bbox 52 | params: 53 | max_length: 50 54 | return_info: true 55 | # Return OCR information 56 | use_ocr: false 57 | # Return spatial information of OCR tokens if present 58 | use_ocr_info: false 59 | training_parameters: 60 | monitored_metric: vqa2/vqa_accuracy 61 | metric_minimize: false 62 | -------------------------------------------------------------------------------- /pythia/common/meter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # Inspired from maskrcnn benchmark 3 | from collections import defaultdict, deque 4 | 5 | import torch 6 | 7 | 8 | class SmoothedValue: 9 | """Track a series of values and provide access to smoothed values over a 10 | window or the global series average. 11 | """ 12 | 13 | def __init__(self, window_size=20): 14 | self.window_size = window_size 15 | self.reset() 16 | 17 | def reset(self): 18 | self.deque = deque(maxlen=self.window_size) 19 | self.series = [] 20 | self.total = 0.0 21 | self.count = 0 22 | 23 | def update(self, value): 24 | self.deque.append(value) 25 | self.series.append(value) 26 | self.count += 1 27 | self.total += value 28 | 29 | @property 30 | def median(self): 31 | d = torch.tensor(list(self.deque)) 32 | return d.median().item() 33 | 34 | @property 35 | def avg(self): 36 | d = torch.tensor(list(self.deque)) 37 | return d.mean().item() 38 | 39 | @property 40 | def global_avg(self): 41 | return self.total / self.count 42 | 43 | def get_latest(self): 44 | return self.deque[-1] 45 | 46 | 47 | class Meter: 48 | def __init__(self, delimiter=", "): 49 | self.meters = defaultdict(SmoothedValue) 50 | self.delimiter = delimiter 51 | 52 | def update(self, update_dict): 53 | for k, v in update_dict.items(): 54 | if isinstance(v, torch.Tensor): 55 | if v.dim() != 0: 56 | v = v.mean() 57 | v = v.item() 58 | assert isinstance(v, (float, int)) 59 | self.meters[k].update(v) 60 | 61 | def update_from_meter(self, meter): 62 | for key, value in meter.meters.items(): 63 | assert isinstance(value, SmoothedValue) 64 | self.meters[key] = value 65 | 66 | def __getattr__(self, attr): 67 | if attr in self.meters: 68 | return self.meters[attr] 69 | if attr in self.__dict__: 70 | return self.__dict__[attr] 71 | raise AttributeError( 72 | "'{}' object has no attribute '{}'".format(type(self).__name__, attr) 73 | ) 74 | 75 | def get_scalar_dict(self): 76 | scalar_dict = {} 77 | for k, v in self.meters.items(): 78 | scalar_dict[k] = v.get_latest() 79 | 80 | return scalar_dict 81 | 82 | def __str__(self): 83 | loss_str = [] 84 | for name, meter in self.meters.items(): 85 | if "train" in name: 86 | loss_str.append( 87 | "{}: {:.4f} ({:.4f})".format(name, meter.median, meter.global_avg) 88 | ) 89 | else: 90 | # In case of val print global avg 91 | loss_str.append("{}: {:.4f}".format(name, meter.global_avg)) 92 | 93 | return self.delimiter.join(loss_str) 94 | -------------------------------------------------------------------------------- /pythia/common/report.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import collections 3 | import warnings 4 | from collections import OrderedDict 5 | 6 | from pythia.common.registry import registry 7 | 8 | 9 | class Report(OrderedDict): 10 | def __init__(self, batch, model_output={}, *args): 11 | super().__init__(self) 12 | if self._check_and_load_tuple(batch): 13 | return 14 | 15 | all_args = [batch, model_output] + [*args] 16 | for idx, arg in enumerate(all_args): 17 | if not isinstance(arg, collections.abc.Mapping): 18 | raise TypeError( 19 | "Argument {:d}, {} must be of instance of " 20 | "collections.abc.Mapping".format(idx, arg) 21 | ) 22 | 23 | self.writer = registry.get("writer") 24 | 25 | self.warning_string = ( 26 | "Updating forward report with key {}" 27 | "{}, but it already exists in {}. " 28 | "Please consider using a different key, " 29 | "as this can cause issues during loss and " 30 | "metric calculations." 31 | ) 32 | 33 | for idx, arg in enumerate(all_args): 34 | for key, item in arg.items(): 35 | if key in self and idx >= 2: 36 | log = self.warning_string.format( 37 | key, "", "in previous arguments to report" 38 | ) 39 | warnings.warn(log) 40 | self[key] = item 41 | 42 | def _check_and_load_tuple(self, batch): 43 | if isinstance(batch, collections.abc.Mapping): 44 | return False 45 | 46 | if isinstance(batch[0], (tuple, list)) and isinstance(batch[0][0], str): 47 | for kv_pair in batch: 48 | self[kv_pair[0]] = kv_pair[1] 49 | return True 50 | else: 51 | return False 52 | 53 | def __setattr__(self, key, value): 54 | self[key] = value 55 | 56 | def __getattr__(self, key): 57 | try: 58 | return self[key] 59 | except KeyError: 60 | raise AttributeError(key) 61 | 62 | def fields(self): 63 | return list(self.keys()) 64 | -------------------------------------------------------------------------------- /pythia/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .base_dataset_builder import BaseDatasetBuilder 3 | from .multi_dataset import MultiDataset 4 | from .base_dataset import BaseDataset 5 | 6 | __all__ = ["BaseDataset", "BaseDatasetBuilder", "MultiDataset"] 7 | -------------------------------------------------------------------------------- /pythia/datasets/captioning/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /pythia/datasets/captioning/coco/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | __all__ = ["COCOBuilder", "COCODataset"] 3 | 4 | from .builder import COCOBuilder 5 | from .dataset import COCODataset 6 | -------------------------------------------------------------------------------- /pythia/datasets/captioning/coco/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | from pythia.common.registry import registry 9 | from pythia.datasets.vqa.vqa2 import VQA2Builder 10 | 11 | from .dataset import COCODataset 12 | 13 | 14 | @registry.register_builder("coco") 15 | class COCOBuilder(VQA2Builder): 16 | def __init__(self): 17 | super().__init__() 18 | self.dataset_name = "coco" 19 | self.set_dataset_class(COCODataset) 20 | 21 | def update_registry_for_model(self, config): 22 | registry.register( 23 | self.dataset_name + "_text_vocab_size", 24 | self.dataset.text_processor.get_vocab_size(), 25 | ) 26 | -------------------------------------------------------------------------------- /pythia/datasets/captioning/coco/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import torch 3 | 4 | from pythia.common.sample import Sample 5 | from pythia.datasets.vqa.vqa2 import VQA2Dataset 6 | 7 | 8 | class COCODataset(VQA2Dataset): 9 | def __init__(self, dataset_type, imdb_file_index, config, *args, **kwargs): 10 | super().__init__(dataset_type, imdb_file_index, config, *args, **kwargs) 11 | self._name = "coco" 12 | 13 | def load_item(self, idx): 14 | sample_info = self.imdb[idx] 15 | current_sample = Sample() 16 | 17 | if self._dataset_type != "test": 18 | text_processor_argument = {"tokens": sample_info["caption_tokens"]} 19 | processed_caption = self.text_processor(text_processor_argument) 20 | current_sample.text = processed_caption["text"] 21 | current_sample.caption_id = torch.tensor( 22 | sample_info["caption_id"], dtype=torch.int 23 | ) 24 | current_sample.caption_len = torch.tensor( 25 | len(sample_info["caption_tokens"]), dtype=torch.int 26 | ) 27 | 28 | if isinstance(sample_info["image_id"], int): 29 | current_sample.image_id = torch.tensor( 30 | sample_info["image_id"], dtype=torch.int 31 | ) 32 | else: 33 | current_sample.image_id = sample_info["image_id"] 34 | 35 | if self._use_features is True: 36 | features = self.features_db[idx] 37 | current_sample.update(features) 38 | 39 | # Add reference captions to sample 40 | current_sample = self.add_reference_caption(sample_info, current_sample) 41 | 42 | return current_sample 43 | 44 | def add_reference_caption(self, sample_info, sample): 45 | reference_list = [] 46 | for reference in sample_info["reference_tokens"]: 47 | text_processor_argument = {"tokens": reference} 48 | processed_reference = self.text_processor(text_processor_argument) 49 | reference_list.append(processed_reference["text"]) 50 | 51 | # Restrict to minimum reference captions available per image 52 | sample.answers = torch.stack(reference_list)[: self.config.min_captions_per_img] 53 | 54 | return sample 55 | 56 | def format_for_evalai(self, report): 57 | captions = report.captions.tolist() 58 | predictions = [] 59 | remove_unk_from_caption_prediction = getattr( 60 | self.config, 'remove_unk_from_caption_prediction', False 61 | ) 62 | for idx, image_id in enumerate(report.image_id): 63 | caption = self.caption_processor(captions[idx])["caption"] 64 | if remove_unk_from_caption_prediction: 65 | caption = caption.replace('', '') 66 | caption = caption.replace(' ', ' ').strip() 67 | if isinstance(image_id, torch.Tensor): 68 | image_id = image_id.item() 69 | predictions.append({"image_id": image_id, "caption": caption}) 70 | 71 | return predictions 72 | -------------------------------------------------------------------------------- /pythia/datasets/captioning/m4c_textcaps/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /pythia/datasets/captioning/m4c_textcaps/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from pythia.common.registry import Registry 3 | from pythia.datasets.captioning.m4c_textcaps.dataset import M4CTextCapsDataset 4 | from pythia.datasets.vqa.m4c_textvqa.builder import M4CTextVQABuilder 5 | 6 | 7 | @Registry.register_builder("m4c_textcaps") 8 | class M4CTextCapsBuilder(M4CTextVQABuilder): 9 | def __init__(self): 10 | super().__init__() 11 | self.dataset_name = "m4c_textcaps" 12 | self.set_dataset_class(M4CTextCapsDataset) 13 | -------------------------------------------------------------------------------- /pythia/datasets/captioning/m4c_textcaps/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from pythia.datasets.vqa.m4c_textvqa.dataset import M4CTextVQADataset 3 | from pythia.utils.objects_to_byte_tensor import enc_obj2bytes 4 | 5 | 6 | class M4CTextCapsDataset(M4CTextVQADataset): 7 | def __init__(self, dataset_type, imdb_file_index, config, *args, **kwargs): 8 | super().__init__( 9 | dataset_type, imdb_file_index, config, *args, **kwargs 10 | ) 11 | self._name = "m4c_textcaps" 12 | 13 | def preprocess_sample_info(self, sample_info): 14 | # add dummy questions to train with M4C (for TextVQA) 15 | sample_info['question_str'] = '' # empty question 16 | sample_info['question_id'] = sample_info['caption_id'] 17 | return sample_info 18 | 19 | def postprocess_evalai_entry(self, entry): 20 | new_entry = { 21 | 'caption_id': entry['question_id'], 22 | 'image_id': entry['image_id'], 23 | 'caption': entry['answer'], 24 | 'pred_source': entry['pred_source'], 25 | } 26 | return new_entry 27 | 28 | def add_answer_info(self, sample_info, sample): 29 | sample_has_caption = ('caption_str' in sample_info) 30 | if sample_has_caption: 31 | sample_info['answers'] = [sample_info['caption_str']] 32 | 33 | sample = super().add_answer_info(sample_info, sample) 34 | 35 | if sample_has_caption: 36 | sample.caption_str = enc_obj2bytes(sample_info['caption_str']) 37 | sample.ref_strs = enc_obj2bytes(sample_info['reference_strs']) 38 | sample.pop('gt_answers_enc') 39 | 40 | return sample 41 | -------------------------------------------------------------------------------- /pythia/datasets/concat_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import functools 3 | import types 4 | 5 | from torch.utils.data import ConcatDataset 6 | 7 | 8 | class PythiaConcatDataset(ConcatDataset): 9 | # These functions should only be called once even if they return nothing 10 | _SINGLE_CALL_FUNCS = [] 11 | 12 | def __init__(self, datasets): 13 | super().__init__(datasets) 14 | self._dir_representation = dir(self) 15 | 16 | def __getattr__(self, name): 17 | if name in self._dir_representation: 18 | return getattr(self, name) 19 | elif hasattr(self.datasets[0], name): 20 | attr = getattr(self.datasets[0], name) 21 | # Check if the current attribute is class method function 22 | if isinstance(attr, types.MethodType): 23 | # if it is the, we to call this function for 24 | # each of the child datasets 25 | attr = functools.partial(self._call_all_datasets_func, name) 26 | return attr 27 | else: 28 | raise AttributeError(name) 29 | 30 | def _get_single_call_funcs(self): 31 | return PythiaConcatDataset._SINGLE_CALL_FUNCS 32 | 33 | def _call_all_datasets_func(self, name, *args, **kwargs): 34 | for dataset in self.datasets: 35 | value = getattr(dataset, name)(*args, **kwargs) 36 | if value is not None: 37 | # TODO: Log a warning here 38 | return value 39 | # raise RuntimeError("Functions returning values can't be " 40 | # "called through PythiaConcatDataset") 41 | if ( 42 | hasattr(dataset, "get_single_call_funcs") 43 | and name in dataset.get_single_call_funcs() 44 | ): 45 | return 46 | -------------------------------------------------------------------------------- /pythia/datasets/dialog/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /pythia/datasets/dialog/visual_dialog/config.yml: -------------------------------------------------------------------------------- 1 | task_attributes: 2 | data_root_dir: data 3 | batch_size: 10 4 | vocab_file: visdial/visdial_vocabulary.txt 5 | max_seq_len: 20 6 | max_history_len: 300 7 | embedding_name: glove.6B.300d 8 | image_depth_first: false 9 | image_fast_reader: false 10 | image_feat_test: 11 | - /checkpoint02/tinayujiang/features/visdial/detectron_23/fc6/ 12 | image_feat_train: 13 | - detec/detectron/fc6/vqa/train+val2014 14 | image_feat_val: 15 | - /checkpoint02/tinayujiang/features/visdial/detectron_23/fc6/ 16 | image_max_loc: 100 17 | imdb_file_test: 18 | - visdial/visdial_1.0_val_imdb.json 19 | imdb_file_train: 20 | - visdial/visdial_1.0_train_imdb.json 21 | imdb_file_val: 22 | - visdial/visdial_1.0_val_imdb.json 23 | num_workers: 12 24 | enforce_slow_reader: false 25 | metrics: 26 | - r@1 27 | - r@5 28 | - r@10 29 | - mean_r 30 | - mean_rr 31 | monitored_metric: 0 32 | metric_minimize: False 33 | should_early_stop: True 34 | exp_name: baseline 35 | loss: logit_bce 36 | lr_scheduler: true 37 | model_attributes: 38 | visdial_top_down_bottom_up: 39 | classifier: 40 | type: logit 41 | params: 42 | img_hidden_dim: 5000 43 | text_hidden_dim: 300 44 | image_embeddings: 45 | - modal_combine: 46 | type: non_linear_element_multiply 47 | params: 48 | dropout: 0 49 | hidden_dim: 5000 50 | normalization: softmax 51 | transform: 52 | type: linear 53 | params: 54 | out_dim: 1 55 | image_feature_dim: 2048 56 | image_feature_encodings: 57 | - type: finetune_faster_rcnn_fpn_fc7 58 | params: 59 | bias_file: detec/detectron/fc6/fc7_b.pkl 60 | weights_file: detec/detectron/fc6/fc7_w.pkl 61 | modal_combine: 62 | type: non_linear_element_multiply 63 | params: 64 | dropout: 0 65 | hidden_dim: 5000 66 | text_embeddings: 67 | - type: attention 68 | params: 69 | hidden_dim: 1024 70 | num_layers: 1 71 | conv1_out: 512 72 | conv2_out: 2 73 | dropout: 0 74 | embedding_dim: 300 75 | embedding_init_file: vqa2.0_glove.6B.300d.txt.npy 76 | kernel_size: 1 77 | padding: 0 78 | optimizer_attributes: 79 | type: Adamax 80 | params: 81 | eps: 1.0e-08 82 | lr: 0.01 83 | weight_decay: 0 84 | run: train+predict 85 | training_parameters: 86 | clip_norm_mode: all 87 | clip_gradients: true 88 | lr_ratio: 0.1 89 | lr_steps: 90 | - 15000 91 | - 18000 92 | - 20000 93 | - 21000 94 | max_grad_l2_norm: 0.25 95 | max_iterations: 22000 96 | log_interval: 100 97 | snapshot_interval: 3000 98 | wu_factor: 0.2 99 | wu_iters: 1000 100 | patience: 3500 101 | -------------------------------------------------------------------------------- /pythia/datasets/dialog/visual_dialog/scripts/extract_vocabulary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import json 3 | 4 | from pythia.scripts.extract_vocabulary import ExtractVocabulary 5 | 6 | 7 | class ExtractVisdialVocabulary(ExtractVocabulary): 8 | def __init__(self): 9 | super(ExtractVisdialVocabulary, self).__init__() 10 | 11 | def get_text(self): 12 | text = [] 13 | 14 | for input_file in self.input_files: 15 | with open(input_file, "r") as f: 16 | f_json = json.load(f) 17 | # Add 'questions' from visdial 18 | text += f_json["data"]["questions"] 19 | # Add 'answers' from visdial 20 | text += f_json["data"]["answers"] 21 | 22 | for dialog in f_json["data"]["dialogs"]: 23 | text += [dialog["caption"]] 24 | return text 25 | 26 | 27 | if __name__ == "__main__": 28 | extractor = ExtractVisdialVocabulary() 29 | extractor.extract() 30 | -------------------------------------------------------------------------------- /pythia/datasets/samplers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved. 2 | # Code is copy-pasted exactly as in torch.utils.data.distributed. 3 | # FIXME remove this once c10d fixes the bug it has 4 | import math 5 | import torch 6 | import torch.distributed as dist 7 | from torch.utils.data.sampler import Sampler 8 | 9 | 10 | class DistributedSampler(Sampler): 11 | """Sampler that restricts data loading to a subset of the dataset. 12 | It is especially useful in conjunction with 13 | :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each 14 | process can pass a DistributedSampler instance as a DataLoader sampler, 15 | and load a subset of the original dataset that is exclusive to it. 16 | .. note:: 17 | Dataset is assumed to be of constant size. 18 | Arguments: 19 | dataset: Dataset used for sampling. 20 | num_replicas (optional): Number of processes participating in 21 | distributed training. 22 | rank (optional): Rank of the current process within num_replicas. 23 | """ 24 | 25 | def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True): 26 | if num_replicas is None: 27 | if not dist.is_available(): 28 | raise RuntimeError("Requires distributed package to be available") 29 | num_replicas = dist.get_world_size() 30 | if rank is None: 31 | if not dist.is_available(): 32 | raise RuntimeError("Requires distributed package to be available") 33 | rank = dist.get_rank() 34 | self.dataset = dataset 35 | self.num_replicas = num_replicas 36 | self.rank = rank 37 | self.epoch = 0 38 | self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas)) 39 | self.total_size = self.num_samples * self.num_replicas 40 | self.shuffle = shuffle 41 | 42 | def __iter__(self): 43 | if self.shuffle: 44 | # deterministically shuffle based on epoch 45 | g = torch.Generator() 46 | g.manual_seed(self.epoch) 47 | indices = torch.randperm(len(self.dataset), generator=g).tolist() 48 | else: 49 | indices = torch.arange(len(self.dataset)).tolist() 50 | 51 | # add extra samples to make it evenly divisible 52 | indices += indices[: (self.total_size - len(indices))] 53 | assert len(indices) == self.total_size 54 | 55 | # subsample 56 | offset = self.num_samples * self.rank 57 | indices = indices[offset : offset + self.num_samples] 58 | assert len(indices) == self.num_samples 59 | 60 | return iter(indices) 61 | 62 | def __len__(self): 63 | return self.num_samples 64 | 65 | def set_epoch(self, epoch): 66 | self.epoch = epoch 67 | -------------------------------------------------------------------------------- /pythia/datasets/scene_graph_database.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from pythia.datasets.image_database import ImageDatabase 3 | 4 | 5 | class SceneGraphDatabase(ImageDatabase): 6 | def __init__(self, scene_graph_path): 7 | super().__init__(scene_graph_path) 8 | self.data_dict = {} 9 | for item in self.data: 10 | self.data_dict[item["image_id"]] = item 11 | 12 | def __getitem__(self, idx): 13 | return self.data_dict[idx] 14 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/clevr/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/datasets/vqa/clevr/__init__.py -------------------------------------------------------------------------------- /pythia/datasets/vqa/clevr/builder.py: -------------------------------------------------------------------------------- 1 | import json 2 | import math 3 | import os 4 | import zipfile 5 | from collections import Counter 6 | 7 | from pythia.common.registry import registry 8 | from pythia.common.constants import CLEVR_DOWNLOAD_URL 9 | from pythia.datasets.base_dataset_builder import BaseDatasetBuilder 10 | from pythia.datasets.vqa.clevr.dataset import CLEVRDataset 11 | from pythia.utils.general import download_file, get_pythia_root 12 | 13 | 14 | @registry.register_builder("clevr") 15 | class CLEVRBuilder(BaseDatasetBuilder): 16 | def __init__(self): 17 | super().__init__("clevr") 18 | self.writer = registry.get("writer") 19 | self.dataset_class = CLEVRDataset 20 | 21 | def _build(self, dataset_type, config): 22 | download_folder = os.path.join(get_pythia_root(), config.data_root_dir, config.data_folder) 23 | 24 | file_name = CLEVR_DOWNLOAD_URL.split("/")[-1] 25 | local_filename = os.path.join(download_folder, file_name) 26 | 27 | extraction_folder = os.path.join(download_folder, ".".join(file_name.split(".")[:-1])) 28 | self.data_folder = extraction_folder 29 | 30 | # Either if the zip file is already present or if there are some 31 | # files inside the folder we don't continue download process 32 | if os.path.exists(local_filename): 33 | self.writer.write("CLEVR dataset is already present. Skipping download.") 34 | return 35 | 36 | if os.path.exists(extraction_folder) and \ 37 | len(os.listdir(extraction_folder)) != 0: 38 | return 39 | 40 | self.writer.write("Downloading the CLEVR dataset now") 41 | download_file(CLEVR_DOWNLOAD_URL, output_dir=download_folder) 42 | 43 | self.writer.write("Downloaded. Extracting now. This can take time.") 44 | with zipfile.ZipFile(local_filename, "r") as zip_ref: 45 | zip_ref.extractall(download_folder) 46 | 47 | 48 | def _load(self, dataset_type, config, *args, **kwargs): 49 | self.dataset = CLEVRDataset( 50 | dataset_type, config, data_folder=self.data_folder 51 | ) 52 | return self.dataset 53 | 54 | def update_registry_for_model(self, config): 55 | registry.register( 56 | self.dataset_name + "_text_vocab_size", 57 | self.dataset.text_processor.get_vocab_size(), 58 | ) 59 | registry.register( 60 | self.dataset_name + "_num_final_outputs", 61 | self.dataset.answer_processor.get_vocab_size(), 62 | ) 63 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/m4c_ocrvqa/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/m4c_ocrvqa/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from pythia.common.registry import Registry 3 | from pythia.datasets.vqa.m4c_ocrvqa.dataset import M4COCRVQADataset 4 | from pythia.datasets.vqa.m4c_textvqa.builder import M4CTextVQABuilder 5 | 6 | 7 | @Registry.register_builder("m4c_ocrvqa") 8 | class M4COCRVQABuilder(M4CTextVQABuilder): 9 | def __init__(self): 10 | super().__init__() 11 | self.dataset_name = "m4c_ocrvqa" 12 | self.set_dataset_class(M4COCRVQADataset) 13 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/m4c_ocrvqa/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from pythia.datasets.vqa.m4c_textvqa.dataset import M4CTextVQADataset 3 | 4 | 5 | class M4COCRVQADataset(M4CTextVQADataset): 6 | def __init__(self, dataset_type, imdb_file_index, config, *args, **kwargs): 7 | super().__init__( 8 | dataset_type, imdb_file_index, config, *args, **kwargs 9 | ) 10 | self._name = "m4c_ocrvqa" 11 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/m4c_stvqa/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/m4c_stvqa/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from pythia.common.registry import Registry 3 | from pythia.datasets.vqa.m4c_stvqa.dataset import M4CSTVQADataset 4 | from pythia.datasets.vqa.m4c_textvqa.builder import M4CTextVQABuilder 5 | 6 | 7 | @Registry.register_builder("m4c_stvqa") 8 | class M4CSTVQABuilder(M4CTextVQABuilder): 9 | def __init__(self): 10 | super().__init__() 11 | self.dataset_name = "m4c_stvqa" 12 | self.set_dataset_class(M4CSTVQADataset) 13 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/m4c_stvqa/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from pythia.datasets.vqa.m4c_textvqa.dataset import M4CTextVQADataset 3 | 4 | 5 | class M4CSTVQADataset(M4CTextVQADataset): 6 | def __init__(self, dataset_type, imdb_file_index, config, *args, **kwargs): 7 | super().__init__( 8 | dataset_type, imdb_file_index, config, *args, **kwargs 9 | ) 10 | self._name = "m4c_stvqa" 11 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/m4c_textvqa/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/m4c_textvqa/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from pythia.common.registry import Registry 3 | from pythia.datasets.vqa.m4c_textvqa.dataset import M4CTextVQADataset 4 | from pythia.datasets.vqa.textvqa.builder import TextVQABuilder 5 | 6 | 7 | @Registry.register_builder("m4c_textvqa") 8 | class M4CTextVQABuilder(TextVQABuilder): 9 | def __init__(self): 10 | super().__init__() 11 | self.dataset_name = "m4c_textvqa" 12 | self.set_dataset_class(M4CTextVQADataset) 13 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/textvqa/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/textvqa/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from pythia.common.registry import Registry 3 | from pythia.datasets.vqa.textvqa.dataset import TextVQADataset 4 | from pythia.datasets.vqa.vizwiz import VizWizBuilder 5 | 6 | 7 | @Registry.register_builder("textvqa") 8 | class TextVQABuilder(VizWizBuilder): 9 | def __init__(self): 10 | super().__init__() 11 | self.dataset_name = "textvqa" 12 | self.set_dataset_class(TextVQADataset) 13 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/textvqa/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from pythia.datasets.vqa.vizwiz import VizWizDataset 3 | from pythia.utils.text_utils import word_tokenize 4 | 5 | 6 | class TextVQADataset(VizWizDataset): 7 | def __init__(self, dataset_type, imdb_file_index, config, *args, **kwargs): 8 | super().__init__(dataset_type, imdb_file_index, config, *args, **kwargs) 9 | self._name = "textvqa" 10 | 11 | def format_for_evalai(self, report): 12 | answers = report.scores.argmax(dim=1) 13 | 14 | predictions = [] 15 | answer_space_size = self.answer_processor.get_true_vocab_size() 16 | 17 | for idx, question_id in enumerate(report.question_id): 18 | answer_id = answers[idx].item() 19 | print(answer_id, idx, len(answers), len(report.question_id), len(report.context_tokens)) 20 | if answer_id >= answer_space_size: 21 | answer_id -= answer_space_size 22 | answer = word_tokenize(report.context_tokens[idx][answer_id]) 23 | else: 24 | answer = self.answer_processor.idx2word(answer_id) 25 | 26 | predictions.append({"question_id": question_id.item(), "answer": answer}) 27 | return predictions 28 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/vizwiz/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from .builder import VizWizBuilder 3 | from .dataset import VizWizDataset 4 | 5 | 6 | __all__ = ["VizWizBuilder", "VizWizDataset"] 7 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/vizwiz/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from pythia.common.registry import registry 3 | from pythia.datasets.vqa.vizwiz.dataset import VizWizDataset 4 | from pythia.datasets.vqa.vqa2 import VQA2Builder 5 | 6 | 7 | @registry.register_builder("vizwiz") 8 | class VizWizBuilder(VQA2Builder): 9 | def __init__(self): 10 | super().__init__() 11 | self.dataset_name = "vizwiz" 12 | self.set_dataset_class(VizWizDataset) 13 | 14 | def update_registry_for_model(self, config): 15 | super().update_registry_for_model(config) 16 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/vizwiz/dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import torch 3 | 4 | from pythia.common.sample import Sample 5 | from pythia.datasets.vqa.vqa2 import VQA2Dataset 6 | 7 | 8 | class VizWizDataset(VQA2Dataset): 9 | def __init__(self, dataset_type, imdb_file_index, config, *args, **kwargs): 10 | super().__init__(dataset_type, imdb_file_index, config, *args, **kwargs) 11 | 12 | # Update name as default would be 'vqa2' due to inheritance 13 | self._name = "vizwiz" 14 | 15 | def load_item(self, idx): 16 | sample = super().load_item(idx) 17 | 18 | sample_info = self.imdb[idx] 19 | 20 | if "image_name" in sample_info: 21 | sample.image_id = sample_info["image_name"] 22 | 23 | return sample 24 | 25 | def format_for_evalai(self, report): 26 | answers = report.scores.argmax(dim=1) 27 | 28 | predictions = [] 29 | answer_space_size = self.answer_processor.get_true_vocab_size() 30 | 31 | for idx, image_id in enumerate(report.image_id): 32 | answer_id = answers[idx].item() 33 | 34 | if answer_id >= answer_space_size: 35 | answer_id -= answer_space_size 36 | answer = report.context_tokens[idx][answer_id] 37 | else: 38 | answer = self.answer_processor.idx2word(answer_id) 39 | if answer == self.context_processor.PAD_TOKEN: 40 | answer = "unanswerable" 41 | predictions.append( 42 | { 43 | "image": "_".join(["VizWiz"] + image_id.split("_")[2:]) + ".jpg", 44 | "answer": answer, 45 | } 46 | ) 47 | 48 | return predictions 49 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/vqa2/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | __all__ = ["VQA2Builder", "VQA2Dataset"] 3 | 4 | from .builder import VQA2Builder 5 | from .dataset import VQA2Dataset 6 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/vqa2/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import warnings 11 | 12 | from pythia.common.registry import registry 13 | from pythia.datasets.base_dataset_builder import BaseDatasetBuilder 14 | from pythia.datasets.concat_dataset import PythiaConcatDataset 15 | from pythia.datasets.vqa.vqa2.dataset import VQA2Dataset 16 | 17 | 18 | @registry.register_builder("vqa2") 19 | class VQA2Builder(BaseDatasetBuilder): 20 | def __init__(self): 21 | super().__init__("vqa2") 22 | self.dataset_class = VQA2Dataset 23 | 24 | def _load(self, dataset_type, config, *args, **kwargs): 25 | self.config = config 26 | 27 | image_features = config["image_features"]["train"][0].split(",") 28 | self.num_image_features = len(image_features) 29 | 30 | registry.register("num_image_features", self.num_image_features) 31 | 32 | self.dataset = self.prepare_data_set(dataset_type, config) 33 | 34 | return self.dataset 35 | 36 | def _build(self, dataset_type, config): 37 | # TODO: Build actually here 38 | return 39 | 40 | def update_registry_for_model(self, config): 41 | registry.register( 42 | self.dataset_name + "_text_vocab_size", 43 | self.dataset.text_processor.get_vocab_size(), 44 | ) 45 | registry.register( 46 | self.dataset_name + "_num_final_outputs", 47 | self.dataset.answer_processor.get_vocab_size(), 48 | ) 49 | 50 | def init_args(self, parser): 51 | parser.add_argument_group("VQA2 task specific arguments") 52 | parser.add_argument( 53 | "--data_root_dir", 54 | type=str, 55 | default="../data", 56 | help="Root directory for data", 57 | ) 58 | parser.add_argument( 59 | "-nfr", 60 | "--fast_read", 61 | type=bool, 62 | default=None, 63 | help="Disable fast read and load features on fly", 64 | ) 65 | 66 | def set_dataset_class(self, cls): 67 | self.dataset_class = cls 68 | 69 | def prepare_data_set(self, dataset_type, config): 70 | if dataset_type not in config.imdb_files: 71 | warnings.warn( 72 | "Dataset type {} is not present in " 73 | "imdb_files of dataset config. Returning None. " 74 | "This dataset won't be used.".format(dataset_type) 75 | ) 76 | return None 77 | 78 | imdb_files = config["imdb_files"][dataset_type] 79 | 80 | datasets = [] 81 | 82 | for imdb_idx in range(len(imdb_files)): 83 | cls = self.dataset_class 84 | dataset = cls(dataset_type, imdb_idx, config) 85 | datasets.append(dataset) 86 | 87 | dataset = PythiaConcatDataset(datasets) 88 | 89 | return dataset 90 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/vqa2/ocr_builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from pythia.common.registry import Registry 3 | from pythia.datasets.vqa.vizwiz import VizWizBuilder 4 | from pythia.datasets.vqa.vqa2.ocr_dataset import VQA2OCRDataset 5 | 6 | 7 | @Registry.register_builder("vqa2_ocr") 8 | class TextVQABuilder(VizWizBuilder): 9 | def __init__(self): 10 | super().__init__() 11 | self.dataset_name = "VQA2_OCR" 12 | self.set_dataset_class(VQA2OCRDataset) 13 | -------------------------------------------------------------------------------- /pythia/datasets/vqa/vqa2/ocr_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from pythia.datasets.vqa.vizwiz import VizWizDataset 3 | from pythia.utils.text_utils import word_tokenize 4 | 5 | 6 | class VQA2OCRDataset(VizWizDataset): 7 | def __init__(self, imdb_file, image_feat_directories, verbose=False, **data_params): 8 | super(VQA2OCRDataset, self).__init__( 9 | imdb_file, image_feat_directories, verbose, **data_params 10 | ) 11 | self.name = "vqa2_ocr" 12 | 13 | def format_for_evalai(self, batch, answers): 14 | answers = answers.argmax(dim=1) 15 | 16 | predictions = [] 17 | for idx, question_id in enumerate(batch["question_id"]): 18 | answer_id = answers[idx] 19 | 20 | if answer_id >= self.answer_space_size: 21 | answer_id -= self.answer_space_size 22 | answer = word_tokenize(batch["ocr_tokens"][answer_id][idx]) 23 | else: 24 | answer = self.answer_dict.idx2word(answer_id) 25 | predictions.append({"question_id": question_id.item(), "answer": answer}) 26 | 27 | return predictions 28 | 29 | def __getitem__(self, idx): 30 | sample = super(VQA2OCRDataset, self).__getitem__(idx) 31 | 32 | if sample["question_id"] is None: 33 | sample["question_id"] = -1 34 | return sample 35 | -------------------------------------------------------------------------------- /pythia/legacy/best_model/config.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | batch_size: 512 3 | data_root_dir: data/ 4 | dataset: vqa_2.0 5 | image_depth_first: false 6 | image_fast_reader: false 7 | image_feat_test: 8 | - rcnn_10_100/test2015 9 | image_feat_train: 10 | - rcnn_10_100/train2014 11 | - rcnn_10_100/val2014 12 | image_feat_val: 13 | - rcnn_10_100/val2014 14 | image_max_loc: 100 15 | imdb_file_test: 16 | - imdb/imdb_test2015.npy 17 | imdb_file_train: 18 | - imdb/imdb_train2014.npy 19 | - imdb/imdb_val2014.npy 20 | imdb_file_val: 21 | - imdb/imdb_minival2014.npy 22 | num_workers: 5 23 | question_max_len: 14 24 | vocab_answer_file: answers_vqa.txt 25 | vocab_question_file: large_vocabulary_vqa.txt 26 | exp_name: baseline 27 | loss: logitBCE 28 | model: 29 | classifier: 30 | method: logit_classifier 31 | par: 32 | img_hidden_dim: 5000 33 | txt_hidden_dim: 300 34 | image_embedding_models: 35 | - modal_combine: 36 | method: non_linear_elmt_multiply 37 | par: 38 | dropout: 0 39 | hidden_size: 5000 40 | normalization: softmax 41 | transform: 42 | method: linear_transform 43 | par: 44 | out_dim: 1 45 | image_feat_dim: 2048 46 | image_feature_encoding: 47 | - method: default_image 48 | par: {} 49 | modal_combine: 50 | method: non_linear_elmt_multiply 51 | par: 52 | dropout: 0 53 | hidden_size: 5000 54 | question_embedding: 55 | - method: att_que_embed 56 | par: 57 | LSTM_hidden_size: 1024 58 | LSTM_layer: 1 59 | conv1_out: 512 60 | conv2_out: 2 61 | dropout: 0 62 | embedding_dim: 300 63 | embedding_init_file: large_vqa2.0_glove.6B.300d.txt.npy 64 | kernel_size: 1 65 | padding: 0 66 | optimizer: 67 | method: Adamax 68 | par: 69 | eps: 1.0e-08 70 | lr: 0.01 71 | weight_decay: 0 72 | run: train+predict 73 | training_parameters: 74 | clip_norm_mode: all 75 | lr_ratio: 0.1 76 | lr_steps: 77 | - 15000 78 | - 18000 79 | - 20000 80 | - 21000 81 | max_grad_l2_norm: 0.25 82 | max_iter: 22000 83 | report_interval: 100 84 | snapshot_interval: 1000 85 | wu_factor: 0.2 86 | wu_iters: 1000 87 | -------------------------------------------------------------------------------- /pythia/legacy/config/collections.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | """A simple attribute dictionary used for representing configuration \ 10 | options.""" 11 | 12 | 13 | class AttrDict(dict): 14 | 15 | IMMUTABLE = "__immutable__" 16 | 17 | def __init__(self, *args, **kwargs): 18 | super(AttrDict, self).__init__(*args, **kwargs) 19 | self.__dict__[AttrDict.IMMUTABLE] = False 20 | 21 | def __getattr__(self, name): 22 | if name in self.__dict__: 23 | return self.__dict__[name] 24 | elif name in self: 25 | return self[name] 26 | else: 27 | raise AttributeError(name) 28 | 29 | def __setattr__(self, name, value): 30 | if not self.__dict__[AttrDict.IMMUTABLE]: 31 | if name in self.__dict__: 32 | self.__dict__[name] = value 33 | else: 34 | self[name] = value 35 | else: 36 | raise AttributeError( 37 | 'Attempted to set "{}" to "{}", but AttrDict is immutable'.format( 38 | name, value 39 | ) 40 | ) 41 | 42 | def immutable(self, is_immutable): 43 | """Set immutability to is_immutable and recursively apply the setting 44 | to all nested AttrDicts. 45 | """ 46 | self.__dict__[AttrDict.IMMUTABLE] = is_immutable 47 | # Recursively set immutable state 48 | for v in self.__dict__.values(): 49 | if isinstance(v, AttrDict): 50 | v.immutable(is_immutable) 51 | for v in self.values(): 52 | if isinstance(v, AttrDict): 53 | v.immutable(is_immutable) 54 | 55 | def is_immutable(self): 56 | return self.__dict__[AttrDict.IMMUTABLE] 57 | -------------------------------------------------------------------------------- /pythia/legacy/config/demo/config.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | batch_size: 512 3 | data_root_dir: data/ 4 | dataset: vqa_2.0 5 | image_depth_first: false 6 | image_fast_reader: false 7 | image_feat_test: 8 | - demo/features/ 9 | image_feat_train: 10 | - rcnn_10_100/train2014 11 | - rcnn_10_100/val2014 12 | image_feat_val: 13 | - rcnn_10_100/val2014 14 | image_max_loc: 100 15 | imdb_file_test: 16 | - demo/imdb/imdb_demo.npy 17 | imdb_file_train: 18 | - imdb/imdb_train2014.npy 19 | - imdb/imdb_val2014.npy 20 | imdb_file_val: 21 | - imdb/imdb_minival2014.npy 22 | num_workers: 5 23 | question_max_len: 14 24 | vocab_answer_file: answers_vqa.txt 25 | vocab_question_file: large_vocabulary_vqa.txt 26 | exp_name: baseline 27 | loss: logitBCE 28 | model: 29 | classifier: 30 | method: logit_classifier 31 | par: 32 | img_hidden_dim: 5000 33 | txt_hidden_dim: 300 34 | image_embedding_models: 35 | - modal_combine: 36 | method: non_linear_elmt_multiply 37 | par: 38 | dropout: 0 39 | hidden_size: 5000 40 | normalization: softmax 41 | transform: 42 | method: linear_transform 43 | par: 44 | out_dim: 1 45 | image_feat_dim: 2048 46 | image_feature_encoding: 47 | - method: default_image 48 | par: {} 49 | modal_combine: 50 | method: non_linear_elmt_multiply 51 | par: 52 | dropout: 0 53 | hidden_size: 5000 54 | question_embedding: 55 | - method: att_que_embed 56 | par: 57 | LSTM_hidden_size: 1024 58 | LSTM_layer: 1 59 | conv1_out: 512 60 | conv2_out: 2 61 | dropout: 0 62 | embedding_dim: 300 63 | embedding_init_file: large_vqa2.0_glove.6B.300d.txt.npy 64 | kernel_size: 1 65 | padding: 0 66 | optimizer: 67 | method: Adamax 68 | par: 69 | eps: 1.0e-08 70 | lr: 0.01 71 | weight_decay: 0 72 | run: train+predict 73 | training_parameters: 74 | clip_norm_mode: all 75 | lr_ratio: 0.1 76 | lr_steps: 77 | - 15000 78 | - 18000 79 | - 20000 80 | - 21000 81 | max_grad_l2_norm: 0.25 82 | max_iter: 22000 83 | report_interval: 100 84 | snapshot_interval: 1000 85 | wu_factor: 0.2 86 | wu_iters: 1000 87 | -------------------------------------------------------------------------------- /pythia/legacy/config/keep/MFH_ft.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | data_root_dir: data 3 | image_feat_test: 4 | - detectron/fc6/vqa/test2015 5 | image_feat_train: 6 | - detectron_23/fc6/vqa/train2014 7 | - detectron_23/fc6/vqa/val2014 8 | image_feat_val: 9 | - detectron_23/fc6/vqa/val2014 10 | image_max_loc: 100 11 | imdb_file_test: 12 | - imdb/imdb_test2015.npy 13 | imdb_file_train: 14 | - imdb/imdb_train2014.npy 15 | - imdb/imdb_val2train2014.npy 16 | imdb_file_val: 17 | - imdb/imdb_minival2014.npy 18 | batch_size: 512 19 | loss: softmaxKL 20 | model: 21 | image_feature_encoding: 22 | - method: finetune_faster_rcnn_fpn_fc7 23 | par: 24 | weights_file: detectron/fc6/fc7_w.pkl 25 | bias_file: detectron/fc6/fc7_b.pkl 26 | classifier: 27 | method: linear_classifier 28 | image_embedding_models: 29 | - modal_combine: 30 | method: MFH 31 | par: 32 | order: 1 33 | hidden_sizes: 34 | - 5000 35 | dropout: 0.1 36 | pool_size: 5 37 | normalization: softmax 38 | transform: 39 | method: conv_transform 40 | modal_combine: 41 | method: MFH 42 | par: 43 | order: 2 44 | hidden_sizes: 45 | - 5000 46 | - 5000 47 | dropout: 0.1 48 | pool_size: 5 49 | -------------------------------------------------------------------------------- /pythia/legacy/config/keep/detectron.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | data_root_dir: data 3 | image_feat_test: 4 | - detectron/fc6/vqa/test2015 5 | image_feat_train: 6 | - detectron/fc6/vqa/train2014 7 | - detectron/fc6/vqa/val2014 8 | image_feat_val: 9 | - detectron/fc6/vqa/val2014 10 | image_max_loc: 100 11 | imdb_file_test: 12 | - imdb/imdb_test2015.npy 13 | imdb_file_train: 14 | - imdb/imdb_train2014.npy 15 | - imdb/imdb_val2train2014.npy 16 | imdb_file_val: 17 | - imdb/imdb_minival2014.npy 18 | model: 19 | image_feature_encoding: 20 | - method: finetune_faster_rcnn_fpn_fc7 21 | par: 22 | weights_file: detectron/fc6/fc7_w.pkl 23 | bias_file: detectron/fc6/fc7_b.pkl 24 | -------------------------------------------------------------------------------- /pythia/legacy/config/verbose/MFH_module.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | batch_size: 512 3 | data_root_dir: data 4 | dataset: vqa_2.0 5 | image_depth_first: false 6 | image_fast_reader: true 7 | image_feat_test: 8 | - detectron/fc6/vqa/test2015 9 | image_feat_train: 10 | - detectron/fc6/vqa/train2014 11 | - detectron/fc6/vqa/val2014 12 | image_feat_val: 13 | - detectron/fc6/vqa/val2014 14 | image_max_loc: 100 15 | imdb_file_test: 16 | - imdb/imdb_test2015.npy 17 | imdb_file_train: 18 | - imdb/imdb_train2014.npy 19 | - imdb/imdb_val2train2014.npy 20 | imdb_file_val: 21 | - imdb/imdb_minival2014.npy 22 | num_workers: 5 23 | question_max_len: 14 24 | vocab_answer_file: answers_vqa.txt 25 | vocab_question_file: vocabulary_vqa.txt 26 | exp_name: baseline 27 | loss: softmaxKL 28 | model: 29 | classifier: 30 | method: linear_classifier 31 | par: {} 32 | image_embedding_models: 33 | - modal_combine: 34 | method: MFH 35 | par: 36 | dropout: 0.1 37 | hidden_sizes: 38 | - 5000 39 | - 5000 40 | order: 2 41 | pool_size: 5 42 | normalization: softmax 43 | transform: 44 | method: conv_transform 45 | par: 46 | hidden_dim: 512 47 | out_dim: 2 48 | image_feat_dim: 2048 49 | image_feature_encoding: 50 | - method: finetune_faster_rcnn_fpn_fc7 51 | par: 52 | bias_file: detectron/fc6/fc7_b.pkl 53 | weights_file: detectron/fc6/fc7_w.pkl 54 | modal_combine: 55 | method: MFH 56 | par: 57 | dropout: 0.1 58 | hidden_sizes: 59 | - 5000 60 | - 5000 61 | order: 2 62 | pool_size: 5 63 | question_embedding: 64 | - method: att_que_embed 65 | par: 66 | LSTM_hidden_size: 1024 67 | LSTM_layer: 1 68 | conv1_out: 512 69 | conv2_out: 2 70 | dropout: 0 71 | embedding_dim: 300 72 | embedding_init_file: vqa2.0_glove.6B.300d.txt.npy 73 | kernel_size: 1 74 | padding: 0 75 | optimizer: 76 | method: Adamax 77 | par: 78 | eps: 1.0e-08 79 | lr: 0.01 80 | weight_decay: 0 81 | run: train+predict 82 | training_parameters: 83 | clip_norm_mode: all 84 | lr_ratio: 0.1 85 | lr_steps: 86 | - 5000 87 | - 7000 88 | - 9000 89 | - 11000 90 | max_grad_l2_norm: 0.25 91 | max_iter: 12000 92 | report_interval: 100 93 | snapshot_interval: 1000 94 | wu_factor: 0.2 95 | wu_iters: 1000 96 | -------------------------------------------------------------------------------- /pythia/legacy/config/verbose/dectectron_finetune.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | batch_size: 512 3 | data_root_dir: data 4 | dataset: vqa_2.0 5 | image_depth_first: false 6 | image_fast_reader: true 7 | image_feat_test: 8 | - detectron/fc6/vqa/test2015 9 | image_feat_train: 10 | - detectron/fc6/vqa/train2014 11 | - detectron/fc6/vqa/val2014 12 | image_feat_val: 13 | - detectron/fc6/vqa/val2014 14 | image_max_loc: 100 15 | imdb_file_test: 16 | - imdb/imdb_test2015.npy 17 | imdb_file_train: 18 | - imdb/imdb_train2014.npy 19 | - imdb/imdb_val2train2014.npy 20 | imdb_file_val: 21 | - imdb/imdb_minival2014.npy 22 | num_workers: 5 23 | question_max_len: 14 24 | vocab_answer_file: answers_vqa.txt 25 | vocab_question_file: vocabulary_vqa.txt 26 | exp_name: baseline 27 | loss: logitBCE 28 | model: 29 | classifier: 30 | method: logit_classifier 31 | par: 32 | img_hidden_dim: 5000 33 | txt_hidden_dim: 300 34 | image_embedding_models: 35 | - modal_combine: 36 | method: non_linear_elmt_multiply 37 | par: 38 | dropout: 0 39 | hidden_size: 5000 40 | normalization: softmax 41 | transform: 42 | method: linear_transform 43 | par: 44 | out_dim: 1 45 | image_feat_dim: 2048 46 | image_feature_encoding: 47 | - method: finetune_faster_rcnn_fpn_fc7 48 | par: 49 | bias_file: detectron/fc6/fc7_b.pkl 50 | weights_file: detectron/fc6/fc7_w.pkl 51 | modal_combine: 52 | method: non_linear_elmt_multiply 53 | par: 54 | dropout: 0 55 | hidden_size: 5000 56 | question_embedding: 57 | - method: att_que_embed 58 | par: 59 | LSTM_hidden_size: 1024 60 | LSTM_layer: 1 61 | conv1_out: 512 62 | conv2_out: 2 63 | dropout: 0 64 | embedding_dim: 300 65 | embedding_init_file: vqa2.0_glove.6B.300d.txt.npy 66 | kernel_size: 1 67 | padding: 0 68 | optimizer: 69 | method: Adamax 70 | par: 71 | eps: 1.0e-08 72 | lr: 0.01 73 | weight_decay: 0 74 | run: train+predict 75 | training_parameters: 76 | clip_norm_mode: all 77 | lr_ratio: 0.1 78 | lr_steps: 79 | - 5000 80 | - 7000 81 | - 9000 82 | - 11000 83 | max_grad_l2_norm: 0.25 84 | max_iter: 12000 85 | report_interval: 100 86 | snapshot_interval: 1000 87 | wu_factor: 0.2 88 | wu_iters: 1000 89 | -------------------------------------------------------------------------------- /pythia/legacy/config/verbose/default.yaml: -------------------------------------------------------------------------------- 1 | data: 2 | batch_size: 512 3 | data_root_dir: data 4 | dataset: vqa_2.0 5 | image_depth_first: false 6 | image_fast_reader: true 7 | image_feat_test: 8 | - rcnn_10_100/vqa/test2015 9 | image_feat_train: 10 | - rcnn_10_100/vqa/train2014 11 | - rcnn_10_100/vqa/val2014 12 | image_feat_val: 13 | - rcnn_10_100/vqa/val2014 14 | image_max_loc: 100 15 | imdb_file_test: 16 | - imdb/imdb_test2015.npy 17 | imdb_file_train: 18 | - imdb/imdb_train2014.npy 19 | - imdb/imdb_val2train2014.npy 20 | imdb_file_val: 21 | - imdb/imdb_minival2014.npy 22 | num_workers: 5 23 | question_max_len: 14 24 | vocab_answer_file: answers_vqa.txt 25 | vocab_question_file: vocabulary_vqa.txt 26 | exp_name: baseline 27 | loss: logitBCE 28 | model: 29 | classifier: 30 | method: logit_classifier 31 | par: 32 | img_hidden_dim: 5000 33 | txt_hidden_dim: 300 34 | image_embedding_models: 35 | - modal_combine: 36 | method: non_linear_elmt_multiply 37 | par: 38 | dropout: 0 39 | hidden_size: 5000 40 | normalization: softmax 41 | transform: 42 | method: linear_transform 43 | par: 44 | out_dim: 1 45 | image_feat_dim: 2048 46 | image_feature_encoding: 47 | - method: default_image 48 | par: {} 49 | modal_combine: 50 | method: non_linear_elmt_multiply 51 | par: 52 | dropout: 0 53 | hidden_size: 5000 54 | question_embedding: 55 | - method: att_que_embed 56 | par: 57 | LSTM_hidden_size: 1024 58 | LSTM_layer: 1 59 | conv1_out: 512 60 | conv2_out: 2 61 | dropout: 0 62 | embedding_dim: 300 63 | embedding_init_file: vqa2.0_glove.6B.300d.txt.npy 64 | kernel_size: 1 65 | padding: 0 66 | optimizer: 67 | method: Adamax 68 | par: 69 | eps: 1.0e-08 70 | lr: 0.01 71 | weight_decay: 0 72 | run: train+predict 73 | training_parameters: 74 | clip_norm_mode: all 75 | lr_ratio: 0.1 76 | lr_steps: 77 | - 5000 78 | - 7000 79 | - 9000 80 | - 11000 81 | max_grad_l2_norm: 0.25 82 | max_iter: 12000 83 | report_interval: 100 84 | snapshot_interval: 1000 85 | wu_factor: 0.2 86 | wu_iters: 1000 87 | -------------------------------------------------------------------------------- /pythia/legacy/data/demo/features/COCO_test2015_000000000001.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/legacy/data/demo/features/COCO_test2015_000000000001.npy -------------------------------------------------------------------------------- /pythia/legacy/data/demo/images/COCO_test2015_000000000001.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/legacy/data/demo/images/COCO_test2015_000000000001.jpg -------------------------------------------------------------------------------- /pythia/legacy/data/demo/imdb/imdb_demo.npy: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/legacy/data/demo/imdb/imdb_demo.npy -------------------------------------------------------------------------------- /pythia/legacy/data_prep/data_preprocess.md: -------------------------------------------------------------------------------- 1 | #### VQA v2.0 2 | 3 | Download dataset 4 | ```bash 5 | cd ../ 6 | mkdir -p orig_data/vqa_v2.0 7 | cd orig_data/vqa_v2.0 8 | ./../../data_prep/vqa_v2.0/download_vqa_2.0.sh 9 | 10 | ``` 11 | 12 | Preprocess dataset 13 | ```bash 14 | cd ../../VQA_suite 15 | mkdir data 16 | 17 | export PYTHONPATH=. 18 | 19 | python data_prep/vqa_v2.0/extract_vocabulary.py \ 20 | --input_files ../orig_data/vqa_v2.0/v2_OpenEnded_mscoco_train2014_questions.json \ 21 | ../orig_data/vqa_v2.0/v2_OpenEnded_mscoco_val2014_questions.json \ 22 | ../orig_data/vqa_v2.0/v2_OpenEnded_mscoco_test2015_questions.json \ 23 | --out_dir data/ 24 | 25 | python data_prep/vqa_v2.0/process_answers.py \ 26 | --annotation_file ../orig_data/vqa_v2.0/v2_mscoco_train2014_annotations.json \ 27 | --val_annotation_file ../orig_data/vqa_v2.0/v2_mscoco_val2014_annotations.json \ 28 | --out_dir data/ --min_freq 9 29 | 30 | python data_prep/vqa_v2.0/extract_word_glove_embedding.py \ 31 | --vocabulary_file data/vocabulary_vqa.txt \ 32 | --glove_file ../orig_data/vqa_v2.0/glove/glove.6B.300d.txt \ 33 | --out_dir data/ 34 | 35 | python data_prep/vqa_v2.0/build_vqa_2.0_imdb.py --data_dir ../orig_data/vqa_v2.0/ --out_dir data/ 36 | 37 | ``` 38 | 39 | Download image features 40 | ```bash 41 | cd data/ 42 | wget https://dl.fbaipublicfiles.com/pythia/features/rcnn_10_100.tar.gz 43 | wget https://dl.fbaipublicfiles.com/pythia/features/detectron.tar.gz 44 | gunzip rcnn_10_100.tar.gz 45 | tar -xvf rcnn_10_100.tar 46 | rm -f rcnn_10_100.tar 47 | 48 | gunzip detectron.tar.gz 49 | tar -xvf detectron.tar 50 | rm -f detectron.tar 51 | ``` 52 | ### Extract Image Features 53 | 54 | We use detectron to extract image features. Setting up [detectron](https://github.com/facebookresearch/Detectron) 55 | and copy [tools/extract_features.py](tools/extract_features.py) to detectron for extracting features 56 | 57 | 58 | Feature extraction works best with commit #3a38b7b of [detectron](https://github.com/facebookresearch/Detectron) 59 | and #0dd3284 of [caffe2](https://github.com/caffe2/caffe2) 60 | 61 | 62 | download the pretrained detectron model 63 | ```bash 64 | wget https://dl.fbaipublicfiles.com/pythia/detectron_model/FAST_RCNN_MLP_DIM2048_FPN_DIM512.pkl 65 | wget https://dl.fbaipublicfiles.com/pythia/detectron_model/e2e_faster_rcnn_X-101-64x4d-FPN_1x_MLP_2048_FPN_512.yaml 66 | 67 | $INPUT_DIR = /path/to/your/input/image or directory 68 | 69 | python extract_features.py --cfg e2e_faster_rcnn_X-101-64x4d-FPN_1x_MLP_2048_FPN_512.yaml \ 70 | --wts FAST_RCNN_MLP_DIM2048_FPN_DIM512.pkl \ 71 | --min_bboxes 100 --max_bboxes 100 \ 72 | --feat_name gpu_0/fc6 \ 73 | --output_dir ~/temp_out $INPUT_DIR 74 | ``` 75 | -------------------------------------------------------------------------------- /pythia/legacy/data_prep/vqa_v2.0/download_vqa_2.0.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | 4 | # GloVe Vectors 5 | wget http://nlp.stanford.edu/data/glove.6B.zip 6 | unzip glove.6B.zip -d glove 7 | rm glove.6B.zip 8 | 9 | 10 | ##VQA2.0 11 | 12 | wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip 13 | 14 | wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip 15 | 16 | wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip 17 | 18 | wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip 19 | 20 | wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Test_mscoco.zip 21 | 22 | wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Complementary_Pairs_Train_mscoco.zip 23 | 24 | wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Complementary_Pairs_Val_mscoco.zip 25 | 26 | unzip v2_Annotations_Train_mscoco.zip 27 | rm v2_Annotations_Train_mscoco.zip 28 | 29 | unzip v2_Annotations_Val_mscoco.zip 30 | rm v2_Annotations_Val_mscoco.zip 31 | 32 | unzip v2_Questions_Train_mscoco.zip 33 | rm v2_Questions_Train_mscoco.zip 34 | 35 | unzip v2_Questions_Val_mscoco.zip 36 | rm v2_Questions_Val_mscoco.zip 37 | 38 | unzip v2_Questions_Test_mscoco.zip 39 | rm v2_Questions_Test_mscoco.zip 40 | 41 | unzip v2_Complementary_Pairs_Train_mscoco.zip 42 | rm v2_Complementary_Pairs_Train_mscoco.zip 43 | 44 | unzip v2_Complementary_Pairs_Val_mscoco.zip 45 | rm v2_Complementary_Pairs_Val_mscoco.zip 46 | 47 | 48 | ### get minival and val2train 49 | wget https://dl.fbaipublicfiles.com/pythia/data/v2_OpenEnded_mscoco_minival2014_questions.json 50 | wget https://dl.fbaipublicfiles.com/pythia/data/v2_OpenEnded_mscoco_val2train2014_questions.json 51 | -------------------------------------------------------------------------------- /pythia/legacy/data_prep/vqa_v2.0/extract_ques_info.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import argparse 10 | import json 11 | 12 | 13 | def extract_info(annotations, writer): 14 | for annotation in annotations: 15 | question_id = annotation["question_id"] 16 | answer_type = annotation["answer_type"] 17 | question_type = annotation["question_type"] 18 | multiple_choice_answer = annotation["multiple_choice_answer"] 19 | answers = [a["answer"] for a in annotation["answers"]] 20 | answers_out = "|".join([str(a) for a in answers]) 21 | confidences = [a["answer_confidence"] for a in annotation["answers"]] 22 | confidences_out = "|".join(str(a) for a in confidences) 23 | 24 | writer.write( 25 | str(question_id) 26 | + "\t" 27 | + question_type 28 | + "\t" 29 | + answer_type 30 | + "\t" 31 | + str(multiple_choice_answer) 32 | + "\t" 33 | + answers_out 34 | + "\t" 35 | + confidences_out 36 | + "\n" 37 | ) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument( 43 | "--annotation_files", 44 | nargs="+", 45 | required=True, 46 | help="input annotation json files, \ 47 | if more than 1, split by space", 48 | ) 49 | parser.add_argument("--out", type=str, required=True, help="out put files") 50 | 51 | args = parser.parse_args() 52 | out_writer = open(args.out, "w") 53 | 54 | for annotation_file in args.annotation_files: 55 | with open(annotation_file, "r") as f: 56 | annotations = json.load(f)["annotations"] 57 | extract_info(annotations, out_writer) 58 | 59 | out_writer.close() 60 | -------------------------------------------------------------------------------- /pythia/legacy/data_prep/vqa_v2.0/extract_vocabulary.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import argparse 10 | import json 11 | import os 12 | from collections import Counter 13 | 14 | from dataset_utils.text_processing import tokenize 15 | 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument( 18 | "--input_files", 19 | nargs="+", 20 | required=True, 21 | help="input question json files, \ 22 | if more than 1, split by space", 23 | ) 24 | parser.add_argument( 25 | "--out_dir", 26 | type=str, 27 | default="./", 28 | help="output directory, default is current directory", 29 | ) 30 | parser.add_argument( 31 | "--min_freq", 32 | type=int, 33 | default=0, 34 | help="the minimum times of word occurrence \ 35 | to be included in vocabulary, default 0", 36 | ) 37 | 38 | args = parser.parse_args() 39 | 40 | input_files = args.input_files 41 | out_dir = args.out_dir 42 | min_freq = args.min_freq 43 | 44 | os.makedirs(out_dir, exist_ok=True) 45 | 46 | vocab_file_name = "vocabulary_vqa.txt" 47 | 48 | word_count = Counter() 49 | questions = [] 50 | 51 | for idx, input_file in enumerate(input_files): 52 | with open(input_file, "r") as f: 53 | questions += json.load(f)["questions"] 54 | 55 | question_length = [None] * len(questions) 56 | 57 | for inx, question in enumerate(questions): 58 | words = tokenize(question["question"]) 59 | question_length[inx] = len(words) 60 | word_count.update(words) 61 | 62 | vocabulary = [w[0] for w in word_count.items() if w[1] >= min_freq] 63 | vocabulary.sort() 64 | vocabulary = [""] + vocabulary 65 | 66 | vocab_file = os.path.join(out_dir, vocab_file_name) 67 | with open(vocab_file, "w") as f: 68 | f.writelines([w + "\n" for w in vocabulary]) 69 | 70 | 71 | print("min question len=", min(question_length)) 72 | print("max question len=", max(question_length)) 73 | -------------------------------------------------------------------------------- /pythia/legacy/data_prep/vqa_v2.0/extract_word_glove_embedding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import argparse 10 | import os 11 | 12 | import numpy as np 13 | 14 | from dataset_utils.text_processing import VocabDict 15 | 16 | 17 | def subset_weights(glove_file, vocabulary_file): 18 | with open(glove_file, "r") as f: 19 | entries = f.readlines() 20 | emb_dim = len(entries[0].split(" ")) - 1 21 | print("embedding dim is %d" % emb_dim) 22 | 23 | vocabulary = VocabDict(vocab_file=vocabulary_file) 24 | 25 | weights = np.zeros((vocabulary.num_vocab, emb_dim), dtype=np.float32) 26 | 27 | word2emb = {} 28 | for entry in entries: 29 | vals = entry.split(" ") 30 | word = vals[0] 31 | vals = np.array(list(map(float, vals[1:]))) 32 | word2emb[word] = np.array(vals) 33 | 34 | for word, idx in vocabulary.word2idx_dict.items(): 35 | if word not in word2emb: 36 | continue 37 | weights[idx] = word2emb[word] 38 | 39 | return weights 40 | 41 | 42 | if __name__ == "__main__": 43 | parser = argparse.ArgumentParser() 44 | parser.add_argument( 45 | "--vocabulary_file", 46 | type=str, 47 | required=True, 48 | help="input train annotationjson file", 49 | ) 50 | parser.add_argument( 51 | "--glove_file", 52 | type=str, 53 | required=True, 54 | help="glove files with the corresponding dim", 55 | ) 56 | parser.add_argument( 57 | "--out_dir", 58 | type=str, 59 | default="./", 60 | help="output directory, default is current directory", 61 | ) 62 | 63 | args = parser.parse_args() 64 | 65 | glove_file = args.glove_file 66 | vocabulary_file = args.vocabulary_file 67 | out_dir = args.out_dir 68 | 69 | os.makedirs(out_dir, exist_ok=True) 70 | emb_file_name = "vqa2.0_" + os.path.basename(glove_file) + ".npy" 71 | 72 | weights = subset_weights(glove_file, vocabulary_file) 73 | 74 | emb_file = os.path.join(out_dir, emb_file_name) 75 | np.save(emb_file, weights) 76 | -------------------------------------------------------------------------------- /pythia/legacy/data_prep/vqa_v2.0/genome_ids.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/legacy/data_prep/vqa_v2.0/genome_ids.pkl -------------------------------------------------------------------------------- /pythia/legacy/dataset_utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | -------------------------------------------------------------------------------- /pythia/legacy/dataset_utils/create_imdb_header.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import datetime 10 | 11 | from global_variables.global_variables import imdb_version 12 | 13 | 14 | def create_header(dataset_name, has_answer, has_gt_layout): 15 | now = datetime.datetime.now() 16 | time = now.strftime("%Y-%m-%d %H:%M") 17 | version = imdb_version 18 | header = dict( 19 | create_time=time, 20 | dataset_name=dataset_name, 21 | version=version, 22 | has_answer=has_answer, 23 | has_gt_layout=has_gt_layout, 24 | ) 25 | return header 26 | -------------------------------------------------------------------------------- /pythia/legacy/dataset_utils/text_processing.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import re 10 | 11 | SENTENCE_SPLIT_REGEX = re.compile(r"(\W+)") 12 | 13 | 14 | def tokenize(sentence): 15 | sentence = sentence.lower() 16 | sentence = sentence.replace(",", "").replace("?", "").replace("'s", " 's") 17 | tokens = SENTENCE_SPLIT_REGEX.split(sentence) 18 | tokens = [t.strip() for t in tokens if len(t.strip()) > 0] 19 | return tokens 20 | 21 | 22 | def load_str_list(fname): 23 | with open(fname) as f: 24 | lines = f.readlines() 25 | lines = [l.strip() for l in lines] 26 | return lines 27 | 28 | 29 | class VocabDict: 30 | def __init__(self, vocab_file): 31 | self.word_list = load_str_list(vocab_file) 32 | self.word2idx_dict = {w: n_w for n_w, w in enumerate(self.word_list)} 33 | self.num_vocab = len(self.word_list) 34 | self.UNK_idx = ( 35 | self.word2idx_dict[""] if "" in self.word2idx_dict else None 36 | ) 37 | 38 | def idx2word(self, n_w): 39 | return self.word_list[n_w] 40 | 41 | def word2idx(self, w): 42 | if w in self.word2idx_dict: 43 | return self.word2idx_dict[w] 44 | elif self.UNK_idx is not None: 45 | return self.UNK_idx 46 | else: 47 | raise ValueError( 48 | "word %s not in dictionary \ 49 | (while dictionary does not contain )" 50 | % w 51 | ) 52 | 53 | def tokenize_and_index(self, sentence): 54 | inds = [self.word2idx(w) for w in tokenize(sentence)] 55 | return inds 56 | -------------------------------------------------------------------------------- /pythia/legacy/dataset_utils/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import numpy as np 10 | 11 | 12 | def unique_columns(data): 13 | dt = np.dtype((np.void, data.dtype.itemsize * data.shape[0])) 14 | dataf = np.asfortranarray(data).view(dt) 15 | u, uind = np.unique(dataf, return_inverse=True) 16 | m = u.view(data.dtype).reshape(-1, data.shape[0]).T 17 | res = [np.where(uind == x)[0] for x in range(m.shape[1])] 18 | return res 19 | -------------------------------------------------------------------------------- /pythia/legacy/dataset_utils/vqa_collates.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import numpy as np 10 | from torch.utils.data.dataloader import default_collate 11 | 12 | 13 | def filter_unk_collate(batch): 14 | batch = list(filter(lambda x: np.sum(x["ans_scores"]) > 0, batch)) 15 | return default_collate(batch) 16 | -------------------------------------------------------------------------------- /pythia/legacy/dataset_utils/vqa_concate_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | from torch.utils.data import ConcatDataset 10 | 11 | 12 | class vqa_concate_dataset(ConcatDataset): 13 | def __init__(self, datasets): 14 | super(vqa_concate_dataset, self).__init__(datasets) 15 | self.vocab_dict = datasets[0].vocab_dict 16 | self.answer_dict = datasets[0].answer_dict 17 | -------------------------------------------------------------------------------- /pythia/legacy/dataset_utils/vqa_html_writer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | html_header = """ 10 | 11 | W3.CSS Template 12 | 13 | 14 | 18 | 19 | 20 |
22 | 23 | """ 24 | 25 | html_footer = """ 26 | 27 | 28 | """ 29 | 30 | row_header = """ 31 |
32 | """ 33 | 34 | element_header = """ 35 |
36 | """ 37 | 38 | 39 | class vqa_html_writer: 40 | def __init__(self, file_path, elements_per_row=4): 41 | self._writer = open(file_path, "w") 42 | self._writer.write(html_header) 43 | self.count = 0 44 | self.elements_per_row = elements_per_row 45 | 46 | def write_element(self, image, **kwarg): 47 | if self.count % self.elements_per_row == 0: 48 | self._writer.write(row_header + "\n") 49 | self._writer.write(element_header) 50 | self._writer.write('') 51 | for key, value in kwarg.items(): 52 | self._writer.write("

%s : %s

" % (key, value)) 53 | self._writer.write("
") 54 | self.count += 1 55 | if self.count % self.elements_per_row == 0 and self.count > 0: 56 | self._writer.write("
") 57 | 58 | def close(self): 59 | if self.count % self.elements_per_row != 0: 60 | self._writer.write("
") 61 | self._writer.write(html_footer) 62 | self._writer.close() 63 | 64 | 65 | if __name__ == "__main__": 66 | html_writer = vqa_html_writer("/Users/tinayujiang/temp/test.html", 4) 67 | n = 10 68 | for i in range(10): 69 | image_path = ( 70 | "/Users/tinayujiang/work/VQA/data_analysis/val2014/" 71 | + "COCO_val2014_000000290951.jpg" 72 | ) 73 | info = {"question": "abcfs efc?", "answers": " wdds cdsde"} 74 | html_writer.write_element(image_path, **info) 75 | 76 | html_writer.close() 77 | -------------------------------------------------------------------------------- /pythia/legacy/ensemble.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import argparse 10 | import glob 11 | import json 12 | 13 | import numpy as np 14 | 15 | import _pickle as pickle 16 | from train_model.helper import print_result 17 | 18 | 19 | def parse_args(): 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument("--out", type=str, required=True, help="output file name") 22 | parser.add_argument( 23 | "--res_dirs", 24 | nargs="+", 25 | help="directories for results, NOTE:" 26 | "all *.pkl file under these dirs will be ensembled", 27 | default=None, 28 | ) 29 | argments = parser.parse_args() 30 | 31 | return argments 32 | 33 | 34 | class answer_json: 35 | def __init__(self): 36 | self.answers = [] 37 | 38 | def add(self, ques_id, ans): 39 | res = {"question_id": ques_id, "answer": ans} 40 | self.answers.append(res) 41 | 42 | 43 | if __name__ == "__main__": 44 | 45 | args = parse_args() 46 | result_dirs = args.res_dirs 47 | out_file = args.out 48 | question_ids = None 49 | soft_max_result = None 50 | ans_dic = None 51 | cnt = 0 52 | for res_dir in result_dirs: 53 | for file in glob.glob(res_dir + "/**/*.pkl", recursive=True): 54 | with open(file, "rb") as f: 55 | cnt += 1 56 | sm = pickle.load(f) 57 | if soft_max_result is None: 58 | soft_max_result = sm 59 | question_ids = pickle.load(f) 60 | ans_dic = pickle.load(f) 61 | else: 62 | soft_max_result += sm 63 | 64 | print("ensemble total %d models" % cnt) 65 | 66 | predicted_answers = np.argmax(soft_max_result, axis=1) 67 | 68 | pkl_file = out_file + ".pkl" 69 | 70 | print_result(question_ids, soft_max_result, ans_dic, out_file, False, pkl_file) 71 | 72 | print("Done") 73 | -------------------------------------------------------------------------------- /pythia/legacy/eval_model/eval_demo.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import json 10 | import sys 11 | 12 | from eval_model.vqaEval import VQAEval 13 | 14 | 15 | def parse_annotation(anno_file): 16 | with open(anno_file, "r") as f: 17 | annotations = json.load(f)["annotations"] 18 | 19 | q_2_anno = dict([(a["question_id"], a) for a in annotations]) 20 | return q_2_anno 21 | 22 | 23 | def parse_ans(answ_file): 24 | with open(answ_file, "r") as f: 25 | answers = json.load(f) 26 | 27 | q_2_answ = dict([(a["question_id"], a) for a in answers]) 28 | return q_2_answ 29 | 30 | 31 | if __name__ == "__main__": 32 | if len(sys.argv) < 3: 33 | exit( 34 | "USAGE: python eval_model/eval_demo.py \ 35 | annotation_json_file answer_json_file" 36 | ) 37 | 38 | anno_file = sys.argv[1] 39 | answ_file = sys.argv[2] 40 | 41 | q_2_anno = parse_annotation(anno_file) 42 | q_2_answ = parse_ans(answ_file) 43 | 44 | eval = VQAEval(q_2_anno, q_2_answ, 2) 45 | eval.evaluate() 46 | acc = eval.accuracy 47 | print( 48 | "overall: %.2f" % acc["overall"], 49 | "yes/no: %f" % acc["perAnswerType"]["yes/no"], 50 | "number: %.2f" % acc["perAnswerType"]["number"], 51 | "other: %.2f" % acc["perAnswerType"]["other"], 52 | ) 53 | -------------------------------------------------------------------------------- /pythia/legacy/global_variables/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | -------------------------------------------------------------------------------- /pythia/legacy/global_variables/global_variables.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import torch 10 | 11 | imdb_version = 1 12 | use_cuda = torch.cuda.is_available() 13 | 14 | model_type_gt = "gt_layout" 15 | model_type_scratch = "scratch" 16 | model_type_gt_rl = "gt+rl" 17 | model_type_top_down_bottom_up = "top_down_bottom_up" 18 | 19 | 20 | topdown_concate_attention = "concate_attention" 21 | topdown_project_attention = "project_attention" 22 | -------------------------------------------------------------------------------- /pythia/legacy/info/code_structure_plot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/legacy/info/code_structure_plot.png -------------------------------------------------------------------------------- /pythia/legacy/info/pythia.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/legacy/info/pythia.jpg -------------------------------------------------------------------------------- /pythia/legacy/info/vqa_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/legacy/info/vqa_example.png -------------------------------------------------------------------------------- /pythia/legacy/install.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | conda create --name vqa python=3.6 4 | source activate vqa 5 | pip install demjson pyyaml 6 | 7 | pip install http://download.pytorch.org/whl/cu90/torch-0.3.0-cp36-cp36m-linux_x86_64.whl 8 | pip install torchvision 9 | pip install tensorboardX 10 | 11 | 12 | -------------------------------------------------------------------------------- /pythia/legacy/tools/convert_VG_to_COCO_qa.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import json 10 | import string 11 | 12 | genome_data_file = "question_answers.json" 13 | genome_questions_file = "v2_OpenEnded_mscoco_genome_questions.json" 14 | genome_annotations_file = "v2_mscoco_genome_annotations.json" 15 | 16 | translator = str.maketrans("", "", string.punctuation) 17 | with open(genome_data_file, "r") as f: 18 | genome_data = json.load(f) 19 | 20 | genome_questions = [] 21 | genome_annotations = [] 22 | 23 | for data in genome_data: 24 | all_qas = data["qas"] 25 | for qas in all_qas: 26 | question = {} 27 | annotation = {} 28 | question["image_id"] = qas["image_id"] 29 | # assume unique question_id for every question answer pair 30 | question["question_id"] = qas["qa_id"] 31 | question["question"] = qas["question"] 32 | genome_questions.append(question) 33 | annotation["image_id"] = qas["image_id"] 34 | annotation["question_id"] = qas["qa_id"] 35 | answertxt = qas["answer"].translate(translator) 36 | answertxt = answertxt.lower() 37 | annotation["multiple_choice_answer"] = answertxt 38 | annotation["answers"] = [] 39 | for i in range(10): 40 | answer = {} 41 | answer["answer"] = answertxt 42 | answer["answer_confifence"] = "yes" 43 | answer["answer_id"] = i + 1 44 | annotation["answers"].append(answer) 45 | genome_annotations.append(annotation) 46 | 47 | genome_data = {} 48 | genome_data["questions"] = genome_questions 49 | 50 | with open(genome_questions_file, "w") as f: 51 | json.dump(genome_data, f) 52 | 53 | genome_data = {} 54 | genome_data["annotations"] = genome_annotations 55 | 56 | with open(genome_annotations_file, "w") as f: 57 | json.dump(genome_data, f) 58 | -------------------------------------------------------------------------------- /pythia/legacy/tools/convert_tsv_feature_to_indiv.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import argparse 10 | import base64 11 | import csv 12 | import os 13 | import sys 14 | 15 | import numpy as np 16 | 17 | parser = argparse.ArgumentParser() 18 | parser.add_argument("--infile", type=str, required=True, help="input file") 19 | parser.add_argument("--label", type=str, required=True, help="label for dataset") 20 | parser.add_argument("--out_dir", type=str, required=True, help="imdb output directory") 21 | args = parser.parse_args() 22 | 23 | out_dir = args.out_dir 24 | 25 | 26 | csv.field_size_limit(sys.maxsize) 27 | 28 | FIELDNAMES = ["image_id", "image_w", "image_h", "num_boxes", "boxes", "features"] 29 | infile = args.infile 30 | 31 | label = args.label 32 | 33 | out_dir = os.path.join(out_dir, label) 34 | 35 | os.makedirs(out_dir, exist_ok=True) 36 | 37 | print("reading tsv...") 38 | with open(infile, "r") as tsv_in_file: 39 | reader = csv.DictReader(tsv_in_file, delimiter="\t", fieldnames=FIELDNAMES) 40 | for item in reader: 41 | item["num_boxes"] = int(item["num_boxes"]) 42 | image_id = int(item["image_id"]) 43 | image_w = float(item["image_w"]) 44 | image_h = float(item["image_h"]) 45 | 46 | image_bboxes = np.frombuffer( 47 | base64.b64decode(item["boxes"]), dtype=np.float32 48 | ).reshape((item["num_boxes"], -1)) 49 | 50 | image_feat = np.frombuffer( 51 | base64.b64decode(item["features"]), dtype=np.float32 52 | ).reshape((item["num_boxes"], -1)) 53 | 54 | image_feat_and_boxes = {"image_bboxes": image_bboxes, "image_feat": image_feat} 55 | 56 | image_file_name = os.path.join( 57 | out_dir, "COCO_" + label + "_%012d.npy" % image_id 58 | ) 59 | np.save(image_file_name, image_feat_and_boxes) 60 | -------------------------------------------------------------------------------- /pythia/legacy/tools/eval_ensemble_on_val.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import glob 10 | import sys 11 | 12 | import torch 13 | import yaml 14 | from torch.utils.data import DataLoader 15 | 16 | from train_model.dataset_utils import prepare_eval_data_set 17 | from train_model.helper import build_model, run_model 18 | 19 | CONFIG = "config.yaml" 20 | MODELNAME = "best_model.pth" 21 | 22 | if __name__ == "__main__": 23 | if len(sys.argv) < 2: 24 | exit( 25 | "USAGE: python tools/eval_ensemble_on_val.py parent_dir \ 26 | [ensemble sizes]" 27 | ) 28 | 29 | esbl_sizes = [int(a) for a in sys.argv[2:]] 30 | 31 | parent_dir = sys.argv[1] 32 | 33 | model_pths = [ 34 | file for file in glob.glob(parent_dir + "/**/" + MODELNAME, recursive=True) 35 | ] 36 | config_files = [c.replace(MODELNAME, CONFIG) for c in model_pths] 37 | 38 | if len(esbl_sizes) == 0: 39 | esbl_sizes = range(1, len(config_files) + 1) 40 | 41 | config_file = config_files[0] 42 | 43 | with open(config_file, "r") as f: 44 | config = yaml.load(f) 45 | 46 | batch_size = config["data"]["batch_size"] 47 | data_set_test = prepare_eval_data_set( 48 | **config["data"], **config["model"], verbose=True 49 | ) 50 | data_reader_test = DataLoader( 51 | data_set_test, shuffle=False, batch_size=batch_size, num_workers=5 52 | ) 53 | ans_dic = data_set_test.answer_dict 54 | 55 | accumulated_softmax = None 56 | final_result = {} 57 | n_model = 0 58 | for c_file, model_file in zip(config_files, model_pths): 59 | with open(c_file, "r") as f: 60 | config = yaml.load(f) 61 | 62 | myModel = build_model(config, data_set_test) 63 | myModel.load_state_dict(torch.load(model_file)["state_dict"]) 64 | 65 | question_ids, soft_max_result = run_model( 66 | myModel, data_reader_test, ans_dic.UNK_idx 67 | ) 68 | 69 | if n_model == 0: 70 | final_result = soft_max_result 71 | -------------------------------------------------------------------------------- /pythia/legacy/tools/extract_detectron_weights.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import pickle 11 | import sys 12 | 13 | if len(sys.argv) < 4: 14 | exit( 15 | "USAGE: python tools/extract_detectron_weights.py \ 16 | weights_file out_dir feat_name [feat_name]" 17 | ) 18 | 19 | wgts_file = sys.argv[1] 20 | out_dir = sys.argv[2] 21 | 22 | with open(wgts_file, "rb") as f: 23 | wgts = pickle.load(f, encoding="latin1")["blobs"] 24 | 25 | for i in range(3, len(sys.argv)): 26 | feat_name = sys.argv[i] 27 | wgt = wgts[feat_name] 28 | out_file = os.path.join(out_dir, feat_name + ".pkl") 29 | with open(out_file, "wb") as w: 30 | pickle.dump(wgt, w) 31 | -------------------------------------------------------------------------------- /pythia/legacy/tools/extract_minival_ids.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import json 10 | import pickle 11 | 12 | 13 | def extract_qid_imid(ques_json_file): 14 | with open(ques_json_file, "r") as f: 15 | info = json.load(f) 16 | questions = info["questions"] 17 | 18 | q_im_ids = [] 19 | for q in questions: 20 | im_id = q["image_id"] 21 | q_id = q["question_id"] 22 | q_im_ids.append((im_id, q_id)) 23 | 24 | return q_im_ids 25 | 26 | 27 | if __name__ == "__main__": 28 | minival_ques_file = "v2_OpenEnded_mscoco_minival2014_questions.json" 29 | 30 | val2train_ques_file = "v2_OpenEnded_mscoco_val2train2014_questions.json" 31 | 32 | minival_out_file = "data_prep/vqa_v2.0/minival_ids.pkl" 33 | val2train_out_file = "data_prep/vqa_v2.0/val2train_ids.pkl" 34 | 35 | minival_ids = extract_qid_imid(minival_ques_file) 36 | with open(minival_out_file, "wb") as w1: 37 | pickle.dump(minival_ids, w1) 38 | 39 | val2train_ids = extract_qid_imid(val2train_ques_file) 40 | with open(val2train_out_file, "wb") as w2: 41 | pickle.dump(val2train_ids, w2) 42 | -------------------------------------------------------------------------------- /pythia/legacy/tools/extract_visual_features_vgg_pool5.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import argparse 10 | import os 11 | import sys 12 | from glob import glob 13 | 14 | import numpy as np 15 | import torch 16 | import torch.nn as nn 17 | import torchvision.models as models 18 | from torch.autograd import Variable 19 | 20 | import skimage.color 21 | import skimage.io 22 | from global_variables.global_variables import use_cuda 23 | 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--gpu_id", type=int, default=0) 26 | parser.add_argument("--data_dir", type=str, required=True) 27 | parser.add_argument("--out_dir", type=str, required=True) 28 | 29 | args = parser.parse_args() 30 | gpu_id = args.gpu_id # set GPU id to use 31 | os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id) 32 | sys.path.append("../../") 33 | 34 | image_basedir = args.data_dir 35 | save_basedir = args.out_dir 36 | 37 | channel_mean = np.array([123.68, 116.779, 103.939], dtype=np.float32) 38 | 39 | 40 | class vgg16_feature_module(nn.Module): 41 | def __init__(self, vgg16_model): 42 | super(vgg16_feature_module, self).__init__() 43 | self.feature_module = nn.Sequential(*list(list(vgg16_model.children())[0])) 44 | 45 | def forward(self, x): 46 | return self.feature_module(x) 47 | 48 | 49 | vgg16 = models.vgg16(pretrained=True) 50 | vgg16_feature = vgg16_feature_module(vgg16) 51 | vgg16_feature = vgg16_feature.cuda() if use_cuda else vgg16_feature 52 | 53 | 54 | def extract_image_pool5(impath): 55 | im = skimage.io.imread(impath)[..., :3] 56 | im_val = im[np.newaxis, ...] - channel_mean 57 | 58 | # permute to get NCHW 59 | im_val = np.transpose(im_val, axes=(0, 3, 1, 2)) 60 | im_val_tensor = torch.FloatTensor(im_val) 61 | im_val_variable = Variable(im_val_tensor) 62 | im_val_variable = im_val_variable.cuda() if use_cuda else im_val_variable 63 | 64 | pool5_val = vgg16_feature(im_val_variable) 65 | return pool5_val.data.cpu().numpy() 66 | 67 | 68 | def extract_dataset_pool5(image_dir, save_dir, ext_filter="*.png"): 69 | image_list = glob(image_dir + "/" + ext_filter) 70 | os.makedirs(save_dir, exist_ok=True) 71 | 72 | for n_im, impath in enumerate(image_list): 73 | if (n_im + 1) % 100 == 0: 74 | print("processing %d / %d" % (n_im + 1, len(image_list))) 75 | image_name = os.path.basename(impath).split(".")[0] 76 | save_path = os.path.join(save_dir, image_name + ".npy") 77 | if not os.path.exists(save_path): 78 | pool5_val = extract_image_pool5(impath) 79 | np.save(save_path, pool5_val) 80 | 81 | 82 | for image_set in ["train", "val", "test"]: 83 | print("Extracting image set " + image_set) 84 | extract_dataset_pool5( 85 | os.path.join(image_basedir, image_set), os.path.join(save_basedir, image_set) 86 | ) 87 | print("Done.") 88 | -------------------------------------------------------------------------------- /pythia/legacy/tools/generate_minival_annotation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import json 10 | import pickle 11 | 12 | if __name__ == "__main__": 13 | val_annotation_file = "v2_mscoco_val2014_annotations.json" 14 | minival_id_file = "data/vqa_v2.0/minival_ids.pkl" 15 | minival_annotation_file = "v2_mscoco_minival2014_annotations.json" 16 | 17 | with open(minival_id_file, "rb") as f: 18 | q_im_ids = pickle.load(f) 19 | 20 | minival_ids = [x[1] for x in q_im_ids] 21 | 22 | with open(val_annotation_file, "r") as f: 23 | file_info = json.load(f) 24 | annotations = file_info["annotations"] 25 | info = file_info["info"] 26 | data_subtype = file_info["data_subtype"] 27 | license_info = file_info["license"] 28 | 29 | minival_annotations = [a for a in annotations if a["question_id"] in minival_ids] 30 | 31 | minival_info = { 32 | "data_subtype": data_subtype, 33 | "license": license_info, 34 | "info": info, 35 | "annotations": minival_annotations, 36 | } 37 | 38 | with open(minival_annotation_file, "w") as w: 39 | json.dump(minival_info, w) 40 | -------------------------------------------------------------------------------- /pythia/legacy/tools/mirror_images.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | # All paths need to be updated 9 | 10 | import json 11 | import os 12 | from multiprocessing.dummy import Pool as ThreadPool 13 | 14 | from PIL import Image, ImageOps 15 | 16 | split = "val2014" 17 | image_paths = [] 18 | 19 | 20 | def mirror_image(image_path): 21 | img = Image.open(image_path) 22 | mirror_img = ImageOps.mirror(img) 23 | image_name = image_path.split("/")[-1] 24 | fh = "data/" + split 25 | fh = os.path.join(fh, image_name) 26 | mirror_img.save(fh, "JPEG") 27 | 28 | 29 | with open("./COCO/060817/annotations/instances_val2014.json") as f: 30 | data = json.load(f) 31 | for item in data["images"]: 32 | image_id = int(item["id"]) 33 | filepath = os.path.join("val2014/", item["file_name"]) 34 | image_paths.append(filepath) 35 | 36 | pool = ThreadPool(10) 37 | results = pool.map(mirror_image, image_paths) 38 | -------------------------------------------------------------------------------- /pythia/legacy/tools/model_path.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | ##vgg model from https://github.com/jcjohnson/pytorch-vgg 10 | 11 | 12 | vgg16_caffe2 = "https://s3-us-west-2.amazonaws.com/jcjohns-models/vgg16-00b39a1b.pth" 13 | vgg19_caffe2 = "https://s3-us-west-2.amazonaws.com/jcjohns-models/vgg19-d01eb7cb.pth" 14 | -------------------------------------------------------------------------------- /pythia/legacy/tools/rename_genome_file.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import shutil 11 | import sys 12 | 13 | if len(sys.argv) != 3: 14 | exit("Usage: python tools/rename_genome_file.py [inDir] [outDir]") 15 | 16 | inDir = sys.argv[1] 17 | outDir = sys.argv[2] 18 | 19 | OUT_NAME = "COCO_genome_%012d.npy" 20 | 21 | os.makedirs(outDir, exist_ok=True) 22 | 23 | n = 0 24 | print("BEGIN.....") 25 | for file in os.listdir(inDir): 26 | if file.endswith(".npy"): 27 | n += 1 28 | if n % 5000 == 0: 29 | print("process %d files" % n) 30 | image_id = int(file.split(".")[0]) 31 | out_name = OUT_NAME % image_id 32 | in_file = os.path.join(inDir, file) 33 | out_file = os.path.join(outDir, out_name) 34 | shutil.copy(in_file, out_file) 35 | 36 | print("process total %d files" % n) 37 | print("DONE.....") 38 | -------------------------------------------------------------------------------- /pythia/legacy/tools/subset_val.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import argparse 10 | import json 11 | import random 12 | 13 | 14 | def parse_args(): 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--ques_file", type=str) 17 | pass 18 | 19 | 20 | if __name__ == "__main__": 21 | val_json_file = "v2_OpenEnded_mscoco_val2014_questions.json" 22 | minival_json_file = "v2_OpenEnded_mscoco_minival2014_questions.json" 23 | val_as_train_json_file = "v2_OpenEnded_mscoco_val2train2014_questions.json" 24 | 25 | with open(val_json_file, "r") as f: 26 | file_info = json.load(f) 27 | questions = file_info["questions"] 28 | info = file_info["info"] 29 | task_type = file_info["task_type"] 30 | data_type = file_info["data_type"] 31 | license = file_info["license"] 32 | data_subtype = file_info["info"] 33 | 34 | # collect image_id 35 | image_ids = [] 36 | for q in questions: 37 | image_id = q["image_id"] 38 | image_ids.append(image_id) 39 | 40 | # divide image_ids to two parts 41 | random.shuffle(image_ids) 42 | minival_images = image_ids[:10000] 43 | other_images = image_ids[10000:] 44 | 45 | minival_ques = [] 46 | other_ques = [] 47 | 48 | total_minival = 0 49 | total_others = 0 50 | # seprate quesion_json_file 51 | for q in questions: 52 | image_id = q["image_id"] 53 | 54 | if image_id in minival_images: 55 | minival_ques.append(q) 56 | total_minival += 1 57 | else: 58 | other_ques.append(q) 59 | total_others += 1 60 | 61 | minival_json = { 62 | "info": info, 63 | "task_type": task_type, 64 | "data_type": data_type, 65 | "license": license, 66 | "data_subtype": "minival2014", 67 | "questions": minival_ques, 68 | } 69 | 70 | other_json = { 71 | "info": info, 72 | "task_type": task_type, 73 | "data_type": data_type, 74 | "license": license, 75 | "data_subtype": "val2train2014", 76 | "questions": other_ques, 77 | } 78 | 79 | with open(minival_json_file, "w") as w1: 80 | json.dump(minival_json, w1) 81 | 82 | with open(val_as_train_json_file, "w") as w2: 83 | json.dump(other_json, w2) 84 | 85 | print( 86 | "minival_questions: %d" % total_minival + "other_questions: %d" % total_others 87 | ) 88 | -------------------------------------------------------------------------------- /pythia/legacy/tools/timer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import timeit 3 | 4 | 5 | class Timer: 6 | def __init__(self, unit="s"): 7 | self.s_time = timeit.default_timer() 8 | self.unit = unit 9 | if self.unit != "s" and self.unit != "m" and self.unit != "h": 10 | raise NotImplementedError("unkown time unit, using s, m, h") 11 | 12 | def start(self): 13 | self.s_time = timeit.default_timer() 14 | 15 | def end(self): 16 | self.e_time = timeit.default_timer() 17 | period = self.e_time - self.s_time 18 | if self.unit == "s": 19 | return "%.1f s" % period 20 | elif self.unit == "m": 21 | return "%.2f min" % (period / 60) 22 | else: 23 | return "%.2f h" % (period / 3600) 24 | -------------------------------------------------------------------------------- /pythia/legacy/top_down_bottom_up/image_embedding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import pickle 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | 15 | 16 | """ 17 | parameters: 18 | 19 | input: 20 | image_feat_variable: [batch_size, num_location, image_feat_dim] 21 | or a list of [num_location, image_feat_dim] 22 | when using adaptive number of objects 23 | question_embedding:[batch_size, txt_embeding_dim] 24 | 25 | output: 26 | image_embedding:[batch_size, image_feat_dim] 27 | 28 | 29 | """ 30 | 31 | 32 | class image_embedding(nn.Module): 33 | def __init__(self, image_attention_model): 34 | super(image_embedding, self).__init__() 35 | self.image_attention_model = image_attention_model 36 | self.out_dim = image_attention_model.out_dim 37 | 38 | def forward(self, image_feat_variable, question_embedding, image_dims): 39 | # N x K x n_att 40 | attention = self.image_attention_model( 41 | image_feat_variable, question_embedding, image_dims 42 | ) 43 | att_reshape = attention.permute(0, 2, 1) 44 | tmp_embedding = torch.bmm( 45 | att_reshape, image_feat_variable 46 | ) # N x n_att x image_dim 47 | batch_size = att_reshape.size(0) 48 | image_embedding = tmp_embedding.view(batch_size, -1) 49 | 50 | return image_embedding 51 | 52 | 53 | class image_finetune(nn.Module): 54 | def __init__(self, in_dim, weights_file, bias_file): 55 | super(image_finetune, self).__init__() 56 | with open(weights_file, "rb") as w: 57 | weights = pickle.load(w) 58 | with open(bias_file, "rb") as b: 59 | bias = pickle.load(b) 60 | out_dim = bias.shape[0] 61 | 62 | self.lc = nn.Linear(in_dim, out_dim) 63 | self.lc.weight.data.copy_(torch.from_numpy(weights)) 64 | self.lc.bias.data.copy_(torch.from_numpy(bias)) 65 | self.out_dim = out_dim 66 | 67 | def forward(self, image): 68 | i2 = self.lc(image) 69 | i3 = F.relu(i2) 70 | return i3 71 | -------------------------------------------------------------------------------- /pythia/legacy/top_down_bottom_up/image_feature_encoding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | import pickle 11 | 12 | import torch 13 | import torch.nn as nn 14 | import torch.nn.functional as F 15 | 16 | from config.config import cfg 17 | 18 | 19 | def build_image_feature_encoding(method, par, in_dim): 20 | if method == "default_image": 21 | return DefaultImageFeature(in_dim) 22 | elif method == "finetune_faster_rcnn_fpn_fc7": 23 | return FinetuneFasterRcnnFpnFc7(in_dim, **par) 24 | else: 25 | raise NotImplementedError("unknown image feature encoding %s" % method) 26 | 27 | 28 | class DefaultImageFeature(nn.Module): 29 | def __init__(self, in_dim): 30 | super(DefaultImageFeature, self).__init__() 31 | self.in_dim = in_dim 32 | self.out_dim = in_dim 33 | 34 | def forward(self, image): 35 | return image 36 | 37 | 38 | class FinetuneFasterRcnnFpnFc7(nn.Module): 39 | def __init__(self, in_dim, weights_file, bias_file): 40 | super(FinetuneFasterRcnnFpnFc7, self).__init__() 41 | if not os.path.isabs(weights_file): 42 | weights_file = os.path.join(cfg.data.data_root_dir, weights_file) 43 | if not os.path.isabs(bias_file): 44 | bias_file = os.path.join(cfg.data.data_root_dir, bias_file) 45 | with open(weights_file, "rb") as w: 46 | weights = pickle.load(w) 47 | with open(bias_file, "rb") as b: 48 | bias = pickle.load(b) 49 | out_dim = bias.shape[0] 50 | 51 | self.lc = nn.Linear(in_dim, out_dim) 52 | self.lc.weight.data.copy_(torch.from_numpy(weights)) 53 | self.lc.bias.data.copy_(torch.from_numpy(bias)) 54 | self.out_dim = out_dim 55 | 56 | def forward(self, image): 57 | i2 = self.lc(image) 58 | i3 = F.relu(i2) 59 | return i3 60 | -------------------------------------------------------------------------------- /pythia/legacy/top_down_bottom_up/intermediate_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import torch.nn as nn 10 | 11 | 12 | class inter_layer(nn.Module): 13 | def __init__(self, dim, n_layer): 14 | super(inter_layer, self).__init__() 15 | layers = [] 16 | for i in range(n_layer): 17 | layers.append(nn.Linear(dim, dim)) 18 | layers.append(nn.ReLU()) 19 | 20 | self.main = nn.Sequential(*layers) 21 | 22 | def forward(self, x): 23 | return self.main(x) 24 | -------------------------------------------------------------------------------- /pythia/legacy/top_down_bottom_up/nonlinear_layer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | from torch.nn.utils.weight_norm import weight_norm 13 | 14 | 15 | """ 16 | nonlinear_layer: f_a : x\in R^m => y \in R^n 17 | \tilda{y} = tanh(Wx + b) 18 | g = sigmoid(W'x + b') 19 | y = \tilda(y) \circ g 20 | input (N, *, in_dim) 21 | output (N, *, out_dim) 22 | """ 23 | 24 | 25 | class nonlinear_layer_org(nn.Module): 26 | def __init__(self, in_dim, out_dim): 27 | super(nonlinear_layer_org, self).__init__() 28 | self.fc1 = nn.Linear(in_dim, out_dim) 29 | self.gate = nn.Linear(in_dim, out_dim) 30 | 31 | def forward(self, x): 32 | y_tilda = F.tanh(self.fc1(x)) 33 | g = F.sigmoid(self.gate(x)) 34 | y = y_tilda * g 35 | return y 36 | 37 | 38 | class FCNet(nn.Module): 39 | """Simple class for non-linear fully connect network 40 | """ 41 | 42 | def __init__(self, dims): 43 | super(FCNet, self).__init__() 44 | 45 | layers = [] 46 | for i in range(len(dims) - 2): 47 | in_dim = dims[i] 48 | out_dim = dims[i + 1] 49 | layers.append(weight_norm(nn.Linear(in_dim, out_dim), dim=None)) 50 | layers.append(nn.ReLU()) 51 | layers.append(weight_norm(nn.Linear(dims[-2], dims[-1]), dim=None)) 52 | layers.append(nn.ReLU()) 53 | 54 | self.main = nn.Sequential(*layers) 55 | 56 | def forward(self, x): 57 | return self.main(x) 58 | 59 | 60 | class nonlinear_layer(nn.Module): 61 | """Simple class for non-linear fully connect network 62 | """ 63 | 64 | def __init__(self, in_dim, out_dim): 65 | super(nonlinear_layer, self).__init__() 66 | 67 | layers = [] 68 | layers.append(weight_norm(nn.Linear(in_dim, out_dim), dim=None)) 69 | layers.append(nn.ReLU()) 70 | 71 | self.main = nn.Sequential(*layers) 72 | 73 | def forward(self, x): 74 | return self.main(x) 75 | -------------------------------------------------------------------------------- /pythia/legacy/top_down_bottom_up/post_combine_transform.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import torch 10 | import torch.nn as nn 11 | import torch.nn.functional as F 12 | from torch.nn.utils.weight_norm import weight_norm 13 | 14 | 15 | def build_post_combine_transform(method, par, in_dim): 16 | if method == "linear_transform": 17 | return LinearTransform(in_dim, **par) 18 | elif method == "conv_transform": 19 | return ConvTransform(in_dim, **par) 20 | else: 21 | raise NotImplementedError("unkown post combime transform type %s" % method) 22 | 23 | 24 | class LinearTransform(nn.Module): 25 | def __init__(self, in_dim, **kwargs): 26 | super(LinearTransform, self).__init__() 27 | self.lc = weight_norm( 28 | nn.Linear(in_features=in_dim, out_features=kwargs["out_dim"]), dim=None 29 | ) 30 | self.out_dim = kwargs["out_dim"] 31 | 32 | def forward(self, x): 33 | return self.lc(x) 34 | 35 | 36 | class ConvTransform(nn.Module): 37 | def __init__(self, in_dim, **kwargs): 38 | super(ConvTransform, self).__init__() 39 | self.conv1 = nn.Conv2d( 40 | in_channels=in_dim, out_channels=kwargs["hidden_dim"], kernel_size=1 41 | ) 42 | self.conv2 = nn.Conv2d( 43 | in_channels=kwargs["hidden_dim"], 44 | out_channels=kwargs["out_dim"], 45 | kernel_size=1, 46 | ) 47 | self.out_dim = kwargs["out_dim"] 48 | 49 | def forward(self, x): 50 | if len(x.size()) == 3: # N x k xdim 51 | # N x dim x k x 1 52 | x_reshape = torch.unsqueeze(x.permute(0, 2, 1), 3) 53 | elif len(x.size()) == 2: # N x dim 54 | # N x dim x 1 x 1 55 | x_reshape = torch.unsqueeze(torch.unsqueeze(x, 2), 3) 56 | 57 | iatt_conv1 = self.conv1(x_reshape) # N x hidden_dim x * x 1 58 | iatt_relu = F.relu(iatt_conv1) 59 | iatt_conv2 = self.conv2(iatt_relu) # N x out_dim x * x 1 60 | 61 | if len(x.size()) == 3: 62 | iatt_conv3 = torch.squeeze(iatt_conv2, 3).permute(0, 2, 1) 63 | elif len(x.size()) == 2: 64 | iatt_conv3 = torch.squeeze(torch.squeeze(iatt_conv2, 3), 2) 65 | 66 | return iatt_conv3 67 | -------------------------------------------------------------------------------- /pythia/legacy/train_model/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | -------------------------------------------------------------------------------- /pythia/legacy/train_model/eval_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import os 10 | 11 | import torch 12 | from torch.utils.data import DataLoader 13 | 14 | 15 | def get_final_validation(data_set_val, batch_size, snapshot_dir, eval_model): 16 | final_val_data_reader = DataLoader( 17 | data_set_val, shuffle=False, batch_size=batch_size 18 | ) 19 | 20 | files = [ 21 | os.path.join(snapshot_dir, file) 22 | for file in os.listdir(snapshot_dir) 23 | if file.startswith("model") 24 | ] 25 | 26 | for model_file in sorted(files, key=os.path.getctime, reverse=True): 27 | current_model = torch.load(model_file) 28 | total_sample = 0 29 | total_score = 0 30 | for i, batch in enumerate(final_val_data_reader): 31 | score, n_sample, _ = eval_model(batch, current_model) 32 | total_sample += n_sample 33 | total_score += score 34 | 35 | acc = total_score / total_sample 36 | print(model_file, ": %.6f" % acc) 37 | -------------------------------------------------------------------------------- /pythia/legacy/train_model/evaluate.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | 9 | import argparse 10 | import os 11 | 12 | import yaml 13 | from torch.utils.data import DataLoader 14 | 15 | from train_model.dataset_utils import prepare_eval_data_set 16 | from train_model.Engineer import one_stage_eval_model 17 | from train_model.eval_utils import get_final_validation 18 | from train_model.model_factory import is_one_stageModel 19 | 20 | parser = argparse.ArgumentParser() 21 | parser.add_argument("--config", type=str, required=True, help="config yaml file") 22 | parser.add_argument("--out_dir", type=str, required=True, help="output directory") 23 | args = parser.parse_args() 24 | 25 | config_file = args.config 26 | out_dir = args.out_dir 27 | 28 | with open(config_file, "r") as f: 29 | config = yaml.load(f) 30 | 31 | # get the potential shared data_config info 32 | data_root_dir = config["data"]["data_root_dir"] 33 | batch_size = config["data"]["batch_size"] 34 | data_set_val = prepare_eval_data_set(**config["data"], **config["model"]) 35 | data_reader_val = DataLoader(data_set_val, shuffle=False, batch_size=batch_size) 36 | 37 | snapshot_dir = os.path.join(out_dir, config["output"]["exp_name"]) 38 | os.makedirs(snapshot_dir, exist_ok=True) 39 | 40 | model_type = config["model"]["model_type"] 41 | if is_one_stageModel(model_type): 42 | get_final_validation(data_set_val, batch_size, snapshot_dir, one_stage_eval_model) 43 | else: 44 | None 45 | -------------------------------------------------------------------------------- /pythia/legacy/train_model/helper.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | # All rights reserved. 3 | # 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | # 7 | 8 | import json 9 | import sys 10 | import timeit 11 | 12 | import numpy as np 13 | 14 | import _pickle as pickle 15 | from train_model.Engineer import masked_unk_softmax, one_stage_run_model 16 | from train_model.model_factory import prepare_model 17 | 18 | 19 | class answer_json: 20 | def __init__(self): 21 | self.answers = [] 22 | 23 | def add(self, ques_id, ans): 24 | res = {"question_id": ques_id, "answer": ans} 25 | self.answers.append(res) 26 | 27 | 28 | def build_model(config, dataset): 29 | num_vocab_txt = dataset.vocab_dict.num_vocab 30 | num_choices = dataset.answer_dict.num_vocab 31 | 32 | num_image_feat = len(config["data"]["image_feat_train"][0].split(",")) 33 | my_model = prepare_model( 34 | num_vocab_txt, num_choices, **config["model"], num_image_feat=num_image_feat 35 | ) 36 | return my_model 37 | 38 | 39 | def run_model(current_model, data_reader, UNK_idx=0): 40 | softmax_tot = [] 41 | q_id_tot = [] 42 | 43 | start = timeit.default_timer() 44 | for i, batch in enumerate(data_reader): 45 | if (i + 1) % 100 == 0: 46 | end = timeit.default_timer() 47 | time = end - start 48 | start = timeit.default_timer() 49 | print(" process batch %d for test for %.1f s" % (i + 1, time)) 50 | sys.stdout.flush() 51 | 52 | verbose_info = batch["verbose_info"] 53 | q_ids = verbose_info["question_id"].cpu().numpy().tolist() 54 | logit_res = one_stage_run_model(batch, current_model, eval_mode=True) 55 | softmax_res = masked_unk_softmax(logit_res, dim=1, mask_idx=UNK_idx) 56 | softmax_res = softmax_res.data.cpu().numpy().astype(np.float16) 57 | q_id_tot += q_ids 58 | softmax_tot.append(softmax_res) 59 | softmax_result = np.vstack(softmax_tot) 60 | 61 | return q_id_tot, softmax_result 62 | 63 | 64 | def print_result( 65 | question_ids, soft_max_result, ans_dic, out_file, json_only=True, pkl_res_file=None 66 | ): 67 | predicted_answers = np.argmax(soft_max_result, axis=1) 68 | 69 | if not json_only: 70 | with open(pkl_res_file, "wb") as writeFile: 71 | pickle.dump(soft_max_result, writeFile) 72 | pickle.dump(question_ids, writeFile) 73 | pickle.dump(ans_dic, writeFile) 74 | 75 | ans_json_out = answer_json() 76 | for idx, pred_idx in enumerate(predicted_answers): 77 | question_id = question_ids[idx] 78 | pred_ans = ans_dic.idx2word(pred_idx) 79 | ans_json_out.add(question_id, pred_ans) 80 | 81 | with open(out_file, "w") as f: 82 | json.dump(ans_json_out.answers, f) 83 | -------------------------------------------------------------------------------- /pythia/models/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | __all__ = ["TopDownBottomUp", "Pythia", "LoRRA", "BAN"] 3 | 4 | from .top_down_bottom_up import TopDownBottomUp 5 | from .ban import BAN 6 | from .pythia import Pythia 7 | from .lorra import LoRRA 8 | -------------------------------------------------------------------------------- /pythia/models/lorra.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import torch 3 | 4 | from pythia.common.registry import registry 5 | from pythia.models.pythia import Pythia 6 | from pythia.modules.layers import ClassifierLayer 7 | 8 | 9 | @registry.register_model("lorra") 10 | class LoRRA(Pythia): 11 | def __init__(self, config): 12 | super().__init__(config) 13 | 14 | def build(self): 15 | self._init_text_embeddings("text") 16 | # For LoRRA context feature and text embeddings would be identity 17 | # but to keep a unified API, we will init them also 18 | # and we need to build them first before building pythia's other 19 | # modules as some of the modules require context attributes to be set 20 | self._init_text_embeddings("context") 21 | self._init_feature_encoders("context") 22 | self._init_feature_embeddings("context") 23 | super().build() 24 | 25 | def get_optimizer_parameters(self, config): 26 | params = super().get_optimizer_parameters(config) 27 | params += [ 28 | {"params": self.context_feature_embeddings_list.parameters()}, 29 | {"params": self.context_embeddings.parameters()}, 30 | {"params": self.context_feature_encoders.parameters()}, 31 | ] 32 | 33 | return params 34 | 35 | def _get_classifier_input_dim(self): 36 | # Now, the classifier's input will be cat of image and context based 37 | # features 38 | return 2 * super()._get_classifier_input_dim() 39 | 40 | def forward(self, sample_list): 41 | sample_list.text = self.word_embedding(sample_list.text) 42 | text_embedding_total = self.process_text_embedding(sample_list) 43 | 44 | image_embedding_total, _ = self.process_feature_embedding( 45 | "image", sample_list, text_embedding_total 46 | ) 47 | 48 | context_embedding_total, _ = self.process_feature_embedding( 49 | "context", sample_list, text_embedding_total, ["order_vectors"] 50 | ) 51 | 52 | if self.inter_model is not None: 53 | image_embedding_total = self.inter_model(image_embedding_total) 54 | 55 | joint_embedding = self.combine_embeddings( 56 | ["image", "text"], 57 | [image_embedding_total, text_embedding_total, context_embedding_total], 58 | ) 59 | 60 | scores = self.calculate_logits(joint_embedding) 61 | 62 | return {"scores": scores} 63 | -------------------------------------------------------------------------------- /pythia/models/m4c_captioner.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | from pythia.common.registry import registry 3 | from pythia.models.m4c import M4C 4 | 5 | 6 | @registry.register_model("m4c_captioner") 7 | class M4CCaptioner(M4C): 8 | def __init__(self, config): 9 | super().__init__(config) 10 | self.remove_unk_in_pred = self.config.remove_unk_in_pred 11 | 12 | def _forward_output(self, sample_list, fwd_results): 13 | super()._forward_output(sample_list, fwd_results) 14 | 15 | if (not self.training) and self.remove_unk_in_pred: 16 | # avoid outputting in the generated captions 17 | fwd_results["scores"][..., self.answer_processor.UNK_IDX] = -1e10 18 | 19 | return fwd_results 20 | -------------------------------------------------------------------------------- /pythia/models/top_down_bottom_up.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import torch 3 | from torch import nn 4 | 5 | from pythia.common.registry import registry 6 | from pythia.models.base_model import BaseModel 7 | from pythia.modules.embeddings import (ImageEmbedding, PreExtractedEmbedding, 8 | TextEmbedding) 9 | from pythia.modules.encoders import ImageEncoder 10 | from pythia.modules.layers import (ClassifierLayer, Identity, 11 | ModalCombineLayer, ReLUWithWeightNormFC) 12 | 13 | 14 | # Note: Doesn't work currently. Needs to be migrated to new API 15 | @registry.register_model("top_down_bottom_up") 16 | class TopDownBottomUp(BaseModel): 17 | def __init__(self, image_attention_model, text_embedding_models, classifier): 18 | super().__init__() 19 | self.image_attention_model = image_attention_model 20 | self.text_embedding_models = text_embedding_models 21 | self.classifier = classifier 22 | text_lstm_dim = sum([q.text_out_dim for q in text_embedding_models]) 23 | joint_embedding_out_dim = classifier.input_dim 24 | image_feat_dim = image_attention_model.image_feat_dim 25 | self.non_linear_text = ReLUWithWeightNormFC( 26 | text_lstm_dim, joint_embedding_out_dim 27 | ) 28 | self.non_linear_image = ReLUWithWeightNormFC( 29 | image_feat_dim, joint_embedding_out_dim 30 | ) 31 | 32 | def build(self): 33 | return 34 | 35 | def forward( 36 | self, image_feat_variable, input_text_variable, input_answers=None, **kwargs 37 | ): 38 | text_embeddings = [] 39 | for q_model in self.text_embedding_models: 40 | q_embedding = q_model(input_text_variable) 41 | text_embeddings.append(q_embedding) 42 | text_embedding = torch.cat(text_embeddings, dim=1) 43 | 44 | if isinstance(image_feat_variable, list): 45 | image_embeddings = [] 46 | for idx, image_feat in enumerate(image_feat_variable): 47 | ques_embedding_each = torch.unsqueeze(text_embedding[idx, :], 0) 48 | image_feat_each = torch.unsqueeze(image_feat, dim=0) 49 | attention_each = self.image_attention_model( 50 | image_feat_each, ques_embedding_each 51 | ) 52 | image_embedding_each = torch.sum(attention_each * image_feat, dim=1) 53 | image_embeddings.append(image_embedding_each) 54 | image_embedding = torch.cat(image_embeddings, dim=0) 55 | else: 56 | attention = self.image_attention_model(image_feat_variable, text_embedding) 57 | image_embedding = torch.sum(attention * image_feat_variable, dim=1) 58 | 59 | joint_embedding = self.non_linear_text(text_embedding) * self.non_linear_image( 60 | image_embedding 61 | ) 62 | logit_res = self.classifier(joint_embedding) 63 | 64 | return logit_res 65 | -------------------------------------------------------------------------------- /pythia/modules/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /pythia/modules/decoders.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import torch 3 | from torch import nn 4 | from torch.nn.utils.weight_norm import weight_norm 5 | from pythia.common.registry import registry 6 | 7 | 8 | class VisDialDiscriminator(nn.Module): 9 | def __init__(self, config, embedding): 10 | super(VisDialDiscriminator, self).__init__() 11 | self.config = config 12 | self.embedding = embedding 13 | 14 | self.emb_out_dim = embedding.text_out_dim 15 | self.hidden_dim = self.config["hidden_dim"] 16 | 17 | self.projection_layer = nn.Linear(self.emb_out_dim, self.hidden_dim) 18 | 19 | def forward(self, encoder_output, batch): 20 | answer_options_len = batch["answer_options_len"] 21 | 22 | # BATCH_SIZE X DIALOGUES X 100 X SEQ_LEN 23 | answer_options = batch["answer_options"] 24 | 25 | max_seq_len = answer_options.size(-1) 26 | 27 | batch_size, ndialogues, noptions, seq_len = answer_options.size() 28 | 29 | # (B X D X 100) X SEQ_LEN 30 | answer_options = answer_options.view(-1, max_seq_len) 31 | answer_options_len = answer_options_len.view(-1) 32 | 33 | # (B x D x 100) x EMB_OUT_DIM 34 | answer_options = self.embedding(answer_options) 35 | 36 | # (B x D x 100) x HIDDEN_DIM 37 | answer_options = self.projection_layer(answer_options) 38 | 39 | # (B x D) x 100 x HIDDEN_DIM 40 | answer_options = answer_options.view( 41 | batch_size * ndialogues, noptions, self.hidden_dim 42 | ) 43 | 44 | # (B x D) x HIDDEN_DIM => (B x D) x 100 x HIDDEN_DIM 45 | encoder_output = encoder_output.unsqueeze(1).expand(-1, noptions, -1) 46 | 47 | # (B x D) x 100 x HIDDEN_DIM * (B x D) x 100 x HIDDEN_DIM = SAME THING 48 | # SUM => (B x D) x 100 49 | scores = torch.sum(answer_options * encoder_output, dim=2) 50 | 51 | return scores 52 | 53 | 54 | class LanguageDecoder(nn.Module): 55 | def __init__(self, in_dim, out_dim, **kwargs): 56 | super().__init__() 57 | 58 | self.language_lstm = nn.LSTMCell( 59 | in_dim + kwargs["hidden_dim"], kwargs["hidden_dim"], bias=True 60 | ) 61 | self.fc = weight_norm(nn.Linear(kwargs["hidden_dim"], out_dim)) 62 | self.dropout = nn.Dropout(p=kwargs["dropout"]) 63 | self.init_weights(kwargs["fc_bias_init"]) 64 | 65 | def init_weights(self, fc_bias_init): 66 | self.fc.bias.data.fill_(fc_bias_init) 67 | self.fc.weight.data.uniform_(-0.1, 0.1) 68 | 69 | def forward(self, weighted_attn): 70 | # Get LSTM state 71 | state = registry.get("{}_lstm_state".format(weighted_attn.device)) 72 | h1, c1 = state["td_hidden"] 73 | h2, c2 = state["lm_hidden"] 74 | 75 | # Language LSTM 76 | h2, c2 = self.language_lstm(torch.cat([weighted_attn, h1], dim=1), (h2, c2)) 77 | predictions = self.fc(self.dropout(h2)) 78 | 79 | # Update hidden state for t+1 80 | state["lm_hidden"] = (h2, c2) 81 | 82 | return predictions 83 | -------------------------------------------------------------------------------- /pythia/modules/encoders.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import os 3 | import pickle 4 | 5 | import torch 6 | from torch import nn 7 | 8 | from pythia.modules.layers import Identity 9 | from pythia.utils.general import get_pythia_root 10 | 11 | 12 | class ImageEncoder(nn.Module): 13 | def __init__(self, encoder_type, in_dim, **kwargs): 14 | super(ImageEncoder, self).__init__() 15 | 16 | if encoder_type == "default": 17 | self.module = Identity() 18 | self.module.in_dim = in_dim 19 | self.module.out_dim = in_dim 20 | elif encoder_type == "finetune_faster_rcnn_fpn_fc7": 21 | self.module = FinetuneFasterRcnnFpnFc7(in_dim, **kwargs) 22 | else: 23 | raise NotImplementedError("Unknown Image Encoder: %s" % encoder_type) 24 | 25 | self.out_dim = self.module.out_dim 26 | 27 | def forward(self, *args, **kwargs): 28 | return self.module(*args, **kwargs) 29 | 30 | 31 | class FinetuneFasterRcnnFpnFc7(nn.Module): 32 | def __init__(self, in_dim, weights_file, bias_file, model_data_dir): 33 | super(FinetuneFasterRcnnFpnFc7, self).__init__() 34 | pythia_root = get_pythia_root() 35 | model_data_dir = os.path.join(pythia_root, model_data_dir) 36 | 37 | if not os.path.isabs(weights_file): 38 | weights_file = os.path.join(model_data_dir, weights_file) 39 | if not os.path.isabs(bias_file): 40 | bias_file = os.path.join(model_data_dir, bias_file) 41 | with open(weights_file, "rb") as w: 42 | weights = pickle.load(w) 43 | with open(bias_file, "rb") as b: 44 | bias = pickle.load(b) 45 | out_dim = bias.shape[0] 46 | 47 | self.lc = nn.Linear(in_dim, out_dim) 48 | self.lc.weight.data.copy_(torch.from_numpy(weights)) 49 | self.lc.bias.data.copy_(torch.from_numpy(bias)) 50 | self.out_dim = out_dim 51 | 52 | def forward(self, image): 53 | i2 = self.lc(image) 54 | i3 = nn.functional.relu(i2) 55 | return i3 56 | -------------------------------------------------------------------------------- /pythia/scripts/features/extract_features.md: -------------------------------------------------------------------------------- 1 | ## Extract Image Features 2 | 3 | We use detectron to extract image features. Set up [detectron](https://github.com/facebookresearch/Detectron) 4 | and copy [tools/extract_features.py](tools/extract_features.py) to detectron for extracting features 5 | 6 | 7 | Feature extraction works best with commit #3a38b7b of [detectron](https://github.com/facebookresearch/Detectron) 8 | and #0dd3284 of [caffe2](https://github.com/caffe2/caffe2) 9 | 10 | 11 | download the pretrained detectron model 12 | ```bash 13 | wget https://dl.fbaipublicfiles.com/pythia/detectron_model/FAST_RCNN_MLP_DIM2048_FPN_DIM512.pkl 14 | wget https://dl.fbaipublicfiles.com/pythia/detectron_model/e2e_faster_rcnn_X-101-64x4d-FPN_1x_MLP_2048_FPN_512.yaml 15 | 16 | $INPUT_DIR = /path/to/your/input/image or directory 17 | 18 | python extract_features.py --cfg e2e_faster_rcnn_X-101-64x4d-FPN_1x_MLP_2048_FPN_512.yaml \ 19 | --wts FAST_RCNN_MLP_DIM2048_FPN_DIM512.pkl \ 20 | --min_bboxes 100 --max_bboxes 100 \ 21 | --feat_name gpu_0/fc6 \ 22 | --output_dir ~/temp_out $INPUT_DIR 23 | -------------------------------------------------------------------------------- /pythia/scripts/gqa/README.md: -------------------------------------------------------------------------------- 1 | # Converstion of GQA to VQA format 2 | 3 | * Download GQA datasets and store as format shown in conversion script 4 | * Download glove embeddings 300D file 5 | * Run the script from the root of the repo as by changing relevant paths: 6 | 7 | ``` 8 | PYTHONPATH=. python ./data_prep/gqa/convert_gqa_to_vqa.py \ 9 | --gqa_dir /checkpoint/meetshah/datasets/gqa/ \ 10 | --out_dir /checkpoint/meetshah/datasets/gqa_pp/ 11 | ``` 12 | -------------------------------------------------------------------------------- /pythia/trainers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | __all__ = ["BaseTrainer"] 3 | 4 | from .base_trainer import BaseTrainer 5 | -------------------------------------------------------------------------------- /pythia/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /pythia/utils/build_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import torch 3 | import warnings 4 | 5 | from pythia.utils.configuration import Configuration 6 | from pythia.common.registry import registry 7 | from pythia.utils.general import get_optimizer_parameters 8 | 9 | 10 | def build_trainer(args, *rest, **kwargs): 11 | configuration = Configuration(args.config) 12 | 13 | # Update with the config override if passed 14 | configuration.override_with_cmd_config(args.config_override) 15 | 16 | # Now, update with opts args that were passed 17 | configuration.override_with_cmd_opts(args.opts) 18 | 19 | # Finally, update with args that were specifically passed 20 | # as arguments 21 | configuration.update_with_args(args) 22 | configuration.freeze() 23 | 24 | config = configuration.get_config() 25 | registry.register("config", config) 26 | registry.register("configuration", configuration) 27 | 28 | trainer_type = config.training_parameters.trainer 29 | trainer_cls = registry.get_trainer_class(trainer_type) 30 | trainer_obj = trainer_cls(config) 31 | 32 | # Set args as an attribute for future use 33 | setattr(trainer_obj, 'args', args) 34 | 35 | return trainer_obj 36 | 37 | 38 | def build_model(config): 39 | model_name = config.model 40 | 41 | model_class = registry.get_model_class(model_name) 42 | 43 | if model_class is None: 44 | registry.get("writer").write("No model registered for name: %s" % model_name) 45 | model = model_class(config) 46 | 47 | if hasattr(model, "build"): 48 | model.build() 49 | model.init_losses_and_metrics() 50 | 51 | return model 52 | 53 | 54 | def build_optimizer(model, config): 55 | optimizer_config = config.optimizer_attributes 56 | if not hasattr(optimizer_config, "type"): 57 | raise ValueError( 58 | "Optimizer attributes must have a 'type' key " 59 | "specifying the type of optimizer. " 60 | "(Custom or PyTorch)" 61 | ) 62 | optimizer_type = optimizer_config.type 63 | 64 | if not hasattr(optimizer_config, "params"): 65 | warnings.warn( 66 | "optimizer attributes has no params defined, defaulting to {}." 67 | ) 68 | 69 | params = getattr(optimizer_config, "params", {}) 70 | 71 | if hasattr(torch.optim, optimizer_type): 72 | optimizer_class = getattr(torch.optim, optimizer_type) 73 | else: 74 | optimizer_class = registry.get_optimizer_class(optimizer_type) 75 | if optimizer_class is None: 76 | raise ValueError( 77 | "No optimizer class of type {} present in " 78 | "either torch or registered to registry" 79 | ) 80 | 81 | parameters = get_optimizer_parameters(model, config) 82 | optimizer = optimizer_class(parameters, **params) 83 | return optimizer 84 | -------------------------------------------------------------------------------- /pythia/utils/dataset_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import torch 3 | 4 | from pythia.common.sample import Sample 5 | 6 | 7 | def build_bbox_tensors(infos, max_length): 8 | num_bbox = min(max_length, len(infos)) 9 | 10 | # After num_bbox, everything else should be zero 11 | coord_tensor = torch.zeros((max_length, 4), dtype=torch.float) 12 | width_tensor = torch.zeros(max_length, dtype=torch.float) 13 | height_tensor = torch.zeros(max_length, dtype=torch.float) 14 | bbox_types = ["xyxy"] * max_length 15 | 16 | infos = infos[:num_bbox] 17 | sample = Sample() 18 | 19 | for idx, info in enumerate(infos): 20 | bbox = info["bounding_box"] 21 | x = bbox["top_left_x"] 22 | y = bbox["top_left_y"] 23 | width = bbox["width"] 24 | height = bbox["height"] 25 | 26 | coord_tensor[idx][0] = x 27 | coord_tensor[idx][1] = y 28 | coord_tensor[idx][2] = x + width 29 | coord_tensor[idx][3] = y + height 30 | 31 | width_tensor[idx] = width 32 | height_tensor[idx] = height 33 | sample.coordinates = coord_tensor 34 | sample.width = width_tensor 35 | sample.height = height_tensor 36 | sample.bbox_types = bbox_types 37 | 38 | return sample 39 | -------------------------------------------------------------------------------- /pythia/utils/objects_to_byte_tensor.py: -------------------------------------------------------------------------------- 1 | 2 | # Adopted from 3 | # https://github.com/pytorch/fairseq/blob/master/fairseq/distributed_utils.py 4 | 5 | import pickle 6 | import torch 7 | 8 | MAX_SIZE_LIMIT = 65533 9 | BYTE_SIZE = 256 10 | 11 | 12 | def enc_obj2bytes(obj, max_size=4094): 13 | """ 14 | Encode Python objects to PyTorch byte tensors 15 | """ 16 | assert max_size <= MAX_SIZE_LIMIT 17 | byte_tensor = torch.zeros(max_size, dtype=torch.uint8) 18 | 19 | obj_enc = pickle.dumps(obj) 20 | obj_size = len(obj_enc) 21 | if obj_size > max_size: 22 | raise Exception( 23 | 'objects too large: object size {}, max size {}'.format( 24 | obj_size, max_size 25 | ) 26 | ) 27 | 28 | byte_tensor[0] = obj_size // 256 29 | byte_tensor[1] = obj_size % 256 30 | byte_tensor[2:2+obj_size] = torch.ByteTensor(list(obj_enc)) 31 | return byte_tensor 32 | 33 | 34 | def dec_bytes2obj(byte_tensor, max_size=4094): 35 | """ 36 | Decode PyTorch byte tensors to Python objects 37 | """ 38 | assert max_size <= MAX_SIZE_LIMIT 39 | 40 | obj_size = byte_tensor[0].item() * 256 + byte_tensor[1].item() 41 | obj_enc = bytes(byte_tensor[2:2+obj_size].tolist()) 42 | obj = pickle.loads(obj_enc) 43 | return obj 44 | 45 | 46 | if __name__ == '__main__': 47 | test_obj = [1, '2', {3: 4}, [5]] 48 | test_obj_bytes = enc_obj2bytes(test_obj) 49 | test_obj_dec = dec_bytes2obj(test_obj_bytes) 50 | print(test_obj_dec == test_obj) 51 | -------------------------------------------------------------------------------- /pythia/utils/phoc/__init__.py: -------------------------------------------------------------------------------- 1 | from .build_phoc import build_phoc # NoQA 2 | -------------------------------------------------------------------------------- /pythia/utils/phoc/build_phoc.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from .cphoc import build_phoc as _build_phoc_raw 4 | 5 | 6 | _alphabet = {"a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","0","1","2","3","4","5","6","7","8","9"} # NoQA 7 | 8 | 9 | def build_phoc(token): 10 | token = token.lower().strip() 11 | token = ''.join([c for c in token if c in _alphabet]) 12 | phoc = _build_phoc_raw(token) 13 | phoc = np.array(phoc, dtype=np.float32) 14 | return phoc 15 | -------------------------------------------------------------------------------- /pythia/utils/timer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import time 3 | 4 | 5 | class Timer: 6 | DEFAULT_TIME_FORMAT_DATE_TIME = "%Y/%m/%d %H:%M:%S" 7 | DEFAULT_TIME_FORMAT = ["%03dms", "%02ds", "%02dm", "%02dh"] 8 | 9 | def __init__(self): 10 | self.start = time.time() * 1000 11 | 12 | def get_current(self): 13 | return self.get_time_hhmmss(self.start) 14 | 15 | def reset(self): 16 | self.start = time.time() * 1000 17 | 18 | def get_time_since_start(self, format=None): 19 | return self.get_time_hhmmss(self.start, format) 20 | 21 | def get_time_hhmmss(self, start=None, end=None, gap=None, format=None): 22 | """ 23 | Calculates time since `start` and formats as a string. 24 | """ 25 | if start is None and gap is None: 26 | 27 | if format is None: 28 | format = self.DEFAULT_TIME_FORMAT_DATE_TIME 29 | 30 | return time.strftime(format) 31 | 32 | if end is None: 33 | end = time.time() * 1000 34 | if gap is None: 35 | gap = end - start 36 | 37 | s, ms = divmod(gap, 1000) 38 | m, s = divmod(s, 60) 39 | h, m = divmod(m, 60) 40 | 41 | if format is None: 42 | format = self.DEFAULT_TIME_FORMAT 43 | 44 | items = [ms, s, m, h] 45 | assert len(items) == len(format), "Format length should be same as items" 46 | 47 | time_str = "" 48 | for idx, item in enumerate(items): 49 | if item != 0: 50 | time_str = format[idx] % item + " " + time_str 51 | 52 | return time_str.strip() 53 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch>=1.2 2 | torchvision>0.2 3 | tensorboardX>=1.2 4 | numpy>=1.14 5 | tqdm>=4.19 6 | demjson>=2.2 7 | torchtext>=0.2 8 | GitPython>=2.1 9 | PyYAML>=3.11 10 | pytest==5.2.0 11 | requests==2.21.0 12 | fastText 13 | nltk==3.4.1 14 | pytorch-transformers==1.2.0 15 | editdistance 16 | -------------------------------------------------------------------------------- /run.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=$1 python -m torch.distributed.launch --nproc_per_node $2 --master_port $4 tools/run.py --tasks captioning --datasets m4c_textcaps --model m4c_captioner \ 2 | --config configs/captioning/m4c_textcaps/m4c_captioner.yml \ 3 | --save_dir save/$3 --resume_file save/$3/m4c_textcaps_m4c_captioner_2021/best.ckpt \ 4 | training_parameters.distributed True 5 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | 3 | # Copyright (c) Facebook, Inc. and its affiliates. 4 | import os.path 5 | import shutil 6 | from glob import glob 7 | import sys 8 | 9 | import setuptools 10 | from setuptools import Extension 11 | from setuptools.command.build_ext import build_ext 12 | 13 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), "pythia")) 14 | 15 | with open("README.md", encoding="utf8") as f: 16 | readme = f.read() 17 | 18 | with open("LICENSE") as f: 19 | license = f.read() 20 | 21 | with open("requirements.txt") as f: 22 | reqs = f.read() 23 | 24 | DISTNAME = "pythia" 25 | DESCRIPTION = "pythia: a modular framework for vision and language multimodal \ 26 | research." 27 | LONG_DESCRIPTION = readme 28 | AUTHOR = "Facebook AI Research" 29 | LICENSE = license 30 | REQUIREMENTS = (reqs.strip().split("\n"),) 31 | 32 | ext_modules = [ 33 | Extension( 34 | 'cphoc', 35 | sources=['pythia/utils/phoc/src/cphoc.c'], 36 | language='c', 37 | libraries=["pthread", "dl", "util", "rt", "m"], 38 | extra_compile_args=["-O3"], 39 | ), 40 | ] 41 | 42 | 43 | class BuildExt(build_ext): 44 | def run(self): 45 | build_ext.run(self) 46 | cphoc_lib = glob('build/lib.*/cphoc.*.so')[0] 47 | shutil.copy(cphoc_lib, 'pythia/utils/phoc/cphoc.so') 48 | 49 | 50 | if __name__ == "__main__": 51 | setuptools.setup( 52 | name=DISTNAME, 53 | install_requires=REQUIREMENTS, 54 | packages=setuptools.find_packages(), 55 | ext_modules=ext_modules, 56 | cmdclass={'build_ext': BuildExt}, 57 | version="0.3", 58 | description=DESCRIPTION, 59 | long_description=LONG_DESCRIPTION, 60 | author=AUTHOR, 61 | license=LICENSE, 62 | setup_requires=["pytest-runner"], 63 | tests_require=["flake8", "pytest"], 64 | ) 65 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /tests/data/vocab.txt: -------------------------------------------------------------------------------- 1 | a 2 | man 3 | with 4 | red 5 | helmet 6 | on 7 | small 8 | moped 9 | dirt 10 | road 11 | riding 12 | motor 13 | bike 14 | the 15 | countryside 16 | back 17 | of 18 | motorcycle -------------------------------------------------------------------------------- /tests/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/tests/modules/__init__.py -------------------------------------------------------------------------------- /tests/modules/test_layers.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import unittest 3 | 4 | import torch 5 | import random 6 | import operator 7 | import functools 8 | import numpy as np 9 | 10 | import pythia.modules.layers as layers 11 | 12 | 13 | class TestModuleLayers(unittest.TestCase): 14 | def setUp(self): 15 | torch.manual_seed(1234) 16 | 17 | def test_conv_net(self): 18 | conv_net = layers.ConvNet(150, 75, 3) 19 | 20 | input_tensor = torch.randn(4, 150, 64, 64) 21 | output = conv_net(input_tensor) 22 | expected_size = torch.Size((4, 75, 32, 32)) 23 | self.assertEqual(output.size(), expected_size) 24 | # Since seed is fix we can check some of tensor values 25 | np.testing.assert_almost_equal(output[0][0][0][0].item(), 0.149190, decimal=5) 26 | np.testing.assert_almost_equal(output[3][74][31][31].item(), -0.25199, decimal=5) 27 | 28 | 29 | def test_flatten(self): 30 | flatten = layers.Flatten() 31 | 32 | # Test 3 dim 33 | input_tensor = torch.randn(5, 6, 10) 34 | expected_size = torch.Size((5, 60)) 35 | actual_size = flatten(input_tensor).size() 36 | self.assertEqual(actual_size, expected_size) 37 | 38 | # Test 1 dim 39 | input_tensor = torch.randn(5) 40 | expected_size = torch.Size((5,)) 41 | actual_size = flatten(input_tensor).size() 42 | self.assertEqual(actual_size, expected_size) 43 | 44 | # Test 6 dim 45 | size_list = [random.randint(2, 4) for _ in range(7)] 46 | expected_size = torch.Size((size_list[0], functools.reduce(operator.mul, size_list[1:]))) 47 | input_tensor = torch.randn(*size_list) 48 | actual_size = flatten(input_tensor).size() 49 | self.assertEqual(actual_size, expected_size) 50 | 51 | def test_unflatten(self): 52 | unflatten = layers.UnFlatten() 53 | 54 | # Test 2 dim to 3 dim 55 | input_tensor = torch.randn(5, 60) 56 | expected_size = torch.Size((5, 6, 10)) 57 | actual_size = unflatten(input_tensor, sizes=[6, 10]).size() 58 | self.assertEqual(actual_size, expected_size) 59 | 60 | # Test 1 dim 61 | input_tensor = torch.randn(5) 62 | expected_size = torch.Size((5,)) 63 | actual_size = unflatten(input_tensor, sizes=[]).size() 64 | self.assertEqual(expected_size, actual_size) 65 | -------------------------------------------------------------------------------- /tests/modules/test_losses.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import unittest 3 | 4 | import pythia.modules.losses as losses 5 | import torch 6 | 7 | 8 | class TestModuleLosses(unittest.TestCase): 9 | def test_caption_cross_entropy(self): 10 | caption_ce_loss = losses.CaptionCrossEntropyLoss() 11 | 12 | expected = dict() 13 | predicted = dict() 14 | 15 | # Test complete match 16 | expected["targets"] = torch.empty((1, 10), dtype=torch.long) 17 | expected["targets"].fill_(4) 18 | predicted["scores"] = torch.zeros((1, 10, 10)) 19 | predicted["scores"][:, :, 4] = 100.0 20 | 21 | self.assertEqual(caption_ce_loss(expected, predicted).item(), 0.0) 22 | 23 | # Test random initialized 24 | torch.manual_seed(1234) 25 | expected["targets"] = torch.randint(0, 9491, (5, 10)) 26 | predicted["scores"] = torch.rand((5, 10, 9491)) 27 | 28 | self.assertAlmostEqual(caption_ce_loss(expected, predicted).item(), 9.2507, 4) 29 | -------------------------------------------------------------------------------- /tests/modules/test_metrics.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import os 3 | import unittest 4 | 5 | import yaml 6 | 7 | import pythia.modules.metrics as metrics 8 | import torch 9 | from pythia.common.registry import registry 10 | from pythia.common.sample import Sample 11 | from pythia.datasets.processors import CaptionProcessor 12 | from pythia.utils.configuration import ConfigNode 13 | 14 | 15 | class TestModuleMetrics(unittest.TestCase): 16 | def test_caption_bleu4(self): 17 | path = os.path.join( 18 | os.path.abspath(__file__), 19 | "../../../pythia/common/defaults/configs/datasets/captioning/coco.yml", 20 | ) 21 | with open(os.path.abspath(path)) as f: 22 | config = yaml.load(f, Loader=yaml.FullLoader) 23 | 24 | config = ConfigNode(config) 25 | captioning_config = config.dataset_attributes.coco 26 | caption_processor_config = captioning_config.processors.caption_processor 27 | vocab_path = os.path.join(os.path.abspath(__file__), "..", "..", "data", "vocab.txt") 28 | caption_processor_config.params.vocab.vocab_file = os.path.abspath(vocab_path) 29 | caption_processor = CaptionProcessor(caption_processor_config.params) 30 | registry.register("coco_caption_processor", caption_processor) 31 | 32 | caption_bleu4 = metrics.CaptionBleu4Metric() 33 | expected = Sample() 34 | predicted = dict() 35 | 36 | # Test complete match 37 | expected.answers = torch.empty((5, 5, 10)) 38 | expected.answers.fill_(4) 39 | predicted["scores"] = torch.zeros((5, 10, 19)) 40 | predicted["scores"][:, :, 4] = 1.0 41 | 42 | self.assertEqual(caption_bleu4.calculate(expected, predicted).item(), 1.0) 43 | 44 | # Test partial match 45 | expected.answers = torch.empty((5, 5, 10)) 46 | expected.answers.fill_(4) 47 | predicted["scores"] = torch.zeros((5, 10, 19)) 48 | predicted["scores"][:, 0:5, 4] = 1.0 49 | 50 | self.assertAlmostEqual( 51 | caption_bleu4.calculate(expected, predicted).item(), 0.3928, 4 52 | ) 53 | -------------------------------------------------------------------------------- /tests/tasks/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/tests/tasks/__init__.py -------------------------------------------------------------------------------- /tests/tasks/test_base_dataset.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import unittest 3 | import os 4 | 5 | from pythia.common.registry import registry 6 | from pythia.datasets.base_dataset import BaseDataset 7 | from pythia.utils.configuration import Configuration 8 | 9 | 10 | class TestBaseDataset(unittest.TestCase): 11 | def test_init_processors(self): 12 | path = os.path.join( 13 | os.path.abspath(__file__), 14 | "../../../pythia/common/defaults/configs/datasets/vqa/vqa2.yml" 15 | ) 16 | 17 | configuration = Configuration(os.path.abspath(path)) 18 | self._fix_configuration(configuration) 19 | configuration.freeze() 20 | 21 | base_dataset = BaseDataset( 22 | "vqa2", 23 | "train", 24 | configuration.get_config()["dataset_attributes"]["vqa2"], 25 | ) 26 | expected_processors = [ 27 | "answer_processor", 28 | "ocr_token_processor", 29 | "bbox_processor", 30 | ] 31 | 32 | # Check no processors are initialized before init_processors call 33 | self.assertFalse(any(hasattr(base_dataset, key) 34 | for key in expected_processors)) 35 | 36 | for processor in expected_processors: 37 | self.assertIsNone(registry.get("{}_{}".format("vqa2", processor))) 38 | 39 | # Check processors are initialized after init_processors 40 | base_dataset.init_processors() 41 | self.assertTrue(all(hasattr(base_dataset, key) 42 | for key in expected_processors)) 43 | for processor in expected_processors: 44 | self.assertIsNotNone(registry.get("{}_{}".format("vqa2", processor))) 45 | 46 | def _fix_configuration(self, configuration): 47 | vqa2_config = configuration.config['dataset_attributes']['vqa2'] 48 | processors = vqa2_config['processors'] 49 | processors.pop('text_processor') 50 | processors.pop('context_processor') 51 | -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | def compare_tensors(a, b): 5 | return torch.all(a.eq(b)) 6 | -------------------------------------------------------------------------------- /tests/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | -------------------------------------------------------------------------------- /tests/utils/test_general.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import unittest 3 | 4 | from pythia.utils.general import (dict_to_string, get_overlap_score) 5 | 6 | 7 | class TestUtilsGeneral(unittest.TestCase): 8 | def test_dict_to_string(self): 9 | dictionary = {"one": 1, "two": 2, "three": 3} 10 | expected = "one: 1.0000, two: 2.0000, three: 3.0000" 11 | 12 | self.assertEqual(dict_to_string(dictionary), expected) 13 | 14 | # TODO: Move later to configuration tests 15 | # def test_nested_dict_update(self): 16 | # # Updates value 17 | # dictionary = {"level1": {"level2": {"levelA": 0, "levelB": 1}}} 18 | # update = {"level1": {"level2": {"levelB": 10}}} 19 | # expected = {"level1": {"level2": {"levelA": 0, "levelB": 10}}} 20 | # 21 | # self.assertEqual(nested_dict_update(dictionary, update), expected) 22 | # 23 | # # Adds new value 24 | # dictionary = {"level1": {"level2": {"levelA": 0}}} 25 | # update = {"level1": {"level2": {"levelB": 10}}} 26 | # expected = {"level1": {"level2": {"levelA": 0, "levelB": 10}}} 27 | # 28 | # self.assertEqual(nested_dict_update(dictionary, update), expected) 29 | 30 | def test_get_overlap_score(self): 31 | # Full overlap 32 | candidate = "pythia" 33 | target = "pythia" 34 | self.assertEqual(get_overlap_score(candidate, target), 1.0) 35 | 36 | # Partial overlap 37 | candidate = "pythia" 38 | target = "python" 39 | self.assertEqual(get_overlap_score(candidate, target), 2 / 3) 40 | 41 | # No overlap 42 | candidate = "pythia" 43 | target = "vqa" 44 | self.assertEqual(get_overlap_score(candidate, target), 0.0) 45 | -------------------------------------------------------------------------------- /tests/utils/test_timer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import time 3 | import unittest 4 | 5 | from pythia.utils.timer import Timer 6 | 7 | 8 | class TestUtilsTimer(unittest.TestCase): 9 | def test_get_current(self): 10 | timer = Timer() 11 | expected = "000ms" 12 | 13 | self.assertEqual(timer.get_current(), expected) 14 | 15 | def test_reset(self): 16 | timer = Timer() 17 | time.sleep(2) 18 | timer.reset() 19 | expected = "000ms" 20 | 21 | self.assertEqual(timer.get_current(), expected) 22 | 23 | def test_get_time_since_start(self): 24 | timer = Timer() 25 | time.sleep(2) 26 | expected = "02s " 27 | 28 | self.assertTrue(expected in timer.get_time_since_start()) 29 | -------------------------------------------------------------------------------- /tools/bert/extract_bert.sh: -------------------------------------------------------------------------------- 1 | N_REM=`expr $3 - 1` 2 | 3 | for i in $(seq 0 $N_REM); do 4 | python tools/extract_bert_embeddings.py --imdb_path $1 --out_path $2 --group_id $i --n_groups $3 & 5 | done 6 | -------------------------------------------------------------------------------- /tools/bert/extract_bert_embeddings.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | import argparse 3 | import multiprocessing 4 | import os 5 | 6 | import numpy as np 7 | import torch 8 | from pytorch_pretrained_bert import BertModel, BertTokenizer 9 | from tqdm import tqdm 10 | 11 | 12 | class BertFeatExtractor(object): 13 | def __init__(self, model_name): 14 | self.tokenizer = BertTokenizer.from_pretrained(model_name) 15 | self.model = BertModel.from_pretrained(model_name).eval() 16 | self.model.cuda() 17 | 18 | def get_bert_embedding(self, text): 19 | tokenized_text = self.tokenizer.tokenize(text) 20 | tokenized_text = ["[CLS]"] + tokenized_text + ["[SEP]"] 21 | indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text) 22 | tokens_tensor = torch.Tensor([indexed_tokens]).long() 23 | segments_tensor = torch.Tensor([0] * len(tokenized_text)).long() 24 | with torch.no_grad(): 25 | encoded_layers, _ = self.model( 26 | tokens_tensor.cuda(), 27 | segments_tensor.cuda(), 28 | output_all_encoded_layers=False, 29 | ) 30 | return encoded_layers.squeeze()[0] 31 | 32 | 33 | def extract_bert(imdb_path, out_path, group_id=0, n_groups=1): 34 | imdb = np.load(imdb_path) 35 | 36 | feat_extractor = BertFeatExtractor("bert-base-uncased") 37 | 38 | if group_id == 0: 39 | iterator_obj = tqdm(imdb[1:]) 40 | else: 41 | iterator_obj = imdb[1:] 42 | 43 | for idx, el in enumerate(iterator_obj): 44 | if idx % n_groups != group_id: 45 | continue 46 | emb = feat_extractor.get_bert_embedding(el["question_str"]) 47 | save_path = out_path + str(el["question_id"]) 48 | np.save(save_path, emb.cpu().numpy()) 49 | 50 | 51 | if __name__ == "__main__": 52 | parser = argparse.ArgumentParser() 53 | parser.add_argument("--imdb_path", type=str, default=None) 54 | parser.add_argument("--out_path", type=str, default=None) 55 | parser.add_argument("--group_id", type=int, default=0) 56 | parser.add_argument("--n_groups", type=int, default=1) 57 | args = parser.parse_args() 58 | extract_bert(args.imdb_path, args.out_path, args.group_id, args.n_groups) 59 | -------------------------------------------------------------------------------- /val.sh: -------------------------------------------------------------------------------- 1 | CUDA_VISIBLE_DEVICES=$1 python tools/run.py --tasks captioning --datasets m4c_textcaps --model m4c_captioner \ 2 | --config configs/captioning/m4c_textcaps/m4c_captioner.yml \ 3 | --save_dir save/$2 \ 4 | --run_type $3 --resume_file $4 \ 5 | --evalai_inference 1 6 | --------------------------------------------------------------------------------