├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── configs
    ├── __init__.py
    ├── captioning
    │   ├── coco
    │   │   ├── butd.yml
    │   │   └── butd_beam_search.yml
    │   └── m4c_textcaps
    │   │   ├── butd.yml
    │   │   ├── butd_beam_search.yml
    │   │   ├── butd_eval_pretrained_coco_model.yml
    │   │   ├── m4c_captioner.yml
    │   │   ├── m4c_captioner_coco.yml
    │   │   ├── m4c_captioner_coco_eval_on_textcaps.yml
    │   │   ├── m4c_captioner_coco_textcaps_joint.yml
    │   │   └── m4c_captioner_without_ocr.yml
    └── vqa
    │   ├── clevr
    │       └── cnn_lstm.yml
    │   ├── m4c_ocrvqa
    │       └── m4c.yml
    │   ├── m4c_stvqa
    │       └── m4c.yml
    │   ├── m4c_textvqa
    │       ├── m4c.yml
    │       ├── m4c_ocr_ml.yml
    │       └── m4c_with_stvqa.yml
    │   ├── textvqa
    │       ├── ban.yml
    │       ├── lorra.yml
    │       └── pythia.yml
    │   ├── visual_genome
    │       └── pythia.yml
    │   ├── vizwiz
    │       ├── ban.yml
    │       ├── lorra.yml
    │       └── pythia.yml
    │   └── vqa2
    │       ├── ban.yml
    │       ├── lorra.yml
    │       ├── lorra_train_and_val.yml
    │       ├── pythia.yml
    │       ├── pythia_12k_iterations_no_resnet.yml
    │       └── pythia_train_and_val.yml
├── docs
    ├── Makefile
    ├── requirements.txt
    └── source
    │   ├── common
    │       ├── registry.rst
    │       └── sample.rst
    │   ├── conf.py
    │   ├── datasets
    │       ├── base_dataset.rst
    │       ├── base_dataset_builder.rst
    │       ├── base_task.rst
    │       └── processors.rst
    │   ├── index.rst
    │   ├── models
    │       └── base_model.rst
    │   ├── modules
    │       ├── losses.rst
    │       └── metrics.rst
    │   └── tutorials
    │       ├── challenge.md
    │       ├── concepts.md
    │       ├── dataset.rst
    │       ├── features.rst
    │       ├── pretrained_models.md
    │       └── quickstart.md
├── overview.png
├── projects
    ├── M4C
    │   ├── README.md
    │   └── scripts
    │   │   └── extract_ocr_frcn_feature.py
    ├── M4C_Captioner
    │   ├── README.md
    │   └── scripts
    │   │   ├── coco_eval.py
    │   │   └── textcaps_eval.py
    └── TextCap_CVPR.pdf
├── pythia
    ├── __init__.py
    ├── common
    │   ├── __init__.py
    │   ├── batch_collator.py
    │   ├── constants.py
    │   ├── dataset_loader.py
    │   ├── defaults
    │   │   ├── __init__.py
    │   │   └── configs
    │   │   │   ├── base.yml
    │   │   │   └── datasets
    │   │   │       ├── captioning
    │   │   │           ├── coco.yml
    │   │   │           └── m4c_textcaps.yml
    │   │   │       ├── dialog
    │   │   │           └── visual_dialog.yml
    │   │   │       └── vqa
    │   │   │           ├── clevr.yml
    │   │   │           ├── m4c_ocrvqa.yml
    │   │   │           ├── m4c_stvqa.yml
    │   │   │           ├── m4c_textvqa.yml
    │   │   │           ├── textvqa.yml
    │   │   │           ├── visual_genome.yml
    │   │   │           ├── vizwiz.yml
    │   │   │           └── vqa2.yml
    │   ├── meter.py
    │   ├── registry.py
    │   ├── report.py
    │   ├── sample.py
    │   └── test_reporter.py
    ├── datasets
    │   ├── __init__.py
    │   ├── base_dataset.py
    │   ├── base_dataset_builder.py
    │   ├── captioning
    │   │   ├── __init__.py
    │   │   ├── coco
    │   │   │   ├── __init__.py
    │   │   │   ├── builder.py
    │   │   │   └── dataset.py
    │   │   └── m4c_textcaps
    │   │   │   ├── __init__.py
    │   │   │   ├── builder.py
    │   │   │   └── dataset.py
    │   ├── concat_dataset.py
    │   ├── dialog
    │   │   ├── __init__.py
    │   │   ├── original.py
    │   │   └── visual_dialog
    │   │   │   ├── config.yml
    │   │   │   └── scripts
    │   │   │       ├── build_imdb.py
    │   │   │       └── extract_vocabulary.py
    │   ├── feature_readers.py
    │   ├── features_dataset.py
    │   ├── image_database.py
    │   ├── multi_dataset.py
    │   ├── processors.py
    │   ├── samplers.py
    │   ├── scene_graph_database.py
    │   └── vqa
    │   │   ├── __init__.py
    │   │   ├── clevr
    │   │       ├── __init__.py
    │   │       ├── builder.py
    │   │       └── dataset.py
    │   │   ├── m4c_ocrvqa
    │   │       ├── __init__.py
    │   │       ├── builder.py
    │   │       └── dataset.py
    │   │   ├── m4c_stvqa
    │   │       ├── __init__.py
    │   │       ├── builder.py
    │   │       └── dataset.py
    │   │   ├── m4c_textvqa
    │   │       ├── __init__.py
    │   │       ├── builder.py
    │   │       └── dataset.py
    │   │   ├── textvqa
    │   │       ├── __init__.py
    │   │       ├── builder.py
    │   │       └── dataset.py
    │   │   ├── visual_genome
    │   │       ├── builder.py
    │   │       └── dataset.py
    │   │   ├── vizwiz
    │   │       ├── __init__.py
    │   │       ├── builder.py
    │   │       └── dataset.py
    │   │   └── vqa2
    │   │       ├── __init__.py
    │   │       ├── builder.py
    │   │       ├── dataset.py
    │   │       ├── ocr_builder.py
    │   │       └── ocr_dataset.py
    ├── legacy
    │   ├── best_model
    │   │   ├── best_model_predict_test.json
    │   │   └── config.yaml
    │   ├── config
    │   │   ├── collections.py
    │   │   ├── config.py
    │   │   ├── config_utils.py
    │   │   ├── demo
    │   │   │   └── config.yaml
    │   │   ├── function_config_lib.py
    │   │   ├── keep
    │   │   │   ├── MFH_ft.yaml
    │   │   │   └── detectron.yaml
    │   │   └── verbose
    │   │   │   ├── MFH_module.yaml
    │   │   │   ├── dectectron_finetune.yaml
    │   │   │   └── default.yaml
    │   ├── data
    │   │   └── demo
    │   │   │   ├── features
    │   │   │       └── COCO_test2015_000000000001.npy
    │   │   │   ├── images
    │   │   │       └── COCO_test2015_000000000001.jpg
    │   │   │   └── imdb
    │   │   │       └── imdb_demo.npy
    │   ├── data_prep
    │   │   ├── data_preprocess.md
    │   │   └── vqa_v2.0
    │   │   │   ├── build_vqa_2.0_imdb.py
    │   │   │   ├── download_vqa_2.0.sh
    │   │   │   ├── extract_ques_info.py
    │   │   │   ├── extract_vocabulary.py
    │   │   │   ├── extract_word_glove_embedding.py
    │   │   │   ├── genome_ids.pkl
    │   │   │   ├── process_answers.py
    │   │   │   ├── train_ids.pkl
    │   │   │   └── val_ids.pkl
    │   ├── dataset_utils
    │   │   ├── __init__.py
    │   │   ├── create_imdb_header.py
    │   │   ├── dataSet.py
    │   │   ├── text_processing.py
    │   │   ├── utils.py
    │   │   ├── vqa_collates.py
    │   │   ├── vqa_concate_dataset.py
    │   │   └── vqa_html_writer.py
    │   ├── ensemble.py
    │   ├── eval_model
    │   │   ├── eval_demo.py
    │   │   └── vqaEval.py
    │   ├── global_variables
    │   │   ├── __init__.py
    │   │   └── global_variables.py
    │   ├── info
    │   │   ├── code_structure_plot.png
    │   │   ├── pythia.jpg
    │   │   └── vqa_example.png
    │   ├── install.sh
    │   ├── run_test.py
    │   ├── tools
    │   │   ├── convert_VD_to_COCO_qa.py
    │   │   ├── convert_VG_to_COCO.py
    │   │   ├── convert_VG_to_COCO_qa.py
    │   │   ├── convert_tsv_feature_to_indiv.py
    │   │   ├── eval_ensemble_on_val.py
    │   │   ├── extract_detectron_weights.py
    │   │   ├── extract_minival_ids.py
    │   │   ├── extract_visual_features_vgg_pool5.py
    │   │   ├── generate_minival_annotation.py
    │   │   ├── mirror_images.py
    │   │   ├── model_path.py
    │   │   ├── process_log.py
    │   │   ├── rename_genome_file.py
    │   │   ├── subset_val.py
    │   │   ├── timer.py
    │   │   └── visualize_bbox.py
    │   ├── top_down_bottom_up
    │   │   ├── classifier.py
    │   │   ├── image_attention.py
    │   │   ├── image_embedding.py
    │   │   ├── image_feature_encoding.py
    │   │   ├── intermediate_layer.py
    │   │   ├── multi_modal_combine.py
    │   │   ├── nonlinear_layer.py
    │   │   ├── post_combine_transform.py
    │   │   ├── question_embeding.py
    │   │   ├── top_down_bottom_up_model.py
    │   │   └── unittests.py
    │   ├── train.py
    │   ├── train_model
    │   │   ├── Engineer.py
    │   │   ├── Error_analysis.py
    │   │   ├── Loss.py
    │   │   ├── __init__.py
    │   │   ├── dataset_utils.py
    │   │   ├── eval_utils.py
    │   │   ├── evaluate.py
    │   │   ├── evaluate_with_ensemble.py
    │   │   ├── helper.py
    │   │   └── model_factory.py
    │   └── vqa_demo.ipynb
    ├── models
    │   ├── __init__.py
    │   ├── ban.py
    │   ├── base_model.py
    │   ├── butd.py
    │   ├── cnn_lstm.py
    │   ├── lorra.py
    │   ├── m4c.py
    │   ├── m4c_captioner.py
    │   ├── pythia.py
    │   ├── top_down_bottom_up.py
    │   └── visdial_multi_modal.py
    ├── modules
    │   ├── __init__.py
    │   ├── attention.py
    │   ├── decoders.py
    │   ├── embeddings.py
    │   ├── encoders.py
    │   ├── gpn.py
    │   ├── layers.py
    │   ├── losses.py
    │   ├── metrics.py
    │   └── refine_mmt.py
    ├── scripts
    │   ├── coco
    │   │   └── coco_caption_eval.py
    │   ├── extract_vocabulary.py
    │   ├── features
    │   │   ├── extract_features.md
    │   │   ├── extract_features.py
    │   │   ├── extract_features_vmb.py
    │   │   ├── extract_resnet152_feat.py
    │   │   └── extract_resnet_features.py
    │   └── gqa
    │   │   ├── README.md
    │   │   └── convert_gqa_to_vqa.py
    ├── trainers
    │   ├── __init__.py
    │   └── base_trainer.py
    └── utils
    │   ├── __init__.py
    │   ├── build_utils.py
    │   ├── checkpoint.py
    │   ├── configuration.py
    │   ├── dataset_utils.py
    │   ├── distributed_utils.py
    │   ├── early_stopping.py
    │   ├── flags.py
    │   ├── general.py
    │   ├── logger.py
    │   ├── m4c_evaluators.py
    │   ├── objects_to_byte_tensor.py
    │   ├── phoc
    │       ├── __init__.py
    │       ├── build_phoc.py
    │       └── src
    │       │   └── cphoc.c
    │   ├── process_answers.py
    │   ├── text_utils.py
    │   ├── timer.py
    │   └── vocab.py
├── requirements.txt
├── run.sh
├── setup.py
├── tests
    ├── __init__.py
    ├── data
    │   └── vocab.txt
    ├── models
    │   └── test_cnn_lstm.py
    ├── modules
    │   ├── __init__.py
    │   ├── test_layers.py
    │   ├── test_losses.py
    │   └── test_metrics.py
    ├── tasks
    │   ├── __init__.py
    │   ├── test_base_dataset.py
    │   └── test_processors.py
    ├── test_utils.py
    └── utils
    │   ├── __init__.py
    │   ├── test_general.py
    │   ├── test_text_utils.py
    │   └── test_timer.py
├── textcap.yaml
├── tools
    ├── bert
    │   ├── extract_bert.sh
    │   └── extract_bert_embeddings.py
    └── run.py
└── val.sh


/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct
2 | 
3 | 
4 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
5 | Please read the [full text](https://code.fb.com/codeofconduct/)
6 | so that you can understand what actions will and will not be tolerated.
7 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | BSD License
 2 | 
 3 | For Pythia software
 4 | 
 5 | Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
 6 | 
 7 | Redistribution and use in source and binary forms, with or without modification,
 8 | are permitted provided that the following conditions are met:
 9 | 
10 |  * Redistributions of source code must retain the above copyright notice, this
11 |    list of conditions and the following disclaimer.
12 | 
13 |  * Redistributions in binary form must reproduce the above copyright notice,
14 |    this list of conditions and the following disclaimer in the documentation
15 |    and/or other materials provided with the distribution.
16 | 
17 |  * Neither the name Facebook nor the names of its contributors may be used to
18 |    endorse or promote products derived from this software without specific
19 |    prior written permission.
20 | 
21 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 | ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
28 | ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Towards Accurate Text-based Image Captioning with Content Diversity Exploration
 2 | 
 3 | Pytorch implementation for the CVPR 2021 paper:  [Towards Accurate Text-based Image Captioning with Content Diversity Exploration](https://github.com/guanghuixu/AnchorCaptioner/blob/main/projects/TextCap_CVPR.pdf)
 4 | 
 5 | <p align="center">
 6 | <img src="overview.png" alt="Anchor Captioner" width="100%" align=center />
 7 | </p>
 8 | 
 9 | ## Install
10 | 
11 | Clone this repository, and build it with the following command.
12 | 
13 | ```
14 | # activate your own conda environment
15 | # [Alternative]
16 | # conda env create -f textcap.yaml
17 | # conda activate textcap
18 | 
19 | git clone git@github.com:guanghuixu/AnchorCaptioner.git
20 | cd AnchorCaptioner
21 | python setup.py build develop
22 | ```
23 | 
24 | ## Data and running scripts
25 | 
26 | Some specific annotations required by our method are provided in [here](https://github.com/guanghuixu/AnchorCaptioner/releases/download/data/data.zip). More details please refer to [projects/M4C_Captioner/README.md](https://github.com/guanghuixu/AnchorCaptioner/blob/main/projects/M4C_Captioner/README.md)
27 | 
28 | ## Citation
29 | 
30 | If you use any part of our code in your research, please cite our paper:
31 | 
32 | ```BibTex
33 | @InProceedings{xu2021textcap,
34 |   title = {Towards Accurate Text-based Image Captioning with Content Diversity Exploration},
35 |   author = {Guanghui Xu and Mingkui Tan and Shuaicheng Niu and Yucheng Luo and Qing Du and Qi Wu},
36 |   booktitle = {The IEEE Conference on Computer Vision and Pattern Recognition},
37 |   year = {2021}
38 | }
39 | ```
40 | 
41 | ## Acknowledgment
42 | 
43 | The code is greatly inspired by the [MMF](https://mmf.readthedocs.io/en/latest/) and [M4C-Captioner](https://github.com/ronghanghu/mmf).
44 | 
45 | 


--------------------------------------------------------------------------------
/configs/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/configs/__init__.py


--------------------------------------------------------------------------------
/configs/captioning/coco/butd.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - common/defaults/configs/datasets/captioning/coco.yml
 3 | model_attributes:
 4 |   butd: &butd
 5 |     model_data_dir: ../data/
 6 |     metrics:
 7 |     - type: caption_bleu4
 8 |     losses:
 9 |     - type: caption_cross_entropy
10 |     classifier:
11 |       type: language_decoder
12 |       params:
13 |         dropout: 0.5
14 |         hidden_dim: 1024
15 |         feature_dim: 2048
16 |         fc_bias_init: 0
17 |     image_feature_embeddings:
18 |     - modal_combine:
19 |         type: top_down_attention_lstm
20 |         params:
21 |           dropout: 0.5
22 |           hidden_dim: 1024
23 |           attention_dim: 1024
24 |       normalization: softmax
25 |       transform:
26 |         type: linear
27 |         params:
28 |           out_dim: 1
29 |     image_feature_dim: 2048
30 |     embedding_dim: 300
31 |     image_feature_encodings:
32 |     - type: finetune_faster_rcnn_fpn_fc7
33 |       params:
34 |         bias_file: detectron/fc6/fc7_b.pkl
35 |         weights_file: detectron/fc6/fc7_w.pkl
36 |     inference:
37 |       type: greedy
38 | optimizer_attributes:
39 |   type: Adamax
40 |   params:
41 |     eps: 1.0e-08
42 |     lr: 0.01
43 |     weight_decay: 0
44 | training_parameters:
45 |   clip_norm_mode: all
46 |   clip_gradients: true
47 |   lr_ratio: 0.1
48 |   lr_scheduler: true
49 |   lr_steps:
50 |   - 15000
51 |   - 25000
52 |   - 35000
53 |   - 45000
54 |   max_grad_l2_norm: 0.25
55 |   max_iterations: 50000
56 |   use_warmup: true
57 |   warmup_factor: 0.2
58 |   warmup_iterations: 1000
59 |   patience: 4000
60 |   batch_size: 256
61 |   num_workers: 7
62 |   task_size_proportional_sampling: true
63 |   monitored_metric: coco/caption_bleu4
64 |   metric_minimize: false
65 | 


--------------------------------------------------------------------------------
/configs/captioning/coco/butd_beam_search.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - ../configs/captioning/coco/butd.yml
 3 | model_attributes:
 4 |   butd: &butd
 5 |     inference:
 6 |       type: beam_search
 7 |       params:
 8 |         beam_length: 5
 9 | training_parameters:
10 |   batch_size: 1
11 | 


--------------------------------------------------------------------------------
/configs/captioning/m4c_textcaps/butd_beam_search.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - ../configs/captioning/m4c_textcaps/butd.yml
 3 | dataset_attributes:
 4 |   coco:
 5 |     imdb_files:
 6 |       val:
 7 |       - imdb/m4c_textcaps/imdb_val_filtered_by_image_id.npy
 8 | model_attributes:
 9 |   butd: &butd
10 |     inference:
11 |       type: beam_search
12 |       params:
13 |         beam_length: 5
14 | training_parameters:
15 |   batch_size: 1
16 | 


--------------------------------------------------------------------------------
/configs/captioning/m4c_textcaps/butd_eval_pretrained_coco_model.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - ../configs/captioning/m4c_textcaps/butd_beam_search.yml
 3 | dataset_attributes:
 4 |   coco:
 5 |     processors:
 6 |       text_processor:
 7 |         params:
 8 |           vocab:
 9 |             vocab_file: vocabs/vocabulary_captioning_thresh5.txt
10 |       caption_processor:
11 |         params:
12 |           vocab:
13 |             vocab_file: vocabs/vocabulary_captioning_thresh5.txt
14 | 


--------------------------------------------------------------------------------
/configs/captioning/m4c_textcaps/m4c_captioner_coco_eval_on_textcaps.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - ../configs/captioning/m4c_textcaps/m4c_captioner_coco.yml
 3 | dataset_attributes:
 4 |   m4c_textcaps:
 5 |     image_features:
 6 |       val:
 7 |       - open_images/detectron_fix_100/fc6/train,m4c_textvqa_ocr_en_frcn_features/train_images
 8 |       test:
 9 |       - open_images/detectron_fix_100/fc6/test,m4c_textvqa_ocr_en_frcn_features/test_images
10 |     imdb_files:
11 |       val:
12 |       - imdb/m4c_textcaps/imdb_val_filtered_by_image_id.npy  # only one sample per image_id
13 |       test:
14 |       - imdb/m4c_textcaps/imdb_test_filtered_by_image_id.npy  # only one sample per image_id
15 | 


--------------------------------------------------------------------------------
/configs/vqa/clevr/cnn_lstm.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - common/defaults/configs/datasets/vqa/clevr.yml
 3 | 
 4 | model_attributes:
 5 |   cnn_lstm:
 6 |     metrics:
 7 |     - type: accuracy
 8 |     losses:
 9 |     - type: logit_bce
10 |     text_embedding:
11 |       embedding_dim: 20
12 |     lstm:
13 |       input_size: 20
14 |       hidden_size: 50
15 |       bidirectional: true
16 |       batch_first: true
17 |     cnn:
18 |       layers:
19 |         input_dims: [3, 64, 128, 128, 64, 64]
20 |         output_dims: [64, 128, 128, 64, 64, 10]
21 |         kernel_sizes: [7, 5, 5, 5, 5, 1]
22 |     classifier:
23 |       input_dim: 450
24 | 
25 | optimizer_attributes:
26 |   type: Adamax
27 |   params:
28 |     eps: 1.0e-08
29 |     lr: 0.01
30 |     weight_decay: 0
31 |     
32 | training_parameters:
33 |   batch_size: 128
34 |   snapshot_interval: 6000
35 |   monitored_metric: clevr/accuracy
36 |   metric_minimize: false
37 | 


--------------------------------------------------------------------------------
/configs/vqa/m4c_ocrvqa/m4c.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - common/defaults/configs/datasets/vqa/m4c_ocrvqa.yml
 3 | # Use soft copy
 4 | dataset_attributes:
 5 |   m4c_ocrvqa:
 6 |     image_features:
 7 |       train:
 8 |       - m4c_ocrvqa_obj_frcn_features/all,m4c_ocrvqa_ocr_en_frcn_features/all
 9 |       val:
10 |       - m4c_ocrvqa_obj_frcn_features/all,m4c_ocrvqa_ocr_en_frcn_features/all
11 |       test:
12 |       - m4c_ocrvqa_obj_frcn_features/all,m4c_ocrvqa_ocr_en_frcn_features/all
13 |     imdb_files:
14 |       train:
15 |       - imdb/m4c_ocrvqa/imdb_train.npy
16 |       val:
17 |       - imdb/m4c_ocrvqa/imdb_val.npy
18 |       test:
19 |       - imdb/m4c_ocrvqa/imdb_test.npy
20 |     processors:
21 |       text_processor:
22 |         type: bert_tokenizer
23 |         params:
24 |           max_length: 20
25 |       answer_processor:
26 |         type: m4c_answer
27 |         params:
28 |           vocab_file: m4c_vocabs/ocrvqa/fixed_answer_vocab_ocrvqa_82.txt
29 |           preprocessor:
30 |             type: simple_word
31 |             params: {}
32 |           context_preprocessor:
33 |             type: simple_word
34 |             params: {}
35 |           max_length: 50
36 |           max_copy_steps: 12
37 |           num_answers: 10
38 |       copy_processor:
39 |         type: copy
40 |         params:
41 |           max_length: 100
42 |       phoc_processor:
43 |         type: phoc
44 |         params:
45 |           max_length: 50
46 | model_attributes:
47 |   m4c:
48 |     lr_scale_frcn: 0.1
49 |     lr_scale_text_bert: 0.1
50 |     lr_scale_mmt: 1.0  # no scaling
51 |     text_bert_init_from_bert_base: true
52 |     text_bert:
53 |       num_hidden_layers: 3
54 |     obj:
55 |       mmt_in_dim: 2048
56 |       dropout_prob: 0.1
57 |     ocr:
58 |       mmt_in_dim: 3002  # 300 (FastText) + 604 (PHOC) + 2048 (Faster R-CNN) + 50 (all zeros; legacy)
59 |       dropout_prob: 0.1
60 |     mmt:
61 |       hidden_size: 768
62 |       num_hidden_layers: 4
63 |     classifier:
64 |       type: linear
65 |       ocr_max_num: 50
66 |       ocr_ptr_net:
67 |         hidden_size: 768
68 |         query_key_size: 768
69 |       params: {}
70 |     model_data_dir: ../data
71 |     metrics:
72 |     - type: ocrvqa_accuracy
73 |     losses:
74 |     - type: m4c_decoding_bce_with_mask
75 | optimizer_attributes:
76 |   params:
77 |     eps: 1.0e-08
78 |     lr: 1e-4
79 |     weight_decay: 0
80 |   type: Adam
81 | training_parameters:
82 |     clip_norm_mode: all
83 |     clip_gradients: true
84 |     max_grad_l2_norm: 0.25
85 |     lr_scheduler: true
86 |     lr_steps:
87 |     - 28000
88 |     - 38000
89 |     lr_ratio: 0.1
90 |     use_warmup: true
91 |     warmup_factor: 0.2
92 |     warmup_iterations: 1000
93 |     max_iterations: 48000
94 |     batch_size: 128
95 |     num_workers: 8
96 |     task_size_proportional_sampling: true
97 |     monitored_metric: m4c_ocrvqa/ocrvqa_accuracy
98 |     metric_minimize: false
99 | 


--------------------------------------------------------------------------------
/configs/vqa/m4c_stvqa/m4c.yml:
--------------------------------------------------------------------------------
  1 | includes:
  2 | - common/defaults/configs/datasets/vqa/m4c_stvqa.yml
  3 | # Use soft copy
  4 | dataset_attributes:
  5 |   m4c_stvqa:
  6 |     image_features:
  7 |       train:
  8 |       - m4c_stvqa_obj_frcn_features/train,m4c_stvqa_ocr_en_frcn_features/train
  9 |       val:
 10 |       - m4c_stvqa_obj_frcn_features/train,m4c_stvqa_ocr_en_frcn_features/train
 11 |       test:
 12 |       - m4c_stvqa_obj_frcn_features/test_task3,m4c_stvqa_ocr_en_frcn_features/test_task3
 13 |     imdb_files:
 14 |       train:
 15 |       - imdb/m4c_stvqa/imdb_subtrain.npy
 16 |       val:
 17 |       - imdb/m4c_stvqa/imdb_subval.npy
 18 |       test:
 19 |       - imdb/m4c_stvqa/imdb_test_task3.npy
 20 |     processors:
 21 |       text_processor:
 22 |         type: bert_tokenizer
 23 |         params:
 24 |           max_length: 20
 25 |       answer_processor:
 26 |         type: m4c_answer
 27 |         params:
 28 |           vocab_file: m4c_vocabs/stvqa/fixed_answer_vocab_stvqa_5k.txt
 29 |           preprocessor:
 30 |             type: simple_word
 31 |             params: {}
 32 |           context_preprocessor:
 33 |             type: simple_word
 34 |             params: {}
 35 |           max_length: 50
 36 |           max_copy_steps: 12
 37 |           num_answers: 10
 38 |       copy_processor:
 39 |         type: copy
 40 |         params:
 41 |           max_length: 100
 42 |       phoc_processor:
 43 |         type: phoc
 44 |         params:
 45 |           max_length: 50
 46 | model_attributes:
 47 |   m4c:
 48 |     lr_scale_frcn: 0.1
 49 |     lr_scale_text_bert: 0.1
 50 |     lr_scale_mmt: 1.0  # no scaling
 51 |     text_bert_init_from_bert_base: true
 52 |     text_bert:
 53 |       num_hidden_layers: 3
 54 |     obj:
 55 |       mmt_in_dim: 2048
 56 |       dropout_prob: 0.1
 57 |     ocr:
 58 |       mmt_in_dim: 3002  # 300 (FastText) + 604 (PHOC) + 2048 (Faster R-CNN) + 50 (all zeros; legacy)
 59 |       dropout_prob: 0.1
 60 |     mmt:
 61 |       hidden_size: 768
 62 |       num_hidden_layers: 4
 63 |     classifier:
 64 |       type: linear
 65 |       ocr_max_num: 50
 66 |       ocr_ptr_net:
 67 |         hidden_size: 768
 68 |         query_key_size: 768
 69 |       params: {}
 70 |     model_data_dir: ../data
 71 |     metrics:
 72 |     - type: stvqa_accuracy
 73 |     - type: stvqa_anls
 74 |     losses:
 75 |     - type: m4c_decoding_bce_with_mask
 76 | optimizer_attributes:
 77 |   params:
 78 |     eps: 1.0e-08
 79 |     lr: 1e-4
 80 |     weight_decay: 0
 81 |   type: Adam
 82 | training_parameters:
 83 |     clip_norm_mode: all
 84 |     clip_gradients: true
 85 |     max_grad_l2_norm: 0.25
 86 |     lr_scheduler: true
 87 |     lr_steps:
 88 |     - 14000
 89 |     - 19000
 90 |     lr_ratio: 0.1
 91 |     use_warmup: true
 92 |     warmup_factor: 0.2
 93 |     warmup_iterations: 1000
 94 |     max_iterations: 24000
 95 |     batch_size: 128
 96 |     num_workers: 8
 97 |     task_size_proportional_sampling: true
 98 |     monitored_metric: m4c_stvqa/stvqa_accuracy
 99 |     metric_minimize: false
100 | 


--------------------------------------------------------------------------------
/configs/vqa/m4c_textvqa/m4c.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - common/defaults/configs/datasets/vqa/m4c_textvqa.yml
 3 | # Use soft copy
 4 | dataset_attributes:
 5 |   m4c_textvqa:
 6 |     image_features:
 7 |       train:
 8 |       - open_images/detectron_fix_100/fc6/train,m4c_textvqa_ocr_en_frcn_features/train_images
 9 |       val:
10 |       - open_images/detectron_fix_100/fc6/train,m4c_textvqa_ocr_en_frcn_features/train_images
11 |       test:
12 |       - open_images/detectron_fix_100/fc6/test,m4c_textvqa_ocr_en_frcn_features/test_images
13 |     imdb_files:
14 |       train:
15 |       - imdb/m4c_textvqa/imdb_train_ocr_en.npy
16 |       val:
17 |       - imdb/m4c_textvqa/imdb_val_ocr_en.npy
18 |       test:
19 |       - imdb/m4c_textvqa/imdb_test_ocr_en.npy
20 |     processors:
21 |       text_processor:
22 |         type: bert_tokenizer
23 |         params:
24 |           max_length: 20
25 |       answer_processor:
26 |         type: m4c_answer
27 |         params:
28 |           vocab_file: m4c_vocabs/textvqa/fixed_answer_vocab_textvqa_5k.txt
29 |           preprocessor:
30 |             type: simple_word
31 |             params: {}
32 |           context_preprocessor:
33 |             type: simple_word
34 |             params: {}
35 |           max_length: 50
36 |           max_copy_steps: 12
37 |           num_answers: 10
38 |       copy_processor:
39 |         type: copy
40 |         params:
41 |           max_length: 100
42 |       phoc_processor:
43 |         type: phoc
44 |         params:
45 |           max_length: 50
46 | model_attributes:
47 |   m4c:
48 |     lr_scale_frcn: 0.1
49 |     lr_scale_text_bert: 0.1
50 |     lr_scale_mmt: 1.0  # no scaling
51 |     text_bert_init_from_bert_base: true
52 |     text_bert:
53 |       num_hidden_layers: 3
54 |     obj:
55 |       mmt_in_dim: 2048
56 |       dropout_prob: 0.1
57 |     ocr:
58 |       mmt_in_dim: 3002  # 300 (FastText) + 604 (PHOC) + 2048 (Faster R-CNN) + 50 (all zeros; legacy)
59 |       dropout_prob: 0.1
60 |     mmt:
61 |       hidden_size: 768
62 |       num_hidden_layers: 4
63 |     classifier:
64 |       type: linear
65 |       ocr_max_num: 50
66 |       ocr_ptr_net:
67 |         hidden_size: 768
68 |         query_key_size: 768
69 |       params: {}
70 |     model_data_dir: ../data
71 |     metrics:
72 |     - type: textvqa_accuracy
73 |     losses:
74 |     - type: m4c_decoding_bce_with_mask
75 | optimizer_attributes:
76 |   params:
77 |     eps: 1.0e-08
78 |     lr: 1e-4
79 |     weight_decay: 0
80 |   type: Adam
81 | training_parameters:
82 |     clip_norm_mode: all
83 |     clip_gradients: true
84 |     max_grad_l2_norm: 0.25
85 |     lr_scheduler: true
86 |     lr_steps:
87 |     - 14000
88 |     - 19000
89 |     lr_ratio: 0.1
90 |     use_warmup: true
91 |     warmup_factor: 0.2
92 |     warmup_iterations: 1000
93 |     max_iterations: 24000
94 |     batch_size: 128
95 |     num_workers: 8
96 |     task_size_proportional_sampling: true
97 |     monitored_metric: m4c_textvqa/textvqa_accuracy
98 |     metric_minimize: false
99 | 


--------------------------------------------------------------------------------
/configs/vqa/m4c_textvqa/m4c_ocr_ml.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - common/defaults/configs/datasets/vqa/m4c_textvqa.yml
 3 | # Use soft copy
 4 | dataset_attributes:
 5 |   m4c_textvqa:
 6 |     image_features:
 7 |       train:
 8 |       - open_images/detectron_fix_100/fc6/train,m4c_textvqa_ocr_ml_frcn_features/train_images
 9 |       val:
10 |       - open_images/detectron_fix_100/fc6/train,m4c_textvqa_ocr_ml_frcn_features/train_images
11 |       test:
12 |       - open_images/detectron_fix_100/fc6/test,m4c_textvqa_ocr_ml_frcn_features/test_images
13 |     imdb_files:
14 |       train:
15 |       - imdb/m4c_textvqa/imdb_train_ocr_ml.npy
16 |       val:
17 |       - imdb/m4c_textvqa/imdb_val_ocr_ml.npy
18 |       test:
19 |       - imdb/m4c_textvqa/imdb_test_ocr_ml.npy
20 |     processors:
21 |       text_processor:
22 |         type: bert_tokenizer
23 |         params:
24 |           max_length: 20
25 |       answer_processor:
26 |         type: m4c_answer
27 |         params:
28 |           vocab_file: m4c_vocabs/textvqa/fixed_answer_vocab_textvqa_5k.txt
29 |           preprocessor:
30 |             type: simple_word
31 |             params: {}
32 |           context_preprocessor:
33 |             type: simple_word
34 |             params: {}
35 |           max_length: 50
36 |           max_copy_steps: 12
37 |           num_answers: 10
38 |       copy_processor:
39 |         type: copy
40 |         params:
41 |           max_length: 100
42 |       phoc_processor:
43 |         type: phoc
44 |         params:
45 |           max_length: 50
46 | model_attributes:
47 |   m4c:
48 |     lr_scale_frcn: 0.1
49 |     lr_scale_text_bert: 0.1
50 |     lr_scale_mmt: 1.0  # no scaling
51 |     text_bert_init_from_bert_base: true
52 |     text_bert:
53 |       num_hidden_layers: 3
54 |     obj:
55 |       mmt_in_dim: 2048
56 |       dropout_prob: 0.1
57 |     ocr:
58 |       mmt_in_dim: 3002  # 300 (FastText) + 604 (PHOC) + 2048 (Faster R-CNN) + 50 (all zeros; legacy)
59 |       dropout_prob: 0.1
60 |     mmt:
61 |       hidden_size: 768
62 |       num_hidden_layers: 4
63 |     classifier:
64 |       type: linear
65 |       ocr_max_num: 50
66 |       ocr_ptr_net:
67 |         hidden_size: 768
68 |         query_key_size: 768
69 |       params: {}
70 |     model_data_dir: ../data
71 |     metrics:
72 |     - type: textvqa_accuracy
73 |     losses:
74 |     - type: m4c_decoding_bce_with_mask
75 | optimizer_attributes:
76 |   params:
77 |     eps: 1.0e-08
78 |     lr: 1e-4
79 |     weight_decay: 0
80 |   type: Adam
81 | training_parameters:
82 |     clip_norm_mode: all
83 |     clip_gradients: true
84 |     max_grad_l2_norm: 0.25
85 |     lr_scheduler: true
86 |     lr_steps:
87 |     - 14000
88 |     - 19000
89 |     lr_ratio: 0.1
90 |     use_warmup: true
91 |     warmup_factor: 0.2
92 |     warmup_iterations: 1000
93 |     max_iterations: 24000
94 |     batch_size: 128
95 |     num_workers: 8
96 |     task_size_proportional_sampling: true
97 |     monitored_metric: m4c_textvqa/textvqa_accuracy
98 |     metric_minimize: false
99 | 


--------------------------------------------------------------------------------
/configs/vqa/textvqa/ban.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - ../configs/vqa/textvqa/pythia.yml
 3 | model_attributes:
 4 |   ban:
 5 |     metrics:
 6 |     - type: vqa_accuracy
 7 |     losses:
 8 |     - type: logit_bce
 9 |     text_embedding:
10 |       num_hidden: 1280
11 |       vocab_size: 1280
12 |       emb_size: 300
13 |       num_layers: 1
14 |       dropout: 0.0
15 |       bidirectional: False
16 |       rnn_type: 'GRU'
17 |     bilinear_attention:
18 |       bc_net:
19 |         k: 1
20 |         dropout: [0.2, 0.5]
21 |         h_out:
22 |       fc_net:
23 |         dims: 600
24 |         activation:
25 |         dropout: 0.2
26 |       gamma: 4
27 |       visual_feat_dim: 2048
28 |     classifier:
29 |       # out dim will be taken from registry as set by dataset builder
30 |       hidden_size: 600
31 |       dropout: 0.5
32 | 


--------------------------------------------------------------------------------
/configs/vqa/textvqa/pythia.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - common/defaults/configs/datasets/vqa/textvqa.yml
 3 | model_attributes:
 4 |   pythia: &pythia
 5 |     model_data_dir: ../data
 6 |     metrics:
 7 |     - type: vqa_accuracy
 8 |     losses:
 9 |     - type: logit_bce
10 |     num_context_features: 1
11 |     context_feature_dim: 300
12 |     image_feature_dim: 2048
13 |     context_max_len: 50
14 |     classifier:
15 |       type: logit
16 |       params:
17 |         img_hidden_dim: 5000
18 |         text_hidden_dim: 300
19 |     image_feature_embeddings:
20 |     - modal_combine:
21 |         type: non_linear_element_multiply
22 |         params:
23 |           dropout: 0
24 |           hidden_dim: 5000
25 |       normalization: softmax
26 |       transform:
27 |         type: linear
28 |         params:
29 |           out_dim: 1
30 |     image_feature_encodings:
31 |     - type: finetune_faster_rcnn_fpn_fc7
32 |       params:
33 |         bias_file: detectron/fc6/fc7_b.pkl
34 |         weights_file: detectron/fc6/fc7_w.pkl
35 |     - type: default
36 |       params: {}
37 |     context_feature_encodings:
38 |     - type: default
39 |       params: {}
40 |     image_text_modal_combine:
41 |       type: non_linear_element_multiply
42 |       params:
43 |         dropout: 0
44 |         hidden_dim: 5000
45 |     text_embeddings:
46 |     - type: attention
47 |       params:
48 |         hidden_dim: 1024
49 |         num_layers: 1
50 |         conv1_out: 512
51 |         conv2_out: 2
52 |         dropout: 0
53 |         embedding_dim: 300
54 |         kernel_size: 1
55 |         padding: 0
56 |   pythia_image_only: *pythia
57 |   pythia_question_only: *pythia
58 | optimizer_attributes:
59 |   type: Adamax
60 |   params:
61 |     lr: 0.005
62 | training_parameters:
63 |     clip_norm_mode: all
64 |     clip_gradients: false
65 |     max_grad_l2_norm: 0.25
66 |     lr_scheduler: true
67 |     lr_steps:
68 |     - 14000
69 |     lr_ratio: 0.01
70 |     use_warmup: true
71 |     warmup_factor: 0.2
72 |     warmup_iterations: 1000
73 |     max_iterations: 24000
74 |     batch_size: 128
75 |     num_workers: 7
76 |     task_size_proportional_sampling: true
77 |     monitored_metric: textvqa/vqa_accuracy
78 |     pretrained_mapping:
79 |       text_embeddings: text_embeddings
80 |       image_feature_encoders: image_feature_encoders
81 |       image_feature_embeddings_list: image_feature_embeddings_list
82 |       image_text_multi_modal_combine_layer: image_text_multi_modal_combine_layer
83 |     metric_minimize: false
84 | 


--------------------------------------------------------------------------------
/configs/vqa/visual_genome/pythia.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - ../configs/vqa/vqa2/pythia.yml
 3 | - common/defaults/configs/datasets/vqa/visual_genome.yml
 4 | dataset_attributes:
 5 |   visual_genome:
 6 |     return_scene_graph: false
 7 |     return_objects: false
 8 |     return_relationships: false
 9 |     return_info: false
10 |     no_unk: true
11 | training_parameters:
12 |   monitored_metric: vqa2/vqa_accuracy


--------------------------------------------------------------------------------
/configs/vqa/vizwiz/ban.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - ../configs/vqa/vizwiz/pythia.yml
 3 | model_attributes:
 4 |   ban:
 5 |     metrics:
 6 |     - type: vqa_accuracy
 7 |     losses:
 8 |     - type: logit_bce
 9 |     text_embedding:
10 |       num_hidden: 1280
11 |       vocab_size: 1280
12 |       emb_size: 300
13 |       num_layers: 1
14 |       dropout: 0.0
15 |       bidirectional: False
16 |       rnn_type: 'GRU'
17 |     bilinear_attention:
18 |       bc_net:
19 |         k: 1
20 |         dropout: [0.2, 0.5]
21 |         h_out:
22 |       fc_net:
23 |         dims: 600
24 |         activation:
25 |         dropout: 0.2
26 |       gamma: 4
27 |       visual_feat_dim: 2048
28 |     classifier:
29 |       # out dim will be taken from registry as set by dataset builder
30 |       hidden_size: 600
31 |       dropout: 0.5
32 | 


--------------------------------------------------------------------------------
/configs/vqa/vizwiz/pythia.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - common/defaults/configs/datasets/vqa/vizwiz.yml
 3 | model_attributes:
 4 |   pythia: &pythia
 5 |     model_data_dir: ../data/
 6 |     metrics:
 7 |     - type: vqa_accuracy
 8 |     losses:
 9 |     - type: logit_bce
10 |     classifier:
11 |       type: logit
12 |       params:
13 |         img_hidden_dim: 5000
14 |         text_hidden_dim: 300
15 |     image_feature_embeddings:
16 |     - modal_combine:
17 |         type: non_linear_element_multiply
18 |         params:
19 |           dropout: 0
20 |           hidden_dim: 5000
21 |       normalization: softmax
22 |       transform:
23 |         type: linear
24 |         params:
25 |           out_dim: 1
26 |     image_feature_dim: 2048
27 |     image_feature_encodings:
28 |     - type: finetune_faster_rcnn_fpn_fc7
29 |       params:
30 |         bias_file: detectron/fc6/fc7_b.pkl
31 |         weights_file: detectron/fc6/fc7_w.pkl
32 |     - type: default
33 |       params: {}
34 |     image_text_modal_combine:
35 |       type: non_linear_element_multiply
36 |       params:
37 |         dropout: 0
38 |         hidden_dim: 5000
39 |     text_embeddings:
40 |     - type: attention
41 |       params:
42 |         hidden_dim: 1024
43 |         num_layers: 1
44 |         conv1_out: 512
45 |         conv2_out: 2
46 |         dropout: 0
47 |         embedding_dim: 300
48 |         kernel_size: 1
49 |         padding: 0
50 |   pythia_image_only: *pythia
51 |   pythia_question_only: *pythia
52 | optimizer_attributes:
53 |   type: Adamax
54 |   params:
55 |     lr: 0.005
56 | training_parameters:
57 |     clip_norm_mode: all
58 |     clip_gradients: true
59 |     max_grad_l2_norm: 0.25
60 |     lr_scheduler: true
61 |     lr_steps:
62 |     - 14000
63 |     lr_ratio: 0.01
64 |     use_warmup: true
65 |     warmup_factor: 0.2
66 |     warmup_iterations: 1000
67 |     max_iterations: 24000
68 |     batch_size: 128
69 |     num_workers: 7
70 |     task_size_proportional_sampling: true
71 |     monitored_metric: vizwiz/vqa_accuracy
72 |     metric_minimize: false
73 |     pretrained_mapping:
74 |       word_embedding: word_embedding
75 |       text_embeddings: text_embeddings
76 |       image_feature_encoders: image_feature_encoders
77 |       image_feature_embeddings_list: image_feature_embeddings_list
78 |       image_text_multi_modal_combine_layer: image_text_multi_modal_combine_layer
79 | 


--------------------------------------------------------------------------------
/configs/vqa/vqa2/ban.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - common/defaults/configs/datasets/vqa/vqa2.yml
 3 | model_attributes:
 4 |   ban:
 5 |     metrics:
 6 |     - vqa_accuracy
 7 |     losses:
 8 |     - type: logit_bce
 9 |     text_embedding:
10 |       num_hidden: 1280
11 |       vocab_size: 1280
12 |       emb_size: 300
13 |       num_layers: 1
14 |       dropout: 0.0
15 |       bidirectional: False
16 |       rnn_type: 'GRU'
17 |     bilinear_attention:
18 |       bc_net:
19 |         k: 1
20 |         dropout: [0.2, 0.5]
21 |         h_out:
22 |       fc_net:
23 |         dims: 600
24 |         activation:
25 |         dropout: 0.2
26 |       gamma: 4
27 |       visual_feat_dim: 2048
28 |     classifier:
29 |       # out dim will be taken from registry as set by dataset builder
30 |       hidden_size: 600
31 |       dropout: 0.5
32 | optimizer_attributes:
33 |   type: Adamax
34 |   params:
35 |     eps: 1.0e-08
36 |     lr: 0.01
37 |     weight_decay: 0
38 | training_parameters:
39 |   clip_norm_mode: all
40 |   clip_gradients: true
41 |   lr_ratio: 0.1
42 |   lr_scheduler: true
43 |   lr_steps:
44 |   - 15000
45 |   - 18000
46 |   - 20000
47 |   - 21000
48 |   max_grad_l2_norm: 0.25
49 |   max_iterations: 22000
50 |   use_warmup: true
51 |   warmup_factor: 0.2
52 |   warmup_iterations: 1000
53 |   patience: 4000
54 |   batch_size: 512
55 |   num_workers: 7
56 |   task_size_proportional_sampling: true
57 |   monitored_metric: vqa2/vqa_accuracy
58 |   metric_minimize: false
59 | 


--------------------------------------------------------------------------------
/configs/vqa/vqa2/lorra_train_and_val.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - ../configs/vqa/vqa2/lorra.yml
 3 | task_attributes:
 4 |   vqa:
 5 |     dataset_attributes:
 6 |       vqa2:
 7 |         image_features:
 8 |           train:
 9 |           - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014
10 |           - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014
11 |           val:
12 |           - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014
13 |         imdb_files:
14 |           train:
15 |           - imdb/vqa/imdb_train2014.npy
16 |           - imdb/vqa/imdb_val2014.npy
17 |           val:
18 |           - imdb/vqa/imdb_minival2014.npy
19 | 


--------------------------------------------------------------------------------
/configs/vqa/vqa2/pythia.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - common/defaults/configs/datasets/vqa/vqa2.yml
 3 | model_attributes:
 4 |   pythia: &pythia
 5 |     model_data_dir: ../data/
 6 |     metrics:
 7 |     - type: vqa_accuracy
 8 |     losses:
 9 |     - type: logit_bce
10 |     classifier:
11 |       type: logit
12 |       params:
13 |         img_hidden_dim: 5000
14 |         text_hidden_dim: 300
15 |     image_feature_embeddings:
16 |     - modal_combine:
17 |         type: non_linear_element_multiply
18 |         params:
19 |           dropout: 0
20 |           hidden_dim: 5000
21 |       normalization: softmax
22 |       transform:
23 |         type: linear
24 |         params:
25 |           out_dim: 1
26 |     image_feature_dim: 2048
27 |     image_feature_encodings:
28 |     - type: finetune_faster_rcnn_fpn_fc7
29 |       params:
30 |         bias_file: detectron/fc6/fc7_b.pkl
31 |         weights_file: detectron/fc6/fc7_w.pkl
32 |     - type: default
33 |       params: {}
34 |     image_text_modal_combine:
35 |       type: non_linear_element_multiply
36 |       params:
37 |         dropout: 0
38 |         hidden_dim: 5000
39 |     text_embeddings:
40 |     - type: attention
41 |       params:
42 |         hidden_dim: 1024
43 |         num_layers: 1
44 |         conv1_out: 512
45 |         conv2_out: 2
46 |         dropout: 0
47 |         embedding_dim: 300
48 |         kernel_size: 1
49 |         padding: 0
50 |   pythia_image_only: *pythia
51 |   pythia_question_only: *pythia
52 | optimizer_attributes:
53 |   type: Adamax
54 |   params:
55 |     eps: 1.0e-08
56 |     lr: 0.01
57 |     weight_decay: 0
58 | training_parameters:
59 |   clip_norm_mode: all
60 |   clip_gradients: true
61 |   lr_ratio: 0.1
62 |   lr_scheduler: true
63 |   lr_steps:
64 |   - 15000
65 |   - 18000
66 |   - 20000
67 |   - 21000
68 |   max_grad_l2_norm: 0.25
69 |   max_iterations: 22000
70 |   use_warmup: true
71 |   warmup_factor: 0.2
72 |   warmup_iterations: 1000
73 |   patience: 4000
74 |   batch_size: 512
75 |   num_workers: 7
76 |   task_size_proportional_sampling: true
77 |   monitored_metric: vqa2/vqa_accuracy
78 |   metric_minimize: false
79 |   pretrained_mapping:
80 |     word_embedding: word_embedding
81 |     text_embeddings: text_embeddings
82 |     image_feature_encoders: image_feature_encoders
83 |     image_feature_embeddings_list: image_feature_embeddings_list
84 |     image_text_multi_modal_combine_layer: image_text_multi_modal_combine_layer
85 |     classifier: classifier
86 | 


--------------------------------------------------------------------------------
/configs/vqa/vqa2/pythia_12k_iterations_no_resnet.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - ../configs/vqa/vqa2/pythia.yml
 3 | dataset_attributes:
 4 |   vqa2:
 5 |     image_features:
 6 |       train:
 7 |       - coco/detectron_fix_100/fc6/train_val_2014
 8 |       - coco/detectron_fix_100/fc6/train_val_2014
 9 |       val:
10 |       - coco/detectron_fix_100/fc6/train_val_2014
11 |       test:
12 |       - coco/detectron_fix_100/fc6/test2015
13 |     imdb_files:
14 |       train:
15 |       - imdb/vqa/imdb_train2014.npy
16 |       - imdb/vqa/imdb_val2014.npy
17 |       val:
18 |       - imdb/vqa/imdb_minival2014.npy
19 | model_attributes:
20 |   pythia:
21 |     image_feature_encodings:
22 |     - type: finetune_faster_rcnn_fpn_fc7
23 |       params:
24 |         bias_file: detectron/fc6/fc7_b.pkl
25 |         weights_file: detectron/fc6/fc7_w.pkl
26 | training_parameters:
27 |   max_iterations: 12000
28 |   lr_steps:
29 |   - 5000
30 |   - 7000
31 |   - 9000
32 |   - 11000
33 | 


--------------------------------------------------------------------------------
/configs/vqa/vqa2/pythia_train_and_val.yml:
--------------------------------------------------------------------------------
 1 | includes:
 2 | - ../configs/vqa/vqa2/pythia.yml
 3 | dataset_attributes:
 4 |   vqa2:
 5 |     image_features:
 6 |       train:
 7 |       - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014
 8 |       - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014
 9 |       val:
10 |       - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014
11 |       test:
12 |       - coco/detectron_fix_100/fc6/test2015,coco/resnet152/test2015
13 |     imdb_files:
14 |       train:
15 |       - imdb/vqa/imdb_train2014.npy
16 |       - imdb/vqa/imdb_val2014.npy
17 |       val:
18 |       - imdb/vqa/imdb_minival2014.npy
19 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = pythia
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | recommonmark==0.5.0
2 | sphinx
3 | sphinx_rtd_theme==0.4.3
4 | 


--------------------------------------------------------------------------------
/docs/source/common/registry.rst:
--------------------------------------------------------------------------------
1 | common.registry
2 | ===============
3 | 
4 | .. automodule:: pythia.common.registry
5 |   :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/common/sample.rst:
--------------------------------------------------------------------------------
1 | common.sample
2 | ===============
3 | 
4 | .. automodule:: pythia.common.sample
5 |   :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/datasets/base_dataset.rst:
--------------------------------------------------------------------------------
1 | datasets.base_dataset
2 | ==================
3 | 
4 | .. automodule:: pythia.datasets.base_dataset
5 |   :members:
6 |   :private-members:
7 | 


--------------------------------------------------------------------------------
/docs/source/datasets/base_dataset_builder.rst:
--------------------------------------------------------------------------------
1 | datasets.base_dataset_builder
2 | ==========================
3 | 
4 | .. automodule:: pythia.datasets.base_dataset_builder
5 |   :members:
6 |   :private-members:
7 | 


--------------------------------------------------------------------------------
/docs/source/datasets/base_task.rst:
--------------------------------------------------------------------------------
1 | datasets.base_task
2 | ==========================
3 | 
4 | .. automodule:: pythia.datasets.base_task
5 |   :members:
6 |   :private-members:
7 | 


--------------------------------------------------------------------------------
/docs/source/datasets/processors.rst:
--------------------------------------------------------------------------------
1 | datasets.processors
2 | ==========================
3 | 
4 | .. automodule:: pythia.datasets.processors
5 |   :members:
6 |   :private-members:
7 | 


--------------------------------------------------------------------------------
/docs/source/models/base_model.rst:
--------------------------------------------------------------------------------
1 | models.base_model
2 | =================
3 | 
4 | .. automodule:: pythia.models.base_model
5 |   :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/modules/losses.rst:
--------------------------------------------------------------------------------
1 | modules.losses
2 | ===============
3 | 
4 | .. automodule:: pythia.modules.losses
5 |   :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/modules/metrics.rst:
--------------------------------------------------------------------------------
1 | modules.metrics
2 | ===============
3 | 
4 | .. automodule:: pythia.modules.metrics
5 |   :members:
6 | 


--------------------------------------------------------------------------------
/docs/source/tutorials/features.rst:
--------------------------------------------------------------------------------
 1 | Features
 2 | ========
 3 | 
 4 | Pythia features:
 5 | 
 6 | - **Model Zoo**: Reference implementations for state-of-the-art vision and language model including
 7 |   LoRRA_ (SoTA on VQA and TextVQA), Pythia_ model (VQA 2018 challenge winner), BAN and BUTD_.
 8 | - **Multi-Tasking**: Support for multi-tasking which allows training on
 9 |   multiple datasets together.
10 | - **Datasets**: Includes support for various datasets built-in including VQA, VizWiz,
11 |   TextVQA, VisualDialog, MS COCO Captioning.
12 | - **Modules**: Provides implementations for many commonly used layers
13 |   in vision and language domain
14 | - **Distributed**: Support for distributed training based on DataParallel
15 |   as well as DistributedDataParallel.
16 | - **Unopinionated**: Unopinionated about the dataset and model implementations
17 |   built on top of it.
18 | - **Customization**: Custom losses, metrics, scheduling, optimizers, tensorboard;
19 |   suits all your custom needs.
20 | 
21 | You can use Pythia to **bootstrap** for your next vision and language multimodal
22 | research project.
23 | 
24 | Pythia can also act as **starter codebase** for challenges around vision and
25 | language datasets (TextVQA challenge, VQA challenge).
26 | 
27 | .. _lorra: https://arxiv.org/abs/1904.08920
28 | .. _pythia: https://arxiv.org/abs/1807.09956
29 | .. _butd: https://arxiv.org/abs/1707.07998
30 | 


--------------------------------------------------------------------------------
/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/overview.png


--------------------------------------------------------------------------------
/projects/M4C_Captioner/scripts/coco_eval.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | import numpy as np
 4 | import os
 5 | 
 6 | sys.path.append(
 7 |     os.path.join(os.path.dirname(__file__), '../../../pythia/scripts/coco/')
 8 | )
 9 | import coco_caption_eval  # NoQA
10 | 
11 | 
12 | def print_metrics(res_metrics):
13 |     print(res_metrics)
14 |     keys = ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'METEOR', 'ROUGE_L', 'SPICE', 'CIDEr']
15 |     print('\n\n**********\nFinal model performance:\n**********')
16 |     for k in keys:
17 |         print(k, ': %.1f' % (res_metrics[k] * 100))
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     import argparse
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument('--pred_file', type=str, required=True)
24 |     parser.add_argument('--set', type=str, default='karpathy_val')
25 |     args = parser.parse_args()
26 | 
27 |     with open(args.pred_file) as f:
28 |         preds = json.load(f)
29 |     imdb_file = os.path.join(
30 |         os.path.dirname(__file__),
31 |         '../../../data/imdb/m4c_coco/imdb_{}.npy'.format(args.set)
32 |     )
33 |     imdb = np.load(imdb_file, allow_pickle=True)
34 |     imdb = imdb[1:]
35 | 
36 |     gts = [
37 |         {'image_id': info['image_id'], 'caption': info['caption_str']}
38 |         for info in imdb
39 |     ]
40 |     preds = [
41 |         {'image_id': int(p['image_id']), 'caption': p['caption']}
42 |         for p in preds
43 |     ]
44 |     imgids = list(set(g['image_id'] for g in gts))
45 | 
46 |     metrics = coco_caption_eval.calculate_metrics(
47 |         imgids, {'annotations': gts}, {'annotations': preds}
48 |     )
49 | 
50 |     print_metrics(metrics)
51 | 


--------------------------------------------------------------------------------
/projects/M4C_Captioner/scripts/textcaps_eval.py:
--------------------------------------------------------------------------------
 1 | import sys
 2 | import json
 3 | import numpy as np
 4 | import os
 5 | 
 6 | sys.path.append(
 7 |     os.path.join(os.path.dirname(__file__), '../../../pythia/scripts/coco/')
 8 | )
 9 | import coco_caption_eval  # NoQA
10 | 
11 | 
12 | def print_metrics(res_metrics):
13 |     print(res_metrics)
14 |     keys = ['Bleu_1', 'Bleu_2', 'Bleu_3', 'Bleu_4', 'METEOR', 'ROUGE_L', 'SPICE', 'CIDEr']
15 |     print('\n\n**********\nFinal model performance:\n**********')
16 |     for k in keys:
17 |         print(k, ': %.1f' % (res_metrics[k] * 100))
18 | 
19 | 
20 | if __name__ == '__main__':
21 |     import argparse
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument('--pred_file', type=str, required=True)
24 |     parser.add_argument('--set', type=str, default='val')
25 |     args = parser.parse_args()
26 | 
27 |     if args.set not in ['train', 'val']:
28 |         raise Exception(
29 |             'this script only supports TextCaps train and val set. '
30 |             'Please use the EvalAI server for test set evaluation'
31 |         )
32 | 
33 |     with open(args.pred_file) as f:
34 |         preds = json.load(f)
35 |     imdb_file = os.path.join(
36 |         os.path.dirname(__file__),
37 |         '../../../data/imdb/m4c_textcaps/imdb_{}.npy'.format(args.set)
38 |     )
39 |     imdb = np.load(imdb_file, allow_pickle=True)
40 |     imdb = imdb[1:]
41 | 
42 |     gts = [
43 |         {'image_id': info['image_id'], 'caption': info['caption_str']}
44 |         for info in imdb
45 |     ]
46 |     preds = [
47 |         {'image_id': p['image_id'], 'caption': p['caption']}
48 |         for p in preds
49 |     ]
50 |     imgids = list(set(g['image_id'] for g in gts))
51 | 
52 |     metrics = coco_caption_eval.calculate_metrics(
53 |         imgids, {'annotations': gts}, {'annotations': preds}
54 |     )
55 | 
56 |     print_metrics(metrics)
57 | 


--------------------------------------------------------------------------------
/projects/TextCap_CVPR.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/projects/TextCap_CVPR.pdf


--------------------------------------------------------------------------------
/pythia/__init__.py:
--------------------------------------------------------------------------------
1 | __version__ = "0.3.0"
2 | 


--------------------------------------------------------------------------------
/pythia/common/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/pythia/common/batch_collator.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from pythia.common.sample import SampleList
 3 | 
 4 | 
 5 | class BatchCollator:
 6 |     # TODO: Think more if there is a better way to do this
 7 |     _IDENTICAL_VALUE_KEYS = ["dataset_type", "dataset_name"]
 8 | 
 9 |     def __call__(self, batch):
10 |         sample_list = SampleList(batch)
11 |         for key in self._IDENTICAL_VALUE_KEYS:
12 |             sample_list[key + "_"] = sample_list[key]
13 |             sample_list[key] = sample_list[key][0]
14 | 
15 |         return sample_list
16 | 


--------------------------------------------------------------------------------
/pythia/common/constants.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import os
 3 | 
 4 | 
 5 | imdb_version = 1
 6 | FASTTEXT_WIKI_URL = (
 7 |     "https://dl.fbaipublicfiles.com/pythia/pretrained_models/fasttext/wiki.en.bin"
 8 | )
 9 | 
10 | CLEVR_DOWNLOAD_URL = (
11 |     "https://dl.fbaipublicfiles.com/clevr/CLEVR_v1.0.zip"
12 | )
13 | 
14 | VISUAL_GENOME_CONSTS = {
15 |     "imdb_url": "https://dl.fbaipublicfiles.com/pythia/data/imdb/visual_genome.tar.gz",
16 |     "features_url": "https://dl.fbaipublicfiles.com/pythia/features/visual_genome.tar.gz",
17 |     "synset_file": "vg_synsets.txt",
18 |     "vocabs": "https://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz"
19 | }
20 | 
21 | VISUAL_DIALOG_CONSTS = {
22 |     "imdb_url": {
23 |         "train": "https://www.dropbox.com/s/ix8keeudqrd8hn8/visdial_1.0_train.zip?dl=1",
24 |         "val": "https://www.dropbox.com/s/ibs3a0zhw74zisc/visdial_1.0_val.zip?dl=1",
25 |         "test": "https://www.dropbox.com/s/ibs3a0zhw74zisc/visdial_1.0_test.zip?dl=1"
26 |     },
27 |     "features_url": {
28 |         "visual_dialog": "https://dl.fbaipublicfiles.com/pythia/features/visual_dialog.tar.gz",
29 |         "coco": "https://dl.fbaipublicfiles.com/pythia/features/coco.tar.gz"
30 |     },
31 |     "vocabs": "https://dl.fbaipublicfiles.com/pythia/data/vocab.tar.gz"
32 | }
33 | 
34 | DOWNLOAD_CHUNK_SIZE = 1024 * 1024
35 | 


--------------------------------------------------------------------------------
/pythia/common/dataset_loader.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import os
 3 | 
 4 | import yaml
 5 | from torch.utils.data import DataLoader
 6 | 
 7 | from pythia.common.batch_collator import BatchCollator
 8 | from pythia.common.test_reporter import TestReporter
 9 | from pythia.datasets.multi_dataset import MultiDataset
10 | from pythia.datasets.samplers import DistributedSampler
11 | from pythia.utils.general import get_batch_size
12 | 
13 | 
14 | class DatasetLoader:
15 |     def __init__(self, config):
16 |         self.config = config
17 | 
18 |     def load_datasets(self):
19 |         self.train_dataset = MultiDataset("train")
20 |         self.val_dataset = MultiDataset("val")
21 |         self.test_dataset = MultiDataset("test")
22 | 
23 |         self.train_dataset.load(**self.config)
24 |         self.val_dataset.load(**self.config)
25 |         self.test_dataset.load(**self.config)
26 | 
27 |         if self.train_dataset.num_datasets == 1:
28 |             self.train_loader = self.train_dataset.first_loader
29 |             self.val_loader = self.val_dataset.first_loader
30 |             self.test_loader = self.test_dataset.first_loader
31 |         else:
32 |             self.train_loader = self.train_dataset
33 |             self.val_loader = self.val_dataset
34 |             self.test_loader = self.test_dataset
35 | 
36 |         self.mapping = {
37 |             "train": self.train_dataset,
38 |             "val": self.val_dataset,
39 |             "test": self.test_dataset,
40 |         }
41 | 
42 |         self.test_reporter = None
43 |         self.should_not_log = self.config.training_parameters.should_not_log
44 | 
45 |     @property
46 |     def dataset_config(self):
47 |         return self._dataset_config
48 | 
49 |     @dataset_config.setter
50 |     def dataset_config(self, config):
51 |         self._dataset_config = config
52 | 
53 |     def get_config(self):
54 |         return self._dataset_config
55 | 
56 |     def get_test_reporter(self, dataset_type):
57 |         dataset = getattr(self, "{}_dataset".format(dataset_type))
58 |         return TestReporter(dataset)
59 | 
60 |     def update_registry_for_model(self, config):
61 |         self.train_dataset.update_registry_for_model(config)
62 |         self.val_dataset.update_registry_for_model(config)
63 |         self.test_dataset.update_registry_for_model(config)
64 | 
65 |     def clean_config(self, config):
66 |         self.train_dataset.clean_config(config)
67 |         self.val_dataset.clean_config(config)
68 |         self.test_dataset.clean_config(config)
69 | 
70 |     def prepare_batch(self, batch, *args, **kwargs):
71 |         return self.mapping[batch.dataset_type].prepare_batch(batch)
72 | 
73 |     def verbose_dump(self, report, *args, **kwargs):
74 |         if self.config.training_parameters.verbose_dump:
75 |             dataset_type = report.dataset_type
76 |             self.mapping[dataset_type].verbose_dump(report, *args, **kwargs)
77 | 
78 |     def seed_sampler(self, dataset_type, seed):
79 |         dataset = getattr(self, "{}_dataset".format(dataset_type))
80 |         dataset.seed_sampler(seed)
81 | 


--------------------------------------------------------------------------------
/pythia/common/defaults/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/common/defaults/__init__.py


--------------------------------------------------------------------------------
/pythia/common/defaults/configs/datasets/captioning/coco.yml:
--------------------------------------------------------------------------------
 1 | dataset_attributes:
 2 |   coco:
 3 |     data_root_dir: ../data
 4 |     image_depth_first: false
 5 |     fast_read: false
 6 |     image_features:
 7 |         train:
 8 |         - coco/detectron_fix_100/fc6/train_val_2014
 9 |         val:
10 |         - coco/detectron_fix_100/fc6/train_val_2014
11 |         test:
12 |         - coco/detectron_fix_100/fc6/train_val_2014
13 |     imdb_files:
14 |         train:
15 |         - imdb/coco_captions/imdb_karpathy_train.npy
16 |         val:
17 |         - imdb/coco_captions/imdb_karpathy_val.npy
18 |         test:
19 |         - imdb/coco_captions/imdb_karpathy_test.npy
20 |     features_max_len: 100
21 |     processors:
22 |       text_processor:
23 |         type: vocab
24 |         params:
25 |           max_length: 52
26 |           vocab:
27 |             type: intersected
28 |             embedding_name: glove.6B.300d
29 |             vocab_file: vocabs/vocabulary_captioning_thresh5.txt
30 |           preprocessor:
31 |             type: simple_sentence
32 |             params: {}
33 |       caption_processor:
34 |         type: caption
35 |         params:
36 |           vocab:
37 |             type: intersected
38 |             embedding_name: glove.6B.300d
39 |             vocab_file: vocabs/vocabulary_captioning_thresh5.txt
40 |     min_captions_per_img: 5
41 |     return_info: false
42 |     # Return OCR information
43 |     use_ocr: false
44 |     # Return spatial information of OCR tokens if present
45 |     use_ocr_info: false
46 | training_parameters:
47 |     monitored_metric: coco/caption_bleu4
48 |     metric_minimize: false
49 | 


--------------------------------------------------------------------------------
/pythia/common/defaults/configs/datasets/captioning/m4c_textcaps.yml:
--------------------------------------------------------------------------------
 1 | dataset_attributes:
 2 |   m4c_textcaps:
 3 |       data_root_dir: ../data
 4 |       image_depth_first: false
 5 |       fast_read: false
 6 |       features_max_len: 100
 7 |       processors:
 8 |         context_processor:
 9 |           type: fasttext
10 |           params:
11 |             max_length: 50
12 |             model_file: .vector_cache/wiki.en.bin
13 |         ocr_token_processor:
14 |           type: simple_word
15 |           params: {}
16 |         bbox_processor:
17 |           type: bbox
18 |           params:
19 |             max_length: 50
20 |       return_info: true
21 |       use_ocr: true
22 |       use_ocr_info: true


--------------------------------------------------------------------------------
/pythia/common/defaults/configs/datasets/dialog/visual_dialog.yml:
--------------------------------------------------------------------------------
 1 | dataset_attributes:
 2 |   visual_genome:
 3 |       data_root_dir: ../data
 4 |       image_depth_first: false
 5 |       fast_read: false
 6 |       image_features:
 7 |           train:
 8 |           - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014
 9 |           val:
10 |           - visual_dialog/detectron_fix_100/fc6/val2018,visual_dialog/resnet152/
11 |           test:
12 |           - visual_dialog/detectron_fix_100/fc6/test2018,visual_dialog/resnet152/
13 |       imdb_files:
14 |           train:
15 |           - imdb/visual_dialog/visdial_1.0_train.json
16 |           val:
17 |           - imdb/visual_dialog/visdial_1.0_val.json
18 |           test:
19 |           - imdb/visual_dialog/visdial_1.0_test.json
20 |       features_max_len: 100
21 |       processors:
22 |         text_processor:
23 |           type: vocab
24 |           params:
25 |             max_length: 14
26 |             vocab:
27 |               type: intersected
28 |               embedding_name: glove.6B.300d
29 |               vocab_file: vocabs/vocabulary_100k.txt
30 |             preprocessor:
31 |               type: simple_sentence
32 |               params: {}
33 |         answer_processor:
34 |           type: vqa_answer
35 |           params:
36 |             num_answers: 1
37 |             vocab_file: vocabs/answers_vqa.txt
38 |             preprocessor:
39 |               type: simple_word
40 |               params: {}
41 |         discriminative_answer_processor:
42 |           type: vocab
43 |           params:
44 |             max_length: 1
45 |             vocab:
46 |               type: random
47 |               vocab_file: vocabs/vocabulary_100k.txt
48 |         vg_answer_preprocessor:
49 |           type: simple_word
50 |           params: {}
51 |         history_processor:
52 |           type: vocab
53 |           params:
54 |             max_length: 100
55 |             vocab:
56 |               type: intersected
57 |               embedding_name: glove.6B.300d
58 |               vocab_file: vocabs/vocabulary_100k.txt
59 |             preprocessor:
60 |               type: simple_sentence
61 |               params: {}
62 |         bbox_processor:
63 |           type: bbox
64 |           params:
65 |             max_length: 50
66 |       return_history: true
67 |       # Means you have to rank 100 candidate answers
68 |       discriminative:
69 |           enabled: true
70 |           # Only return answer indices, otherwise it will return
71 |           # glove embeddings
72 |           return_indices: true
73 |       no_unk: false
74 |       # Return OCR information
75 |       use_ocr: false
76 |       # Return spatial information of OCR tokens if present
77 |       use_ocr_info: false
78 | training_parameters:
79 |     monitored_metric: visual_dialog/r@1
80 |     metric_minimize: false
81 | 


--------------------------------------------------------------------------------
/pythia/common/defaults/configs/datasets/vqa/clevr.yml:
--------------------------------------------------------------------------------
 1 | dataset_attributes:
 2 |     clevr:
 3 |         data_root_dir: ../data
 4 |         data_folder: CLEVR_v1.0
 5 |         build_attributes:
 6 |             min_count: 1
 7 |             split_regex: " "
 8 |             keep:
 9 |                 - ";"
10 |                 - ","
11 |             remove:
12 |                 - "?"
13 |                 - "."
14 |         processors:
15 |             text_processor:
16 |                 type: vocab
17 |                 params:
18 |                     max_length: 10
19 |                     vocab:
20 |                         type: random
21 |                         vocab_file: vocabs/clevr_question_vocab.txt
22 |                 preprocessor:
23 |                     type: simple_sentence
24 |                     params: {}
25 |             answer_processor:
26 |                 type: multi_hot_answer_from_vocab
27 |                 params:
28 |                     num_answers: 1
29 |                     # Vocab file is relative to [data_root_dir]/[data_folder]
30 |                     vocab_file: vocabs/clevr_answer_vocab.txt
31 |                     preprocessor:
32 |                         type: simple_word
33 |                         params: {}


--------------------------------------------------------------------------------
/pythia/common/defaults/configs/datasets/vqa/m4c_ocrvqa.yml:
--------------------------------------------------------------------------------
 1 | dataset_attributes:
 2 |   m4c_ocrvqa:
 3 |       data_root_dir: ../data
 4 |       image_depth_first: false
 5 |       fast_read: false
 6 |       features_max_len: 100
 7 |       processors:
 8 |         context_processor:
 9 |           type: fasttext
10 |           params:
11 |             max_length: 50
12 |             model_file: .vector_cache/wiki.en.bin
13 |         ocr_token_processor:
14 |           type: simple_word
15 |           params: {}
16 |         bbox_processor:
17 |           type: bbox
18 |           params:
19 |             max_length: 50
20 |       return_info: true
21 |       use_ocr: true
22 |       use_ocr_info: true


--------------------------------------------------------------------------------
/pythia/common/defaults/configs/datasets/vqa/m4c_stvqa.yml:
--------------------------------------------------------------------------------
 1 | dataset_attributes:
 2 |   m4c_stvqa:
 3 |       data_root_dir: ../data
 4 |       image_depth_first: false
 5 |       fast_read: false
 6 |       features_max_len: 100
 7 |       processors:
 8 |         context_processor:
 9 |           type: fasttext
10 |           params:
11 |             max_length: 50
12 |             model_file: .vector_cache/wiki.en.bin
13 |         ocr_token_processor:
14 |           type: simple_word
15 |           params: {}
16 |         bbox_processor:
17 |           type: bbox
18 |           params:
19 |             max_length: 50
20 |       return_info: true
21 |       use_ocr: true
22 |       use_ocr_info: true


--------------------------------------------------------------------------------
/pythia/common/defaults/configs/datasets/vqa/m4c_textvqa.yml:
--------------------------------------------------------------------------------
 1 | dataset_attributes:
 2 |   m4c_textvqa:
 3 |       data_root_dir: ../data
 4 |       image_depth_first: false
 5 |       fast_read: false
 6 |       features_max_len: 100
 7 |       processors:
 8 |         context_processor:
 9 |           type: fasttext
10 |           params:
11 |             max_length: 50
12 |             model_file: .vector_cache/wiki.en.bin
13 |         ocr_token_processor:
14 |           type: simple_word
15 |           params: {}
16 |         bbox_processor:
17 |           type: bbox
18 |           params:
19 |             max_length: 50
20 |       return_info: true
21 |       use_ocr: true
22 |       use_ocr_info: true


--------------------------------------------------------------------------------
/pythia/common/defaults/configs/datasets/vqa/textvqa.yml:
--------------------------------------------------------------------------------
 1 | dataset_attributes:
 2 |   textvqa:
 3 |       data_root_dir: ../data
 4 |       image_depth_first: false
 5 |       fast_read: false
 6 |       image_features:
 7 |           train:
 8 |           - open_images/detectron_fix_100/fc6/train,open_images/resnet152/train
 9 |           val:
10 |           - open_images/detectron_fix_100/fc6/train,open_images/resnet152/train
11 |           test:
12 |           - open_images/detectron_fix_100/fc6/test,open_images/resnet152/test
13 |       imdb_files:
14 |           train:
15 |           - imdb/textvqa_0.5/imdb_textvqa_train.npy
16 |           val:
17 |           - imdb/textvqa_0.5/imdb_textvqa_val.npy
18 |           test:
19 |           - imdb/textvqa_0.5/imdb_textvqa_test.npy
20 |       features_max_len: 137
21 |       processors:
22 |         text_processor:
23 |           type: vocab
24 |           params:
25 |             max_length: 14
26 |             vocab:
27 |               type: intersected
28 |               embedding_name: glove.6B.300d
29 |               vocab_file: vocabs/vocabulary_100k.txt
30 |             preprocessor:
31 |               type: simple_sentence
32 |               params: {}
33 |         answer_processor:
34 |           type: vqa_answer
35 |           params:
36 |             vocab_file: vocabs/answers_textvqa_8k.txt
37 |             preprocessor:
38 |               type: simple_word
39 |               params: {}
40 |             num_answers: 10
41 |         context_processor:
42 |           type: fasttext
43 |           params:
44 |             max_length: 50
45 |             model_file: .vector_cache/wiki.en.bin
46 |         ocr_token_processor:
47 |           type: simple_word
48 |           params: {}
49 |         bbox_processor:
50 |           type: bbox
51 |           params:
52 |             max_length: 50
53 |       return_info: true
54 |       # Return OCR information
55 |       use_ocr: true
56 |       # Return spatial information of OCR tokens if present
57 |       use_ocr_info: false
58 | training_parameters:
59 |     monitored_metric: textvqa/vqa_accuracy
60 |     metric_minimize: false
61 | 


--------------------------------------------------------------------------------
/pythia/common/defaults/configs/datasets/vqa/vizwiz.yml:
--------------------------------------------------------------------------------
 1 | dataset_attributes:
 2 |   vizwiz:
 3 |       data_root_dir: ../data
 4 |       image_depth_first: false
 5 |       fast_read: false
 6 |       image_features:
 7 |           train:
 8 |           - vizwiz/detectron_fix_100/fc6/train,vizwiz/resnet152/train
 9 |           val:
10 |           - vizwiz/detectron_fix_100/fc6/val,vizwiz/resnet152/val
11 |           test:
12 |           - vizwiz/detectron_fix_100/fc6/test,vizwiz/resnet152/test
13 |       imdb_files:
14 |           train:
15 |           - imdb/vizwiz/imdb_vizwiz_train.npy
16 |           val:
17 |           - imdb/vizwiz/imdb_vizwiz_val.npy
18 |           test:
19 |           - imdb/vizwiz/imdb_vizwiz_test.npy
20 |       features_max_len: 100
21 |       processors:
22 |         text_processor:
23 |           type: vocab
24 |           params:
25 |             max_length: 14
26 |             vocab:
27 |               type: intersected
28 |               embedding_name: glove.6B.300d
29 |               vocab_file: vocabs/vocabulary_100k.txt
30 |             preprocessor:
31 |               type: simple_sentence
32 |               params: {}
33 |         answer_processor:
34 |           type: vqa_answer
35 |           params:
36 |             vocab_file: vocabs/answers_vizwiz_7k.txt
37 |             preprocessor:
38 |               type: simple_word
39 |               params: {}
40 |             num_answers: 10
41 |         context_processor:
42 |           type: fasttext
43 |           params:
44 |             max_length: 50
45 |             model_file: .vector_cache/wiki.en.bin
46 |         ocr_token_processor:
47 |           type: simple_word
48 |           params: {}
49 |         bbox_processor:
50 |           type: bbox
51 |           params:
52 |             max_length: 50
53 |       return_info: true
54 |       # Return OCR information
55 |       use_ocr: false
56 |       # Return spatial information of OCR tokens if present
57 |       use_ocr_info: false
58 | training_parameters:
59 |     monitored_metric: vizwiz/vqa_accuracy
60 |     metric_minimize: false
61 | 


--------------------------------------------------------------------------------
/pythia/common/defaults/configs/datasets/vqa/vqa2.yml:
--------------------------------------------------------------------------------
 1 | dataset_attributes:
 2 |   vqa2:
 3 |       data_root_dir: ../data
 4 |       image_depth_first: false
 5 |       fast_read: false
 6 |       image_features:
 7 |           train:
 8 |           - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014
 9 |           val:
10 |           - coco/detectron_fix_100/fc6/train_val_2014,coco/resnet152/train_val_2014
11 |           test:
12 |           - coco/detectron_fix_100/fc6/test2015,coco/resnet152/test2015
13 |       imdb_files:
14 |           train:
15 |           - imdb/vqa/imdb_train2014.npy
16 |           val:
17 |           - imdb/vqa/imdb_val2014.npy
18 |           test:
19 |           - imdb/vqa/imdb_test2015.npy
20 |       features_max_len: 100
21 |       processors:
22 |         text_processor:
23 |           type: vocab
24 |           params:
25 |             max_length: 14
26 |             vocab:
27 |               type: intersected
28 |               embedding_name: glove.6B.300d
29 |               vocab_file: vocabs/vocabulary_100k.txt
30 |             preprocessor:
31 |               type: simple_sentence
32 |               params: {}
33 |         answer_processor:
34 |           type: vqa_answer
35 |           params:
36 |             num_answers: 10
37 |             vocab_file: vocabs/answers_vqa.txt
38 |             preprocessor:
39 |               type: simple_word
40 |               params: {}
41 |         context_processor:
42 |           type: fasttext
43 |           params:
44 |             download_initially: false
45 |             max_length: 50
46 |             model_file: .vector_cache/wiki.en.bin
47 |         ocr_token_processor:
48 |           type: simple_word
49 |           params: {}
50 |         bbox_processor:
51 |           type: bbox
52 |           params:
53 |             max_length: 50
54 |       return_info: true
55 |       # Return OCR information
56 |       use_ocr: false
57 |       # Return spatial information of OCR tokens if present
58 |       use_ocr_info: false
59 | training_parameters:
60 |     monitored_metric: vqa2/vqa_accuracy
61 |     metric_minimize: false
62 | 


--------------------------------------------------------------------------------
/pythia/common/meter.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # Inspired from maskrcnn benchmark
 3 | from collections import defaultdict, deque
 4 | 
 5 | import torch
 6 | 
 7 | 
 8 | class SmoothedValue:
 9 |     """Track a series of values and provide access to smoothed values over a
10 |     window or the global series average.
11 |     """
12 | 
13 |     def __init__(self, window_size=20):
14 |         self.window_size = window_size
15 |         self.reset()
16 | 
17 |     def reset(self):
18 |         self.deque = deque(maxlen=self.window_size)
19 |         self.series = []
20 |         self.total = 0.0
21 |         self.count = 0
22 | 
23 |     def update(self, value):
24 |         self.deque.append(value)
25 |         self.series.append(value)
26 |         self.count += 1
27 |         self.total += value
28 | 
29 |     @property
30 |     def median(self):
31 |         d = torch.tensor(list(self.deque))
32 |         return d.median().item()
33 | 
34 |     @property
35 |     def avg(self):
36 |         d = torch.tensor(list(self.deque))
37 |         return d.mean().item()
38 | 
39 |     @property
40 |     def global_avg(self):
41 |         return self.total / self.count
42 | 
43 |     def get_latest(self):
44 |         return self.deque[-1]
45 | 
46 | 
47 | class Meter:
48 |     def __init__(self, delimiter=", "):
49 |         self.meters = defaultdict(SmoothedValue)
50 |         self.delimiter = delimiter
51 | 
52 |     def update(self, update_dict):
53 |         for k, v in update_dict.items():
54 |             if isinstance(v, torch.Tensor):
55 |                 if v.dim() != 0:
56 |                     v = v.mean()
57 |                 v = v.item()
58 |             assert isinstance(v, (float, int))
59 |             self.meters[k].update(v)
60 | 
61 |     def update_from_meter(self, meter):
62 |         for key, value in meter.meters.items():
63 |             assert isinstance(value, SmoothedValue)
64 |             self.meters[key] = value
65 | 
66 |     def __getattr__(self, attr):
67 |         if attr in self.meters:
68 |             return self.meters[attr]
69 |         if attr in self.__dict__:
70 |             return self.__dict__[attr]
71 |         raise AttributeError(
72 |             "'{}' object has no attribute '{}'".format(type(self).__name__, attr)
73 |         )
74 | 
75 |     def get_scalar_dict(self):
76 |         scalar_dict = {}
77 |         for k, v in self.meters.items():
78 |             scalar_dict[k] = v.get_latest()
79 | 
80 |         return scalar_dict
81 | 
82 |     def __str__(self):
83 |         loss_str = []
84 |         for name, meter in self.meters.items():
85 |             if "train" in name:
86 |                 loss_str.append(
87 |                     "{}: {:.4f} ({:.4f})".format(name, meter.median, meter.global_avg)
88 |                 )
89 |             else:
90 |                 # In case of val print global avg
91 |                 loss_str.append("{}: {:.4f}".format(name, meter.global_avg))
92 | 
93 |         return self.delimiter.join(loss_str)
94 | 


--------------------------------------------------------------------------------
/pythia/common/report.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import collections
 3 | import warnings
 4 | from collections import OrderedDict
 5 | 
 6 | from pythia.common.registry import registry
 7 | 
 8 | 
 9 | class Report(OrderedDict):
10 |     def __init__(self, batch, model_output={}, *args):
11 |         super().__init__(self)
12 |         if self._check_and_load_tuple(batch):
13 |             return
14 | 
15 |         all_args = [batch, model_output] + [*args]
16 |         for idx, arg in enumerate(all_args):
17 |             if not isinstance(arg, collections.abc.Mapping):
18 |                 raise TypeError(
19 |                     "Argument {:d}, {} must be of instance of "
20 |                     "collections.abc.Mapping".format(idx, arg)
21 |                 )
22 | 
23 |         self.writer = registry.get("writer")
24 | 
25 |         self.warning_string = (
26 |             "Updating forward report with key {}"
27 |             "{}, but it already exists in {}. "
28 |             "Please consider using a different key, "
29 |             "as this can cause issues during loss and "
30 |             "metric calculations."
31 |         )
32 | 
33 |         for idx, arg in enumerate(all_args):
34 |             for key, item in arg.items():
35 |                 if key in self and idx >= 2:
36 |                     log = self.warning_string.format(
37 |                         key, "", "in previous arguments to report"
38 |                     )
39 |                     warnings.warn(log)
40 |                 self[key] = item
41 | 
42 |     def _check_and_load_tuple(self, batch):
43 |         if isinstance(batch, collections.abc.Mapping):
44 |             return False
45 | 
46 |         if isinstance(batch[0], (tuple, list)) and isinstance(batch[0][0], str):
47 |             for kv_pair in batch:
48 |                 self[kv_pair[0]] = kv_pair[1]
49 |             return True
50 |         else:
51 |             return False
52 | 
53 |     def __setattr__(self, key, value):
54 |         self[key] = value
55 | 
56 |     def __getattr__(self, key):
57 |         try:
58 |             return self[key]
59 |         except KeyError:
60 |             raise AttributeError(key)
61 | 
62 |     def fields(self):
63 |         return list(self.keys())
64 | 


--------------------------------------------------------------------------------
/pythia/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .base_dataset_builder import BaseDatasetBuilder
3 | from .multi_dataset import MultiDataset
4 | from .base_dataset import BaseDataset
5 | 
6 | __all__ = ["BaseDataset", "BaseDatasetBuilder", "MultiDataset"]
7 | 


--------------------------------------------------------------------------------
/pythia/datasets/captioning/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/pythia/datasets/captioning/coco/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | __all__ = ["COCOBuilder", "COCODataset"]
3 | 
4 | from .builder import COCOBuilder
5 | from .dataset import COCODataset
6 | 


--------------------------------------------------------------------------------
/pythia/datasets/captioning/coco/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | from pythia.common.registry import registry
 9 | from pythia.datasets.vqa.vqa2 import VQA2Builder
10 | 
11 | from .dataset import COCODataset
12 | 
13 | 
14 | @registry.register_builder("coco")
15 | class COCOBuilder(VQA2Builder):
16 |     def __init__(self):
17 |         super().__init__()
18 |         self.dataset_name = "coco"
19 |         self.set_dataset_class(COCODataset)
20 | 
21 |     def update_registry_for_model(self, config):
22 |         registry.register(
23 |             self.dataset_name + "_text_vocab_size",
24 |             self.dataset.text_processor.get_vocab_size(),
25 |         )
26 | 


--------------------------------------------------------------------------------
/pythia/datasets/captioning/coco/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import torch
 3 | 
 4 | from pythia.common.sample import Sample
 5 | from pythia.datasets.vqa.vqa2 import VQA2Dataset
 6 | 
 7 | 
 8 | class COCODataset(VQA2Dataset):
 9 |     def __init__(self, dataset_type, imdb_file_index, config, *args, **kwargs):
10 |         super().__init__(dataset_type, imdb_file_index, config, *args, **kwargs)
11 |         self._name = "coco"
12 | 
13 |     def load_item(self, idx):
14 |         sample_info = self.imdb[idx]
15 |         current_sample = Sample()
16 | 
17 |         if self._dataset_type != "test":
18 |             text_processor_argument = {"tokens": sample_info["caption_tokens"]}
19 |             processed_caption = self.text_processor(text_processor_argument)
20 |             current_sample.text = processed_caption["text"]
21 |             current_sample.caption_id = torch.tensor(
22 |                 sample_info["caption_id"], dtype=torch.int
23 |             )
24 |             current_sample.caption_len = torch.tensor(
25 |                 len(sample_info["caption_tokens"]), dtype=torch.int
26 |             )
27 | 
28 |         if isinstance(sample_info["image_id"], int):
29 |             current_sample.image_id = torch.tensor(
30 |                 sample_info["image_id"], dtype=torch.int
31 |             )
32 |         else:
33 |             current_sample.image_id = sample_info["image_id"]
34 | 
35 |         if self._use_features is True:
36 |             features = self.features_db[idx]
37 |             current_sample.update(features)
38 | 
39 |         # Add reference captions to sample
40 |         current_sample = self.add_reference_caption(sample_info, current_sample)
41 | 
42 |         return current_sample
43 | 
44 |     def add_reference_caption(self, sample_info, sample):
45 |         reference_list = []
46 |         for reference in sample_info["reference_tokens"]:
47 |             text_processor_argument = {"tokens": reference}
48 |             processed_reference = self.text_processor(text_processor_argument)
49 |             reference_list.append(processed_reference["text"])
50 | 
51 |         # Restrict to minimum reference captions available per image
52 |         sample.answers = torch.stack(reference_list)[: self.config.min_captions_per_img]
53 | 
54 |         return sample
55 | 
56 |     def format_for_evalai(self, report):
57 |         captions = report.captions.tolist()
58 |         predictions = []
59 |         remove_unk_from_caption_prediction = getattr(
60 |             self.config, 'remove_unk_from_caption_prediction', False
61 |         )
62 |         for idx, image_id in enumerate(report.image_id):
63 |             caption = self.caption_processor(captions[idx])["caption"]
64 |             if remove_unk_from_caption_prediction:
65 |                 caption = caption.replace('<unk>', '')
66 |                 caption = caption.replace('  ', ' ').strip()
67 |             if isinstance(image_id, torch.Tensor):
68 |                 image_id = image_id.item()
69 |             predictions.append({"image_id": image_id, "caption": caption})
70 | 
71 |         return predictions
72 | 


--------------------------------------------------------------------------------
/pythia/datasets/captioning/m4c_textcaps/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/pythia/datasets/captioning/m4c_textcaps/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from pythia.common.registry import Registry
 3 | from pythia.datasets.captioning.m4c_textcaps.dataset import M4CTextCapsDataset
 4 | from pythia.datasets.vqa.m4c_textvqa.builder import M4CTextVQABuilder
 5 | 
 6 | 
 7 | @Registry.register_builder("m4c_textcaps")
 8 | class M4CTextCapsBuilder(M4CTextVQABuilder):
 9 |     def __init__(self):
10 |         super().__init__()
11 |         self.dataset_name = "m4c_textcaps"
12 |         self.set_dataset_class(M4CTextCapsDataset)
13 | 


--------------------------------------------------------------------------------
/pythia/datasets/captioning/m4c_textcaps/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from pythia.datasets.vqa.m4c_textvqa.dataset import M4CTextVQADataset
 3 | from pythia.utils.objects_to_byte_tensor import enc_obj2bytes
 4 | 
 5 | 
 6 | class M4CTextCapsDataset(M4CTextVQADataset):
 7 |     def __init__(self, dataset_type, imdb_file_index, config, *args, **kwargs):
 8 |         super().__init__(
 9 |             dataset_type, imdb_file_index, config, *args, **kwargs
10 |         )
11 |         self._name = "m4c_textcaps"
12 | 
13 |     def preprocess_sample_info(self, sample_info):
14 |         # add dummy questions to train with M4C (for TextVQA)
15 |         sample_info['question_str'] = ''  # empty question
16 |         sample_info['question_id'] = sample_info['caption_id']
17 |         return sample_info
18 | 
19 |     def postprocess_evalai_entry(self, entry):
20 |         new_entry = {
21 |             'caption_id': entry['question_id'],
22 |             'image_id': entry['image_id'],
23 |             'caption': entry['answer'],
24 |             'pred_source': entry['pred_source'],
25 |         }
26 |         return new_entry
27 | 
28 |     def add_answer_info(self, sample_info, sample):
29 |         sample_has_caption = ('caption_str' in sample_info)
30 |         if sample_has_caption:
31 |             sample_info['answers'] = [sample_info['caption_str']]
32 | 
33 |         sample = super().add_answer_info(sample_info, sample)
34 | 
35 |         if sample_has_caption:
36 |             sample.caption_str = enc_obj2bytes(sample_info['caption_str'])
37 |             sample.ref_strs = enc_obj2bytes(sample_info['reference_strs'])
38 |             sample.pop('gt_answers_enc')
39 | 
40 |         return sample
41 | 


--------------------------------------------------------------------------------
/pythia/datasets/concat_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import functools
 3 | import types
 4 | 
 5 | from torch.utils.data import ConcatDataset
 6 | 
 7 | 
 8 | class PythiaConcatDataset(ConcatDataset):
 9 |     # These functions should only be called once even if they return nothing
10 |     _SINGLE_CALL_FUNCS = []
11 | 
12 |     def __init__(self, datasets):
13 |         super().__init__(datasets)
14 |         self._dir_representation = dir(self)
15 | 
16 |     def __getattr__(self, name):
17 |         if name in self._dir_representation:
18 |             return getattr(self, name)
19 |         elif hasattr(self.datasets[0], name):
20 |             attr = getattr(self.datasets[0], name)
21 |             # Check if the current attribute is class method function
22 |             if isinstance(attr, types.MethodType):
23 |                 # if it is the, we to call this function for
24 |                 # each of the child datasets
25 |                 attr = functools.partial(self._call_all_datasets_func, name)
26 |             return attr
27 |         else:
28 |             raise AttributeError(name)
29 | 
30 |     def _get_single_call_funcs(self):
31 |         return PythiaConcatDataset._SINGLE_CALL_FUNCS
32 | 
33 |     def _call_all_datasets_func(self, name, *args, **kwargs):
34 |         for dataset in self.datasets:
35 |             value = getattr(dataset, name)(*args, **kwargs)
36 |             if value is not None:
37 |                 # TODO: Log a warning here
38 |                 return value
39 |                 # raise RuntimeError("Functions returning values can't be "
40 |                 #                    "called through PythiaConcatDataset")
41 |             if (
42 |                 hasattr(dataset, "get_single_call_funcs")
43 |                 and name in dataset.get_single_call_funcs()
44 |             ):
45 |                 return
46 | 


--------------------------------------------------------------------------------
/pythia/datasets/dialog/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/pythia/datasets/dialog/visual_dialog/config.yml:
--------------------------------------------------------------------------------
  1 | task_attributes:
  2 |   data_root_dir: data
  3 |   batch_size: 10
  4 |   vocab_file: visdial/visdial_vocabulary.txt
  5 |   max_seq_len: 20
  6 |   max_history_len: 300
  7 |   embedding_name: glove.6B.300d
  8 |   image_depth_first: false
  9 |   image_fast_reader: false
 10 |   image_feat_test:
 11 |   - /checkpoint02/tinayujiang/features/visdial/detectron_23/fc6/
 12 |   image_feat_train:
 13 |   - detec/detectron/fc6/vqa/train+val2014
 14 |   image_feat_val:
 15 |   - /checkpoint02/tinayujiang/features/visdial/detectron_23/fc6/
 16 |   image_max_loc: 100
 17 |   imdb_file_test:
 18 |   - visdial/visdial_1.0_val_imdb.json
 19 |   imdb_file_train:
 20 |   - visdial/visdial_1.0_train_imdb.json
 21 |   imdb_file_val:
 22 |   - visdial/visdial_1.0_val_imdb.json
 23 |   num_workers: 12
 24 |   enforce_slow_reader: false
 25 |   metrics:
 26 |     - r@1
 27 |     - r@5
 28 |     - r@10
 29 |     - mean_r
 30 |     - mean_rr
 31 |   monitored_metric: 0
 32 |   metric_minimize: False
 33 |   should_early_stop: True
 34 | exp_name: baseline
 35 | loss: logit_bce
 36 | lr_scheduler: true
 37 | model_attributes:
 38 |   visdial_top_down_bottom_up:
 39 |     classifier:
 40 |       type: logit
 41 |       params:
 42 |         img_hidden_dim: 5000
 43 |         text_hidden_dim: 300
 44 |     image_embeddings:
 45 |     - modal_combine:
 46 |         type: non_linear_element_multiply
 47 |         params:
 48 |           dropout: 0
 49 |           hidden_dim: 5000
 50 |       normalization: softmax
 51 |       transform:
 52 |         type: linear
 53 |         params:
 54 |           out_dim: 1
 55 |     image_feature_dim: 2048
 56 |     image_feature_encodings:
 57 |     - type: finetune_faster_rcnn_fpn_fc7
 58 |       params:
 59 |         bias_file: detec/detectron/fc6/fc7_b.pkl
 60 |         weights_file: detec/detectron/fc6/fc7_w.pkl
 61 |     modal_combine:
 62 |       type: non_linear_element_multiply
 63 |       params:
 64 |         dropout: 0
 65 |         hidden_dim: 5000
 66 |     text_embeddings:
 67 |     - type: attention
 68 |       params:
 69 |         hidden_dim: 1024
 70 |         num_layers: 1
 71 |         conv1_out: 512
 72 |         conv2_out: 2
 73 |         dropout: 0
 74 |         embedding_dim: 300
 75 |         embedding_init_file: vqa2.0_glove.6B.300d.txt.npy
 76 |         kernel_size: 1
 77 |         padding: 0
 78 | optimizer_attributes:
 79 |   type: Adamax
 80 |   params:
 81 |     eps: 1.0e-08
 82 |     lr: 0.01
 83 |     weight_decay: 0
 84 | run: train+predict
 85 | training_parameters:
 86 |   clip_norm_mode: all
 87 |   clip_gradients: true
 88 |   lr_ratio: 0.1
 89 |   lr_steps:
 90 |   - 15000
 91 |   - 18000
 92 |   - 20000
 93 |   - 21000
 94 |   max_grad_l2_norm: 0.25
 95 |   max_iterations: 22000
 96 |   log_interval: 100
 97 |   snapshot_interval: 3000
 98 |   wu_factor: 0.2
 99 |   wu_iters: 1000
100 |   patience: 3500
101 | 


--------------------------------------------------------------------------------
/pythia/datasets/dialog/visual_dialog/scripts/extract_vocabulary.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import json
 3 | 
 4 | from pythia.scripts.extract_vocabulary import ExtractVocabulary
 5 | 
 6 | 
 7 | class ExtractVisdialVocabulary(ExtractVocabulary):
 8 |     def __init__(self):
 9 |         super(ExtractVisdialVocabulary, self).__init__()
10 | 
11 |     def get_text(self):
12 |         text = []
13 | 
14 |         for input_file in self.input_files:
15 |             with open(input_file, "r") as f:
16 |                 f_json = json.load(f)
17 |                 # Add 'questions' from visdial
18 |                 text += f_json["data"]["questions"]
19 |                 # Add 'answers' from visdial
20 |                 text += f_json["data"]["answers"]
21 | 
22 |                 for dialog in f_json["data"]["dialogs"]:
23 |                     text += [dialog["caption"]]
24 |         return text
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     extractor = ExtractVisdialVocabulary()
29 |     extractor.extract()
30 | 


--------------------------------------------------------------------------------
/pythia/datasets/samplers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
 2 | # Code is copy-pasted exactly as in torch.utils.data.distributed.
 3 | # FIXME remove this once c10d fixes the bug it has
 4 | import math
 5 | import torch
 6 | import torch.distributed as dist
 7 | from torch.utils.data.sampler import Sampler
 8 | 
 9 | 
10 | class DistributedSampler(Sampler):
11 |     """Sampler that restricts data loading to a subset of the dataset.
12 |     It is especially useful in conjunction with
13 |     :class:`torch.nn.parallel.DistributedDataParallel`. In such case, each
14 |     process can pass a DistributedSampler instance as a DataLoader sampler,
15 |     and load a subset of the original dataset that is exclusive to it.
16 |     .. note::
17 |         Dataset is assumed to be of constant size.
18 |     Arguments:
19 |         dataset: Dataset used for sampling.
20 |         num_replicas (optional): Number of processes participating in
21 |             distributed training.
22 |         rank (optional): Rank of the current process within num_replicas.
23 |     """
24 | 
25 |     def __init__(self, dataset, num_replicas=None, rank=None, shuffle=True):
26 |         if num_replicas is None:
27 |             if not dist.is_available():
28 |                 raise RuntimeError("Requires distributed package to be available")
29 |             num_replicas = dist.get_world_size()
30 |         if rank is None:
31 |             if not dist.is_available():
32 |                 raise RuntimeError("Requires distributed package to be available")
33 |             rank = dist.get_rank()
34 |         self.dataset = dataset
35 |         self.num_replicas = num_replicas
36 |         self.rank = rank
37 |         self.epoch = 0
38 |         self.num_samples = int(math.ceil(len(self.dataset) * 1.0 / self.num_replicas))
39 |         self.total_size = self.num_samples * self.num_replicas
40 |         self.shuffle = shuffle
41 | 
42 |     def __iter__(self):
43 |         if self.shuffle:
44 |             # deterministically shuffle based on epoch
45 |             g = torch.Generator()
46 |             g.manual_seed(self.epoch)
47 |             indices = torch.randperm(len(self.dataset), generator=g).tolist()
48 |         else:
49 |             indices = torch.arange(len(self.dataset)).tolist()
50 | 
51 |         # add extra samples to make it evenly divisible
52 |         indices += indices[: (self.total_size - len(indices))]
53 |         assert len(indices) == self.total_size
54 | 
55 |         # subsample
56 |         offset = self.num_samples * self.rank
57 |         indices = indices[offset : offset + self.num_samples]
58 |         assert len(indices) == self.num_samples
59 | 
60 |         return iter(indices)
61 | 
62 |     def __len__(self):
63 |         return self.num_samples
64 | 
65 |     def set_epoch(self, epoch):
66 |         self.epoch = epoch
67 | 


--------------------------------------------------------------------------------
/pythia/datasets/scene_graph_database.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from pythia.datasets.image_database import ImageDatabase
 3 | 
 4 | 
 5 | class SceneGraphDatabase(ImageDatabase):
 6 |     def __init__(self, scene_graph_path):
 7 |         super().__init__(scene_graph_path)
 8 |         self.data_dict = {}
 9 |         for item in self.data:
10 |             self.data_dict[item["image_id"]] = item
11 | 
12 |     def __getitem__(self, idx):
13 |         return self.data_dict[idx]
14 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/clevr/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/datasets/vqa/clevr/__init__.py


--------------------------------------------------------------------------------
/pythia/datasets/vqa/clevr/builder.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import math
 3 | import os
 4 | import zipfile
 5 | from collections import Counter
 6 | 
 7 | from pythia.common.registry import registry
 8 | from pythia.common.constants import CLEVR_DOWNLOAD_URL
 9 | from pythia.datasets.base_dataset_builder import BaseDatasetBuilder
10 | from pythia.datasets.vqa.clevr.dataset import CLEVRDataset
11 | from pythia.utils.general import download_file, get_pythia_root
12 | 
13 | 
14 | @registry.register_builder("clevr")
15 | class CLEVRBuilder(BaseDatasetBuilder):
16 |     def __init__(self):
17 |         super().__init__("clevr")
18 |         self.writer = registry.get("writer")
19 |         self.dataset_class = CLEVRDataset
20 | 
21 |     def _build(self, dataset_type, config):
22 |         download_folder = os.path.join(get_pythia_root(), config.data_root_dir, config.data_folder)
23 | 
24 |         file_name = CLEVR_DOWNLOAD_URL.split("/")[-1]
25 |         local_filename = os.path.join(download_folder, file_name)
26 | 
27 |         extraction_folder = os.path.join(download_folder, ".".join(file_name.split(".")[:-1]))
28 |         self.data_folder = extraction_folder
29 | 
30 |         # Either if the zip file is already present or if there are some
31 |         # files inside the folder we don't continue download process
32 |         if os.path.exists(local_filename):
33 |             self.writer.write("CLEVR dataset is already present. Skipping download.")
34 |             return
35 | 
36 |         if os.path.exists(extraction_folder) and \
37 |             len(os.listdir(extraction_folder)) != 0:
38 |             return
39 | 
40 |         self.writer.write("Downloading the CLEVR dataset now")
41 |         download_file(CLEVR_DOWNLOAD_URL, output_dir=download_folder)
42 | 
43 |         self.writer.write("Downloaded. Extracting now. This can take time.")
44 |         with zipfile.ZipFile(local_filename, "r") as zip_ref:
45 |             zip_ref.extractall(download_folder)
46 | 
47 | 
48 |     def _load(self, dataset_type, config, *args, **kwargs):
49 |         self.dataset = CLEVRDataset(
50 |             dataset_type, config, data_folder=self.data_folder
51 |         )
52 |         return self.dataset
53 | 
54 |     def update_registry_for_model(self, config):
55 |         registry.register(
56 |             self.dataset_name + "_text_vocab_size",
57 |             self.dataset.text_processor.get_vocab_size(),
58 |         )
59 |         registry.register(
60 |             self.dataset_name + "_num_final_outputs",
61 |             self.dataset.answer_processor.get_vocab_size(),
62 |         )
63 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/m4c_ocrvqa/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/m4c_ocrvqa/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from pythia.common.registry import Registry
 3 | from pythia.datasets.vqa.m4c_ocrvqa.dataset import M4COCRVQADataset
 4 | from pythia.datasets.vqa.m4c_textvqa.builder import M4CTextVQABuilder
 5 | 
 6 | 
 7 | @Registry.register_builder("m4c_ocrvqa")
 8 | class M4COCRVQABuilder(M4CTextVQABuilder):
 9 |     def __init__(self):
10 |         super().__init__()
11 |         self.dataset_name = "m4c_ocrvqa"
12 |         self.set_dataset_class(M4COCRVQADataset)
13 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/m4c_ocrvqa/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from pythia.datasets.vqa.m4c_textvqa.dataset import M4CTextVQADataset
 3 | 
 4 | 
 5 | class M4COCRVQADataset(M4CTextVQADataset):
 6 |     def __init__(self, dataset_type, imdb_file_index, config, *args, **kwargs):
 7 |         super().__init__(
 8 |             dataset_type, imdb_file_index, config, *args, **kwargs
 9 |         )
10 |         self._name = "m4c_ocrvqa"
11 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/m4c_stvqa/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/m4c_stvqa/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from pythia.common.registry import Registry
 3 | from pythia.datasets.vqa.m4c_stvqa.dataset import M4CSTVQADataset
 4 | from pythia.datasets.vqa.m4c_textvqa.builder import M4CTextVQABuilder
 5 | 
 6 | 
 7 | @Registry.register_builder("m4c_stvqa")
 8 | class M4CSTVQABuilder(M4CTextVQABuilder):
 9 |     def __init__(self):
10 |         super().__init__()
11 |         self.dataset_name = "m4c_stvqa"
12 |         self.set_dataset_class(M4CSTVQADataset)
13 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/m4c_stvqa/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from pythia.datasets.vqa.m4c_textvqa.dataset import M4CTextVQADataset
 3 | 
 4 | 
 5 | class M4CSTVQADataset(M4CTextVQADataset):
 6 |     def __init__(self, dataset_type, imdb_file_index, config, *args, **kwargs):
 7 |         super().__init__(
 8 |             dataset_type, imdb_file_index, config, *args, **kwargs
 9 |         )
10 |         self._name = "m4c_stvqa"
11 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/m4c_textvqa/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/m4c_textvqa/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from pythia.common.registry import Registry
 3 | from pythia.datasets.vqa.m4c_textvqa.dataset import M4CTextVQADataset
 4 | from pythia.datasets.vqa.textvqa.builder import TextVQABuilder
 5 | 
 6 | 
 7 | @Registry.register_builder("m4c_textvqa")
 8 | class M4CTextVQABuilder(TextVQABuilder):
 9 |     def __init__(self):
10 |         super().__init__()
11 |         self.dataset_name = "m4c_textvqa"
12 |         self.set_dataset_class(M4CTextVQADataset)
13 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/textvqa/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/textvqa/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from pythia.common.registry import Registry
 3 | from pythia.datasets.vqa.textvqa.dataset import TextVQADataset
 4 | from pythia.datasets.vqa.vizwiz import VizWizBuilder
 5 | 
 6 | 
 7 | @Registry.register_builder("textvqa")
 8 | class TextVQABuilder(VizWizBuilder):
 9 |     def __init__(self):
10 |         super().__init__()
11 |         self.dataset_name = "textvqa"
12 |         self.set_dataset_class(TextVQADataset)
13 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/textvqa/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from pythia.datasets.vqa.vizwiz import VizWizDataset
 3 | from pythia.utils.text_utils import word_tokenize
 4 | 
 5 | 
 6 | class TextVQADataset(VizWizDataset):
 7 |     def __init__(self, dataset_type, imdb_file_index, config, *args, **kwargs):
 8 |         super().__init__(dataset_type, imdb_file_index, config, *args, **kwargs)
 9 |         self._name = "textvqa"
10 | 
11 |     def format_for_evalai(self, report):
12 |         answers = report.scores.argmax(dim=1)
13 | 
14 |         predictions = []
15 |         answer_space_size = self.answer_processor.get_true_vocab_size()
16 | 
17 |         for idx, question_id in enumerate(report.question_id):
18 |             answer_id = answers[idx].item()
19 |             print(answer_id, idx, len(answers), len(report.question_id), len(report.context_tokens))
20 |             if answer_id >= answer_space_size:
21 |                 answer_id -= answer_space_size
22 |                 answer = word_tokenize(report.context_tokens[idx][answer_id])
23 |             else:
24 |                 answer = self.answer_processor.idx2word(answer_id)
25 | 
26 |             predictions.append({"question_id": question_id.item(), "answer": answer})
27 |         return predictions
28 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/vizwiz/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | from .builder import VizWizBuilder
3 | from .dataset import VizWizDataset
4 | 
5 | 
6 | __all__ = ["VizWizBuilder", "VizWizDataset"]
7 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/vizwiz/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from pythia.common.registry import registry
 3 | from pythia.datasets.vqa.vizwiz.dataset import VizWizDataset
 4 | from pythia.datasets.vqa.vqa2 import VQA2Builder
 5 | 
 6 | 
 7 | @registry.register_builder("vizwiz")
 8 | class VizWizBuilder(VQA2Builder):
 9 |     def __init__(self):
10 |         super().__init__()
11 |         self.dataset_name = "vizwiz"
12 |         self.set_dataset_class(VizWizDataset)
13 | 
14 |     def update_registry_for_model(self, config):
15 |         super().update_registry_for_model(config)
16 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/vizwiz/dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import torch
 3 | 
 4 | from pythia.common.sample import Sample
 5 | from pythia.datasets.vqa.vqa2 import VQA2Dataset
 6 | 
 7 | 
 8 | class VizWizDataset(VQA2Dataset):
 9 |     def __init__(self, dataset_type, imdb_file_index, config, *args, **kwargs):
10 |         super().__init__(dataset_type, imdb_file_index, config, *args, **kwargs)
11 | 
12 |         # Update name as default would be 'vqa2' due to inheritance
13 |         self._name = "vizwiz"
14 | 
15 |     def load_item(self, idx):
16 |         sample = super().load_item(idx)
17 | 
18 |         sample_info = self.imdb[idx]
19 | 
20 |         if "image_name" in sample_info:
21 |             sample.image_id = sample_info["image_name"]
22 | 
23 |         return sample
24 | 
25 |     def format_for_evalai(self, report):
26 |         answers = report.scores.argmax(dim=1)
27 | 
28 |         predictions = []
29 |         answer_space_size = self.answer_processor.get_true_vocab_size()
30 | 
31 |         for idx, image_id in enumerate(report.image_id):
32 |             answer_id = answers[idx].item()
33 | 
34 |             if answer_id >= answer_space_size:
35 |                 answer_id -= answer_space_size
36 |                 answer = report.context_tokens[idx][answer_id]
37 |             else:
38 |                 answer = self.answer_processor.idx2word(answer_id)
39 |             if answer == self.context_processor.PAD_TOKEN:
40 |                 answer = "unanswerable"
41 |             predictions.append(
42 |                 {
43 |                     "image": "_".join(["VizWiz"] + image_id.split("_")[2:]) + ".jpg",
44 |                     "answer": answer,
45 |                 }
46 |             )
47 | 
48 |         return predictions
49 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/vqa2/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | __all__ = ["VQA2Builder", "VQA2Dataset"]
3 | 
4 | from .builder import VQA2Builder
5 | from .dataset import VQA2Dataset
6 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/vqa2/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import os
10 | import warnings
11 | 
12 | from pythia.common.registry import registry
13 | from pythia.datasets.base_dataset_builder import BaseDatasetBuilder
14 | from pythia.datasets.concat_dataset import PythiaConcatDataset
15 | from pythia.datasets.vqa.vqa2.dataset import VQA2Dataset
16 | 
17 | 
18 | @registry.register_builder("vqa2")
19 | class VQA2Builder(BaseDatasetBuilder):
20 |     def __init__(self):
21 |         super().__init__("vqa2")
22 |         self.dataset_class = VQA2Dataset
23 | 
24 |     def _load(self, dataset_type, config, *args, **kwargs):
25 |         self.config = config
26 | 
27 |         image_features = config["image_features"]["train"][0].split(",")
28 |         self.num_image_features = len(image_features)
29 | 
30 |         registry.register("num_image_features", self.num_image_features)
31 | 
32 |         self.dataset = self.prepare_data_set(dataset_type, config)
33 | 
34 |         return self.dataset
35 | 
36 |     def _build(self, dataset_type, config):
37 |         # TODO: Build actually here
38 |         return
39 | 
40 |     def update_registry_for_model(self, config):
41 |         registry.register(
42 |             self.dataset_name + "_text_vocab_size",
43 |             self.dataset.text_processor.get_vocab_size(),
44 |         )
45 |         registry.register(
46 |             self.dataset_name + "_num_final_outputs",
47 |             self.dataset.answer_processor.get_vocab_size(),
48 |         )
49 | 
50 |     def init_args(self, parser):
51 |         parser.add_argument_group("VQA2 task specific arguments")
52 |         parser.add_argument(
53 |             "--data_root_dir",
54 |             type=str,
55 |             default="../data",
56 |             help="Root directory for data",
57 |         )
58 |         parser.add_argument(
59 |             "-nfr",
60 |             "--fast_read",
61 |             type=bool,
62 |             default=None,
63 |             help="Disable fast read and load features on fly",
64 |         )
65 | 
66 |     def set_dataset_class(self, cls):
67 |         self.dataset_class = cls
68 | 
69 |     def prepare_data_set(self, dataset_type, config):
70 |         if dataset_type not in config.imdb_files:
71 |             warnings.warn(
72 |                 "Dataset type {} is not present in "
73 |                 "imdb_files of dataset config. Returning None. "
74 |                 "This dataset won't be used.".format(dataset_type)
75 |             )
76 |             return None
77 | 
78 |         imdb_files = config["imdb_files"][dataset_type]
79 | 
80 |         datasets = []
81 | 
82 |         for imdb_idx in range(len(imdb_files)):
83 |             cls = self.dataset_class
84 |             dataset = cls(dataset_type, imdb_idx, config)
85 |             datasets.append(dataset)
86 | 
87 |         dataset = PythiaConcatDataset(datasets)
88 | 
89 |         return dataset
90 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/vqa2/ocr_builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from pythia.common.registry import Registry
 3 | from pythia.datasets.vqa.vizwiz import VizWizBuilder
 4 | from pythia.datasets.vqa.vqa2.ocr_dataset import VQA2OCRDataset
 5 | 
 6 | 
 7 | @Registry.register_builder("vqa2_ocr")
 8 | class TextVQABuilder(VizWizBuilder):
 9 |     def __init__(self):
10 |         super().__init__()
11 |         self.dataset_name = "VQA2_OCR"
12 |         self.set_dataset_class(VQA2OCRDataset)
13 | 


--------------------------------------------------------------------------------
/pythia/datasets/vqa/vqa2/ocr_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from pythia.datasets.vqa.vizwiz import VizWizDataset
 3 | from pythia.utils.text_utils import word_tokenize
 4 | 
 5 | 
 6 | class VQA2OCRDataset(VizWizDataset):
 7 |     def __init__(self, imdb_file, image_feat_directories, verbose=False, **data_params):
 8 |         super(VQA2OCRDataset, self).__init__(
 9 |             imdb_file, image_feat_directories, verbose, **data_params
10 |         )
11 |         self.name = "vqa2_ocr"
12 | 
13 |     def format_for_evalai(self, batch, answers):
14 |         answers = answers.argmax(dim=1)
15 | 
16 |         predictions = []
17 |         for idx, question_id in enumerate(batch["question_id"]):
18 |             answer_id = answers[idx]
19 | 
20 |             if answer_id >= self.answer_space_size:
21 |                 answer_id -= self.answer_space_size
22 |                 answer = word_tokenize(batch["ocr_tokens"][answer_id][idx])
23 |             else:
24 |                 answer = self.answer_dict.idx2word(answer_id)
25 |             predictions.append({"question_id": question_id.item(), "answer": answer})
26 | 
27 |         return predictions
28 | 
29 |     def __getitem__(self, idx):
30 |         sample = super(VQA2OCRDataset, self).__getitem__(idx)
31 | 
32 |         if sample["question_id"] is None:
33 |             sample["question_id"] = -1
34 |         return sample
35 | 


--------------------------------------------------------------------------------
/pythia/legacy/best_model/config.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   batch_size: 512
 3 |   data_root_dir: data/
 4 |   dataset: vqa_2.0
 5 |   image_depth_first: false
 6 |   image_fast_reader: false
 7 |   image_feat_test:
 8 |   - rcnn_10_100/test2015
 9 |   image_feat_train:
10 |   - rcnn_10_100/train2014
11 |   - rcnn_10_100/val2014
12 |   image_feat_val:
13 |   - rcnn_10_100/val2014
14 |   image_max_loc: 100
15 |   imdb_file_test:
16 |   - imdb/imdb_test2015.npy
17 |   imdb_file_train:
18 |   - imdb/imdb_train2014.npy
19 |   - imdb/imdb_val2014.npy
20 |   imdb_file_val:
21 |   - imdb/imdb_minival2014.npy
22 |   num_workers: 5
23 |   question_max_len: 14
24 |   vocab_answer_file: answers_vqa.txt
25 |   vocab_question_file: large_vocabulary_vqa.txt
26 | exp_name: baseline
27 | loss: logitBCE
28 | model:
29 |   classifier:
30 |     method: logit_classifier
31 |     par:
32 |       img_hidden_dim: 5000
33 |       txt_hidden_dim: 300
34 |   image_embedding_models:
35 |   - modal_combine:
36 |       method: non_linear_elmt_multiply
37 |       par:
38 |         dropout: 0
39 |         hidden_size: 5000
40 |     normalization: softmax
41 |     transform:
42 |       method: linear_transform
43 |       par:
44 |         out_dim: 1
45 |   image_feat_dim: 2048
46 |   image_feature_encoding:
47 |   - method: default_image
48 |     par: {}
49 |   modal_combine:
50 |     method: non_linear_elmt_multiply
51 |     par:
52 |       dropout: 0
53 |       hidden_size: 5000
54 |   question_embedding:
55 |   - method: att_que_embed
56 |     par:
57 |       LSTM_hidden_size: 1024
58 |       LSTM_layer: 1
59 |       conv1_out: 512
60 |       conv2_out: 2
61 |       dropout: 0
62 |       embedding_dim: 300
63 |       embedding_init_file: large_vqa2.0_glove.6B.300d.txt.npy
64 |       kernel_size: 1
65 |       padding: 0
66 | optimizer:
67 |   method: Adamax
68 |   par:
69 |     eps: 1.0e-08
70 |     lr: 0.01
71 |     weight_decay: 0
72 | run: train+predict
73 | training_parameters:
74 |   clip_norm_mode: all
75 |   lr_ratio: 0.1
76 |   lr_steps:
77 |   - 15000
78 |   - 18000
79 |   - 20000
80 |   - 21000
81 |   max_grad_l2_norm: 0.25
82 |   max_iter: 22000
83 |   report_interval: 100
84 |   snapshot_interval: 1000
85 |   wu_factor: 0.2
86 |   wu_iters: 1000
87 | 


--------------------------------------------------------------------------------
/pythia/legacy/config/collections.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | """A simple attribute dictionary used for representing configuration \
10 |     options."""
11 | 
12 | 
13 | class AttrDict(dict):
14 | 
15 |     IMMUTABLE = "__immutable__"
16 | 
17 |     def __init__(self, *args, **kwargs):
18 |         super(AttrDict, self).__init__(*args, **kwargs)
19 |         self.__dict__[AttrDict.IMMUTABLE] = False
20 | 
21 |     def __getattr__(self, name):
22 |         if name in self.__dict__:
23 |             return self.__dict__[name]
24 |         elif name in self:
25 |             return self[name]
26 |         else:
27 |             raise AttributeError(name)
28 | 
29 |     def __setattr__(self, name, value):
30 |         if not self.__dict__[AttrDict.IMMUTABLE]:
31 |             if name in self.__dict__:
32 |                 self.__dict__[name] = value
33 |             else:
34 |                 self[name] = value
35 |         else:
36 |             raise AttributeError(
37 |                 'Attempted to set "{}" to "{}", but AttrDict is immutable'.format(
38 |                     name, value
39 |                 )
40 |             )
41 | 
42 |     def immutable(self, is_immutable):
43 |         """Set immutability to is_immutable and recursively apply the setting
44 |         to all nested AttrDicts.
45 |         """
46 |         self.__dict__[AttrDict.IMMUTABLE] = is_immutable
47 |         # Recursively set immutable state
48 |         for v in self.__dict__.values():
49 |             if isinstance(v, AttrDict):
50 |                 v.immutable(is_immutable)
51 |         for v in self.values():
52 |             if isinstance(v, AttrDict):
53 |                 v.immutable(is_immutable)
54 | 
55 |     def is_immutable(self):
56 |         return self.__dict__[AttrDict.IMMUTABLE]
57 | 


--------------------------------------------------------------------------------
/pythia/legacy/config/demo/config.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   batch_size: 512
 3 |   data_root_dir: data/
 4 |   dataset: vqa_2.0
 5 |   image_depth_first: false
 6 |   image_fast_reader: false
 7 |   image_feat_test:
 8 |   - demo/features/
 9 |   image_feat_train:
10 |   - rcnn_10_100/train2014
11 |   - rcnn_10_100/val2014
12 |   image_feat_val:
13 |   - rcnn_10_100/val2014
14 |   image_max_loc: 100
15 |   imdb_file_test:
16 |   - demo/imdb/imdb_demo.npy
17 |   imdb_file_train:
18 |   - imdb/imdb_train2014.npy
19 |   - imdb/imdb_val2014.npy
20 |   imdb_file_val:
21 |   - imdb/imdb_minival2014.npy
22 |   num_workers: 5
23 |   question_max_len: 14
24 |   vocab_answer_file: answers_vqa.txt
25 |   vocab_question_file: large_vocabulary_vqa.txt
26 | exp_name: baseline
27 | loss: logitBCE
28 | model:
29 |   classifier:
30 |     method: logit_classifier
31 |     par:
32 |       img_hidden_dim: 5000
33 |       txt_hidden_dim: 300
34 |   image_embedding_models:
35 |   - modal_combine:
36 |       method: non_linear_elmt_multiply
37 |       par:
38 |         dropout: 0
39 |         hidden_size: 5000
40 |     normalization: softmax
41 |     transform:
42 |       method: linear_transform
43 |       par:
44 |         out_dim: 1
45 |   image_feat_dim: 2048
46 |   image_feature_encoding:
47 |   - method: default_image
48 |     par: {}
49 |   modal_combine:
50 |     method: non_linear_elmt_multiply
51 |     par:
52 |       dropout: 0
53 |       hidden_size: 5000
54 |   question_embedding:
55 |   - method: att_que_embed
56 |     par:
57 |       LSTM_hidden_size: 1024
58 |       LSTM_layer: 1
59 |       conv1_out: 512
60 |       conv2_out: 2
61 |       dropout: 0
62 |       embedding_dim: 300
63 |       embedding_init_file: large_vqa2.0_glove.6B.300d.txt.npy
64 |       kernel_size: 1
65 |       padding: 0
66 | optimizer:
67 |   method: Adamax
68 |   par:
69 |     eps: 1.0e-08
70 |     lr: 0.01
71 |     weight_decay: 0
72 | run: train+predict
73 | training_parameters:
74 |   clip_norm_mode: all
75 |   lr_ratio: 0.1
76 |   lr_steps:
77 |   - 15000
78 |   - 18000
79 |   - 20000
80 |   - 21000
81 |   max_grad_l2_norm: 0.25
82 |   max_iter: 22000
83 |   report_interval: 100
84 |   snapshot_interval: 1000
85 |   wu_factor: 0.2
86 |   wu_iters: 1000
87 | 


--------------------------------------------------------------------------------
/pythia/legacy/config/keep/MFH_ft.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   data_root_dir: data
 3 |   image_feat_test:
 4 |   - detectron/fc6/vqa/test2015
 5 |   image_feat_train:
 6 |   - detectron_23/fc6/vqa/train2014
 7 |   - detectron_23/fc6/vqa/val2014
 8 |   image_feat_val:
 9 |   - detectron_23/fc6/vqa/val2014
10 |   image_max_loc: 100
11 |   imdb_file_test:
12 |   - imdb/imdb_test2015.npy
13 |   imdb_file_train:
14 |   - imdb/imdb_train2014.npy
15 |   - imdb/imdb_val2train2014.npy
16 |   imdb_file_val:
17 |   - imdb/imdb_minival2014.npy
18 |   batch_size: 512
19 | loss: softmaxKL
20 | model:
21 |   image_feature_encoding:
22 |   - method: finetune_faster_rcnn_fpn_fc7
23 |     par:
24 |       weights_file: detectron/fc6/fc7_w.pkl
25 |       bias_file: detectron/fc6/fc7_b.pkl
26 |   classifier:
27 |     method: linear_classifier
28 |   image_embedding_models:
29 |   - modal_combine:
30 |       method: MFH
31 |       par:
32 |         order: 1
33 |         hidden_sizes:
34 |         - 5000
35 |         dropout: 0.1
36 |         pool_size: 5
37 |     normalization: softmax
38 |     transform:
39 |       method: conv_transform
40 |   modal_combine:
41 |     method: MFH
42 |     par:
43 |       order: 2
44 |       hidden_sizes:
45 |       - 5000
46 |       - 5000
47 |       dropout: 0.1
48 |       pool_size: 5
49 | 


--------------------------------------------------------------------------------
/pythia/legacy/config/keep/detectron.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   data_root_dir: data
 3 |   image_feat_test:
 4 |   - detectron/fc6/vqa/test2015
 5 |   image_feat_train:
 6 |   - detectron/fc6/vqa/train2014
 7 |   - detectron/fc6/vqa/val2014
 8 |   image_feat_val:
 9 |   - detectron/fc6/vqa/val2014
10 |   image_max_loc: 100
11 |   imdb_file_test:
12 |   - imdb/imdb_test2015.npy
13 |   imdb_file_train:
14 |   - imdb/imdb_train2014.npy
15 |   - imdb/imdb_val2train2014.npy
16 |   imdb_file_val:
17 |   - imdb/imdb_minival2014.npy
18 | model:
19 |   image_feature_encoding:
20 |   - method: finetune_faster_rcnn_fpn_fc7
21 |     par:
22 |       weights_file: detectron/fc6/fc7_w.pkl
23 |       bias_file: detectron/fc6/fc7_b.pkl
24 | 


--------------------------------------------------------------------------------
/pythia/legacy/config/verbose/MFH_module.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   batch_size: 512
 3 |   data_root_dir: data
 4 |   dataset: vqa_2.0
 5 |   image_depth_first: false
 6 |   image_fast_reader: true
 7 |   image_feat_test:
 8 |   - detectron/fc6/vqa/test2015
 9 |   image_feat_train:
10 |   - detectron/fc6/vqa/train2014
11 |   - detectron/fc6/vqa/val2014
12 |   image_feat_val:
13 |   - detectron/fc6/vqa/val2014
14 |   image_max_loc: 100
15 |   imdb_file_test:
16 |   - imdb/imdb_test2015.npy
17 |   imdb_file_train:
18 |   - imdb/imdb_train2014.npy
19 |   - imdb/imdb_val2train2014.npy
20 |   imdb_file_val:
21 |   - imdb/imdb_minival2014.npy
22 |   num_workers: 5
23 |   question_max_len: 14
24 |   vocab_answer_file: answers_vqa.txt
25 |   vocab_question_file: vocabulary_vqa.txt
26 | exp_name: baseline
27 | loss: softmaxKL
28 | model:
29 |   classifier:
30 |     method: linear_classifier
31 |     par: {}
32 |   image_embedding_models:
33 |   - modal_combine:
34 |       method: MFH
35 |       par:
36 |         dropout: 0.1
37 |         hidden_sizes:
38 |         - 5000
39 |         - 5000
40 |         order: 2
41 |         pool_size: 5
42 |     normalization: softmax
43 |     transform:
44 |       method: conv_transform
45 |       par:
46 |         hidden_dim: 512
47 |         out_dim: 2
48 |   image_feat_dim: 2048
49 |   image_feature_encoding:
50 |   - method: finetune_faster_rcnn_fpn_fc7
51 |     par:
52 |       bias_file: detectron/fc6/fc7_b.pkl
53 |       weights_file: detectron/fc6/fc7_w.pkl
54 |   modal_combine:
55 |     method: MFH
56 |     par:
57 |       dropout: 0.1
58 |       hidden_sizes:
59 |       - 5000
60 |       - 5000
61 |       order: 2
62 |       pool_size: 5
63 |   question_embedding:
64 |   - method: att_que_embed
65 |     par:
66 |       LSTM_hidden_size: 1024
67 |       LSTM_layer: 1
68 |       conv1_out: 512
69 |       conv2_out: 2
70 |       dropout: 0
71 |       embedding_dim: 300
72 |       embedding_init_file: vqa2.0_glove.6B.300d.txt.npy
73 |       kernel_size: 1
74 |       padding: 0
75 | optimizer:
76 |   method: Adamax
77 |   par:
78 |     eps: 1.0e-08
79 |     lr: 0.01
80 |     weight_decay: 0
81 | run: train+predict
82 | training_parameters:
83 |   clip_norm_mode: all
84 |   lr_ratio: 0.1
85 |   lr_steps:
86 |   - 5000
87 |   - 7000
88 |   - 9000
89 |   - 11000
90 |   max_grad_l2_norm: 0.25
91 |   max_iter: 12000
92 |   report_interval: 100
93 |   snapshot_interval: 1000
94 |   wu_factor: 0.2
95 |   wu_iters: 1000
96 | 


--------------------------------------------------------------------------------
/pythia/legacy/config/verbose/dectectron_finetune.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   batch_size: 512
 3 |   data_root_dir: data
 4 |   dataset: vqa_2.0
 5 |   image_depth_first: false
 6 |   image_fast_reader: true
 7 |   image_feat_test:
 8 |   - detectron/fc6/vqa/test2015
 9 |   image_feat_train:
10 |   - detectron/fc6/vqa/train2014
11 |   - detectron/fc6/vqa/val2014
12 |   image_feat_val:
13 |   - detectron/fc6/vqa/val2014
14 |   image_max_loc: 100
15 |   imdb_file_test:
16 |   - imdb/imdb_test2015.npy
17 |   imdb_file_train:
18 |   - imdb/imdb_train2014.npy
19 |   - imdb/imdb_val2train2014.npy
20 |   imdb_file_val:
21 |   - imdb/imdb_minival2014.npy
22 |   num_workers: 5
23 |   question_max_len: 14
24 |   vocab_answer_file: answers_vqa.txt
25 |   vocab_question_file: vocabulary_vqa.txt
26 | exp_name: baseline
27 | loss: logitBCE
28 | model:
29 |   classifier:
30 |     method: logit_classifier
31 |     par:
32 |       img_hidden_dim: 5000
33 |       txt_hidden_dim: 300
34 |   image_embedding_models:
35 |   - modal_combine:
36 |       method: non_linear_elmt_multiply
37 |       par:
38 |         dropout: 0
39 |         hidden_size: 5000
40 |     normalization: softmax
41 |     transform:
42 |       method: linear_transform
43 |       par:
44 |         out_dim: 1
45 |   image_feat_dim: 2048
46 |   image_feature_encoding:
47 |   - method: finetune_faster_rcnn_fpn_fc7
48 |     par:
49 |       bias_file: detectron/fc6/fc7_b.pkl
50 |       weights_file: detectron/fc6/fc7_w.pkl
51 |   modal_combine:
52 |     method: non_linear_elmt_multiply
53 |     par:
54 |       dropout: 0
55 |       hidden_size: 5000
56 |   question_embedding:
57 |   - method: att_que_embed
58 |     par:
59 |       LSTM_hidden_size: 1024
60 |       LSTM_layer: 1
61 |       conv1_out: 512
62 |       conv2_out: 2
63 |       dropout: 0
64 |       embedding_dim: 300
65 |       embedding_init_file: vqa2.0_glove.6B.300d.txt.npy
66 |       kernel_size: 1
67 |       padding: 0
68 | optimizer:
69 |   method: Adamax
70 |   par:
71 |     eps: 1.0e-08
72 |     lr: 0.01
73 |     weight_decay: 0
74 | run: train+predict
75 | training_parameters:
76 |   clip_norm_mode: all
77 |   lr_ratio: 0.1
78 |   lr_steps:
79 |   - 5000
80 |   - 7000
81 |   - 9000
82 |   - 11000
83 |   max_grad_l2_norm: 0.25
84 |   max_iter: 12000
85 |   report_interval: 100
86 |   snapshot_interval: 1000
87 |   wu_factor: 0.2
88 |   wu_iters: 1000
89 | 


--------------------------------------------------------------------------------
/pythia/legacy/config/verbose/default.yaml:
--------------------------------------------------------------------------------
 1 | data:
 2 |   batch_size: 512
 3 |   data_root_dir: data
 4 |   dataset: vqa_2.0
 5 |   image_depth_first: false
 6 |   image_fast_reader: true
 7 |   image_feat_test:
 8 |   - rcnn_10_100/vqa/test2015
 9 |   image_feat_train:
10 |   - rcnn_10_100/vqa/train2014
11 |   - rcnn_10_100/vqa/val2014
12 |   image_feat_val:
13 |   - rcnn_10_100/vqa/val2014
14 |   image_max_loc: 100
15 |   imdb_file_test:
16 |   - imdb/imdb_test2015.npy
17 |   imdb_file_train:
18 |   - imdb/imdb_train2014.npy
19 |   - imdb/imdb_val2train2014.npy
20 |   imdb_file_val:
21 |   - imdb/imdb_minival2014.npy
22 |   num_workers: 5
23 |   question_max_len: 14
24 |   vocab_answer_file: answers_vqa.txt
25 |   vocab_question_file: vocabulary_vqa.txt
26 | exp_name: baseline
27 | loss: logitBCE
28 | model:
29 |   classifier:
30 |     method: logit_classifier
31 |     par:
32 |       img_hidden_dim: 5000
33 |       txt_hidden_dim: 300
34 |   image_embedding_models:
35 |   - modal_combine:
36 |       method: non_linear_elmt_multiply
37 |       par:
38 |         dropout: 0
39 |         hidden_size: 5000
40 |     normalization: softmax
41 |     transform:
42 |       method: linear_transform
43 |       par:
44 |         out_dim: 1
45 |   image_feat_dim: 2048
46 |   image_feature_encoding:
47 |   - method: default_image
48 |     par: {}
49 |   modal_combine:
50 |     method: non_linear_elmt_multiply
51 |     par:
52 |       dropout: 0
53 |       hidden_size: 5000
54 |   question_embedding:
55 |   - method: att_que_embed
56 |     par:
57 |       LSTM_hidden_size: 1024
58 |       LSTM_layer: 1
59 |       conv1_out: 512
60 |       conv2_out: 2
61 |       dropout: 0
62 |       embedding_dim: 300
63 |       embedding_init_file: vqa2.0_glove.6B.300d.txt.npy
64 |       kernel_size: 1
65 |       padding: 0
66 | optimizer:
67 |   method: Adamax
68 |   par:
69 |     eps: 1.0e-08
70 |     lr: 0.01
71 |     weight_decay: 0
72 | run: train+predict
73 | training_parameters:
74 |   clip_norm_mode: all
75 |   lr_ratio: 0.1
76 |   lr_steps:
77 |   - 5000
78 |   - 7000
79 |   - 9000
80 |   - 11000
81 |   max_grad_l2_norm: 0.25
82 |   max_iter: 12000
83 |   report_interval: 100
84 |   snapshot_interval: 1000
85 |   wu_factor: 0.2
86 |   wu_iters: 1000
87 | 


--------------------------------------------------------------------------------
/pythia/legacy/data/demo/features/COCO_test2015_000000000001.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/legacy/data/demo/features/COCO_test2015_000000000001.npy


--------------------------------------------------------------------------------
/pythia/legacy/data/demo/images/COCO_test2015_000000000001.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/legacy/data/demo/images/COCO_test2015_000000000001.jpg


--------------------------------------------------------------------------------
/pythia/legacy/data/demo/imdb/imdb_demo.npy:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/legacy/data/demo/imdb/imdb_demo.npy


--------------------------------------------------------------------------------
/pythia/legacy/data_prep/data_preprocess.md:
--------------------------------------------------------------------------------
 1 | #### VQA v2.0
 2 |  
 3 | Download dataset 
 4 | ```bash
 5 | cd ../
 6 | mkdir -p orig_data/vqa_v2.0
 7 | cd orig_data/vqa_v2.0
 8 | ./../../data_prep/vqa_v2.0/download_vqa_2.0.sh
 9 | 
10 | ```
11 | 
12 | Preprocess dataset
13 | ```bash
14 | cd ../../VQA_suite 
15 | mkdir data
16 | 
17 | export PYTHONPATH=.
18 | 
19 | python data_prep/vqa_v2.0/extract_vocabulary.py \
20 | --input_files ../orig_data/vqa_v2.0/v2_OpenEnded_mscoco_train2014_questions.json \
21 |  ../orig_data/vqa_v2.0/v2_OpenEnded_mscoco_val2014_questions.json \
22 |  ../orig_data/vqa_v2.0/v2_OpenEnded_mscoco_test2015_questions.json \
23 | --out_dir data/
24 | 
25 | python data_prep/vqa_v2.0/process_answers.py \
26 | --annotation_file ../orig_data/vqa_v2.0/v2_mscoco_train2014_annotations.json \
27 | --val_annotation_file ../orig_data/vqa_v2.0/v2_mscoco_val2014_annotations.json  \
28 | --out_dir data/ --min_freq 9
29 | 
30 | python data_prep/vqa_v2.0/extract_word_glove_embedding.py  \
31 | --vocabulary_file data/vocabulary_vqa.txt  \
32 | --glove_file ../orig_data/vqa_v2.0/glove/glove.6B.300d.txt \
33 | --out_dir data/
34 | 
35 | python data_prep/vqa_v2.0/build_vqa_2.0_imdb.py --data_dir ../orig_data/vqa_v2.0/ --out_dir data/
36 | 
37 | ```
38 | 
39 | Download image features
40 | ```bash
41 | cd data/
42 | wget https://dl.fbaipublicfiles.com/pythia/features/rcnn_10_100.tar.gz
43 | wget https://dl.fbaipublicfiles.com/pythia/features/detectron.tar.gz
44 | gunzip rcnn_10_100.tar.gz 
45 | tar -xvf rcnn_10_100.tar
46 | rm -f rcnn_10_100.tar
47 | 
48 | gunzip detectron.tar.gz
49 | tar -xvf detectron.tar
50 | rm -f detectron.tar
51 | ``` 
52 | ### Extract Image Features
53 | 
54 | We use detectron to extract image features. Setting up [detectron](https://github.com/facebookresearch/Detectron) 
55 | and copy [tools/extract_features.py](tools/extract_features.py) to detectron for extracting features
56 | 
57 | 
58 | Feature extraction works best with commit #3a38b7b of [detectron](https://github.com/facebookresearch/Detectron)
59 | and #0dd3284 of [caffe2](https://github.com/caffe2/caffe2)
60 | 
61 | 
62 | download the pretrained detectron model
63 | ```bash
64 | wget https://dl.fbaipublicfiles.com/pythia/detectron_model/FAST_RCNN_MLP_DIM2048_FPN_DIM512.pkl
65 | wget https://dl.fbaipublicfiles.com/pythia/detectron_model/e2e_faster_rcnn_X-101-64x4d-FPN_1x_MLP_2048_FPN_512.yaml
66 | 
67 | $INPUT_DIR = /path/to/your/input/image or directory
68 | 
69 | python extract_features.py --cfg e2e_faster_rcnn_X-101-64x4d-FPN_1x_MLP_2048_FPN_512.yaml \
70 | --wts FAST_RCNN_MLP_DIM2048_FPN_DIM512.pkl \
71 | --min_bboxes 100 --max_bboxes 100 \
72 | --feat_name gpu_0/fc6 \
73 | --output_dir ~/temp_out $INPUT_DIR
74 | ```
75 | 


--------------------------------------------------------------------------------
/pythia/legacy/data_prep/vqa_v2.0/download_vqa_2.0.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | 
 4 | # GloVe Vectors
 5 | wget  http://nlp.stanford.edu/data/glove.6B.zip
 6 | unzip glove.6B.zip -d glove
 7 | rm glove.6B.zip
 8 | 
 9 | 
10 | ##VQA2.0
11 | 
12 | wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Train_mscoco.zip
13 | 
14 | wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Annotations_Val_mscoco.zip
15 | 
16 | wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Train_mscoco.zip
17 | 
18 | wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Val_mscoco.zip
19 | 
20 | wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Questions_Test_mscoco.zip
21 | 
22 | wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Complementary_Pairs_Train_mscoco.zip
23 | 
24 | wget https://s3.amazonaws.com/cvmlp/vqa/mscoco/vqa/v2_Complementary_Pairs_Val_mscoco.zip
25 | 
26 | unzip v2_Annotations_Train_mscoco.zip
27 | rm v2_Annotations_Train_mscoco.zip
28 | 
29 | unzip v2_Annotations_Val_mscoco.zip
30 | rm v2_Annotations_Val_mscoco.zip
31 | 
32 | unzip v2_Questions_Train_mscoco.zip
33 | rm v2_Questions_Train_mscoco.zip
34 | 
35 | unzip v2_Questions_Val_mscoco.zip
36 | rm v2_Questions_Val_mscoco.zip
37 | 
38 | unzip v2_Questions_Test_mscoco.zip
39 | rm v2_Questions_Test_mscoco.zip
40 | 
41 | unzip v2_Complementary_Pairs_Train_mscoco.zip
42 | rm v2_Complementary_Pairs_Train_mscoco.zip
43 | 
44 | unzip v2_Complementary_Pairs_Val_mscoco.zip
45 | rm v2_Complementary_Pairs_Val_mscoco.zip
46 | 
47 | 
48 | ### get minival and val2train
49 | wget https://dl.fbaipublicfiles.com/pythia/data/v2_OpenEnded_mscoco_minival2014_questions.json
50 | wget https://dl.fbaipublicfiles.com/pythia/data/v2_OpenEnded_mscoco_val2train2014_questions.json
51 | 


--------------------------------------------------------------------------------
/pythia/legacy/data_prep/vqa_v2.0/extract_ques_info.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import argparse
10 | import json
11 | 
12 | 
13 | def extract_info(annotations, writer):
14 |     for annotation in annotations:
15 |         question_id = annotation["question_id"]
16 |         answer_type = annotation["answer_type"]
17 |         question_type = annotation["question_type"]
18 |         multiple_choice_answer = annotation["multiple_choice_answer"]
19 |         answers = [a["answer"] for a in annotation["answers"]]
20 |         answers_out = "|".join([str(a) for a in answers])
21 |         confidences = [a["answer_confidence"] for a in annotation["answers"]]
22 |         confidences_out = "|".join(str(a) for a in confidences)
23 | 
24 |         writer.write(
25 |             str(question_id)
26 |             + "\t"
27 |             + question_type
28 |             + "\t"
29 |             + answer_type
30 |             + "\t"
31 |             + str(multiple_choice_answer)
32 |             + "\t"
33 |             + answers_out
34 |             + "\t"
35 |             + confidences_out
36 |             + "\n"
37 |         )
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument(
43 |         "--annotation_files",
44 |         nargs="+",
45 |         required=True,
46 |         help="input annotation json files, \
47 |                              if more than 1, split by space",
48 |     )
49 |     parser.add_argument("--out", type=str, required=True, help="out put files")
50 | 
51 |     args = parser.parse_args()
52 |     out_writer = open(args.out, "w")
53 | 
54 |     for annotation_file in args.annotation_files:
55 |         with open(annotation_file, "r") as f:
56 |             annotations = json.load(f)["annotations"]
57 |         extract_info(annotations, out_writer)
58 | 
59 |     out_writer.close()
60 | 


--------------------------------------------------------------------------------
/pythia/legacy/data_prep/vqa_v2.0/extract_vocabulary.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import argparse
10 | import json
11 | import os
12 | from collections import Counter
13 | 
14 | from dataset_utils.text_processing import tokenize
15 | 
16 | parser = argparse.ArgumentParser()
17 | parser.add_argument(
18 |     "--input_files",
19 |     nargs="+",
20 |     required=True,
21 |     help="input question json files, \
22 |                          if more than 1, split by space",
23 | )
24 | parser.add_argument(
25 |     "--out_dir",
26 |     type=str,
27 |     default="./",
28 |     help="output directory, default is current directory",
29 | )
30 | parser.add_argument(
31 |     "--min_freq",
32 |     type=int,
33 |     default=0,
34 |     help="the minimum times of word occurrence \
35 |                           to be included in vocabulary, default 0",
36 | )
37 | 
38 | args = parser.parse_args()
39 | 
40 | input_files = args.input_files
41 | out_dir = args.out_dir
42 | min_freq = args.min_freq
43 | 
44 | os.makedirs(out_dir, exist_ok=True)
45 | 
46 | vocab_file_name = "vocabulary_vqa.txt"
47 | 
48 | word_count = Counter()
49 | questions = []
50 | 
51 | for idx, input_file in enumerate(input_files):
52 |     with open(input_file, "r") as f:
53 |         questions += json.load(f)["questions"]
54 | 
55 | question_length = [None] * len(questions)
56 | 
57 | for inx, question in enumerate(questions):
58 |     words = tokenize(question["question"])
59 |     question_length[inx] = len(words)
60 |     word_count.update(words)
61 | 
62 | vocabulary = [w[0] for w in word_count.items() if w[1] >= min_freq]
63 | vocabulary.sort()
64 | vocabulary = ["<unk>"] + vocabulary
65 | 
66 | vocab_file = os.path.join(out_dir, vocab_file_name)
67 | with open(vocab_file, "w") as f:
68 |     f.writelines([w + "\n" for w in vocabulary])
69 | 
70 | 
71 | print("min question len=", min(question_length))
72 | print("max question len=", max(question_length))
73 | 


--------------------------------------------------------------------------------
/pythia/legacy/data_prep/vqa_v2.0/extract_word_glove_embedding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import argparse
10 | import os
11 | 
12 | import numpy as np
13 | 
14 | from dataset_utils.text_processing import VocabDict
15 | 
16 | 
17 | def subset_weights(glove_file, vocabulary_file):
18 |     with open(glove_file, "r") as f:
19 |         entries = f.readlines()
20 |     emb_dim = len(entries[0].split(" ")) - 1
21 |     print("embedding dim is %d" % emb_dim)
22 | 
23 |     vocabulary = VocabDict(vocab_file=vocabulary_file)
24 | 
25 |     weights = np.zeros((vocabulary.num_vocab, emb_dim), dtype=np.float32)
26 | 
27 |     word2emb = {}
28 |     for entry in entries:
29 |         vals = entry.split(" ")
30 |         word = vals[0]
31 |         vals = np.array(list(map(float, vals[1:])))
32 |         word2emb[word] = np.array(vals)
33 | 
34 |     for word, idx in vocabulary.word2idx_dict.items():
35 |         if word not in word2emb:
36 |             continue
37 |         weights[idx] = word2emb[word]
38 | 
39 |     return weights
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     parser = argparse.ArgumentParser()
44 |     parser.add_argument(
45 |         "--vocabulary_file",
46 |         type=str,
47 |         required=True,
48 |         help="input train annotationjson file",
49 |     )
50 |     parser.add_argument(
51 |         "--glove_file",
52 |         type=str,
53 |         required=True,
54 |         help="glove files with the corresponding dim",
55 |     )
56 |     parser.add_argument(
57 |         "--out_dir",
58 |         type=str,
59 |         default="./",
60 |         help="output directory, default is current directory",
61 |     )
62 | 
63 |     args = parser.parse_args()
64 | 
65 |     glove_file = args.glove_file
66 |     vocabulary_file = args.vocabulary_file
67 |     out_dir = args.out_dir
68 | 
69 |     os.makedirs(out_dir, exist_ok=True)
70 |     emb_file_name = "vqa2.0_" + os.path.basename(glove_file) + ".npy"
71 | 
72 |     weights = subset_weights(glove_file, vocabulary_file)
73 | 
74 |     emb_file = os.path.join(out_dir, emb_file_name)
75 |     np.save(emb_file, weights)
76 | 


--------------------------------------------------------------------------------
/pythia/legacy/data_prep/vqa_v2.0/genome_ids.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/legacy/data_prep/vqa_v2.0/genome_ids.pkl


--------------------------------------------------------------------------------
/pythia/legacy/dataset_utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 | 


--------------------------------------------------------------------------------
/pythia/legacy/dataset_utils/create_imdb_header.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import datetime
10 | 
11 | from global_variables.global_variables import imdb_version
12 | 
13 | 
14 | def create_header(dataset_name, has_answer, has_gt_layout):
15 |     now = datetime.datetime.now()
16 |     time = now.strftime("%Y-%m-%d %H:%M")
17 |     version = imdb_version
18 |     header = dict(
19 |         create_time=time,
20 |         dataset_name=dataset_name,
21 |         version=version,
22 |         has_answer=has_answer,
23 |         has_gt_layout=has_gt_layout,
24 |     )
25 |     return header
26 | 


--------------------------------------------------------------------------------
/pythia/legacy/dataset_utils/text_processing.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import re
10 | 
11 | SENTENCE_SPLIT_REGEX = re.compile(r"(\W+)")
12 | 
13 | 
14 | def tokenize(sentence):
15 |     sentence = sentence.lower()
16 |     sentence = sentence.replace(",", "").replace("?", "").replace("'s", " 's")
17 |     tokens = SENTENCE_SPLIT_REGEX.split(sentence)
18 |     tokens = [t.strip() for t in tokens if len(t.strip()) > 0]
19 |     return tokens
20 | 
21 | 
22 | def load_str_list(fname):
23 |     with open(fname) as f:
24 |         lines = f.readlines()
25 |     lines = [l.strip() for l in lines]
26 |     return lines
27 | 
28 | 
29 | class VocabDict:
30 |     def __init__(self, vocab_file):
31 |         self.word_list = load_str_list(vocab_file)
32 |         self.word2idx_dict = {w: n_w for n_w, w in enumerate(self.word_list)}
33 |         self.num_vocab = len(self.word_list)
34 |         self.UNK_idx = (
35 |             self.word2idx_dict["<unk>"] if "<unk>" in self.word2idx_dict else None
36 |         )
37 | 
38 |     def idx2word(self, n_w):
39 |         return self.word_list[n_w]
40 | 
41 |     def word2idx(self, w):
42 |         if w in self.word2idx_dict:
43 |             return self.word2idx_dict[w]
44 |         elif self.UNK_idx is not None:
45 |             return self.UNK_idx
46 |         else:
47 |             raise ValueError(
48 |                 "word %s not in dictionary \
49 |                              (while dictionary does not contain <unk>)"
50 |                 % w
51 |             )
52 | 
53 |     def tokenize_and_index(self, sentence):
54 |         inds = [self.word2idx(w) for w in tokenize(sentence)]
55 |         return inds
56 | 


--------------------------------------------------------------------------------
/pythia/legacy/dataset_utils/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import numpy as np
10 | 
11 | 
12 | def unique_columns(data):
13 |     dt = np.dtype((np.void, data.dtype.itemsize * data.shape[0]))
14 |     dataf = np.asfortranarray(data).view(dt)
15 |     u, uind = np.unique(dataf, return_inverse=True)
16 |     m = u.view(data.dtype).reshape(-1, data.shape[0]).T
17 |     res = [np.where(uind == x)[0] for x in range(m.shape[1])]
18 |     return res
19 | 


--------------------------------------------------------------------------------
/pythia/legacy/dataset_utils/vqa_collates.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import numpy as np
10 | from torch.utils.data.dataloader import default_collate
11 | 
12 | 
13 | def filter_unk_collate(batch):
14 |     batch = list(filter(lambda x: np.sum(x["ans_scores"]) > 0, batch))
15 |     return default_collate(batch)
16 | 


--------------------------------------------------------------------------------
/pythia/legacy/dataset_utils/vqa_concate_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | from torch.utils.data import ConcatDataset
10 | 
11 | 
12 | class vqa_concate_dataset(ConcatDataset):
13 |     def __init__(self, datasets):
14 |         super(vqa_concate_dataset, self).__init__(datasets)
15 |         self.vocab_dict = datasets[0].vocab_dict
16 |         self.answer_dict = datasets[0].answer_dict
17 | 


--------------------------------------------------------------------------------
/pythia/legacy/dataset_utils/vqa_html_writer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | html_header = """
10 | <html>
11 | <title>W3.CSS Template</title>
12 | <meta charset="UTF-8">
13 | <meta name="viewport" content="width=device-width, initial-scale=1">
14 | <style>
15 | body,h1,h2,h3,h4,h5,h6 {font-family: "Karma", sans-serif}
16 | .w3-bar-block .w3-bar-item {padding:20px}
17 | </style>
18 | <body>
19 | 
20 | <div class="w3-main w3-content w3-padding"
21 | style="max-width:1200px;margin-top:100px">
22 | 
23 | """
24 | 
25 | html_footer = """
26 | </body>
27 | </html>
28 | """
29 | 
30 | row_header = """
31 | <div class="w3-row-padding w3-padding-16 w3-center" id="vqa">
32 | """
33 | 
34 | element_header = """
35 | <div class="w3-quarter">
36 | """
37 | 
38 | 
39 | class vqa_html_writer:
40 |     def __init__(self, file_path, elements_per_row=4):
41 |         self._writer = open(file_path, "w")
42 |         self._writer.write(html_header)
43 |         self.count = 0
44 |         self.elements_per_row = elements_per_row
45 | 
46 |     def write_element(self, image, **kwarg):
47 |         if self.count % self.elements_per_row == 0:
48 |             self._writer.write(row_header + "\n")
49 |         self._writer.write(element_header)
50 |         self._writer.write('<img src=" ' + image + '" width = 100%">')
51 |         for key, value in kwarg.items():
52 |             self._writer.write("<p>%s : %s</p>" % (key, value))
53 |         self._writer.write("</div>")
54 |         self.count += 1
55 |         if self.count % self.elements_per_row == 0 and self.count > 0:
56 |             self._writer.write("</div>")
57 | 
58 |     def close(self):
59 |         if self.count % self.elements_per_row != 0:
60 |             self._writer.write("</div>")
61 |         self._writer.write(html_footer)
62 |         self._writer.close()
63 | 
64 | 
65 | if __name__ == "__main__":
66 |     html_writer = vqa_html_writer("/Users/tinayujiang/temp/test.html", 4)
67 |     n = 10
68 |     for i in range(10):
69 |         image_path = (
70 |             "/Users/tinayujiang/work/VQA/data_analysis/val2014/"
71 |             + "COCO_val2014_000000290951.jpg"
72 |         )
73 |         info = {"question": "abcfs efc?", "answers": " wdds cdsde"}
74 |         html_writer.write_element(image_path, **info)
75 | 
76 |     html_writer.close()
77 | 


--------------------------------------------------------------------------------
/pythia/legacy/ensemble.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import argparse
10 | import glob
11 | import json
12 | 
13 | import numpy as np
14 | 
15 | import _pickle as pickle
16 | from train_model.helper import print_result
17 | 
18 | 
19 | def parse_args():
20 |     parser = argparse.ArgumentParser()
21 |     parser.add_argument("--out", type=str, required=True, help="output file name")
22 |     parser.add_argument(
23 |         "--res_dirs",
24 |         nargs="+",
25 |         help="directories for results, NOTE:"
26 |         "all *.pkl file under these dirs will be ensembled",
27 |         default=None,
28 |     )
29 |     argments = parser.parse_args()
30 | 
31 |     return argments
32 | 
33 | 
34 | class answer_json:
35 |     def __init__(self):
36 |         self.answers = []
37 | 
38 |     def add(self, ques_id, ans):
39 |         res = {"question_id": ques_id, "answer": ans}
40 |         self.answers.append(res)
41 | 
42 | 
43 | if __name__ == "__main__":
44 | 
45 |     args = parse_args()
46 |     result_dirs = args.res_dirs
47 |     out_file = args.out
48 |     question_ids = None
49 |     soft_max_result = None
50 |     ans_dic = None
51 |     cnt = 0
52 |     for res_dir in result_dirs:
53 |         for file in glob.glob(res_dir + "/**/*.pkl", recursive=True):
54 |             with open(file, "rb") as f:
55 |                 cnt += 1
56 |                 sm = pickle.load(f)
57 |                 if soft_max_result is None:
58 |                     soft_max_result = sm
59 |                     question_ids = pickle.load(f)
60 |                     ans_dic = pickle.load(f)
61 |                 else:
62 |                     soft_max_result += sm
63 | 
64 |     print("ensemble total %d models" % cnt)
65 | 
66 |     predicted_answers = np.argmax(soft_max_result, axis=1)
67 | 
68 |     pkl_file = out_file + ".pkl"
69 | 
70 |     print_result(question_ids, soft_max_result, ans_dic, out_file, False, pkl_file)
71 | 
72 |     print("Done")
73 | 


--------------------------------------------------------------------------------
/pythia/legacy/eval_model/eval_demo.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import json
10 | import sys
11 | 
12 | from eval_model.vqaEval import VQAEval
13 | 
14 | 
15 | def parse_annotation(anno_file):
16 |     with open(anno_file, "r") as f:
17 |         annotations = json.load(f)["annotations"]
18 | 
19 |     q_2_anno = dict([(a["question_id"], a) for a in annotations])
20 |     return q_2_anno
21 | 
22 | 
23 | def parse_ans(answ_file):
24 |     with open(answ_file, "r") as f:
25 |         answers = json.load(f)
26 | 
27 |     q_2_answ = dict([(a["question_id"], a) for a in answers])
28 |     return q_2_answ
29 | 
30 | 
31 | if __name__ == "__main__":
32 |     if len(sys.argv) < 3:
33 |         exit(
34 |             "USAGE: python eval_model/eval_demo.py \
35 |              annotation_json_file answer_json_file"
36 |         )
37 | 
38 |     anno_file = sys.argv[1]
39 |     answ_file = sys.argv[2]
40 | 
41 |     q_2_anno = parse_annotation(anno_file)
42 |     q_2_answ = parse_ans(answ_file)
43 | 
44 |     eval = VQAEval(q_2_anno, q_2_answ, 2)
45 |     eval.evaluate()
46 |     acc = eval.accuracy
47 |     print(
48 |         "overall: %.2f" % acc["overall"],
49 |         "yes/no: %f" % acc["perAnswerType"]["yes/no"],
50 |         "number: %.2f" % acc["perAnswerType"]["number"],
51 |         "other: %.2f" % acc["perAnswerType"]["other"],
52 |     )
53 | 


--------------------------------------------------------------------------------
/pythia/legacy/global_variables/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 | 


--------------------------------------------------------------------------------
/pythia/legacy/global_variables/global_variables.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import torch
10 | 
11 | imdb_version = 1
12 | use_cuda = torch.cuda.is_available()
13 | 
14 | model_type_gt = "gt_layout"
15 | model_type_scratch = "scratch"
16 | model_type_gt_rl = "gt+rl"
17 | model_type_top_down_bottom_up = "top_down_bottom_up"
18 | 
19 | 
20 | topdown_concate_attention = "concate_attention"
21 | topdown_project_attention = "project_attention"
22 | 


--------------------------------------------------------------------------------
/pythia/legacy/info/code_structure_plot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/legacy/info/code_structure_plot.png


--------------------------------------------------------------------------------
/pythia/legacy/info/pythia.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/legacy/info/pythia.jpg


--------------------------------------------------------------------------------
/pythia/legacy/info/vqa_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/pythia/legacy/info/vqa_example.png


--------------------------------------------------------------------------------
/pythia/legacy/install.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | conda create --name vqa python=3.6
 4 | source activate vqa
 5 | pip install demjson pyyaml
 6 | 
 7 | pip install http://download.pytorch.org/whl/cu90/torch-0.3.0-cp36-cp36m-linux_x86_64.whl
 8 | pip install torchvision
 9 | pip install tensorboardX
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/pythia/legacy/tools/convert_VG_to_COCO_qa.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import json
10 | import string
11 | 
12 | genome_data_file = "question_answers.json"
13 | genome_questions_file = "v2_OpenEnded_mscoco_genome_questions.json"
14 | genome_annotations_file = "v2_mscoco_genome_annotations.json"
15 | 
16 | translator = str.maketrans("", "", string.punctuation)
17 | with open(genome_data_file, "r") as f:
18 |     genome_data = json.load(f)
19 | 
20 | genome_questions = []
21 | genome_annotations = []
22 | 
23 | for data in genome_data:
24 |     all_qas = data["qas"]
25 |     for qas in all_qas:
26 |         question = {}
27 |         annotation = {}
28 |         question["image_id"] = qas["image_id"]
29 |         # assume unique question_id for every question answer pair
30 |         question["question_id"] = qas["qa_id"]
31 |         question["question"] = qas["question"]
32 |         genome_questions.append(question)
33 |         annotation["image_id"] = qas["image_id"]
34 |         annotation["question_id"] = qas["qa_id"]
35 |         answertxt = qas["answer"].translate(translator)
36 |         answertxt = answertxt.lower()
37 |         annotation["multiple_choice_answer"] = answertxt
38 |         annotation["answers"] = []
39 |         for i in range(10):
40 |             answer = {}
41 |             answer["answer"] = answertxt
42 |             answer["answer_confifence"] = "yes"
43 |             answer["answer_id"] = i + 1
44 |             annotation["answers"].append(answer)
45 |         genome_annotations.append(annotation)
46 | 
47 | genome_data = {}
48 | genome_data["questions"] = genome_questions
49 | 
50 | with open(genome_questions_file, "w") as f:
51 |     json.dump(genome_data, f)
52 | 
53 | genome_data = {}
54 | genome_data["annotations"] = genome_annotations
55 | 
56 | with open(genome_annotations_file, "w") as f:
57 |     json.dump(genome_data, f)
58 | 


--------------------------------------------------------------------------------
/pythia/legacy/tools/convert_tsv_feature_to_indiv.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import argparse
10 | import base64
11 | import csv
12 | import os
13 | import sys
14 | 
15 | import numpy as np
16 | 
17 | parser = argparse.ArgumentParser()
18 | parser.add_argument("--infile", type=str, required=True, help="input file")
19 | parser.add_argument("--label", type=str, required=True, help="label for dataset")
20 | parser.add_argument("--out_dir", type=str, required=True, help="imdb output directory")
21 | args = parser.parse_args()
22 | 
23 | out_dir = args.out_dir
24 | 
25 | 
26 | csv.field_size_limit(sys.maxsize)
27 | 
28 | FIELDNAMES = ["image_id", "image_w", "image_h", "num_boxes", "boxes", "features"]
29 | infile = args.infile
30 | 
31 | label = args.label
32 | 
33 | out_dir = os.path.join(out_dir, label)
34 | 
35 | os.makedirs(out_dir, exist_ok=True)
36 | 
37 | print("reading tsv...")
38 | with open(infile, "r") as tsv_in_file:
39 |     reader = csv.DictReader(tsv_in_file, delimiter="\t", fieldnames=FIELDNAMES)
40 |     for item in reader:
41 |         item["num_boxes"] = int(item["num_boxes"])
42 |         image_id = int(item["image_id"])
43 |         image_w = float(item["image_w"])
44 |         image_h = float(item["image_h"])
45 | 
46 |         image_bboxes = np.frombuffer(
47 |             base64.b64decode(item["boxes"]), dtype=np.float32
48 |         ).reshape((item["num_boxes"], -1))
49 | 
50 |         image_feat = np.frombuffer(
51 |             base64.b64decode(item["features"]), dtype=np.float32
52 |         ).reshape((item["num_boxes"], -1))
53 | 
54 |         image_feat_and_boxes = {"image_bboxes": image_bboxes, "image_feat": image_feat}
55 | 
56 |         image_file_name = os.path.join(
57 |             out_dir, "COCO_" + label + "_%012d.npy" % image_id
58 |         )
59 |         np.save(image_file_name, image_feat_and_boxes)
60 | 


--------------------------------------------------------------------------------
/pythia/legacy/tools/eval_ensemble_on_val.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import glob
10 | import sys
11 | 
12 | import torch
13 | import yaml
14 | from torch.utils.data import DataLoader
15 | 
16 | from train_model.dataset_utils import prepare_eval_data_set
17 | from train_model.helper import build_model, run_model
18 | 
19 | CONFIG = "config.yaml"
20 | MODELNAME = "best_model.pth"
21 | 
22 | if __name__ == "__main__":
23 |     if len(sys.argv) < 2:
24 |         exit(
25 |             "USAGE: python tools/eval_ensemble_on_val.py parent_dir \
26 |              [ensemble sizes]"
27 |         )
28 | 
29 |     esbl_sizes = [int(a) for a in sys.argv[2:]]
30 | 
31 |     parent_dir = sys.argv[1]
32 | 
33 |     model_pths = [
34 |         file for file in glob.glob(parent_dir + "/**/" + MODELNAME, recursive=True)
35 |     ]
36 |     config_files = [c.replace(MODELNAME, CONFIG) for c in model_pths]
37 | 
38 |     if len(esbl_sizes) == 0:
39 |         esbl_sizes = range(1, len(config_files) + 1)
40 | 
41 |     config_file = config_files[0]
42 | 
43 |     with open(config_file, "r") as f:
44 |         config = yaml.load(f)
45 | 
46 |     batch_size = config["data"]["batch_size"]
47 |     data_set_test = prepare_eval_data_set(
48 |         **config["data"], **config["model"], verbose=True
49 |     )
50 |     data_reader_test = DataLoader(
51 |         data_set_test, shuffle=False, batch_size=batch_size, num_workers=5
52 |     )
53 |     ans_dic = data_set_test.answer_dict
54 | 
55 |     accumulated_softmax = None
56 |     final_result = {}
57 |     n_model = 0
58 |     for c_file, model_file in zip(config_files, model_pths):
59 |         with open(c_file, "r") as f:
60 |             config = yaml.load(f)
61 | 
62 |         myModel = build_model(config, data_set_test)
63 |         myModel.load_state_dict(torch.load(model_file)["state_dict"])
64 | 
65 |         question_ids, soft_max_result = run_model(
66 |             myModel, data_reader_test, ans_dic.UNK_idx
67 |         )
68 | 
69 |         if n_model == 0:
70 |             final_result = soft_max_result
71 | 


--------------------------------------------------------------------------------
/pythia/legacy/tools/extract_detectron_weights.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import os
10 | import pickle
11 | import sys
12 | 
13 | if len(sys.argv) < 4:
14 |     exit(
15 |         "USAGE: python tools/extract_detectron_weights.py \
16 |          weights_file out_dir feat_name [feat_name]"
17 |     )
18 | 
19 | wgts_file = sys.argv[1]
20 | out_dir = sys.argv[2]
21 | 
22 | with open(wgts_file, "rb") as f:
23 |     wgts = pickle.load(f, encoding="latin1")["blobs"]
24 | 
25 | for i in range(3, len(sys.argv)):
26 |     feat_name = sys.argv[i]
27 |     wgt = wgts[feat_name]
28 |     out_file = os.path.join(out_dir, feat_name + ".pkl")
29 |     with open(out_file, "wb") as w:
30 |         pickle.dump(wgt, w)
31 | 


--------------------------------------------------------------------------------
/pythia/legacy/tools/extract_minival_ids.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import json
10 | import pickle
11 | 
12 | 
13 | def extract_qid_imid(ques_json_file):
14 |     with open(ques_json_file, "r") as f:
15 |         info = json.load(f)
16 |         questions = info["questions"]
17 | 
18 |     q_im_ids = []
19 |     for q in questions:
20 |         im_id = q["image_id"]
21 |         q_id = q["question_id"]
22 |         q_im_ids.append((im_id, q_id))
23 | 
24 |     return q_im_ids
25 | 
26 | 
27 | if __name__ == "__main__":
28 |     minival_ques_file = "v2_OpenEnded_mscoco_minival2014_questions.json"
29 | 
30 |     val2train_ques_file = "v2_OpenEnded_mscoco_val2train2014_questions.json"
31 | 
32 |     minival_out_file = "data_prep/vqa_v2.0/minival_ids.pkl"
33 |     val2train_out_file = "data_prep/vqa_v2.0/val2train_ids.pkl"
34 | 
35 |     minival_ids = extract_qid_imid(minival_ques_file)
36 |     with open(minival_out_file, "wb") as w1:
37 |         pickle.dump(minival_ids, w1)
38 | 
39 |     val2train_ids = extract_qid_imid(val2train_ques_file)
40 |     with open(val2train_out_file, "wb") as w2:
41 |         pickle.dump(val2train_ids, w2)
42 | 


--------------------------------------------------------------------------------
/pythia/legacy/tools/extract_visual_features_vgg_pool5.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import argparse
10 | import os
11 | import sys
12 | from glob import glob
13 | 
14 | import numpy as np
15 | import torch
16 | import torch.nn as nn
17 | import torchvision.models as models
18 | from torch.autograd import Variable
19 | 
20 | import skimage.color
21 | import skimage.io
22 | from global_variables.global_variables import use_cuda
23 | 
24 | parser = argparse.ArgumentParser()
25 | parser.add_argument("--gpu_id", type=int, default=0)
26 | parser.add_argument("--data_dir", type=str, required=True)
27 | parser.add_argument("--out_dir", type=str, required=True)
28 | 
29 | args = parser.parse_args()
30 | gpu_id = args.gpu_id  # set GPU id to use
31 | os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
32 | sys.path.append("../../")
33 | 
34 | image_basedir = args.data_dir
35 | save_basedir = args.out_dir
36 | 
37 | channel_mean = np.array([123.68, 116.779, 103.939], dtype=np.float32)
38 | 
39 | 
40 | class vgg16_feature_module(nn.Module):
41 |     def __init__(self, vgg16_model):
42 |         super(vgg16_feature_module, self).__init__()
43 |         self.feature_module = nn.Sequential(*list(list(vgg16_model.children())[0]))
44 | 
45 |     def forward(self, x):
46 |         return self.feature_module(x)
47 | 
48 | 
49 | vgg16 = models.vgg16(pretrained=True)
50 | vgg16_feature = vgg16_feature_module(vgg16)
51 | vgg16_feature = vgg16_feature.cuda() if use_cuda else vgg16_feature
52 | 
53 | 
54 | def extract_image_pool5(impath):
55 |     im = skimage.io.imread(impath)[..., :3]
56 |     im_val = im[np.newaxis, ...] - channel_mean
57 | 
58 |     # permute to get NCHW
59 |     im_val = np.transpose(im_val, axes=(0, 3, 1, 2))
60 |     im_val_tensor = torch.FloatTensor(im_val)
61 |     im_val_variable = Variable(im_val_tensor)
62 |     im_val_variable = im_val_variable.cuda() if use_cuda else im_val_variable
63 | 
64 |     pool5_val = vgg16_feature(im_val_variable)
65 |     return pool5_val.data.cpu().numpy()
66 | 
67 | 
68 | def extract_dataset_pool5(image_dir, save_dir, ext_filter="*.png"):
69 |     image_list = glob(image_dir + "/" + ext_filter)
70 |     os.makedirs(save_dir, exist_ok=True)
71 | 
72 |     for n_im, impath in enumerate(image_list):
73 |         if (n_im + 1) % 100 == 0:
74 |             print("processing %d / %d" % (n_im + 1, len(image_list)))
75 |         image_name = os.path.basename(impath).split(".")[0]
76 |         save_path = os.path.join(save_dir, image_name + ".npy")
77 |         if not os.path.exists(save_path):
78 |             pool5_val = extract_image_pool5(impath)
79 |             np.save(save_path, pool5_val)
80 | 
81 | 
82 | for image_set in ["train", "val", "test"]:
83 |     print("Extracting image set " + image_set)
84 |     extract_dataset_pool5(
85 |         os.path.join(image_basedir, image_set), os.path.join(save_basedir, image_set)
86 |     )
87 |     print("Done.")
88 | 


--------------------------------------------------------------------------------
/pythia/legacy/tools/generate_minival_annotation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import json
10 | import pickle
11 | 
12 | if __name__ == "__main__":
13 |     val_annotation_file = "v2_mscoco_val2014_annotations.json"
14 |     minival_id_file = "data/vqa_v2.0/minival_ids.pkl"
15 |     minival_annotation_file = "v2_mscoco_minival2014_annotations.json"
16 | 
17 |     with open(minival_id_file, "rb") as f:
18 |         q_im_ids = pickle.load(f)
19 | 
20 |     minival_ids = [x[1] for x in q_im_ids]
21 | 
22 |     with open(val_annotation_file, "r") as f:
23 |         file_info = json.load(f)
24 |         annotations = file_info["annotations"]
25 |         info = file_info["info"]
26 |         data_subtype = file_info["data_subtype"]
27 |         license_info = file_info["license"]
28 | 
29 |     minival_annotations = [a for a in annotations if a["question_id"] in minival_ids]
30 | 
31 |     minival_info = {
32 |         "data_subtype": data_subtype,
33 |         "license": license_info,
34 |         "info": info,
35 |         "annotations": minival_annotations,
36 |     }
37 | 
38 |     with open(minival_annotation_file, "w") as w:
39 |         json.dump(minival_info, w)
40 | 


--------------------------------------------------------------------------------
/pythia/legacy/tools/mirror_images.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | # All paths need to be updated
 9 | 
10 | import json
11 | import os
12 | from multiprocessing.dummy import Pool as ThreadPool
13 | 
14 | from PIL import Image, ImageOps
15 | 
16 | split = "val2014"
17 | image_paths = []
18 | 
19 | 
20 | def mirror_image(image_path):
21 |     img = Image.open(image_path)
22 |     mirror_img = ImageOps.mirror(img)
23 |     image_name = image_path.split("/")[-1]
24 |     fh = "data/" + split
25 |     fh = os.path.join(fh, image_name)
26 |     mirror_img.save(fh, "JPEG")
27 | 
28 | 
29 | with open("./COCO/060817/annotations/instances_val2014.json") as f:
30 |     data = json.load(f)
31 |     for item in data["images"]:
32 |         image_id = int(item["id"])
33 |         filepath = os.path.join("val2014/", item["file_name"])
34 |         image_paths.append(filepath)
35 | 
36 | pool = ThreadPool(10)
37 | results = pool.map(mirror_image, image_paths)
38 | 


--------------------------------------------------------------------------------
/pythia/legacy/tools/model_path.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | ##vgg model from https://github.com/jcjohnson/pytorch-vgg
10 | 
11 | 
12 | vgg16_caffe2 = "https://s3-us-west-2.amazonaws.com/jcjohns-models/vgg16-00b39a1b.pth"
13 | vgg19_caffe2 = "https://s3-us-west-2.amazonaws.com/jcjohns-models/vgg19-d01eb7cb.pth"
14 | 


--------------------------------------------------------------------------------
/pythia/legacy/tools/rename_genome_file.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import os
10 | import shutil
11 | import sys
12 | 
13 | if len(sys.argv) != 3:
14 |     exit("Usage: python tools/rename_genome_file.py [inDir] [outDir]")
15 | 
16 | inDir = sys.argv[1]
17 | outDir = sys.argv[2]
18 | 
19 | OUT_NAME = "COCO_genome_%012d.npy"
20 | 
21 | os.makedirs(outDir, exist_ok=True)
22 | 
23 | n = 0
24 | print("BEGIN.....")
25 | for file in os.listdir(inDir):
26 |     if file.endswith(".npy"):
27 |         n += 1
28 |         if n % 5000 == 0:
29 |             print("process %d files" % n)
30 |         image_id = int(file.split(".")[0])
31 |         out_name = OUT_NAME % image_id
32 |         in_file = os.path.join(inDir, file)
33 |         out_file = os.path.join(outDir, out_name)
34 |         shutil.copy(in_file, out_file)
35 | 
36 | print("process total %d files" % n)
37 | print("DONE.....")
38 | 


--------------------------------------------------------------------------------
/pythia/legacy/tools/subset_val.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import argparse
10 | import json
11 | import random
12 | 
13 | 
14 | def parse_args():
15 |     parser = argparse.ArgumentParser()
16 |     parser.add_argument("--ques_file", type=str)
17 |     pass
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     val_json_file = "v2_OpenEnded_mscoco_val2014_questions.json"
22 |     minival_json_file = "v2_OpenEnded_mscoco_minival2014_questions.json"
23 |     val_as_train_json_file = "v2_OpenEnded_mscoco_val2train2014_questions.json"
24 | 
25 |     with open(val_json_file, "r") as f:
26 |         file_info = json.load(f)
27 |         questions = file_info["questions"]
28 |         info = file_info["info"]
29 |         task_type = file_info["task_type"]
30 |         data_type = file_info["data_type"]
31 |         license = file_info["license"]
32 |         data_subtype = file_info["info"]
33 | 
34 |     # collect image_id
35 |     image_ids = []
36 |     for q in questions:
37 |         image_id = q["image_id"]
38 |         image_ids.append(image_id)
39 | 
40 |     # divide image_ids to two parts
41 |     random.shuffle(image_ids)
42 |     minival_images = image_ids[:10000]
43 |     other_images = image_ids[10000:]
44 | 
45 |     minival_ques = []
46 |     other_ques = []
47 | 
48 |     total_minival = 0
49 |     total_others = 0
50 |     # seprate quesion_json_file
51 |     for q in questions:
52 |         image_id = q["image_id"]
53 | 
54 |         if image_id in minival_images:
55 |             minival_ques.append(q)
56 |             total_minival += 1
57 |         else:
58 |             other_ques.append(q)
59 |             total_others += 1
60 | 
61 |     minival_json = {
62 |         "info": info,
63 |         "task_type": task_type,
64 |         "data_type": data_type,
65 |         "license": license,
66 |         "data_subtype": "minival2014",
67 |         "questions": minival_ques,
68 |     }
69 | 
70 |     other_json = {
71 |         "info": info,
72 |         "task_type": task_type,
73 |         "data_type": data_type,
74 |         "license": license,
75 |         "data_subtype": "val2train2014",
76 |         "questions": other_ques,
77 |     }
78 | 
79 |     with open(minival_json_file, "w") as w1:
80 |         json.dump(minival_json, w1)
81 | 
82 |     with open(val_as_train_json_file, "w") as w2:
83 |         json.dump(other_json, w2)
84 | 
85 |     print(
86 |         "minival_questions: %d" % total_minival + "other_questions: %d" % total_others
87 |     )
88 | 


--------------------------------------------------------------------------------
/pythia/legacy/tools/timer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import timeit
 3 | 
 4 | 
 5 | class Timer:
 6 |     def __init__(self, unit="s"):
 7 |         self.s_time = timeit.default_timer()
 8 |         self.unit = unit
 9 |         if self.unit != "s" and self.unit != "m" and self.unit != "h":
10 |             raise NotImplementedError("unkown time unit, using s, m, h")
11 | 
12 |     def start(self):
13 |         self.s_time = timeit.default_timer()
14 | 
15 |     def end(self):
16 |         self.e_time = timeit.default_timer()
17 |         period = self.e_time - self.s_time
18 |         if self.unit == "s":
19 |             return "%.1f s" % period
20 |         elif self.unit == "m":
21 |             return "%.2f min" % (period / 60)
22 |         else:
23 |             return "%.2f h" % (period / 3600)
24 | 


--------------------------------------------------------------------------------
/pythia/legacy/top_down_bottom_up/image_embedding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import pickle
10 | 
11 | import torch
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | 
15 | 
16 | """
17 |     parameters:
18 | 
19 |     input:
20 |         image_feat_variable: [batch_size, num_location, image_feat_dim]
21 |             or a list of [num_location, image_feat_dim]
22 |             when using adaptive number of objects
23 |         question_embedding:[batch_size, txt_embeding_dim]
24 | 
25 |     output:
26 |         image_embedding:[batch_size, image_feat_dim]
27 | 
28 | 
29 | """
30 | 
31 | 
32 | class image_embedding(nn.Module):
33 |     def __init__(self, image_attention_model):
34 |         super(image_embedding, self).__init__()
35 |         self.image_attention_model = image_attention_model
36 |         self.out_dim = image_attention_model.out_dim
37 | 
38 |     def forward(self, image_feat_variable, question_embedding, image_dims):
39 |         # N x K x n_att
40 |         attention = self.image_attention_model(
41 |             image_feat_variable, question_embedding, image_dims
42 |         )
43 |         att_reshape = attention.permute(0, 2, 1)
44 |         tmp_embedding = torch.bmm(
45 |             att_reshape, image_feat_variable
46 |         )  # N x n_att x image_dim
47 |         batch_size = att_reshape.size(0)
48 |         image_embedding = tmp_embedding.view(batch_size, -1)
49 | 
50 |         return image_embedding
51 | 
52 | 
53 | class image_finetune(nn.Module):
54 |     def __init__(self, in_dim, weights_file, bias_file):
55 |         super(image_finetune, self).__init__()
56 |         with open(weights_file, "rb") as w:
57 |             weights = pickle.load(w)
58 |         with open(bias_file, "rb") as b:
59 |             bias = pickle.load(b)
60 |         out_dim = bias.shape[0]
61 | 
62 |         self.lc = nn.Linear(in_dim, out_dim)
63 |         self.lc.weight.data.copy_(torch.from_numpy(weights))
64 |         self.lc.bias.data.copy_(torch.from_numpy(bias))
65 |         self.out_dim = out_dim
66 | 
67 |     def forward(self, image):
68 |         i2 = self.lc(image)
69 |         i3 = F.relu(i2)
70 |         return i3
71 | 


--------------------------------------------------------------------------------
/pythia/legacy/top_down_bottom_up/image_feature_encoding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import os
10 | import pickle
11 | 
12 | import torch
13 | import torch.nn as nn
14 | import torch.nn.functional as F
15 | 
16 | from config.config import cfg
17 | 
18 | 
19 | def build_image_feature_encoding(method, par, in_dim):
20 |     if method == "default_image":
21 |         return DefaultImageFeature(in_dim)
22 |     elif method == "finetune_faster_rcnn_fpn_fc7":
23 |         return FinetuneFasterRcnnFpnFc7(in_dim, **par)
24 |     else:
25 |         raise NotImplementedError("unknown image feature encoding %s" % method)
26 | 
27 | 
28 | class DefaultImageFeature(nn.Module):
29 |     def __init__(self, in_dim):
30 |         super(DefaultImageFeature, self).__init__()
31 |         self.in_dim = in_dim
32 |         self.out_dim = in_dim
33 | 
34 |     def forward(self, image):
35 |         return image
36 | 
37 | 
38 | class FinetuneFasterRcnnFpnFc7(nn.Module):
39 |     def __init__(self, in_dim, weights_file, bias_file):
40 |         super(FinetuneFasterRcnnFpnFc7, self).__init__()
41 |         if not os.path.isabs(weights_file):
42 |             weights_file = os.path.join(cfg.data.data_root_dir, weights_file)
43 |         if not os.path.isabs(bias_file):
44 |             bias_file = os.path.join(cfg.data.data_root_dir, bias_file)
45 |         with open(weights_file, "rb") as w:
46 |             weights = pickle.load(w)
47 |         with open(bias_file, "rb") as b:
48 |             bias = pickle.load(b)
49 |         out_dim = bias.shape[0]
50 | 
51 |         self.lc = nn.Linear(in_dim, out_dim)
52 |         self.lc.weight.data.copy_(torch.from_numpy(weights))
53 |         self.lc.bias.data.copy_(torch.from_numpy(bias))
54 |         self.out_dim = out_dim
55 | 
56 |     def forward(self, image):
57 |         i2 = self.lc(image)
58 |         i3 = F.relu(i2)
59 |         return i3
60 | 


--------------------------------------------------------------------------------
/pythia/legacy/top_down_bottom_up/intermediate_layer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import torch.nn as nn
10 | 
11 | 
12 | class inter_layer(nn.Module):
13 |     def __init__(self, dim, n_layer):
14 |         super(inter_layer, self).__init__()
15 |         layers = []
16 |         for i in range(n_layer):
17 |             layers.append(nn.Linear(dim, dim))
18 |             layers.append(nn.ReLU())
19 | 
20 |         self.main = nn.Sequential(*layers)
21 | 
22 |     def forward(self, x):
23 |         return self.main(x)
24 | 


--------------------------------------------------------------------------------
/pythia/legacy/top_down_bottom_up/nonlinear_layer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | import torch.nn.functional as F
12 | from torch.nn.utils.weight_norm import weight_norm
13 | 
14 | 
15 | """
16 | nonlinear_layer: f_a : x\in R^m => y \in R^n
17 | \tilda{y} = tanh(Wx + b)
18 | g = sigmoid(W'x + b')
19 | y = \tilda(y) \circ g
20 | input (N, *, in_dim)
21 | output (N, *, out_dim)
22 | """
23 | 
24 | 
25 | class nonlinear_layer_org(nn.Module):
26 |     def __init__(self, in_dim, out_dim):
27 |         super(nonlinear_layer_org, self).__init__()
28 |         self.fc1 = nn.Linear(in_dim, out_dim)
29 |         self.gate = nn.Linear(in_dim, out_dim)
30 | 
31 |     def forward(self, x):
32 |         y_tilda = F.tanh(self.fc1(x))
33 |         g = F.sigmoid(self.gate(x))
34 |         y = y_tilda * g
35 |         return y
36 | 
37 | 
38 | class FCNet(nn.Module):
39 |     """Simple class for non-linear fully connect network
40 |     """
41 | 
42 |     def __init__(self, dims):
43 |         super(FCNet, self).__init__()
44 | 
45 |         layers = []
46 |         for i in range(len(dims) - 2):
47 |             in_dim = dims[i]
48 |             out_dim = dims[i + 1]
49 |             layers.append(weight_norm(nn.Linear(in_dim, out_dim), dim=None))
50 |             layers.append(nn.ReLU())
51 |         layers.append(weight_norm(nn.Linear(dims[-2], dims[-1]), dim=None))
52 |         layers.append(nn.ReLU())
53 | 
54 |         self.main = nn.Sequential(*layers)
55 | 
56 |     def forward(self, x):
57 |         return self.main(x)
58 | 
59 | 
60 | class nonlinear_layer(nn.Module):
61 |     """Simple class for non-linear fully connect network
62 |     """
63 | 
64 |     def __init__(self, in_dim, out_dim):
65 |         super(nonlinear_layer, self).__init__()
66 | 
67 |         layers = []
68 |         layers.append(weight_norm(nn.Linear(in_dim, out_dim), dim=None))
69 |         layers.append(nn.ReLU())
70 | 
71 |         self.main = nn.Sequential(*layers)
72 | 
73 |     def forward(self, x):
74 |         return self.main(x)
75 | 


--------------------------------------------------------------------------------
/pythia/legacy/top_down_bottom_up/post_combine_transform.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import torch
10 | import torch.nn as nn
11 | import torch.nn.functional as F
12 | from torch.nn.utils.weight_norm import weight_norm
13 | 
14 | 
15 | def build_post_combine_transform(method, par, in_dim):
16 |     if method == "linear_transform":
17 |         return LinearTransform(in_dim, **par)
18 |     elif method == "conv_transform":
19 |         return ConvTransform(in_dim, **par)
20 |     else:
21 |         raise NotImplementedError("unkown post combime transform type %s" % method)
22 | 
23 | 
24 | class LinearTransform(nn.Module):
25 |     def __init__(self, in_dim, **kwargs):
26 |         super(LinearTransform, self).__init__()
27 |         self.lc = weight_norm(
28 |             nn.Linear(in_features=in_dim, out_features=kwargs["out_dim"]), dim=None
29 |         )
30 |         self.out_dim = kwargs["out_dim"]
31 | 
32 |     def forward(self, x):
33 |         return self.lc(x)
34 | 
35 | 
36 | class ConvTransform(nn.Module):
37 |     def __init__(self, in_dim, **kwargs):
38 |         super(ConvTransform, self).__init__()
39 |         self.conv1 = nn.Conv2d(
40 |             in_channels=in_dim, out_channels=kwargs["hidden_dim"], kernel_size=1
41 |         )
42 |         self.conv2 = nn.Conv2d(
43 |             in_channels=kwargs["hidden_dim"],
44 |             out_channels=kwargs["out_dim"],
45 |             kernel_size=1,
46 |         )
47 |         self.out_dim = kwargs["out_dim"]
48 | 
49 |     def forward(self, x):
50 |         if len(x.size()) == 3:  # N x k xdim
51 |             # N x dim x k x 1
52 |             x_reshape = torch.unsqueeze(x.permute(0, 2, 1), 3)
53 |         elif len(x.size()) == 2:  # N x dim
54 |             # N x dim x 1 x 1
55 |             x_reshape = torch.unsqueeze(torch.unsqueeze(x, 2), 3)
56 | 
57 |         iatt_conv1 = self.conv1(x_reshape)  # N x hidden_dim x * x 1
58 |         iatt_relu = F.relu(iatt_conv1)
59 |         iatt_conv2 = self.conv2(iatt_relu)  # N x out_dim x * x 1
60 | 
61 |         if len(x.size()) == 3:
62 |             iatt_conv3 = torch.squeeze(iatt_conv2, 3).permute(0, 2, 1)
63 |         elif len(x.size()) == 2:
64 |             iatt_conv3 = torch.squeeze(torch.squeeze(iatt_conv2, 3), 2)
65 | 
66 |         return iatt_conv3
67 | 


--------------------------------------------------------------------------------
/pythia/legacy/train_model/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | # All rights reserved.
3 | #
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | #
7 | 


--------------------------------------------------------------------------------
/pythia/legacy/train_model/eval_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import os
10 | 
11 | import torch
12 | from torch.utils.data import DataLoader
13 | 
14 | 
15 | def get_final_validation(data_set_val, batch_size, snapshot_dir, eval_model):
16 |     final_val_data_reader = DataLoader(
17 |         data_set_val, shuffle=False, batch_size=batch_size
18 |     )
19 | 
20 |     files = [
21 |         os.path.join(snapshot_dir, file)
22 |         for file in os.listdir(snapshot_dir)
23 |         if file.startswith("model")
24 |     ]
25 | 
26 |     for model_file in sorted(files, key=os.path.getctime, reverse=True):
27 |         current_model = torch.load(model_file)
28 |         total_sample = 0
29 |         total_score = 0
30 |         for i, batch in enumerate(final_val_data_reader):
31 |             score, n_sample, _ = eval_model(batch, current_model)
32 |             total_sample += n_sample
33 |             total_score += score
34 | 
35 |         acc = total_score / total_sample
36 |         print(model_file, ": %.6f" % acc)
37 | 


--------------------------------------------------------------------------------
/pythia/legacy/train_model/evaluate.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | 
 9 | import argparse
10 | import os
11 | 
12 | import yaml
13 | from torch.utils.data import DataLoader
14 | 
15 | from train_model.dataset_utils import prepare_eval_data_set
16 | from train_model.Engineer import one_stage_eval_model
17 | from train_model.eval_utils import get_final_validation
18 | from train_model.model_factory import is_one_stageModel
19 | 
20 | parser = argparse.ArgumentParser()
21 | parser.add_argument("--config", type=str, required=True, help="config yaml file")
22 | parser.add_argument("--out_dir", type=str, required=True, help="output directory")
23 | args = parser.parse_args()
24 | 
25 | config_file = args.config
26 | out_dir = args.out_dir
27 | 
28 | with open(config_file, "r") as f:
29 |     config = yaml.load(f)
30 | 
31 | # get the potential shared data_config info
32 | data_root_dir = config["data"]["data_root_dir"]
33 | batch_size = config["data"]["batch_size"]
34 | data_set_val = prepare_eval_data_set(**config["data"], **config["model"])
35 | data_reader_val = DataLoader(data_set_val, shuffle=False, batch_size=batch_size)
36 | 
37 | snapshot_dir = os.path.join(out_dir, config["output"]["exp_name"])
38 | os.makedirs(snapshot_dir, exist_ok=True)
39 | 
40 | model_type = config["model"]["model_type"]
41 | if is_one_stageModel(model_type):
42 |     get_final_validation(data_set_val, batch_size, snapshot_dir, one_stage_eval_model)
43 | else:
44 |     None
45 | 


--------------------------------------------------------------------------------
/pythia/legacy/train_model/helper.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | # All rights reserved.
 3 | #
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | #
 7 | 
 8 | import json
 9 | import sys
10 | import timeit
11 | 
12 | import numpy as np
13 | 
14 | import _pickle as pickle
15 | from train_model.Engineer import masked_unk_softmax, one_stage_run_model
16 | from train_model.model_factory import prepare_model
17 | 
18 | 
19 | class answer_json:
20 |     def __init__(self):
21 |         self.answers = []
22 | 
23 |     def add(self, ques_id, ans):
24 |         res = {"question_id": ques_id, "answer": ans}
25 |         self.answers.append(res)
26 | 
27 | 
28 | def build_model(config, dataset):
29 |     num_vocab_txt = dataset.vocab_dict.num_vocab
30 |     num_choices = dataset.answer_dict.num_vocab
31 | 
32 |     num_image_feat = len(config["data"]["image_feat_train"][0].split(","))
33 |     my_model = prepare_model(
34 |         num_vocab_txt, num_choices, **config["model"], num_image_feat=num_image_feat
35 |     )
36 |     return my_model
37 | 
38 | 
39 | def run_model(current_model, data_reader, UNK_idx=0):
40 |     softmax_tot = []
41 |     q_id_tot = []
42 | 
43 |     start = timeit.default_timer()
44 |     for i, batch in enumerate(data_reader):
45 |         if (i + 1) % 100 == 0:
46 |             end = timeit.default_timer()
47 |             time = end - start
48 |             start = timeit.default_timer()
49 |             print(" process batch %d for test for %.1f s" % (i + 1, time))
50 |             sys.stdout.flush()
51 | 
52 |         verbose_info = batch["verbose_info"]
53 |         q_ids = verbose_info["question_id"].cpu().numpy().tolist()
54 |         logit_res = one_stage_run_model(batch, current_model, eval_mode=True)
55 |         softmax_res = masked_unk_softmax(logit_res, dim=1, mask_idx=UNK_idx)
56 |         softmax_res = softmax_res.data.cpu().numpy().astype(np.float16)
57 |         q_id_tot += q_ids
58 |         softmax_tot.append(softmax_res)
59 |     softmax_result = np.vstack(softmax_tot)
60 | 
61 |     return q_id_tot, softmax_result
62 | 
63 | 
64 | def print_result(
65 |     question_ids, soft_max_result, ans_dic, out_file, json_only=True, pkl_res_file=None
66 | ):
67 |     predicted_answers = np.argmax(soft_max_result, axis=1)
68 | 
69 |     if not json_only:
70 |         with open(pkl_res_file, "wb") as writeFile:
71 |             pickle.dump(soft_max_result, writeFile)
72 |             pickle.dump(question_ids, writeFile)
73 |             pickle.dump(ans_dic, writeFile)
74 | 
75 |     ans_json_out = answer_json()
76 |     for idx, pred_idx in enumerate(predicted_answers):
77 |         question_id = question_ids[idx]
78 |         pred_ans = ans_dic.idx2word(pred_idx)
79 |         ans_json_out.add(question_id, pred_ans)
80 | 
81 |     with open(out_file, "w") as f:
82 |         json.dump(ans_json_out.answers, f)
83 | 


--------------------------------------------------------------------------------
/pythia/models/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | __all__ = ["TopDownBottomUp", "Pythia", "LoRRA", "BAN"]
3 | 
4 | from .top_down_bottom_up import TopDownBottomUp
5 | from .ban import BAN
6 | from .pythia import Pythia
7 | from .lorra import LoRRA
8 | 


--------------------------------------------------------------------------------
/pythia/models/lorra.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import torch
 3 | 
 4 | from pythia.common.registry import registry
 5 | from pythia.models.pythia import Pythia
 6 | from pythia.modules.layers import ClassifierLayer
 7 | 
 8 | 
 9 | @registry.register_model("lorra")
10 | class LoRRA(Pythia):
11 |     def __init__(self, config):
12 |         super().__init__(config)
13 | 
14 |     def build(self):
15 |         self._init_text_embeddings("text")
16 |         # For LoRRA context feature and text embeddings would be identity
17 |         # but to keep a unified API, we will init them also
18 |         # and we need to build them first before building pythia's other
19 |         # modules as some of the modules require context attributes to be set
20 |         self._init_text_embeddings("context")
21 |         self._init_feature_encoders("context")
22 |         self._init_feature_embeddings("context")
23 |         super().build()
24 | 
25 |     def get_optimizer_parameters(self, config):
26 |         params = super().get_optimizer_parameters(config)
27 |         params += [
28 |             {"params": self.context_feature_embeddings_list.parameters()},
29 |             {"params": self.context_embeddings.parameters()},
30 |             {"params": self.context_feature_encoders.parameters()},
31 |         ]
32 | 
33 |         return params
34 | 
35 |     def _get_classifier_input_dim(self):
36 |         # Now, the classifier's input will be cat of image and context based
37 |         # features
38 |         return 2 * super()._get_classifier_input_dim()
39 | 
40 |     def forward(self, sample_list):
41 |         sample_list.text = self.word_embedding(sample_list.text)
42 |         text_embedding_total = self.process_text_embedding(sample_list)
43 | 
44 |         image_embedding_total, _ = self.process_feature_embedding(
45 |             "image", sample_list, text_embedding_total
46 |         )
47 | 
48 |         context_embedding_total, _ = self.process_feature_embedding(
49 |             "context", sample_list, text_embedding_total, ["order_vectors"]
50 |         )
51 | 
52 |         if self.inter_model is not None:
53 |             image_embedding_total = self.inter_model(image_embedding_total)
54 | 
55 |         joint_embedding = self.combine_embeddings(
56 |             ["image", "text"],
57 |             [image_embedding_total, text_embedding_total, context_embedding_total],
58 |         )
59 | 
60 |         scores = self.calculate_logits(joint_embedding)
61 | 
62 |         return {"scores": scores}
63 | 


--------------------------------------------------------------------------------
/pythia/models/m4c_captioner.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | from pythia.common.registry import registry
 3 | from pythia.models.m4c import M4C
 4 | 
 5 | 
 6 | @registry.register_model("m4c_captioner")
 7 | class M4CCaptioner(M4C):
 8 |     def __init__(self, config):
 9 |         super().__init__(config)
10 |         self.remove_unk_in_pred = self.config.remove_unk_in_pred
11 | 
12 |     def _forward_output(self, sample_list, fwd_results):
13 |         super()._forward_output(sample_list, fwd_results)
14 | 
15 |         if (not self.training) and self.remove_unk_in_pred:
16 |             # avoid outputting <unk> in the generated captions
17 |             fwd_results["scores"][..., self.answer_processor.UNK_IDX] = -1e10
18 | 
19 |         return fwd_results
20 | 


--------------------------------------------------------------------------------
/pythia/models/top_down_bottom_up.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import torch
 3 | from torch import nn
 4 | 
 5 | from pythia.common.registry import registry
 6 | from pythia.models.base_model import BaseModel
 7 | from pythia.modules.embeddings import (ImageEmbedding, PreExtractedEmbedding,
 8 |                                        TextEmbedding)
 9 | from pythia.modules.encoders import ImageEncoder
10 | from pythia.modules.layers import (ClassifierLayer, Identity,
11 |                                    ModalCombineLayer, ReLUWithWeightNormFC)
12 | 
13 | 
14 | # Note: Doesn't work currently. Needs to be migrated to new API
15 | @registry.register_model("top_down_bottom_up")
16 | class TopDownBottomUp(BaseModel):
17 |     def __init__(self, image_attention_model, text_embedding_models, classifier):
18 |         super().__init__()
19 |         self.image_attention_model = image_attention_model
20 |         self.text_embedding_models = text_embedding_models
21 |         self.classifier = classifier
22 |         text_lstm_dim = sum([q.text_out_dim for q in text_embedding_models])
23 |         joint_embedding_out_dim = classifier.input_dim
24 |         image_feat_dim = image_attention_model.image_feat_dim
25 |         self.non_linear_text = ReLUWithWeightNormFC(
26 |             text_lstm_dim, joint_embedding_out_dim
27 |         )
28 |         self.non_linear_image = ReLUWithWeightNormFC(
29 |             image_feat_dim, joint_embedding_out_dim
30 |         )
31 | 
32 |     def build(self):
33 |         return
34 | 
35 |     def forward(
36 |         self, image_feat_variable, input_text_variable, input_answers=None, **kwargs
37 |     ):
38 |         text_embeddings = []
39 |         for q_model in self.text_embedding_models:
40 |             q_embedding = q_model(input_text_variable)
41 |             text_embeddings.append(q_embedding)
42 |         text_embedding = torch.cat(text_embeddings, dim=1)
43 | 
44 |         if isinstance(image_feat_variable, list):
45 |             image_embeddings = []
46 |             for idx, image_feat in enumerate(image_feat_variable):
47 |                 ques_embedding_each = torch.unsqueeze(text_embedding[idx, :], 0)
48 |                 image_feat_each = torch.unsqueeze(image_feat, dim=0)
49 |                 attention_each = self.image_attention_model(
50 |                     image_feat_each, ques_embedding_each
51 |                 )
52 |                 image_embedding_each = torch.sum(attention_each * image_feat, dim=1)
53 |                 image_embeddings.append(image_embedding_each)
54 |             image_embedding = torch.cat(image_embeddings, dim=0)
55 |         else:
56 |             attention = self.image_attention_model(image_feat_variable, text_embedding)
57 |             image_embedding = torch.sum(attention * image_feat_variable, dim=1)
58 | 
59 |         joint_embedding = self.non_linear_text(text_embedding) * self.non_linear_image(
60 |             image_embedding
61 |         )
62 |         logit_res = self.classifier(joint_embedding)
63 | 
64 |         return logit_res
65 | 


--------------------------------------------------------------------------------
/pythia/modules/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/pythia/modules/decoders.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import torch
 3 | from torch import nn
 4 | from torch.nn.utils.weight_norm import weight_norm
 5 | from pythia.common.registry import registry
 6 | 
 7 | 
 8 | class VisDialDiscriminator(nn.Module):
 9 |     def __init__(self, config, embedding):
10 |         super(VisDialDiscriminator, self).__init__()
11 |         self.config = config
12 |         self.embedding = embedding
13 | 
14 |         self.emb_out_dim = embedding.text_out_dim
15 |         self.hidden_dim = self.config["hidden_dim"]
16 | 
17 |         self.projection_layer = nn.Linear(self.emb_out_dim, self.hidden_dim)
18 | 
19 |     def forward(self, encoder_output, batch):
20 |         answer_options_len = batch["answer_options_len"]
21 | 
22 |         # BATCH_SIZE X DIALOGUES X 100 X SEQ_LEN
23 |         answer_options = batch["answer_options"]
24 | 
25 |         max_seq_len = answer_options.size(-1)
26 | 
27 |         batch_size, ndialogues, noptions, seq_len = answer_options.size()
28 | 
29 |         # (B X D X 100) X SEQ_LEN
30 |         answer_options = answer_options.view(-1, max_seq_len)
31 |         answer_options_len = answer_options_len.view(-1)
32 | 
33 |         # (B x D x 100) x EMB_OUT_DIM
34 |         answer_options = self.embedding(answer_options)
35 | 
36 |         # (B x D x 100) x HIDDEN_DIM
37 |         answer_options = self.projection_layer(answer_options)
38 | 
39 |         # (B x D) x 100 x HIDDEN_DIM
40 |         answer_options = answer_options.view(
41 |             batch_size * ndialogues, noptions, self.hidden_dim
42 |         )
43 | 
44 |         # (B x D) x HIDDEN_DIM => (B x D) x 100 x HIDDEN_DIM
45 |         encoder_output = encoder_output.unsqueeze(1).expand(-1, noptions, -1)
46 | 
47 |         # (B x D) x 100 x HIDDEN_DIM * (B x D) x 100 x HIDDEN_DIM = SAME THING
48 |         # SUM => (B x D) x 100
49 |         scores = torch.sum(answer_options * encoder_output, dim=2)
50 | 
51 |         return scores
52 | 
53 | 
54 | class LanguageDecoder(nn.Module):
55 |     def __init__(self, in_dim, out_dim, **kwargs):
56 |         super().__init__()
57 | 
58 |         self.language_lstm = nn.LSTMCell(
59 |             in_dim + kwargs["hidden_dim"], kwargs["hidden_dim"], bias=True
60 |         )
61 |         self.fc = weight_norm(nn.Linear(kwargs["hidden_dim"], out_dim))
62 |         self.dropout = nn.Dropout(p=kwargs["dropout"])
63 |         self.init_weights(kwargs["fc_bias_init"])
64 | 
65 |     def init_weights(self, fc_bias_init):
66 |         self.fc.bias.data.fill_(fc_bias_init)
67 |         self.fc.weight.data.uniform_(-0.1, 0.1)
68 | 
69 |     def forward(self, weighted_attn):
70 |         # Get LSTM state
71 |         state = registry.get("{}_lstm_state".format(weighted_attn.device))
72 |         h1, c1 = state["td_hidden"]
73 |         h2, c2 = state["lm_hidden"]
74 | 
75 |         # Language LSTM
76 |         h2, c2 = self.language_lstm(torch.cat([weighted_attn, h1], dim=1), (h2, c2))
77 |         predictions = self.fc(self.dropout(h2))
78 | 
79 |         # Update hidden state for t+1
80 |         state["lm_hidden"] = (h2, c2)
81 | 
82 |         return predictions
83 | 


--------------------------------------------------------------------------------
/pythia/modules/encoders.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import os
 3 | import pickle
 4 | 
 5 | import torch
 6 | from torch import nn
 7 | 
 8 | from pythia.modules.layers import Identity
 9 | from pythia.utils.general import get_pythia_root
10 | 
11 | 
12 | class ImageEncoder(nn.Module):
13 |     def __init__(self, encoder_type, in_dim, **kwargs):
14 |         super(ImageEncoder, self).__init__()
15 | 
16 |         if encoder_type == "default":
17 |             self.module = Identity()
18 |             self.module.in_dim = in_dim
19 |             self.module.out_dim = in_dim
20 |         elif encoder_type == "finetune_faster_rcnn_fpn_fc7":
21 |             self.module = FinetuneFasterRcnnFpnFc7(in_dim, **kwargs)
22 |         else:
23 |             raise NotImplementedError("Unknown Image Encoder: %s" % encoder_type)
24 | 
25 |         self.out_dim = self.module.out_dim
26 | 
27 |     def forward(self, *args, **kwargs):
28 |         return self.module(*args, **kwargs)
29 | 
30 | 
31 | class FinetuneFasterRcnnFpnFc7(nn.Module):
32 |     def __init__(self, in_dim, weights_file, bias_file, model_data_dir):
33 |         super(FinetuneFasterRcnnFpnFc7, self).__init__()
34 |         pythia_root = get_pythia_root()
35 |         model_data_dir = os.path.join(pythia_root, model_data_dir)
36 | 
37 |         if not os.path.isabs(weights_file):
38 |             weights_file = os.path.join(model_data_dir, weights_file)
39 |         if not os.path.isabs(bias_file):
40 |             bias_file = os.path.join(model_data_dir, bias_file)
41 |         with open(weights_file, "rb") as w:
42 |             weights = pickle.load(w)
43 |         with open(bias_file, "rb") as b:
44 |             bias = pickle.load(b)
45 |         out_dim = bias.shape[0]
46 | 
47 |         self.lc = nn.Linear(in_dim, out_dim)
48 |         self.lc.weight.data.copy_(torch.from_numpy(weights))
49 |         self.lc.bias.data.copy_(torch.from_numpy(bias))
50 |         self.out_dim = out_dim
51 | 
52 |     def forward(self, image):
53 |         i2 = self.lc(image)
54 |         i3 = nn.functional.relu(i2)
55 |         return i3
56 | 


--------------------------------------------------------------------------------
/pythia/scripts/features/extract_features.md:
--------------------------------------------------------------------------------
 1 | ## Extract Image Features
 2 | 
 3 | We use detectron to extract image features. Set up [detectron](https://github.com/facebookresearch/Detectron) 
 4 | and copy [tools/extract_features.py](tools/extract_features.py) to detectron for extracting features
 5 | 
 6 | 
 7 | Feature extraction works best with commit #3a38b7b of [detectron](https://github.com/facebookresearch/Detectron)
 8 | and #0dd3284 of [caffe2](https://github.com/caffe2/caffe2)
 9 | 
10 | 
11 | download the pretrained detectron model
12 | ```bash
13 | wget https://dl.fbaipublicfiles.com/pythia/detectron_model/FAST_RCNN_MLP_DIM2048_FPN_DIM512.pkl
14 | wget https://dl.fbaipublicfiles.com/pythia/detectron_model/e2e_faster_rcnn_X-101-64x4d-FPN_1x_MLP_2048_FPN_512.yaml
15 | 
16 | $INPUT_DIR = /path/to/your/input/image or directory
17 | 
18 | python extract_features.py --cfg e2e_faster_rcnn_X-101-64x4d-FPN_1x_MLP_2048_FPN_512.yaml \
19 | --wts FAST_RCNN_MLP_DIM2048_FPN_DIM512.pkl \
20 | --min_bboxes 100 --max_bboxes 100 \
21 | --feat_name gpu_0/fc6 \
22 | --output_dir ~/temp_out $INPUT_DIR
23 | 


--------------------------------------------------------------------------------
/pythia/scripts/gqa/README.md:
--------------------------------------------------------------------------------
 1 | # Converstion of GQA to VQA format
 2 | 
 3 | * Download GQA datasets and store as format shown in conversion script
 4 | * Download glove embeddings 300D file
 5 | * Run the script from the root of the repo as by changing relevant paths: 
 6 | 
 7 | ```
 8 | PYTHONPATH=. python ./data_prep/gqa/convert_gqa_to_vqa.py \
 9 |              --gqa_dir /checkpoint/meetshah/datasets/gqa/ \
10 |              --out_dir /checkpoint/meetshah/datasets/gqa_pp/
11 | ```
12 | 


--------------------------------------------------------------------------------
/pythia/trainers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | __all__ = ["BaseTrainer"]
3 | 
4 | from .base_trainer import BaseTrainer
5 | 


--------------------------------------------------------------------------------
/pythia/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/pythia/utils/build_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import torch
 3 | import warnings
 4 | 
 5 | from pythia.utils.configuration import Configuration
 6 | from pythia.common.registry import registry
 7 | from pythia.utils.general import get_optimizer_parameters
 8 | 
 9 | 
10 | def build_trainer(args, *rest, **kwargs):
11 |     configuration = Configuration(args.config)
12 | 
13 |     # Update with the config override if passed
14 |     configuration.override_with_cmd_config(args.config_override)
15 | 
16 |     # Now, update with opts args that were passed
17 |     configuration.override_with_cmd_opts(args.opts)
18 | 
19 |     # Finally, update with args that were specifically passed
20 |     # as arguments
21 |     configuration.update_with_args(args)
22 |     configuration.freeze()
23 | 
24 |     config = configuration.get_config()
25 |     registry.register("config", config)
26 |     registry.register("configuration", configuration)
27 | 
28 |     trainer_type = config.training_parameters.trainer
29 |     trainer_cls = registry.get_trainer_class(trainer_type)
30 |     trainer_obj = trainer_cls(config)
31 | 
32 |     # Set args as an attribute for future use
33 |     setattr(trainer_obj, 'args', args)
34 | 
35 |     return trainer_obj
36 | 
37 | 
38 | def build_model(config):
39 |     model_name = config.model
40 | 
41 |     model_class = registry.get_model_class(model_name)
42 | 
43 |     if model_class is None:
44 |         registry.get("writer").write("No model registered for name: %s" % model_name)
45 |     model = model_class(config)
46 | 
47 |     if hasattr(model, "build"):
48 |         model.build()
49 |         model.init_losses_and_metrics()
50 | 
51 |     return model
52 | 
53 | 
54 | def build_optimizer(model, config):
55 |     optimizer_config = config.optimizer_attributes
56 |     if not hasattr(optimizer_config, "type"):
57 |         raise ValueError(
58 |             "Optimizer attributes must have a 'type' key "
59 |             "specifying the type of optimizer. "
60 |             "(Custom or PyTorch)"
61 |         )
62 |     optimizer_type = optimizer_config.type
63 | 
64 |     if not hasattr(optimizer_config, "params"):
65 |         warnings.warn(
66 |             "optimizer attributes has no params defined, defaulting to {}."
67 |         )
68 | 
69 |     params = getattr(optimizer_config, "params", {})
70 | 
71 |     if hasattr(torch.optim, optimizer_type):
72 |         optimizer_class = getattr(torch.optim, optimizer_type)
73 |     else:
74 |         optimizer_class = registry.get_optimizer_class(optimizer_type)
75 |         if optimizer_class is None:
76 |             raise ValueError(
77 |                 "No optimizer class of type {} present in "
78 |                 "either torch or registered to registry"
79 |             )
80 | 
81 |     parameters = get_optimizer_parameters(model, config)
82 |     optimizer = optimizer_class(parameters, **params)
83 |     return optimizer
84 | 


--------------------------------------------------------------------------------
/pythia/utils/dataset_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import torch
 3 | 
 4 | from pythia.common.sample import Sample
 5 | 
 6 | 
 7 | def build_bbox_tensors(infos, max_length):
 8 |     num_bbox = min(max_length, len(infos))
 9 | 
10 |     # After num_bbox, everything else should be zero
11 |     coord_tensor = torch.zeros((max_length, 4), dtype=torch.float)
12 |     width_tensor = torch.zeros(max_length, dtype=torch.float)
13 |     height_tensor = torch.zeros(max_length, dtype=torch.float)
14 |     bbox_types = ["xyxy"] * max_length
15 | 
16 |     infos = infos[:num_bbox]
17 |     sample = Sample()
18 | 
19 |     for idx, info in enumerate(infos):
20 |         bbox = info["bounding_box"]
21 |         x = bbox["top_left_x"]
22 |         y = bbox["top_left_y"]
23 |         width = bbox["width"]
24 |         height = bbox["height"]
25 | 
26 |         coord_tensor[idx][0] = x
27 |         coord_tensor[idx][1] = y
28 |         coord_tensor[idx][2] = x + width
29 |         coord_tensor[idx][3] = y + height
30 | 
31 |         width_tensor[idx] = width
32 |         height_tensor[idx] = height
33 |     sample.coordinates = coord_tensor
34 |     sample.width = width_tensor
35 |     sample.height = height_tensor
36 |     sample.bbox_types = bbox_types
37 | 
38 |     return sample
39 | 


--------------------------------------------------------------------------------
/pythia/utils/objects_to_byte_tensor.py:
--------------------------------------------------------------------------------
 1 | 
 2 | # Adopted from
 3 | # https://github.com/pytorch/fairseq/blob/master/fairseq/distributed_utils.py
 4 | 
 5 | import pickle
 6 | import torch
 7 | 
 8 | MAX_SIZE_LIMIT = 65533
 9 | BYTE_SIZE = 256
10 | 
11 | 
12 | def enc_obj2bytes(obj, max_size=4094):
13 |     """
14 |     Encode Python objects to PyTorch byte tensors
15 |     """
16 |     assert max_size <= MAX_SIZE_LIMIT
17 |     byte_tensor = torch.zeros(max_size, dtype=torch.uint8)
18 | 
19 |     obj_enc = pickle.dumps(obj)
20 |     obj_size = len(obj_enc)
21 |     if obj_size > max_size:
22 |         raise Exception(
23 |             'objects too large: object size {}, max size {}'.format(
24 |                 obj_size, max_size
25 |             )
26 |         )
27 | 
28 |     byte_tensor[0] = obj_size // 256
29 |     byte_tensor[1] = obj_size % 256
30 |     byte_tensor[2:2+obj_size] = torch.ByteTensor(list(obj_enc))
31 |     return byte_tensor
32 | 
33 | 
34 | def dec_bytes2obj(byte_tensor, max_size=4094):
35 |     """
36 |     Decode PyTorch byte tensors to Python objects
37 |     """
38 |     assert max_size <= MAX_SIZE_LIMIT
39 | 
40 |     obj_size = byte_tensor[0].item() * 256 + byte_tensor[1].item()
41 |     obj_enc = bytes(byte_tensor[2:2+obj_size].tolist())
42 |     obj = pickle.loads(obj_enc)
43 |     return obj
44 | 
45 | 
46 | if __name__ == '__main__':
47 |     test_obj = [1, '2', {3: 4}, [5]]
48 |     test_obj_bytes = enc_obj2bytes(test_obj)
49 |     test_obj_dec = dec_bytes2obj(test_obj_bytes)
50 |     print(test_obj_dec == test_obj)
51 | 


--------------------------------------------------------------------------------
/pythia/utils/phoc/__init__.py:
--------------------------------------------------------------------------------
1 | from .build_phoc import build_phoc  # NoQA
2 | 


--------------------------------------------------------------------------------
/pythia/utils/phoc/build_phoc.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from .cphoc import build_phoc as _build_phoc_raw
 4 | 
 5 | 
 6 | _alphabet = {"a","b","c","d","e","f","g","h","i","j","k","l","m","n","o","p","q","r","s","t","u","v","w","x","y","z","0","1","2","3","4","5","6","7","8","9"}  # NoQA
 7 | 
 8 | 
 9 | def build_phoc(token):
10 |     token = token.lower().strip()
11 |     token = ''.join([c for c in token if c in _alphabet])
12 |     phoc = _build_phoc_raw(token)
13 |     phoc = np.array(phoc, dtype=np.float32)
14 |     return phoc
15 | 


--------------------------------------------------------------------------------
/pythia/utils/timer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import time
 3 | 
 4 | 
 5 | class Timer:
 6 |     DEFAULT_TIME_FORMAT_DATE_TIME = "%Y/%m/%d %H:%M:%S"
 7 |     DEFAULT_TIME_FORMAT = ["%03dms", "%02ds", "%02dm", "%02dh"]
 8 | 
 9 |     def __init__(self):
10 |         self.start = time.time() * 1000
11 | 
12 |     def get_current(self):
13 |         return self.get_time_hhmmss(self.start)
14 | 
15 |     def reset(self):
16 |         self.start = time.time() * 1000
17 | 
18 |     def get_time_since_start(self, format=None):
19 |         return self.get_time_hhmmss(self.start, format)
20 | 
21 |     def get_time_hhmmss(self, start=None, end=None, gap=None, format=None):
22 |         """
23 |         Calculates time since `start` and formats as a string.
24 |         """
25 |         if start is None and gap is None:
26 | 
27 |             if format is None:
28 |                 format = self.DEFAULT_TIME_FORMAT_DATE_TIME
29 | 
30 |             return time.strftime(format)
31 | 
32 |         if end is None:
33 |             end = time.time() * 1000
34 |         if gap is None:
35 |             gap = end - start
36 | 
37 |         s, ms = divmod(gap, 1000)
38 |         m, s = divmod(s, 60)
39 |         h, m = divmod(m, 60)
40 | 
41 |         if format is None:
42 |             format = self.DEFAULT_TIME_FORMAT
43 | 
44 |         items = [ms, s, m, h]
45 |         assert len(items) == len(format), "Format length should be same as items"
46 | 
47 |         time_str = ""
48 |         for idx, item in enumerate(items):
49 |             if item != 0:
50 |                 time_str = format[idx] % item + " " + time_str
51 | 
52 |         return time_str.strip()
53 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch>=1.2
 2 | torchvision>0.2
 3 | tensorboardX>=1.2
 4 | numpy>=1.14
 5 | tqdm>=4.19
 6 | demjson>=2.2
 7 | torchtext>=0.2
 8 | GitPython>=2.1
 9 | PyYAML>=3.11
10 | pytest==5.2.0
11 | requests==2.21.0
12 | fastText
13 | nltk==3.4.1
14 | pytorch-transformers==1.2.0
15 | editdistance
16 | 


--------------------------------------------------------------------------------
/run.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=$1 python -m torch.distributed.launch --nproc_per_node $2 --master_port $4 tools/run.py --tasks captioning --datasets m4c_textcaps --model m4c_captioner \
2 | 	--config configs/captioning/m4c_textcaps/m4c_captioner.yml \
3 | 	--save_dir save/$3 --resume_file save/$3/m4c_textcaps_m4c_captioner_2021/best.ckpt \
4 | 	training_parameters.distributed True
5 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | # Copyright (c) Facebook, Inc. and its affiliates.
 4 | import os.path
 5 | import shutil
 6 | from glob import glob
 7 | import sys
 8 | 
 9 | import setuptools
10 | from setuptools import Extension
11 | from setuptools.command.build_ext import build_ext
12 | 
13 | sys.path.insert(0, os.path.join(os.path.dirname(__file__), "pythia"))
14 | 
15 | with open("README.md", encoding="utf8") as f:
16 |     readme = f.read()
17 | 
18 | with open("LICENSE") as f:
19 |     license = f.read()
20 | 
21 | with open("requirements.txt") as f:
22 |     reqs = f.read()
23 | 
24 | DISTNAME = "pythia"
25 | DESCRIPTION = "pythia: a modular framework for vision and language multimodal \
26 | research."
27 | LONG_DESCRIPTION = readme
28 | AUTHOR = "Facebook AI Research"
29 | LICENSE = license
30 | REQUIREMENTS = (reqs.strip().split("\n"),)
31 | 
32 | ext_modules = [
33 |     Extension(
34 |         'cphoc',
35 |         sources=['pythia/utils/phoc/src/cphoc.c'],
36 |         language='c',
37 |         libraries=["pthread", "dl", "util", "rt", "m"],
38 |         extra_compile_args=["-O3"],
39 |     ),
40 | ]
41 | 
42 | 
43 | class BuildExt(build_ext):
44 |     def run(self):
45 |         build_ext.run(self)
46 |         cphoc_lib = glob('build/lib.*/cphoc.*.so')[0]
47 |         shutil.copy(cphoc_lib, 'pythia/utils/phoc/cphoc.so')
48 | 
49 | 
50 | if __name__ == "__main__":
51 |     setuptools.setup(
52 |         name=DISTNAME,
53 |         install_requires=REQUIREMENTS,
54 |         packages=setuptools.find_packages(),
55 |         ext_modules=ext_modules,
56 |         cmdclass={'build_ext': BuildExt},
57 |         version="0.3",
58 |         description=DESCRIPTION,
59 |         long_description=LONG_DESCRIPTION,
60 |         author=AUTHOR,
61 |         license=LICENSE,
62 |         setup_requires=["pytest-runner"],
63 |         tests_require=["flake8", "pytest"],
64 |     )
65 | 


--------------------------------------------------------------------------------
/tests/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/tests/data/vocab.txt:
--------------------------------------------------------------------------------
 1 | a
 2 | man
 3 | with
 4 | red
 5 | helmet
 6 | on
 7 | small
 8 | moped
 9 | dirt
10 | road
11 | riding
12 | motor
13 | bike
14 | the
15 | countryside
16 | back
17 | of
18 | motorcycle


--------------------------------------------------------------------------------
/tests/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/tests/modules/__init__.py


--------------------------------------------------------------------------------
/tests/modules/test_layers.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import unittest
 3 | 
 4 | import torch
 5 | import random
 6 | import operator
 7 | import functools
 8 | import numpy as np
 9 | 
10 | import pythia.modules.layers as layers
11 | 
12 | 
13 | class TestModuleLayers(unittest.TestCase):
14 |     def setUp(self):
15 |         torch.manual_seed(1234)
16 | 
17 |     def test_conv_net(self):
18 |         conv_net = layers.ConvNet(150, 75, 3)
19 | 
20 |         input_tensor = torch.randn(4, 150, 64, 64)
21 |         output = conv_net(input_tensor)
22 |         expected_size = torch.Size((4, 75, 32, 32))
23 |         self.assertEqual(output.size(), expected_size)
24 |         # Since seed is fix we can check some of tensor values
25 |         np.testing.assert_almost_equal(output[0][0][0][0].item(), 0.149190, decimal=5)
26 |         np.testing.assert_almost_equal(output[3][74][31][31].item(), -0.25199, decimal=5)
27 | 
28 | 
29 |     def test_flatten(self):
30 |         flatten = layers.Flatten()
31 | 
32 |         # Test 3 dim
33 |         input_tensor = torch.randn(5, 6, 10)
34 |         expected_size = torch.Size((5, 60))
35 |         actual_size = flatten(input_tensor).size()
36 |         self.assertEqual(actual_size, expected_size)
37 | 
38 |         # Test 1 dim
39 |         input_tensor = torch.randn(5)
40 |         expected_size = torch.Size((5,))
41 |         actual_size = flatten(input_tensor).size()
42 |         self.assertEqual(actual_size, expected_size)
43 | 
44 |         # Test 6 dim
45 |         size_list = [random.randint(2, 4) for _ in range(7)]
46 |         expected_size = torch.Size((size_list[0], functools.reduce(operator.mul, size_list[1:])))
47 |         input_tensor = torch.randn(*size_list)
48 |         actual_size = flatten(input_tensor).size()
49 |         self.assertEqual(actual_size, expected_size)
50 | 
51 |     def test_unflatten(self):
52 |         unflatten = layers.UnFlatten()
53 | 
54 |         # Test 2 dim to 3 dim
55 |         input_tensor = torch.randn(5, 60)
56 |         expected_size = torch.Size((5, 6, 10))
57 |         actual_size = unflatten(input_tensor, sizes=[6, 10]).size()
58 |         self.assertEqual(actual_size, expected_size)
59 | 
60 |         # Test 1 dim
61 |         input_tensor = torch.randn(5)
62 |         expected_size = torch.Size((5,))
63 |         actual_size = unflatten(input_tensor, sizes=[]).size()
64 |         self.assertEqual(expected_size, actual_size)
65 | 


--------------------------------------------------------------------------------
/tests/modules/test_losses.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import unittest
 3 | 
 4 | import pythia.modules.losses as losses
 5 | import torch
 6 | 
 7 | 
 8 | class TestModuleLosses(unittest.TestCase):
 9 |     def test_caption_cross_entropy(self):
10 |         caption_ce_loss = losses.CaptionCrossEntropyLoss()
11 | 
12 |         expected = dict()
13 |         predicted = dict()
14 | 
15 |         # Test complete match
16 |         expected["targets"] = torch.empty((1, 10), dtype=torch.long)
17 |         expected["targets"].fill_(4)
18 |         predicted["scores"] = torch.zeros((1, 10, 10))
19 |         predicted["scores"][:, :, 4] = 100.0
20 | 
21 |         self.assertEqual(caption_ce_loss(expected, predicted).item(), 0.0)
22 | 
23 |         # Test random initialized
24 |         torch.manual_seed(1234)
25 |         expected["targets"] = torch.randint(0, 9491, (5, 10))
26 |         predicted["scores"] = torch.rand((5, 10, 9491))
27 | 
28 |         self.assertAlmostEqual(caption_ce_loss(expected, predicted).item(), 9.2507, 4)
29 | 


--------------------------------------------------------------------------------
/tests/modules/test_metrics.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import os
 3 | import unittest
 4 | 
 5 | import yaml
 6 | 
 7 | import pythia.modules.metrics as metrics
 8 | import torch
 9 | from pythia.common.registry import registry
10 | from pythia.common.sample import Sample
11 | from pythia.datasets.processors import CaptionProcessor
12 | from pythia.utils.configuration import ConfigNode
13 | 
14 | 
15 | class TestModuleMetrics(unittest.TestCase):
16 |     def test_caption_bleu4(self):
17 |         path = os.path.join(
18 |             os.path.abspath(__file__),
19 |             "../../../pythia/common/defaults/configs/datasets/captioning/coco.yml",
20 |         )
21 |         with open(os.path.abspath(path)) as f:
22 |             config = yaml.load(f, Loader=yaml.FullLoader)
23 | 
24 |         config = ConfigNode(config)
25 |         captioning_config = config.dataset_attributes.coco
26 |         caption_processor_config = captioning_config.processors.caption_processor
27 |         vocab_path = os.path.join(os.path.abspath(__file__), "..", "..", "data", "vocab.txt")
28 |         caption_processor_config.params.vocab.vocab_file = os.path.abspath(vocab_path)
29 |         caption_processor = CaptionProcessor(caption_processor_config.params)
30 |         registry.register("coco_caption_processor", caption_processor)
31 | 
32 |         caption_bleu4 = metrics.CaptionBleu4Metric()
33 |         expected = Sample()
34 |         predicted = dict()
35 | 
36 |         # Test complete match
37 |         expected.answers = torch.empty((5, 5, 10))
38 |         expected.answers.fill_(4)
39 |         predicted["scores"] = torch.zeros((5, 10, 19))
40 |         predicted["scores"][:, :, 4] = 1.0
41 | 
42 |         self.assertEqual(caption_bleu4.calculate(expected, predicted).item(), 1.0)
43 | 
44 |         # Test partial match
45 |         expected.answers = torch.empty((5, 5, 10))
46 |         expected.answers.fill_(4)
47 |         predicted["scores"] = torch.zeros((5, 10, 19))
48 |         predicted["scores"][:, 0:5, 4] = 1.0
49 | 
50 |         self.assertAlmostEqual(
51 |             caption_bleu4.calculate(expected, predicted).item(), 0.3928, 4
52 |         )
53 | 


--------------------------------------------------------------------------------
/tests/tasks/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/guanghuixu/AnchorCaptioner/3a49ce5de025087cbea00075ec0636aee0525382/tests/tasks/__init__.py


--------------------------------------------------------------------------------
/tests/tasks/test_base_dataset.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import unittest
 3 | import os
 4 | 
 5 | from pythia.common.registry import registry
 6 | from pythia.datasets.base_dataset import BaseDataset
 7 | from pythia.utils.configuration import Configuration
 8 | 
 9 | 
10 | class TestBaseDataset(unittest.TestCase):
11 |     def test_init_processors(self):
12 |         path = os.path.join(
13 |             os.path.abspath(__file__),
14 |             "../../../pythia/common/defaults/configs/datasets/vqa/vqa2.yml"
15 |         )
16 | 
17 |         configuration = Configuration(os.path.abspath(path))
18 |         self._fix_configuration(configuration)
19 |         configuration.freeze()
20 | 
21 |         base_dataset = BaseDataset(
22 |             "vqa2",
23 |             "train",
24 |             configuration.get_config()["dataset_attributes"]["vqa2"],
25 |         )
26 |         expected_processors = [
27 |             "answer_processor",
28 |             "ocr_token_processor",
29 |             "bbox_processor",
30 |         ]
31 | 
32 |         # Check no processors are initialized before init_processors call
33 |         self.assertFalse(any(hasattr(base_dataset, key)
34 |                              for key in expected_processors))
35 | 
36 |         for processor in expected_processors:
37 |             self.assertIsNone(registry.get("{}_{}".format("vqa2", processor)))
38 | 
39 |         # Check processors are initialized after init_processors
40 |         base_dataset.init_processors()
41 |         self.assertTrue(all(hasattr(base_dataset, key)
42 |                             for key in expected_processors))
43 |         for processor in expected_processors:
44 |             self.assertIsNotNone(registry.get("{}_{}".format("vqa2", processor)))
45 | 
46 |     def _fix_configuration(self, configuration):
47 |         vqa2_config = configuration.config['dataset_attributes']['vqa2']
48 |         processors = vqa2_config['processors']
49 |         processors.pop('text_processor')
50 |         processors.pop('context_processor')
51 | 


--------------------------------------------------------------------------------
/tests/test_utils.py:
--------------------------------------------------------------------------------
1 | import torch
2 | 
3 | 
4 | def compare_tensors(a, b):
5 |     return torch.all(a.eq(b))
6 | 


--------------------------------------------------------------------------------
/tests/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 | 


--------------------------------------------------------------------------------
/tests/utils/test_general.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import unittest
 3 | 
 4 | from pythia.utils.general import (dict_to_string, get_overlap_score)
 5 | 
 6 | 
 7 | class TestUtilsGeneral(unittest.TestCase):
 8 |     def test_dict_to_string(self):
 9 |         dictionary = {"one": 1, "two": 2, "three": 3}
10 |         expected = "one: 1.0000, two: 2.0000, three: 3.0000"
11 | 
12 |         self.assertEqual(dict_to_string(dictionary), expected)
13 | 
14 |     # TODO: Move later to configuration tests
15 |     # def test_nested_dict_update(self):
16 |     #     # Updates value
17 |     #     dictionary = {"level1": {"level2": {"levelA": 0, "levelB": 1}}}
18 |     #     update = {"level1": {"level2": {"levelB": 10}}}
19 |     #     expected = {"level1": {"level2": {"levelA": 0, "levelB": 10}}}
20 |     #
21 |     #     self.assertEqual(nested_dict_update(dictionary, update), expected)
22 |     #
23 |     #     # Adds new value
24 |     #     dictionary = {"level1": {"level2": {"levelA": 0}}}
25 |     #     update = {"level1": {"level2": {"levelB": 10}}}
26 |     #     expected = {"level1": {"level2": {"levelA": 0, "levelB": 10}}}
27 |     #
28 |     #     self.assertEqual(nested_dict_update(dictionary, update), expected)
29 | 
30 |     def test_get_overlap_score(self):
31 |         # Full overlap
32 |         candidate = "pythia"
33 |         target = "pythia"
34 |         self.assertEqual(get_overlap_score(candidate, target), 1.0)
35 | 
36 |         # Partial overlap
37 |         candidate = "pythia"
38 |         target = "python"
39 |         self.assertEqual(get_overlap_score(candidate, target), 2 / 3)
40 | 
41 |         # No overlap
42 |         candidate = "pythia"
43 |         target = "vqa"
44 |         self.assertEqual(get_overlap_score(candidate, target), 0.0)
45 | 


--------------------------------------------------------------------------------
/tests/utils/test_timer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import time
 3 | import unittest
 4 | 
 5 | from pythia.utils.timer import Timer
 6 | 
 7 | 
 8 | class TestUtilsTimer(unittest.TestCase):
 9 |     def test_get_current(self):
10 |         timer = Timer()
11 |         expected = "000ms"
12 | 
13 |         self.assertEqual(timer.get_current(), expected)
14 | 
15 |     def test_reset(self):
16 |         timer = Timer()
17 |         time.sleep(2)
18 |         timer.reset()
19 |         expected = "000ms"
20 | 
21 |         self.assertEqual(timer.get_current(), expected)
22 | 
23 |     def test_get_time_since_start(self):
24 |         timer = Timer()
25 |         time.sleep(2)
26 |         expected = "02s "
27 | 
28 |         self.assertTrue(expected in timer.get_time_since_start())
29 | 


--------------------------------------------------------------------------------
/tools/bert/extract_bert.sh:
--------------------------------------------------------------------------------
1 | N_REM=`expr $3 - 1`
2 | 
3 | for i in $(seq 0 $N_REM); do
4 |     python tools/extract_bert_embeddings.py --imdb_path $1 --out_path $2 --group_id $i --n_groups $3 &
5 | done
6 | 


--------------------------------------------------------------------------------
/tools/bert/extract_bert_embeddings.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | import argparse
 3 | import multiprocessing
 4 | import os
 5 | 
 6 | import numpy as np
 7 | import torch
 8 | from pytorch_pretrained_bert import BertModel, BertTokenizer
 9 | from tqdm import tqdm
10 | 
11 | 
12 | class BertFeatExtractor(object):
13 |     def __init__(self, model_name):
14 |         self.tokenizer = BertTokenizer.from_pretrained(model_name)
15 |         self.model = BertModel.from_pretrained(model_name).eval()
16 |         self.model.cuda()
17 | 
18 |     def get_bert_embedding(self, text):
19 |         tokenized_text = self.tokenizer.tokenize(text)
20 |         tokenized_text = ["[CLS]"] + tokenized_text + ["[SEP]"]
21 |         indexed_tokens = self.tokenizer.convert_tokens_to_ids(tokenized_text)
22 |         tokens_tensor = torch.Tensor([indexed_tokens]).long()
23 |         segments_tensor = torch.Tensor([0] * len(tokenized_text)).long()
24 |         with torch.no_grad():
25 |             encoded_layers, _ = self.model(
26 |                 tokens_tensor.cuda(),
27 |                 segments_tensor.cuda(),
28 |                 output_all_encoded_layers=False,
29 |             )
30 |         return encoded_layers.squeeze()[0]
31 | 
32 | 
33 | def extract_bert(imdb_path, out_path, group_id=0, n_groups=1):
34 |     imdb = np.load(imdb_path)
35 | 
36 |     feat_extractor = BertFeatExtractor("bert-base-uncased")
37 | 
38 |     if group_id == 0:
39 |         iterator_obj = tqdm(imdb[1:])
40 |     else:
41 |         iterator_obj = imdb[1:]
42 | 
43 |     for idx, el in enumerate(iterator_obj):
44 |         if idx % n_groups != group_id:
45 |             continue
46 |         emb = feat_extractor.get_bert_embedding(el["question_str"])
47 |         save_path = out_path + str(el["question_id"])
48 |         np.save(save_path, emb.cpu().numpy())
49 | 
50 | 
51 | if __name__ == "__main__":
52 |     parser = argparse.ArgumentParser()
53 |     parser.add_argument("--imdb_path", type=str, default=None)
54 |     parser.add_argument("--out_path", type=str, default=None)
55 |     parser.add_argument("--group_id", type=int, default=0)
56 |     parser.add_argument("--n_groups", type=int, default=1)
57 |     args = parser.parse_args()
58 |     extract_bert(args.imdb_path, args.out_path, args.group_id, args.n_groups)
59 | 


--------------------------------------------------------------------------------
/val.sh:
--------------------------------------------------------------------------------
1 | CUDA_VISIBLE_DEVICES=$1 python tools/run.py --tasks captioning --datasets m4c_textcaps --model m4c_captioner \
2 | 	--config configs/captioning/m4c_textcaps/m4c_captioner.yml \
3 | 	--save_dir save/$2 \
4 | 	--run_type $3 --resume_file $4 \
5 | 	--evalai_inference 1
6 | 


--------------------------------------------------------------------------------