├── .github
    └── workflows
    │   └── docs.yaml
├── .gitignore
├── .pre-commit-config.yaml
├── CODEOWNERS
├── CODE_OF_CONDUCT.md
├── LICENSE.txt
├── MANIFEST.in
├── README.md
├── SECURITY.md
├── app
    ├── __init__.py
    ├── calculate_coco_features.py
    ├── caption.py
    ├── classification.py
    ├── dataset_browser.py
    ├── image_text_match.py
    ├── main.py
    ├── multimodal_search.py
    ├── multipage.py
    ├── text_localization.py
    ├── utils.py
    └── vqa.py
├── assets
    └── demo-6.png
├── dataset_card
    ├── avsd_dialogue.md
    ├── coco_caption.md
    ├── coco_retrieval.md
    ├── conceptual_captions.md
    ├── didemo_retrieval.md
    ├── flickr_retrieval.md
    ├── gqa.md
    ├── imgs
    │   ├── NLVR2.png
    │   ├── avsd_dialogue.png
    │   ├── coco_caption.png
    │   ├── conceptual_captions.png
    │   ├── didemo.png
    │   ├── flickr30k.png
    │   ├── gqa.png
    │   ├── msrvtt.png
    │   ├── msrvtt_qa.png
    │   ├── msvd_qa.png
    │   ├── nocaps.png
    │   ├── sbu_caption.png
    │   ├── snli_ve.png
    │   └── vqav2.png
    ├── msrvtt_qa.md
    ├── msrvtt_retrieval.md
    ├── msvd_qa.md
    ├── nlvr2.md
    ├── nocaps.md
    ├── sbu_caption.md
    ├── snli_visual_entailment.md
    └── vqav2.md
├── docs
    ├── Makefile
    ├── _static
    │   ├── Confusing-Pictures.jpg
    │   ├── architecture.png
    │   ├── logo_final.png
    │   └── merlion.png
    ├── benchmark.rst
    ├── build_docs.sh
    ├── conf.py
    ├── getting_started.rst
    ├── index.rst
    ├── intro.rst
    ├── make.bat
    ├── requirements.txt
    ├── tutorial.configs.rst
    ├── tutorial.datasets.rst
    ├── tutorial.evaluation.rst
    ├── tutorial.models.rst
    ├── tutorial.processors.rst
    ├── tutorial.rst
    ├── tutorial.tasks.rst
    └── tutorial.training-example.rst
├── evaluate.py
├── examples
    ├── albef_feature_extraction.ipynb
    ├── albef_vqa.ipynb
    ├── albef_zero_shot_classification.ipynb
    ├── blip2_feature_extraction.ipynb
    ├── blip2_image_text_matching.ipynb
    ├── blip2_instructed_generation.ipynb
    ├── blip_feature_extraction.ipynb
    ├── blip_image_captioning.ipynb
    ├── blip_image_text_matching.ipynb
    ├── blip_text_localization.ipynb
    ├── blip_vqa.ipynb
    ├── blip_zero_shot_classification.ipynb
    ├── clip_feature_extraction.ipynb
    └── clip_zero_shot_classification.ipynb
├── lavis
    ├── __init__.py
    ├── common
    │   ├── annotator
    │   │   ├── canny
    │   │   │   └── __init__.py
    │   │   ├── ckpts
    │   │   │   └── download.sh
    │   │   ├── hed
    │   │   │   └── __init__.py
    │   │   ├── midas
    │   │   │   ├── __init__.py
    │   │   │   ├── api.py
    │   │   │   ├── midas
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── base_model.py
    │   │   │   │   ├── blocks.py
    │   │   │   │   ├── dpt_depth.py
    │   │   │   │   ├── midas_net.py
    │   │   │   │   ├── midas_net_custom.py
    │   │   │   │   ├── transforms.py
    │   │   │   │   └── vit.py
    │   │   │   └── utils.py
    │   │   ├── mlsd
    │   │   │   ├── __init__.py
    │   │   │   ├── models
    │   │   │   │   ├── mbv2_mlsd_large.py
    │   │   │   │   └── mbv2_mlsd_tiny.py
    │   │   │   └── utils.py
    │   │   ├── openpose
    │   │   │   ├── __init__.py
    │   │   │   ├── body.py
    │   │   │   ├── hand.py
    │   │   │   ├── model.py
    │   │   │   └── util.py
    │   │   ├── uniformer
    │   │   │   ├── __init__.py
    │   │   │   ├── configs
    │   │   │   │   └── _base_
    │   │   │   │   │   ├── datasets
    │   │   │   │   │       ├── ade20k.py
    │   │   │   │   │       ├── chase_db1.py
    │   │   │   │   │       ├── cityscapes.py
    │   │   │   │   │       ├── cityscapes_769x769.py
    │   │   │   │   │       ├── drive.py
    │   │   │   │   │       ├── hrf.py
    │   │   │   │   │       ├── pascal_context.py
    │   │   │   │   │       ├── pascal_context_59.py
    │   │   │   │   │       ├── pascal_voc12.py
    │   │   │   │   │       ├── pascal_voc12_aug.py
    │   │   │   │   │       └── stare.py
    │   │   │   │   │   ├── default_runtime.py
    │   │   │   │   │   ├── models
    │   │   │   │   │       ├── ann_r50-d8.py
    │   │   │   │   │       ├── apcnet_r50-d8.py
    │   │   │   │   │       ├── ccnet_r50-d8.py
    │   │   │   │   │       ├── cgnet.py
    │   │   │   │   │       ├── danet_r50-d8.py
    │   │   │   │   │       ├── deeplabv3_r50-d8.py
    │   │   │   │   │       ├── deeplabv3_unet_s5-d16.py
    │   │   │   │   │       ├── deeplabv3plus_r50-d8.py
    │   │   │   │   │       ├── dmnet_r50-d8.py
    │   │   │   │   │       ├── dnl_r50-d8.py
    │   │   │   │   │       ├── emanet_r50-d8.py
    │   │   │   │   │       ├── encnet_r50-d8.py
    │   │   │   │   │       ├── fast_scnn.py
    │   │   │   │   │       ├── fcn_hr18.py
    │   │   │   │   │       ├── fcn_r50-d8.py
    │   │   │   │   │       ├── fcn_unet_s5-d16.py
    │   │   │   │   │       ├── fpn_r50.py
    │   │   │   │   │       ├── fpn_uniformer.py
    │   │   │   │   │       ├── gcnet_r50-d8.py
    │   │   │   │   │       ├── lraspp_m-v3-d8.py
    │   │   │   │   │       ├── nonlocal_r50-d8.py
    │   │   │   │   │       ├── ocrnet_hr18.py
    │   │   │   │   │       ├── ocrnet_r50-d8.py
    │   │   │   │   │       ├── pointrend_r50.py
    │   │   │   │   │       ├── psanet_r50-d8.py
    │   │   │   │   │       ├── pspnet_r50-d8.py
    │   │   │   │   │       ├── pspnet_unet_s5-d16.py
    │   │   │   │   │       ├── upernet_r50.py
    │   │   │   │   │       └── upernet_uniformer.py
    │   │   │   │   │   └── schedules
    │   │   │   │   │       ├── schedule_160k.py
    │   │   │   │   │       ├── schedule_20k.py
    │   │   │   │   │       ├── schedule_40k.py
    │   │   │   │   │       └── schedule_80k.py
    │   │   │   ├── exp
    │   │   │   │   └── upernet_global_small
    │   │   │   │   │   ├── config.py
    │   │   │   │   │   ├── run.sh
    │   │   │   │   │   ├── test.sh
    │   │   │   │   │   ├── test_config_g.py
    │   │   │   │   │   ├── test_config_h32.py
    │   │   │   │   │   └── test_config_w32.py
    │   │   │   ├── mmcv
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── arraymisc
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── quantization.py
    │   │   │   │   ├── cnn
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── alexnet.py
    │   │   │   │   │   ├── bricks
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── activation.py
    │   │   │   │   │   │   ├── context_block.py
    │   │   │   │   │   │   ├── conv.py
    │   │   │   │   │   │   ├── conv2d_adaptive_padding.py
    │   │   │   │   │   │   ├── conv_module.py
    │   │   │   │   │   │   ├── conv_ws.py
    │   │   │   │   │   │   ├── depthwise_separable_conv_module.py
    │   │   │   │   │   │   ├── drop.py
    │   │   │   │   │   │   ├── generalized_attention.py
    │   │   │   │   │   │   ├── hsigmoid.py
    │   │   │   │   │   │   ├── hswish.py
    │   │   │   │   │   │   ├── non_local.py
    │   │   │   │   │   │   ├── norm.py
    │   │   │   │   │   │   ├── padding.py
    │   │   │   │   │   │   ├── plugin.py
    │   │   │   │   │   │   ├── registry.py
    │   │   │   │   │   │   ├── scale.py
    │   │   │   │   │   │   ├── swish.py
    │   │   │   │   │   │   ├── transformer.py
    │   │   │   │   │   │   ├── upsample.py
    │   │   │   │   │   │   └── wrappers.py
    │   │   │   │   │   ├── builder.py
    │   │   │   │   │   ├── resnet.py
    │   │   │   │   │   ├── utils
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── flops_counter.py
    │   │   │   │   │   │   ├── fuse_conv_bn.py
    │   │   │   │   │   │   ├── sync_bn.py
    │   │   │   │   │   │   └── weight_init.py
    │   │   │   │   │   └── vgg.py
    │   │   │   │   ├── engine
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── test.py
    │   │   │   │   ├── fileio
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── file_client.py
    │   │   │   │   │   ├── handlers
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── base.py
    │   │   │   │   │   │   ├── json_handler.py
    │   │   │   │   │   │   ├── pickle_handler.py
    │   │   │   │   │   │   └── yaml_handler.py
    │   │   │   │   │   ├── io.py
    │   │   │   │   │   └── parse.py
    │   │   │   │   ├── image
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── colorspace.py
    │   │   │   │   │   ├── geometric.py
    │   │   │   │   │   ├── io.py
    │   │   │   │   │   ├── misc.py
    │   │   │   │   │   └── photometric.py
    │   │   │   │   ├── model_zoo
    │   │   │   │   │   ├── deprecated.json
    │   │   │   │   │   ├── mmcls.json
    │   │   │   │   │   └── open_mmlab.json
    │   │   │   │   ├── ops
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── assign_score_withk.py
    │   │   │   │   │   ├── ball_query.py
    │   │   │   │   │   ├── bbox.py
    │   │   │   │   │   ├── border_align.py
    │   │   │   │   │   ├── box_iou_rotated.py
    │   │   │   │   │   ├── carafe.py
    │   │   │   │   │   ├── cc_attention.py
    │   │   │   │   │   ├── contour_expand.py
    │   │   │   │   │   ├── corner_pool.py
    │   │   │   │   │   ├── correlation.py
    │   │   │   │   │   ├── deform_conv.py
    │   │   │   │   │   ├── deform_roi_pool.py
    │   │   │   │   │   ├── deprecated_wrappers.py
    │   │   │   │   │   ├── focal_loss.py
    │   │   │   │   │   ├── furthest_point_sample.py
    │   │   │   │   │   ├── fused_bias_leakyrelu.py
    │   │   │   │   │   ├── gather_points.py
    │   │   │   │   │   ├── group_points.py
    │   │   │   │   │   ├── info.py
    │   │   │   │   │   ├── iou3d.py
    │   │   │   │   │   ├── knn.py
    │   │   │   │   │   ├── masked_conv.py
    │   │   │   │   │   ├── merge_cells.py
    │   │   │   │   │   ├── modulated_deform_conv.py
    │   │   │   │   │   ├── multi_scale_deform_attn.py
    │   │   │   │   │   ├── nms.py
    │   │   │   │   │   ├── pixel_group.py
    │   │   │   │   │   ├── point_sample.py
    │   │   │   │   │   ├── points_in_boxes.py
    │   │   │   │   │   ├── points_sampler.py
    │   │   │   │   │   ├── psa_mask.py
    │   │   │   │   │   ├── roi_align.py
    │   │   │   │   │   ├── roi_align_rotated.py
    │   │   │   │   │   ├── roi_pool.py
    │   │   │   │   │   ├── roiaware_pool3d.py
    │   │   │   │   │   ├── roipoint_pool3d.py
    │   │   │   │   │   ├── saconv.py
    │   │   │   │   │   ├── scatter_points.py
    │   │   │   │   │   ├── sync_bn.py
    │   │   │   │   │   ├── three_interpolate.py
    │   │   │   │   │   ├── three_nn.py
    │   │   │   │   │   ├── tin_shift.py
    │   │   │   │   │   ├── upfirdn2d.py
    │   │   │   │   │   └── voxelize.py
    │   │   │   │   ├── parallel
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── _functions.py
    │   │   │   │   │   ├── collate.py
    │   │   │   │   │   ├── data_container.py
    │   │   │   │   │   ├── data_parallel.py
    │   │   │   │   │   ├── distributed.py
    │   │   │   │   │   ├── distributed_deprecated.py
    │   │   │   │   │   ├── registry.py
    │   │   │   │   │   ├── scatter_gather.py
    │   │   │   │   │   └── utils.py
    │   │   │   │   ├── runner
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── base_module.py
    │   │   │   │   │   ├── base_runner.py
    │   │   │   │   │   ├── builder.py
    │   │   │   │   │   ├── checkpoint.py
    │   │   │   │   │   ├── default_constructor.py
    │   │   │   │   │   ├── dist_utils.py
    │   │   │   │   │   ├── epoch_based_runner.py
    │   │   │   │   │   ├── fp16_utils.py
    │   │   │   │   │   ├── hooks
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── checkpoint.py
    │   │   │   │   │   │   ├── closure.py
    │   │   │   │   │   │   ├── ema.py
    │   │   │   │   │   │   ├── evaluation.py
    │   │   │   │   │   │   ├── hook.py
    │   │   │   │   │   │   ├── iter_timer.py
    │   │   │   │   │   │   ├── logger
    │   │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   │   ├── base.py
    │   │   │   │   │   │   │   ├── dvclive.py
    │   │   │   │   │   │   │   ├── mlflow.py
    │   │   │   │   │   │   │   ├── neptune.py
    │   │   │   │   │   │   │   ├── pavi.py
    │   │   │   │   │   │   │   ├── tensorboard.py
    │   │   │   │   │   │   │   ├── text.py
    │   │   │   │   │   │   │   └── wandb.py
    │   │   │   │   │   │   ├── lr_updater.py
    │   │   │   │   │   │   ├── memory.py
    │   │   │   │   │   │   ├── momentum_updater.py
    │   │   │   │   │   │   ├── optimizer.py
    │   │   │   │   │   │   ├── profiler.py
    │   │   │   │   │   │   ├── sampler_seed.py
    │   │   │   │   │   │   └── sync_buffer.py
    │   │   │   │   │   ├── iter_based_runner.py
    │   │   │   │   │   ├── log_buffer.py
    │   │   │   │   │   ├── optimizer
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── builder.py
    │   │   │   │   │   │   └── default_constructor.py
    │   │   │   │   │   ├── priority.py
    │   │   │   │   │   └── utils.py
    │   │   │   │   ├── utils
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── config.py
    │   │   │   │   │   ├── env.py
    │   │   │   │   │   ├── ext_loader.py
    │   │   │   │   │   ├── logging.py
    │   │   │   │   │   ├── misc.py
    │   │   │   │   │   ├── parrots_jit.py
    │   │   │   │   │   ├── parrots_wrapper.py
    │   │   │   │   │   ├── path.py
    │   │   │   │   │   ├── progressbar.py
    │   │   │   │   │   ├── registry.py
    │   │   │   │   │   ├── testing.py
    │   │   │   │   │   ├── timer.py
    │   │   │   │   │   ├── trace.py
    │   │   │   │   │   └── version_utils.py
    │   │   │   │   ├── version.py
    │   │   │   │   ├── video
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── io.py
    │   │   │   │   │   ├── optflow.py
    │   │   │   │   │   └── processing.py
    │   │   │   │   └── visualization
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── color.py
    │   │   │   │   │   ├── image.py
    │   │   │   │   │   └── optflow.py
    │   │   │   ├── mmcv_custom
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── checkpoint.py
    │   │   │   └── mmseg
    │   │   │   │   ├── apis
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── inference.py
    │   │   │   │       ├── test.py
    │   │   │   │       └── train.py
    │   │   │   │   ├── core
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── evaluation
    │   │   │   │       │   ├── __init__.py
    │   │   │   │       │   ├── class_names.py
    │   │   │   │       │   ├── eval_hooks.py
    │   │   │   │       │   └── metrics.py
    │   │   │   │       ├── seg
    │   │   │   │       │   ├── __init__.py
    │   │   │   │       │   ├── builder.py
    │   │   │   │       │   └── sampler
    │   │   │   │       │   │   ├── __init__.py
    │   │   │   │       │   │   ├── base_pixel_sampler.py
    │   │   │   │       │   │   └── ohem_pixel_sampler.py
    │   │   │   │       └── utils
    │   │   │   │       │   ├── __init__.py
    │   │   │   │       │   └── misc.py
    │   │   │   │   ├── datasets
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── ade.py
    │   │   │   │       ├── builder.py
    │   │   │   │       ├── chase_db1.py
    │   │   │   │       ├── cityscapes.py
    │   │   │   │       ├── custom.py
    │   │   │   │       ├── dataset_wrappers.py
    │   │   │   │       ├── drive.py
    │   │   │   │       ├── hrf.py
    │   │   │   │       ├── pascal_context.py
    │   │   │   │       ├── pipelines
    │   │   │   │       │   ├── __init__.py
    │   │   │   │       │   ├── compose.py
    │   │   │   │       │   ├── formating.py
    │   │   │   │       │   ├── loading.py
    │   │   │   │       │   ├── test_time_aug.py
    │   │   │   │       │   └── transforms.py
    │   │   │   │       ├── stare.py
    │   │   │   │       └── voc.py
    │   │   │   │   ├── models
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── backbones
    │   │   │   │       │   ├── __init__.py
    │   │   │   │       │   ├── cgnet.py
    │   │   │   │       │   ├── fast_scnn.py
    │   │   │   │       │   ├── hrnet.py
    │   │   │   │       │   ├── mobilenet_v2.py
    │   │   │   │       │   ├── mobilenet_v3.py
    │   │   │   │       │   ├── resnest.py
    │   │   │   │       │   ├── resnet.py
    │   │   │   │       │   ├── resnext.py
    │   │   │   │       │   ├── unet.py
    │   │   │   │       │   ├── uniformer.py
    │   │   │   │       │   └── vit.py
    │   │   │   │       ├── builder.py
    │   │   │   │       ├── decode_heads
    │   │   │   │       │   ├── __init__.py
    │   │   │   │       │   ├── ann_head.py
    │   │   │   │       │   ├── apc_head.py
    │   │   │   │       │   ├── aspp_head.py
    │   │   │   │       │   ├── cascade_decode_head.py
    │   │   │   │       │   ├── cc_head.py
    │   │   │   │       │   ├── da_head.py
    │   │   │   │       │   ├── decode_head.py
    │   │   │   │       │   ├── dm_head.py
    │   │   │   │       │   ├── dnl_head.py
    │   │   │   │       │   ├── ema_head.py
    │   │   │   │       │   ├── enc_head.py
    │   │   │   │       │   ├── fcn_head.py
    │   │   │   │       │   ├── fpn_head.py
    │   │   │   │       │   ├── gc_head.py
    │   │   │   │       │   ├── lraspp_head.py
    │   │   │   │       │   ├── nl_head.py
    │   │   │   │       │   ├── ocr_head.py
    │   │   │   │       │   ├── point_head.py
    │   │   │   │       │   ├── psa_head.py
    │   │   │   │       │   ├── psp_head.py
    │   │   │   │       │   ├── sep_aspp_head.py
    │   │   │   │       │   ├── sep_fcn_head.py
    │   │   │   │       │   └── uper_head.py
    │   │   │   │       ├── losses
    │   │   │   │       │   ├── __init__.py
    │   │   │   │       │   ├── accuracy.py
    │   │   │   │       │   ├── cross_entropy_loss.py
    │   │   │   │       │   ├── dice_loss.py
    │   │   │   │       │   ├── lovasz_loss.py
    │   │   │   │       │   └── utils.py
    │   │   │   │       ├── necks
    │   │   │   │       │   ├── __init__.py
    │   │   │   │       │   ├── fpn.py
    │   │   │   │       │   └── multilevel_neck.py
    │   │   │   │       ├── segmentors
    │   │   │   │       │   ├── __init__.py
    │   │   │   │       │   ├── base.py
    │   │   │   │       │   ├── cascade_encoder_decoder.py
    │   │   │   │       │   └── encoder_decoder.py
    │   │   │   │       └── utils
    │   │   │   │       │   ├── __init__.py
    │   │   │   │       │   ├── drop.py
    │   │   │   │       │   ├── inverted_residual.py
    │   │   │   │       │   ├── make_divisible.py
    │   │   │   │       │   ├── res_layer.py
    │   │   │   │       │   ├── se_layer.py
    │   │   │   │       │   ├── self_attention_block.py
    │   │   │   │       │   ├── up_conv_block.py
    │   │   │   │       │   └── weight_init.py
    │   │   │   │   ├── ops
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── encoding.py
    │   │   │   │       └── wrappers.py
    │   │   │   │   └── utils
    │   │   │   │       ├── __init__.py
    │   │   │   │       ├── collect_env.py
    │   │   │   │       └── logger.py
    │   │   └── util.py
    │   ├── config.py
    │   ├── dist_utils.py
    │   ├── gradcam.py
    │   ├── logger.py
    │   ├── optims.py
    │   ├── registry.py
    │   ├── utils.py
    │   └── vqa_tools
    │   │   ├── __init__.py
    │   │   ├── vqa.py
    │   │   └── vqa_eval.py
    ├── configs
    │   ├── datasets
    │   │   ├── aokvqa
    │   │   │   ├── defaults.yaml
    │   │   │   └── defaults_instruct.yaml
    │   │   ├── audiocaps
    │   │   │   ├── defaults_mm_cap.yaml
    │   │   │   ├── defaults_mm_cap_instruct.yaml
    │   │   │   └── defaults_mm_qa.yaml
    │   │   ├── audioset
    │   │   │   ├── defaults_mm_cap.yaml
    │   │   │   └── defaults_mm_cap_instruct.yaml
    │   │   ├── avsd
    │   │   │   ├── defaults_dial.yaml
    │   │   │   └── defaults_mm_dial_instruct.yaml
    │   │   ├── blip_diffusion_datasets
    │   │   │   └── defaults.yaml
    │   │   ├── capfilt14m
    │   │   │   ├── defaults_cap.yaml
    │   │   │   └── defaults_cap_instruct.yaml
    │   │   ├── charade
    │   │   │   ├── defaults_cap.yaml
    │   │   │   └── defaults_cap_instruct.yaml
    │   │   ├── clotho
    │   │   │   ├── defaults_mm_cap.yaml
    │   │   │   ├── defaults_mm_cap_instruct.yaml
    │   │   │   └── defaults_mm_qa.yaml
    │   │   ├── coco
    │   │   │   ├── defaults_cap.yaml
    │   │   │   ├── defaults_cap_instruct.yaml
    │   │   │   ├── defaults_ret.yaml
    │   │   │   ├── defaults_vqa.yaml
    │   │   │   ├── defaults_vqa_instruct.yaml
    │   │   │   └── eval_vqa.yaml
    │   │   ├── coin
    │   │   │   ├── defaults_cap.yaml
    │   │   │   └── defaults_cap_instruct.yaml
    │   │   ├── conceptual_caption
    │   │   │   ├── defaults_12m.yaml
    │   │   │   ├── defaults_12m_instruct.yaml
    │   │   │   ├── defaults_3m.yaml
    │   │   │   └── defaults_3m_instruct.yaml
    │   │   ├── didemo
    │   │   │   └── defaults_ret.yaml
    │   │   ├── discriminatory_reasoning
    │   │   │   ├── defaults_mm_audio_video.yaml
    │   │   │   ├── defaults_mm_image_pc.yaml
    │   │   │   └── discriminatory_dataset
    │   │   │   │   ├── audiocaps_discrn.json
    │   │   │   │   └── objaverse_discrn.json
    │   │   ├── esc50
    │   │   │   └── defaults_mm_cls.yaml
    │   │   ├── flickr30k
    │   │   │   ├── defaults.yaml
    │   │   │   ├── defaults_cap.yaml
    │   │   │   └── defaults_cap_instruct.yaml
    │   │   ├── gqa
    │   │   │   ├── balanced_testdev.yaml
    │   │   │   ├── balanced_testdev_instruct.yaml
    │   │   │   ├── balanced_val.yaml
    │   │   │   ├── balanced_val_instruct.yaml
    │   │   │   ├── defaults.yaml
    │   │   │   └── defaults_instruct.yaml
    │   │   ├── iconqa
    │   │   │   ├── defaults.yaml
    │   │   │   └── defaults_instruct.yaml
    │   │   ├── imagenet
    │   │   │   └── defaults.yaml
    │   │   ├── laion
    │   │   │   ├── defaults_2B_multi.yaml
    │   │   │   ├── defaults_400M.yaml
    │   │   │   └── defaults_400M_instruct.yaml
    │   │   ├── llava150k
    │   │   │   └── defaults_dial.yaml
    │   │   ├── modelnet40
    │   │   │   └── defaults_cls.yaml
    │   │   ├── msrvtt
    │   │   │   ├── defaults_cap.yaml
    │   │   │   ├── defaults_cap_instruct.yaml
    │   │   │   ├── defaults_qa.yaml
    │   │   │   ├── defaults_qa_instruct.yaml
    │   │   │   └── defaults_ret.yaml
    │   │   ├── msvd
    │   │   │   ├── defaults_cap.yaml
    │   │   │   ├── defaults_cap_instruct.yaml
    │   │   │   ├── defaults_qa.yaml
    │   │   │   └── defaults_qa_instruct.yaml
    │   │   ├── music_avqa
    │   │   │   ├── defaults_mm_qa.yaml
    │   │   │   └── defaults_mm_qa_instruct.yaml
    │   │   ├── nlvr
    │   │   │   └── defaults.yaml
    │   │   ├── nocaps
    │   │   │   └── defaults.yaml
    │   │   ├── objaverse
    │   │   │   ├── defaults_mm_cap.yaml
    │   │   │   ├── defaults_mm_cap_instruct.yaml
    │   │   │   └── defaults_mm_qa.yaml
    │   │   ├── ocrvqa
    │   │   │   ├── defaults.yaml
    │   │   │   └── defaults_instruct.yaml
    │   │   ├── okvqa
    │   │   │   ├── defaults.yaml
    │   │   │   └── defaults_instruct.yaml
    │   │   ├── sbu_caption
    │   │   │   ├── defaults.yaml
    │   │   │   └── defaults_instruct.yaml
    │   │   ├── scienceqa
    │   │   │   ├── defaults.yaml
    │   │   │   └── defaults_instruct.yaml
    │   │   ├── shapenet
    │   │   │   ├── defaults_mm_cap.yaml
    │   │   │   └── defaults_mm_cap_instruct.yaml
    │   │   ├── snli_ve
    │   │   │   ├── defaults.yaml
    │   │   │   └── defaults_instruct.yaml
    │   │   ├── textcaps
    │   │   │   ├── defaults.yaml
    │   │   │   └── defaults_instruct.yaml
    │   │   ├── valor
    │   │   │   ├── defaults_mm_cap.yaml
    │   │   │   └── defaults_mm_cap_instruct.yaml
    │   │   ├── vatex
    │   │   │   ├── defaults_cap.yaml
    │   │   │   └── defaults_cap_instruct.yaml
    │   │   ├── vg
    │   │   │   ├── defaults_caption.yaml
    │   │   │   ├── defaults_caption_instruct.yaml
    │   │   │   ├── defaults_vqa.yaml
    │   │   │   └── defaults_vqa_instruct.yaml
    │   │   ├── violin
    │   │   │   ├── defaults_cap.yaml
    │   │   │   ├── defaults_cap_instruct.yaml
    │   │   │   ├── defaults_entail.yaml
    │   │   │   └── defaults_entail_instruct.yaml
    │   │   ├── visdial
    │   │   │   ├── defaults_dial.yaml
    │   │   │   └── defaults_dial_instruct.yaml
    │   │   ├── vizwiz
    │   │   │   └── defaults.yaml
    │   │   ├── vlep
    │   │   │   ├── defaults_cap.yaml
    │   │   │   └── defaults_cap_instruct.yaml
    │   │   ├── vsr
    │   │   │   ├── defaults.yaml
    │   │   │   ├── defaults_classification.yaml
    │   │   │   ├── defaults_classification_instruct.yaml
    │   │   │   └── defaults_instruct.yaml
    │   │   ├── wavcaps
    │   │   │   ├── defaults_mm_cap.yaml
    │   │   │   └── defaults_mm_cap_instruct.yaml
    │   │   ├── webvid
    │   │   │   ├── defaults_cap.yaml
    │   │   │   └── defaults_cap_instruct.yaml
    │   │   ├── youcook
    │   │   │   ├── defaults_cap.yaml
    │   │   │   └── defaults_cap_instruct.yaml
    │   │   └── yt8m
    │   │   │   └── defaults_mm_dial.yaml
    │   ├── default.yaml
    │   └── models
    │   │   ├── albef_classification_ve.yaml
    │   │   ├── albef_feature_extractor.yaml
    │   │   ├── albef_nlvr.yaml
    │   │   ├── albef_pretrain_base.yaml
    │   │   ├── albef_retrieval_coco.yaml
    │   │   ├── albef_retrieval_flickr.yaml
    │   │   ├── albef_vqav2.yaml
    │   │   ├── alpro_qa_msrvtt.yaml
    │   │   ├── alpro_qa_msvd.yaml
    │   │   ├── alpro_retrieval_didemo.yaml
    │   │   ├── alpro_retrieval_msrvtt.yaml
    │   │   ├── bert_config.json
    │   │   ├── bert_config_alpro.json
    │   │   ├── blip-diffusion
    │   │       ├── blip_diffusion_base.yaml
    │   │       ├── blip_diffusion_controlnet_canny.yaml
    │   │       ├── blip_diffusion_controlnet_depth.yaml
    │   │       └── blip_diffusion_controlnet_hed.yaml
    │   │   ├── blip2
    │   │       ├── blip2_caption_flant5xl.yaml
    │   │       ├── blip2_caption_opt2.7b.yaml
    │   │       ├── blip2_caption_opt6.7b.yaml
    │   │       ├── blip2_coco.yaml
    │   │       ├── blip2_instruct_flant5xl.yaml
    │   │       ├── blip2_instruct_flant5xxl.yaml
    │   │       ├── blip2_instruct_vicuna13b.yaml
    │   │       ├── blip2_instruct_vicuna7b.yaml
    │   │       ├── blip2_pretrain.yaml
    │   │       ├── blip2_pretrain_flant5xl.yaml
    │   │       ├── blip2_pretrain_flant5xl_vitL.yaml
    │   │       ├── blip2_pretrain_flant5xxl.yaml
    │   │       ├── blip2_pretrain_llama7b.yaml
    │   │       ├── blip2_pretrain_opt2.7b.yaml
    │   │       ├── blip2_pretrain_opt6.7b.yaml
    │   │       ├── blip2_pretrain_vitL.yaml
    │   │       ├── blip2_xinstruct_vicuna13b.yaml
    │   │       └── blip2_xinstruct_vicuna7b.yaml
    │   │   ├── blip_caption_base_coco.yaml
    │   │   ├── blip_caption_large_coco.yaml
    │   │   ├── blip_classification_base.yaml
    │   │   ├── blip_feature_extractor_base.yaml
    │   │   ├── blip_itm_base.yaml
    │   │   ├── blip_itm_large.yaml
    │   │   ├── blip_nlvr.yaml
    │   │   ├── blip_pretrain_base.yaml
    │   │   ├── blip_pretrain_large.yaml
    │   │   ├── blip_retrieval_coco.yaml
    │   │   ├── blip_retrieval_flickr.yaml
    │   │   ├── blip_vqa_aokvqa.yaml
    │   │   ├── blip_vqa_okvqa.yaml
    │   │   ├── blip_vqav2.yaml
    │   │   ├── clip
    │   │       ├── RN101-quickgelu.json
    │   │       ├── RN101.json
    │   │       ├── RN50-quickgelu.json
    │   │       ├── RN50.json
    │   │       ├── RN50x16.json
    │   │       ├── RN50x4.json
    │   │       ├── ViT-B-16-plus-240.json
    │   │       ├── ViT-B-16-plus.json
    │   │       ├── ViT-B-16.json
    │   │       ├── ViT-B-32-plus-256.json
    │   │       ├── ViT-B-32-quickgelu.json
    │   │       ├── ViT-B-32.json
    │   │       ├── ViT-H-14.json
    │   │       ├── ViT-H-16.json
    │   │       ├── ViT-L-14-280.json
    │   │       ├── ViT-L-14-336.json
    │   │       ├── ViT-L-14.json
    │   │       ├── ViT-L-16-320.json
    │   │       ├── ViT-L-16.json
    │   │       ├── ViT-g-14.json
    │   │       ├── timm-efficientnetv2_rw_s.json
    │   │       ├── timm-resnet50d.json
    │   │       ├── timm-resnetaa50d.json
    │   │       ├── timm-resnetblur50.json
    │   │       ├── timm-swin_base_patch4_window7_224.json
    │   │       ├── timm-vit_base_patch16_224.json
    │   │       ├── timm-vit_base_patch32_224.json
    │   │       └── timm-vit_small_patch16_224.json
    │   │   ├── clip_resnet50.yaml
    │   │   ├── clip_vit_base16.yaml
    │   │   ├── clip_vit_base32.yaml
    │   │   ├── clip_vit_large14.yaml
    │   │   ├── clip_vit_large14_336.yaml
    │   │   ├── gpt_dialogue_base.yaml
    │   │   ├── img2prompt-vqa
    │   │       └── img2prompt_vqa_base.yaml
    │   │   ├── med_config.json
    │   │   ├── med_config_albef.json
    │   │   ├── med_large_config.json
    │   │   └── pnp-vqa
    │   │       ├── pnp_vqa_3b.yaml
    │   │       ├── pnp_vqa_base.yaml
    │   │       ├── pnp_vqa_large.yaml
    │   │       ├── unifiedqav2_3b_config.json
    │   │       ├── unifiedqav2_base_config.json
    │   │       └── unifiedqav2_large_config.json
    ├── datasets
    │   ├── builders
    │   │   ├── __init__.py
    │   │   ├── audio_caption_builder.py
    │   │   ├── audio_qa_builder.py
    │   │   ├── base_dataset_builder.py
    │   │   ├── caption_builder.py
    │   │   ├── classification_builder.py
    │   │   ├── dialogue_builder.py
    │   │   ├── discrn_builders.py
    │   │   ├── image_text_pair_builder.py
    │   │   ├── imagefolder_builder.py
    │   │   ├── object3d_caption_builder.py
    │   │   ├── object3d_classification_builder.py
    │   │   ├── object3d_qa_builder.py
    │   │   ├── retrieval_builder.py
    │   │   ├── text_to_image_generation_builder.py
    │   │   ├── video_qa_builder.py
    │   │   └── vqa_builder.py
    │   ├── data_utils.py
    │   ├── datasets
    │   │   ├── aok_vqa_datasets.py
    │   │   ├── audio_captioning_datasets.py
    │   │   ├── audio_classification_datasets.py
    │   │   ├── audio_qa_datasets.py
    │   │   ├── avsd_dialogue_datasets.py
    │   │   ├── base_dataset.py
    │   │   ├── capfilt_dataset.py
    │   │   ├── caption_datasets.py
    │   │   ├── coco_caption_datasets.py
    │   │   ├── coco_vqa_datasets.py
    │   │   ├── dataloader_utils.py
    │   │   ├── dialogue_datasets.py
    │   │   ├── discriminatory_reasoning_datasets.py
    │   │   ├── gqa_datasets.py
    │   │   ├── iconqa_datasets.py
    │   │   ├── image_text_pair_datasets.py
    │   │   ├── imagefolder_dataset.py
    │   │   ├── laion_dataset.py
    │   │   ├── llava150k_dataset.py
    │   │   ├── multimodal_classification_datasets.py
    │   │   ├── music_avqa.py
    │   │   ├── nlvr_datasets.py
    │   │   ├── object3d_captioning_datasets.py
    │   │   ├── object3d_classification_datasets.py
    │   │   ├── object3d_qa_datasets.py
    │   │   ├── ocr_datasets.py
    │   │   ├── retrieval_datasets.py
    │   │   ├── snli_ve_datasets.py
    │   │   ├── subject_driven_t2i_dataset.py
    │   │   ├── textcaps_datasets.py
    │   │   ├── valor_caption.py
    │   │   ├── vatex_captioning_datasets.py
    │   │   ├── vg_vqa_datasets.py
    │   │   ├── video_caption_datasets.py
    │   │   ├── video_vqa_datasets.py
    │   │   ├── violin_dataset.py
    │   │   ├── visdial_dialogue_datasets.py
    │   │   ├── vizwiz_vqa_datasets.py
    │   │   ├── vlep_dataset.py
    │   │   ├── vqa_datasets.py
    │   │   ├── vsr_datasets.py
    │   │   └── yt8m_video_dialogue_datasets.py
    │   └── download_scripts
    │   │   ├── DownloadConceptualCaptions
    │   │       ├── LICENSE
    │   │       ├── README.md
    │   │       ├── create_annotation_12m.ipynb
    │   │       ├── create_annotation_3m.ipynb
    │   │       ├── download_data_cc12m.py
    │   │       └── download_data_cc3m.py
    │   │   ├── download_charade.py
    │   │   ├── download_coco.py
    │   │   ├── download_coin.py
    │   │   ├── download_didemo.py
    │   │   ├── download_flickr.py
    │   │   ├── download_gqa.py
    │   │   ├── download_iconqa.py
    │   │   ├── download_msrvtt.py
    │   │   ├── download_msvd.py
    │   │   ├── download_nocaps.py
    │   │   ├── download_sbu.py
    │   │   ├── download_vg.py
    │   │   └── download_violin.py
    ├── models
    │   ├── __init__.py
    │   ├── albef_models
    │   │   ├── __init__.py
    │   │   ├── albef_classification.py
    │   │   ├── albef_feature_extractor.py
    │   │   ├── albef_nlvr.py
    │   │   ├── albef_outputs.py
    │   │   ├── albef_pretrain.py
    │   │   ├── albef_retrieval.py
    │   │   └── albef_vqa.py
    │   ├── alpro_models
    │   │   ├── __init__.py
    │   │   ├── alpro_outputs.py
    │   │   ├── alpro_qa.py
    │   │   └── alpro_retrieval.py
    │   ├── base_model.py
    │   ├── beats
    │   │   ├── BEATs.py
    │   │   ├── LICENSE_BEATs.txt
    │   │   ├── README.md
    │   │   ├── Tokenizers.py
    │   │   ├── backbone.py
    │   │   ├── modules.py
    │   │   └── quantizer.py
    │   ├── beats_encoder.py
    │   ├── blip2_models
    │   │   ├── Qformer.py
    │   │   ├── __init__.py
    │   │   ├── blip2.py
    │   │   ├── blip2_image_text_matching.py
    │   │   ├── blip2_opt.py
    │   │   ├── blip2_qformer.py
    │   │   ├── blip2_t5.py
    │   │   ├── blip2_t5_instruct.py
    │   │   ├── blip2_vicuna_instruct.py
    │   │   ├── blip2_vicuna_xinstruct.py
    │   │   ├── modeling_llama.py
    │   │   ├── modeling_opt.py
    │   │   └── modeling_t5.py
    │   ├── blip_diffusion_models
    │   │   ├── __init__.py
    │   │   ├── blip_diffusion.py
    │   │   ├── modeling_ctx_clip.py
    │   │   ├── ptp_utils.py
    │   │   └── utils.py
    │   ├── blip_models
    │   │   ├── __init__.py
    │   │   ├── blip.py
    │   │   ├── blip_caption.py
    │   │   ├── blip_classification.py
    │   │   ├── blip_feature_extractor.py
    │   │   ├── blip_image_text_matching.py
    │   │   ├── blip_nlvr.py
    │   │   ├── blip_outputs.py
    │   │   ├── blip_pretrain.py
    │   │   ├── blip_retrieval.py
    │   │   ├── blip_vqa.py
    │   │   └── nlvr_encoder.py
    │   ├── clip_models
    │   │   ├── __init__.py
    │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   ├── clip_outputs.py
    │   │   ├── loss.py
    │   │   ├── model.py
    │   │   ├── pics
    │   │   │   └── CLIP.png
    │   │   ├── pretrained.py
    │   │   ├── timm_model.py
    │   │   ├── tokenizer.py
    │   │   ├── transform.py
    │   │   └── utils.py
    │   ├── clip_vit.py
    │   ├── eva_vit.py
    │   ├── gpt_models
    │   │   └── gpt_dialogue.py
    │   ├── img2prompt_models
    │   │   ├── __init__.py
    │   │   └── img2prompt_vqa.py
    │   ├── med.py
    │   ├── pnp_vqa_models
    │   │   ├── __init__.py
    │   │   ├── pnp_unifiedqav2_fid.py
    │   │   └── pnp_vqa.py
    │   ├── timesformer
    │   │   ├── __init__.py
    │   │   ├── conv2d_same.py
    │   │   ├── features.py
    │   │   ├── helpers.py
    │   │   ├── linear.py
    │   │   ├── vit.py
    │   │   └── vit_utils.py
    │   ├── ulip_models
    │   │   ├── ULIP_models.py
    │   │   ├── losses.py
    │   │   ├── pointbert
    │   │   │   ├── PointTransformer_8192point.yaml
    │   │   │   ├── checkpoint.py
    │   │   │   ├── dvae.py
    │   │   │   ├── logger.py
    │   │   │   ├── misc.py
    │   │   │   └── point_encoder.py
    │   │   ├── ulip_scaled_up_config.yaml
    │   │   └── utils
    │   │   │   ├── __init__.py
    │   │   │   ├── build.py
    │   │   │   ├── config.py
    │   │   │   ├── io.py
    │   │   │   ├── logger.py
    │   │   │   ├── registry.py
    │   │   │   ├── tokenizer.py
    │   │   │   └── utils.py
    │   └── vit.py
    ├── processors
    │   ├── __init__.py
    │   ├── alpro_processors.py
    │   ├── audio_processors.py
    │   ├── base_processor.py
    │   ├── blip_diffusion_processors.py
    │   ├── blip_processors.py
    │   ├── clip_processors.py
    │   ├── functional_video.py
    │   ├── gpt_processors.py
    │   ├── instruction_text_processors.py
    │   ├── randaugment.py
    │   ├── transforms_video.py
    │   └── ulip_processors.py
    ├── projects
    │   ├── albef
    │   │   ├── eval
    │   │   │   ├── nlvr_eval.yaml
    │   │   │   ├── ret_coco_eval.yaml
    │   │   │   ├── ret_flickr30k_eval.yaml
    │   │   │   ├── snli_ve_eval.yaml
    │   │   │   ├── vqa_test.yaml
    │   │   │   └── vqa_val.yaml
    │   │   └── train
    │   │   │   ├── aokvqa_ft.yaml
    │   │   │   ├── nlvr_ft.yaml
    │   │   │   ├── okvqa_ft.yaml
    │   │   │   ├── pretrain.yaml
    │   │   │   ├── ret_coco_ft.yaml
    │   │   │   ├── ret_flickr30k_ft.yaml
    │   │   │   ├── snli_ve_ft.yaml
    │   │   │   └── vqa_ft.yaml
    │   ├── alpro
    │   │   ├── eval
    │   │   │   ├── didemo_ret_eval.yaml
    │   │   │   ├── msrvtt_qa_eval.yaml
    │   │   │   ├── msrvtt_ret_eval.yaml
    │   │   │   └── msvd_qa_eval.yaml
    │   │   └── train
    │   │   │   ├── didemo_ret_ft.yaml
    │   │   │   ├── msrvtt_qa_ft.yaml
    │   │   │   ├── msrvtt_retrieval_ft.yaml
    │   │   │   └── msvd_qa_ft.yaml
    │   ├── blip
    │   │   ├── coco_cap_ft_iter.yaml
    │   │   ├── eval
    │   │   │   ├── aokvqa_eval.yaml
    │   │   │   ├── caption_coco_eval.yaml
    │   │   │   ├── caption_coco_eval_large.yaml
    │   │   │   ├── nlvr_eval.yaml
    │   │   │   ├── nocaps_eval.yaml
    │   │   │   ├── okvqa_eval.yaml
    │   │   │   ├── ret_coco_eval.yaml
    │   │   │   ├── ret_flickr_eval.yaml
    │   │   │   └── vqav2_eval.yaml
    │   │   └── train
    │   │   │   ├── aokvqa_ft.yaml
    │   │   │   ├── caption_coco_ft.yaml
    │   │   │   ├── caption_coco_large_ft.yaml
    │   │   │   ├── nlvr_ft.yaml
    │   │   │   ├── okvqa_ft.yaml
    │   │   │   ├── pretrain_14m.yaml
    │   │   │   ├── retrieval_coco_ft.yaml
    │   │   │   ├── retrieval_flickr_ft.yaml
    │   │   │   └── vqav2_ft.yaml
    │   ├── blip2
    │   │   ├── eval
    │   │   │   ├── caption_coco_flant5xl_eval.yaml
    │   │   │   ├── caption_coco_opt2.7b_eval.yaml
    │   │   │   ├── caption_coco_opt6.7b_eval.yaml
    │   │   │   ├── caption_nocaps_out_domain_flant5xl_eval.yaml
    │   │   │   ├── caption_nocaps_out_domain_flant5xxl_eval.yaml
    │   │   │   ├── gqa_zeroshot_flant5xl_eval.yaml
    │   │   │   ├── okvqa_zeroshot_flant5xl_eval.yaml
    │   │   │   ├── ret_coco_eval.yaml
    │   │   │   ├── ret_flickr_eval.yaml
    │   │   │   ├── vqav2_zeroshot_flant5xl_eval.yaml
    │   │   │   └── vqav2_zeroshot_opt_eval.yaml
    │   │   └── train
    │   │   │   ├── caption_coco_ft.yaml
    │   │   │   ├── pretrain_stage1.yaml
    │   │   │   ├── pretrain_stage2.yaml
    │   │   │   └── retrieval_coco_ft.yaml
    │   ├── blip_diffusion
    │   │   ├── finetune-db-dog.yaml
    │   │   ├── finetune-db-pink-dress.yaml
    │   │   ├── finetune-db-shein-jacket.yaml
    │   │   └── finetune-db-template.yaml
    │   ├── clip
    │   │   ├── exp_coco_ret_eval.yaml
    │   │   ├── exp_flickr_ret_eval.yaml
    │   │   └── exp_imnet_zs_eval.yaml
    │   ├── gpt
    │   │   ├── eval
    │   │   │   └── dialogue_avsd_eval.yaml
    │   │   └── train
    │   │   │   └── dialogue_avsd_ft.yaml
    │   ├── instructblip
    │   │   ├── caption_coco_flant5xl_eval_test.yaml
    │   │   ├── caption_coco_flant5xl_eval_val.yaml
    │   │   ├── caption_coco_flant5xxl_eval_test.yaml
    │   │   ├── caption_coco_flant5xxl_eval_val.yaml
    │   │   ├── caption_coco_vicuna13b_eval_test.yaml
    │   │   ├── caption_coco_vicuna13b_eval_val.yaml
    │   │   ├── caption_coco_vicuna7b_eval_test.yaml
    │   │   ├── caption_coco_vicuna7b_eval_val.yaml
    │   │   ├── caption_msrvtt_flant5xl_eval_test.yaml
    │   │   ├── caption_msrvtt_flant5xl_eval_val.yaml
    │   │   ├── caption_msrvtt_flant5xxl_eval_test.yaml
    │   │   ├── caption_msrvtt_flant5xxl_eval_val.yaml
    │   │   ├── caption_msrvtt_vicuna13b_eval_test.yaml
    │   │   ├── caption_msrvtt_vicuna13b_eval_val.yaml
    │   │   ├── caption_msrvtt_vicuna7b_eval_test.yaml
    │   │   ├── caption_msrvtt_vicuna7b_eval_val.yaml
    │   │   ├── caption_msvd_flant5xl_eval.yaml
    │   │   ├── caption_msvd_flant5xxl_eval.yaml
    │   │   ├── caption_msvd_vicuna13b_eval.yaml
    │   │   ├── caption_msvd_vicuna7b_eval.yaml
    │   │   ├── caption_nocaps_out_domain_flant5xl_eval.yaml
    │   │   ├── caption_nocaps_out_domain_flant5xxl_eval.yaml
    │   │   ├── caption_nocaps_out_domain_vicuna13b_eval.yaml
    │   │   ├── caption_nocaps_out_domain_vicuna7b_eval.yaml
    │   │   ├── caption_vatex_flant5xl_eval.yaml
    │   │   ├── caption_vatex_flant5xxl_eval.yaml
    │   │   ├── caption_vatex_vicuna13b_eval.yaml
    │   │   ├── caption_vatex_vicuna7b_eval.yaml
    │   │   ├── classification_modelnet40_vicuna13b.yaml
    │   │   ├── classification_modelnet40_vicuna7b.yaml
    │   │   ├── classification_snlive_flant5xl.yaml
    │   │   ├── classification_snlive_flant5xxl.yaml
    │   │   ├── classification_snlive_vicuna13b.yaml
    │   │   ├── classification_snlive_vicuna13b_test.yaml
    │   │   ├── classification_snlive_vicuna7b_test.yaml
    │   │   ├── classification_snlive_vicuna7b_val.yaml
    │   │   ├── completion_modelnet40_vicuna13b.yaml
    │   │   ├── completion_modelnet40_vicuna7b.yaml
    │   │   ├── qa_msrvtt_flant5xl_eval_test.yaml
    │   │   ├── qa_msrvtt_flant5xxl_eval_test.yaml
    │   │   ├── qa_msrvtt_vicuna13b_eval_test.yaml
    │   │   ├── qa_msrvtt_vicuna7b_eval_test.yaml
    │   │   ├── qa_msvd_flant5xl_eval.yaml
    │   │   ├── qa_msvd_flant5xxl_eval.yaml
    │   │   ├── qa_msvd_vicuna13b_eval.yaml
    │   │   ├── qa_msvd_vicuna7b_eval.yaml
    │   │   ├── qa_okvqa_flant5xl_eval.yaml
    │   │   ├── qa_okvqa_flant5xxl_eval.yaml
    │   │   ├── qa_okvqa_vicuna13b_eval.yaml
    │   │   └── qa_okvqa_vicuna7b_eval.yaml
    │   ├── pnp-vqa
    │   │   └── eval
    │   │   │   ├── gqa_eval.yaml
    │   │   │   ├── gqa_eval_3b.yaml
    │   │   │   ├── gqa_eval_large.yaml
    │   │   │   ├── okvqa_eval.yaml
    │   │   │   ├── okvqa_eval_3b.yaml
    │   │   │   ├── okvqa_eval_large.yaml
    │   │   │   ├── vqav2_eval.yaml
    │   │   │   ├── vqav2_eval_3b.yaml
    │   │   │   ├── vqav2_eval_large.yaml
    │   │   │   ├── vqav2_test_eval.yaml
    │   │   │   ├── vqav2_test_eval_3b.yaml
    │   │   │   └── vqav2_test_eval_large.yaml
    │   └── xinstruct_blip
    │   │   ├── eval
    │   │       ├── discrn
    │   │       │   ├── audio_video_caption.yaml
    │   │       │   ├── audio_video_caption_13b.yaml
    │   │       │   ├── audio_video_describe.yaml
    │   │       │   ├── audio_video_describe_13b.yaml
    │   │       │   ├── audio_video_describe_nocue.yaml
    │   │       │   ├── audio_video_describe_proj copy.yaml
    │   │       │   ├── audio_video_describe_proj.yaml
    │   │       │   ├── audio_video_describe_rand_init.yaml
    │   │       │   ├── image_3d_caption.yaml
    │   │       │   ├── image_3d_caption_13b.yaml
    │   │       │   ├── image_3d_describe.yaml
    │   │       │   ├── image_3d_describe_13b.yaml
    │   │       │   ├── image_3d_describe_no_init.yaml
    │   │       │   ├── image_3d_describe_nocue.yaml
    │   │       │   └── image_3d_describe_proj.yaml
    │   │       ├── vicuna13b
    │   │       │   ├── audio
    │   │       │   │   ├── audiocaps_captioning_qa.yaml
    │   │       │   │   ├── audiocaps_captioning_test.yaml
    │   │       │   │   ├── audiocaps_captioning_val.yaml
    │   │       │   │   ├── clothoQA_captioning.yaml
    │   │       │   │   ├── clothov1_captioning.yaml
    │   │       │   │   ├── clothov2_captioning.yaml
    │   │       │   │   ├── esc50_classification.yaml
    │   │       │   │   └── esc50_classification_completion.yaml
    │   │       │   ├── crossmodal
    │   │       │   │   ├── musicavqa
    │   │       │   │   │   ├── musicavqa_audio_eval.yaml
    │   │       │   │   │   ├── musicavqa_joint_eval.yaml
    │   │       │   │   │   └── musicavqa_video_eval.yaml
    │   │       │   │   └── vatex
    │   │       │   │   │   ├── vatex_audio_captioning.yaml
    │   │       │   │   │   ├── vatex_captioning.yaml
    │   │       │   │   │   ├── vatex_joint_captioning.yaml
    │   │       │   │   │   └── vatex_joint_captioning_interleave.yaml
    │   │       │   ├── image
    │   │       │   │   ├── coco_captioning_test.yaml
    │   │       │   │   ├── coco_captioning_val.yaml
    │   │       │   │   ├── flickr30k_captioning.yaml
    │   │       │   │   ├── gqa_qa.yaml
    │   │       │   │   ├── nocaps_captioning.yaml
    │   │       │   │   ├── nocaps_out_domain_captioning.yaml
    │   │       │   │   ├── okvqa_qa.yaml
    │   │       │   │   ├── snlive_classification_test.yaml
    │   │       │   │   ├── snlive_classification_val.yaml
    │   │       │   │   └── vizwiz_qa.yaml
    │   │       │   ├── image_with_coco
    │   │       │   │   ├── coco_captioning_test.yaml
    │   │       │   │   ├── coco_captioning_val.yaml
    │   │       │   │   ├── flickr30k_captioning.yaml
    │   │       │   │   ├── gqa_qa.yaml
    │   │       │   │   ├── nocaps_captioning.yaml
    │   │       │   │   ├── nocaps_out_domain_captioning.yaml
    │   │       │   │   ├── okvqa_qa.yaml
    │   │       │   │   ├── snlive_classification_test.yaml
    │   │       │   │   ├── snlive_classification_val.yaml
    │   │       │   │   └── vizwiz_qa.yaml
    │   │       │   ├── pc
    │   │       │   │   ├── modelnet40_classification.yaml
    │   │       │   │   ├── modelnet40_completion.yaml
    │   │       │   │   ├── objaverse_captioning.yaml
    │   │       │   │   └── objaverse_qa.yaml
    │   │       │   ├── video
    │   │       │   │   ├── msrvtt_captioning.yaml
    │   │       │   │   ├── msrvtt_captioning_test.yaml
    │   │       │   │   ├── msrvtt_captioning_val.yaml
    │   │       │   │   ├── msrvtt_qa_test.yaml
    │   │       │   │   ├── msrvtt_qa_val.yaml
    │   │       │   │   ├── msvd_captioning.yaml
    │   │       │   │   ├── msvd_qa.yaml
    │   │       │   │   ├── vatex_audio_captioning.yaml
    │   │       │   │   ├── vatex_captioning.yaml
    │   │       │   │   ├── vatex_joint_captioning.yaml
    │   │       │   │   └── vatex_joint_captioning_interleave.yaml
    │   │       │   └── video_image
    │   │       │   │   ├── msvd_captioning.yaml
    │   │       │   │   ├── msvd_qa.yaml
    │   │       │   │   └── vatex_captioning.yaml
    │   │       ├── vicuna7b
    │   │       │   ├── audio
    │   │       │   │   ├── audiocaps_captioning_qa.yaml
    │   │       │   │   ├── audiocaps_captioning_test.yaml
    │   │       │   │   ├── audiocaps_captioning_val.yaml
    │   │       │   │   ├── clothoQA_captioning.yaml
    │   │       │   │   ├── clothov1_captioning.yaml
    │   │       │   │   ├── clothov2_captioning.yaml
    │   │       │   │   ├── esc50_classification.yaml
    │   │       │   │   └── esc50_classification_completion.yaml
    │   │       │   ├── audio_no_init
    │   │       │   │   ├── audiocaps_captioning_qa.yaml
    │   │       │   │   ├── audiocaps_captioning_test.yaml
    │   │       │   │   ├── audiocaps_captioning_val.yaml
    │   │       │   │   ├── clothoQA_captioning.yaml
    │   │       │   │   ├── clothov1_captioning.yaml
    │   │       │   │   ├── clothov2_captioning.yaml
    │   │       │   │   ├── esc50_classification.yaml
    │   │       │   │   └── esc50_classification_completion.yaml
    │   │       │   ├── audio_projection_only
    │   │       │   │   ├── audiocaps_captioning_qa.yaml
    │   │       │   │   ├── audiocaps_captioning_test.yaml
    │   │       │   │   ├── audiocaps_captioning_val.yaml
    │   │       │   │   ├── clothoQA_captioning.yaml
    │   │       │   │   ├── clothov1_captioning.yaml
    │   │       │   │   ├── clothov2_captioning.yaml
    │   │       │   │   ├── esc50_classification.yaml
    │   │       │   │   └── esc50_classification_completion.yaml
    │   │       │   ├── audio_projection_only_nocue
    │   │       │   │   ├── audiocaps_captioning_qa.yaml
    │   │       │   │   ├── audiocaps_captioning_test.yaml
    │   │       │   │   ├── audiocaps_captioning_val.yaml
    │   │       │   │   ├── clothoQA_captioning.yaml
    │   │       │   │   ├── clothov1_captioning.yaml
    │   │       │   │   ├── clothov2_captioning.yaml
    │   │       │   │   ├── esc50_classification.yaml
    │   │       │   │   └── esc50_classification_completion.yaml
    │   │       │   ├── crossmodal
    │   │       │   │   ├── musicavqa
    │   │       │   │   │   ├── musicavqa_audio_eval.yaml
    │   │       │   │   │   ├── musicavqa_joint_eval.yaml
    │   │       │   │   │   └── musicavqa_video_eval.yaml
    │   │       │   │   └── vatex
    │   │       │   │   │   ├── vatex_audio_captioning.yaml
    │   │       │   │   │   ├── vatex_captioning.yaml
    │   │       │   │   │   ├── vatex_joint_captioning.yaml
    │   │       │   │   │   └── vatex_joint_captioning_interleave.yaml
    │   │       │   ├── image
    │   │       │   │   ├── coco_captioning_test.yaml
    │   │       │   │   ├── coco_captioning_val.yaml
    │   │       │   │   ├── flickr30k_captioning.yaml
    │   │       │   │   ├── gqa_qa.yaml
    │   │       │   │   ├── gqa_qa_val.yaml
    │   │       │   │   ├── nocaps_captioning.yaml
    │   │       │   │   ├── nocaps_out_domain_captioning.yaml
    │   │       │   │   ├── okvqa_qa.yaml
    │   │       │   │   ├── snlive_classification_test.yaml
    │   │       │   │   ├── snlive_classification_val.yaml
    │   │       │   │   └── vizwiz_qa.yaml
    │   │       │   ├── image_full_init
    │   │       │   │   ├── coco_captioning_test.yaml
    │   │       │   │   ├── coco_captioning_val.yaml
    │   │       │   │   ├── flickr30k_captioning.yaml
    │   │       │   │   ├── gqa_qa.yaml
    │   │       │   │   ├── gqa_qa_val.yaml
    │   │       │   │   ├── nocaps_captioning.yaml
    │   │       │   │   ├── nocaps_out_domain_captioning.yaml
    │   │       │   │   ├── okvqa_qa.yaml
    │   │       │   │   ├── snlive_classification_test.yaml
    │   │       │   │   ├── snlive_classification_val.yaml
    │   │       │   │   └── vizwiz_qa.yaml
    │   │       │   ├── image_no_init
    │   │       │   │   ├── coco_captioning_test.yaml
    │   │       │   │   ├── coco_captioning_val.yaml
    │   │       │   │   ├── flickr30k_captioning.yaml
    │   │       │   │   ├── gqa_qa.yaml
    │   │       │   │   ├── gqa_qa_val.yaml
    │   │       │   │   ├── nocaps_captioning.yaml
    │   │       │   │   ├── nocaps_out_domain_captioning.yaml
    │   │       │   │   ├── okvqa_qa.yaml
    │   │       │   │   ├── snlive_classification_test.yaml
    │   │       │   │   ├── snlive_classification_val.yaml
    │   │       │   │   └── vizwiz_qa.yaml
    │   │       │   ├── image_pre_coco
    │   │       │   │   ├── coco_captioning_test.yaml
    │   │       │   │   ├── coco_captioning_val.yaml
    │   │       │   │   ├── flickr30k_captioning.yaml
    │   │       │   │   ├── gqa_qa.yaml
    │   │       │   │   ├── nocaps_captioning.yaml
    │   │       │   │   ├── nocaps_out_domain_captioning.yaml
    │   │       │   │   ├── okvqa_qa.yaml
    │   │       │   │   ├── snlive_classification_test.yaml
    │   │       │   │   ├── snlive_classification_val.yaml
    │   │       │   │   └── vizwiz_qa.yaml
    │   │       │   ├── image_projection_only
    │   │       │   │   ├── coco_captioning_test.yaml
    │   │       │   │   ├── coco_captioning_val.yaml
    │   │       │   │   ├── flickr30k_captioning.yaml
    │   │       │   │   ├── gqa_qa.yaml
    │   │       │   │   ├── gqa_qa_val.yaml
    │   │       │   │   ├── nocaps_captioning.yaml
    │   │       │   │   ├── nocaps_out_domain_captioning.yaml
    │   │       │   │   ├── okvqa_qa.yaml
    │   │       │   │   ├── snlive_classification_test.yaml
    │   │       │   │   ├── snlive_classification_val.yaml
    │   │       │   │   └── vizwiz_qa.yaml
    │   │       │   ├── pc
    │   │       │   │   ├── modelnet40_classification.yaml
    │   │       │   │   ├── modelnet40_completion.yaml
    │   │       │   │   ├── objaverse_captioning.yaml
    │   │       │   │   └── objaverse_qa.yaml
    │   │       │   ├── pc_no_init
    │   │       │   │   ├── modelnet40_classification.yaml
    │   │       │   │   ├── modelnet40_completion.yaml
    │   │       │   │   ├── objaverse_captioning.yaml
    │   │       │   │   └── objaverse_qa.yaml
    │   │       │   ├── pc_projection_only
    │   │       │   │   ├── modelnet40_classification.yaml
    │   │       │   │   ├── modelnet40_completion.yaml
    │   │       │   │   ├── objaverse_captioning.yaml
    │   │       │   │   └── objaverse_qa.yaml
    │   │       │   ├── pc_ulip1
    │   │       │   │   ├── modelnet40_classification.yaml
    │   │       │   │   ├── modelnet40_completion.yaml
    │   │       │   │   ├── objaverse_captioning.yaml
    │   │       │   │   └── objaverse_qa.yaml
    │   │       │   ├── pc_ulip2_scaled_up
    │   │       │   │   ├── modelnet40_classification.yaml
    │   │       │   │   ├── modelnet40_completion.yaml
    │   │       │   │   ├── objaverse_captioning.yaml
    │   │       │   │   └── objaverse_qa.yaml
    │   │       │   ├── pc_ulip_objaverse
    │   │       │   │   ├── modelnet40_classification.yaml
    │   │       │   │   ├── modelnet40_completion.yaml
    │   │       │   │   ├── objaverse_captioning.yaml
    │   │       │   │   └── objaverse_qa.yaml
    │   │       │   ├── pc_ulip_objaverse_shapenet
    │   │       │   │   ├── modelnet40_classification.yaml
    │   │       │   │   ├── modelnet40_completion.yaml
    │   │       │   │   ├── objaverse_captioning.yaml
    │   │       │   │   └── objaverse_qa.yaml
    │   │       │   ├── pc_ulip_shapenet
    │   │       │   │   ├── modelnet40_classification.yaml
    │   │       │   │   ├── modelnet40_completion.yaml
    │   │       │   │   ├── objaverse_captioning.yaml
    │   │       │   │   └── objaverse_qa.yaml
    │   │       │   ├── video
    │   │       │   │   ├── msrvtt_captioning_test.yaml
    │   │       │   │   ├── msrvtt_captioning_val.yaml
    │   │       │   │   ├── msrvtt_qa_test.yaml
    │   │       │   │   ├── msrvtt_qa_val.yaml
    │   │       │   │   ├── msvd_captioning.yaml
    │   │       │   │   ├── msvd_qa.yaml
    │   │       │   │   └── vatex_captioning.yaml
    │   │       │   ├── video_image
    │   │       │   │   ├── msvd_captioning.yaml
    │   │       │   │   ├── msvd_qa.yaml
    │   │       │   │   └── vatex_captioning.yaml
    │   │       │   ├── video_image_pre_coco
    │   │       │   │   ├── msvd_captioning.yaml
    │   │       │   │   ├── msvd_qa.yaml
    │   │       │   │   └── vatex_captioning.yaml
    │   │       │   └── video_no_upsample
    │   │       │   │   ├── msrvtt_captioning_test.yaml
    │   │       │   │   ├── msrvtt_captioning_val.yaml
    │   │       │   │   ├── msrvtt_qa_test.yaml
    │   │       │   │   ├── msrvtt_qa_val.yaml
    │   │       │   │   ├── msvd_captioning.yaml
    │   │       │   │   ├── msvd_captioning_up.yaml
    │   │       │   │   ├── msvd_qa.yaml
    │   │       │   │   ├── msvd_qa_up.yaml
    │   │       │   │   ├── vatex_captioning.yaml
    │   │       │   │   └── vatex_captioning_up.yaml
    │   │       └── vicuna7b_nocue
    │   │       │   ├── audio
    │   │       │       ├── audiocaps_captioning_qa.yaml
    │   │       │       ├── audiocaps_captioning_test.yaml
    │   │       │       ├── audiocaps_captioning_val.yaml
    │   │       │       ├── clothoQA_captioning.yaml
    │   │       │       ├── clothov1_captioning.yaml
    │   │       │       ├── clothov2_captioning.yaml
    │   │       │       ├── esc50_classification.yaml
    │   │       │       └── esc50_classification_completion.yaml
    │   │       │   ├── crossmodal
    │   │       │       ├── musicavqa
    │   │       │       │   ├── musicavqa_audio_eval.yaml
    │   │       │       │   ├── musicavqa_joint_eval.yaml
    │   │       │       │   └── musicavqa_video_eval.yaml
    │   │       │       └── vatex
    │   │       │       │   ├── vatex_audio_captioning.yaml
    │   │       │       │   ├── vatex_captioning.yaml
    │   │       │       │   └── vatex_joint_captioning.yaml
    │   │       │   ├── image
    │   │       │       ├── coco_captioning_test.yaml
    │   │       │       ├── coco_captioning_val.yaml
    │   │       │       ├── flickr30k_captioning.yaml
    │   │       │       ├── gqa_qa.yaml
    │   │       │       ├── nocaps_captioning.yaml
    │   │       │       ├── nocaps_out_domain_captioning.yaml
    │   │       │       ├── okvqa_qa.yaml
    │   │       │       ├── snlive_classification_test.yaml
    │   │       │       ├── snlive_classification_val.yaml
    │   │       │       └── vizwiz_qa.yaml
    │   │       │   ├── pc
    │   │       │       ├── modelnet40_classification.yaml
    │   │       │       ├── modelnet40_completion.yaml
    │   │       │       ├── objaverse_captioning.yaml
    │   │       │       └── objaverse_qa.yaml
    │   │       │   ├── video
    │   │       │       ├── msrvtt_captioning_test.yaml
    │   │       │       ├── msrvtt_captioning_val.yaml
    │   │       │       ├── msrvtt_qa_test.yaml
    │   │       │       ├── msrvtt_qa_val.yaml
    │   │       │       ├── msvd_captioning.yaml
    │   │       │       ├── msvd_qa.yaml
    │   │       │       └── vatex_captioning.yaml
    │   │       │   └── video_image
    │   │       │       ├── msvd_captioning.yaml
    │   │       │       ├── msvd_qa.yaml
    │   │       │       └── vatex_captioning.yaml
    │   │   ├── prompt_variation
    │   │       └── nocaps
    │   │       │   ├── instructblip
    │   │       │       ├── original.yaml
    │   │       │       ├── template_1.yaml
    │   │       │       ├── template_2.yaml
    │   │       │       ├── template_3.yaml
    │   │       │       ├── template_4.yaml
    │   │       │       └── template_5.yaml
    │   │       │   └── xinstructblip
    │   │       │       ├── template_1.yaml
    │   │       │       ├── template_2.yaml
    │   │       │       ├── template_3.yaml
    │   │       │       ├── template_4.yaml
    │   │       │       └── template_5.yaml
    │   │   └── train
    │   │       ├── vicuna13b
    │   │           ├── audio_training.yaml
    │   │           ├── audio_training_continue.yaml
    │   │           ├── image_train.yaml
    │   │           ├── image_train_continue.yaml
    │   │           ├── pc_training.yaml
    │   │           └── video_training.yaml
    │   │       ├── vicuna7b
    │   │           ├── audio_training.yaml
    │   │           ├── audio_training_improved.yaml
    │   │           ├── audio_training_no_init.yaml
    │   │           ├── audio_training_projection_only.yaml
    │   │           ├── audio_training_projection_only_nocue.yaml
    │   │           ├── image_train.yaml
    │   │           ├── image_train_improved.yaml
    │   │           ├── image_train_no_init.yaml
    │   │           ├── image_train_projection_only.yaml
    │   │           ├── lora_training.yaml
    │   │           ├── pc_training.yaml
    │   │           ├── pc_training_improved.yaml
    │   │           ├── pc_training_no_init.yaml
    │   │           ├── pc_training_projection_only.yaml
    │   │           ├── pc_training_projection_only_nocue.yaml
    │   │           ├── pc_training_scaled_up.yaml
    │   │           ├── pc_training_ulip1.yaml
    │   │           ├── pc_training_ulip2_objaverse_shapenet_k_1.yaml
    │   │           ├── pc_training_ulip_objaverse.yaml
    │   │           ├── pc_training_ulip_shapenet.yaml
    │   │           ├── video_training.yaml
    │   │           └── video_training_no_msrvtt_upsample.yaml
    │   │       └── vicuna7b_nocue
    │   │           ├── audio_training.yaml
    │   │           ├── image_train.yaml
    │   │           ├── pc_training.yaml
    │   │           └── video_training.yaml
    ├── runners
    │   ├── __init__.py
    │   ├── runner_base.py
    │   └── runner_iter.py
    └── tasks
    │   ├── __init__.py
    │   ├── base_task.py
    │   ├── captioning.py
    │   ├── dialogue.py
    │   ├── image_text_pretrain.py
    │   ├── multimodal_classification.py
    │   ├── retrieval.py
    │   ├── text_to_image_generation.py
    │   ├── vqa.py
    │   └── vqa_reading_comprehension.py
├── projects
    ├── blip-diffusion
    │   ├── README.md
    │   ├── images
    │   │   ├── black-cat.png
    │   │   ├── cat-sofa.png
    │   │   ├── dog.png
    │   │   ├── dog2.png
    │   │   ├── dreambooth
    │   │   │   ├── dog
    │   │   │   │   ├── 00.jpg
    │   │   │   │   ├── 01.jpg
    │   │   │   │   ├── 02.jpg
    │   │   │   │   ├── 03.jpg
    │   │   │   │   └── 04.jpg
    │   │   │   └── dog8
    │   │   │   │   ├── 00.jpg
    │   │   │   │   ├── 01.jpg
    │   │   │   │   ├── 02.jpg
    │   │   │   │   ├── 03.jpg
    │   │   │   │   └── 04.jpg
    │   │   ├── dress-model.png
    │   │   ├── flower.jpg
    │   │   ├── green-skirt.png
    │   │   ├── jacket-letter-s
    │   │   │   └── jacket-letter-s.png
    │   │   ├── kettle.jpg
    │   │   ├── pink-dress.png
    │   │   ├── pink-dress
    │   │   │   └── pink-dress.png
    │   │   └── shein-jacket
    │   │   │   └── shein-jacket.jpg
    │   ├── notebooks
    │   │   ├── editing_real_finetuned.ipynb
    │   │   ├── editing_real_zeroshot.ipynb
    │   │   ├── editing_synthetic_zeroshot.ipynb
    │   │   ├── editing_tryon_zeroshot.ipynb
    │   │   ├── generation_finetuned_dog.ipynb
    │   │   ├── generation_zeroshot.ipynb
    │   │   └── stylization.ipynb
    │   └── teaser-website.png
    ├── blip2
    │   ├── README.md
    │   └── blip2_illustration.png
    ├── img2llm-vqa
    │   ├── Caption.png
    │   ├── Illustration.png
    │   ├── QuestionGeneration.png
    │   ├── README.md
    │   ├── demo.png
    │   ├── img2llm_vqa.ipynb
    │   └── img2llm_vqa.py
    ├── img2prompt-vqa
    │   └── README.md
    ├── instructblip
    │   ├── README.md
    │   ├── comparison.png
    │   ├── run_demo.py
    │   └── showcase.png
    ├── pnp-vqa
    │   ├── README.md
    │   ├── pnp_vqa.ipynb
    │   └── pnp_vqa.png
    └── xinstructblip
    │   ├── README.md
    │   ├── assets
    │       ├── architecture.png
    │       └── data.png
    │   ├── data_aug
    │       ├── 3d_qa_data_generation.py
    │       └── audio_qa_data_generation.py
    │   ├── demo
    │       ├── configs
    │       │   ├── vicuna13b.yaml
    │       │   ├── vicuna7b.yaml
    │       │   ├── vicuna7b_blip_init.yaml
    │       │   ├── vicuna7b_no_init.yaml
    │       │   ├── vicuna7b_nocue.yaml
    │       │   ├── vicuna7b_projection.yaml
    │       │   ├── vicuna7b_rand.yaml
    │       │   └── vicuna7b_v2.yaml
    │       ├── demo.ipynb
    │       ├── examples
    │       │   ├── audio
    │       │   │   ├── 110714_wren.wav
    │       │   │   └── Group_of_Dogs_Barking.wav
    │       │   └── point_cloud
    │       │   │   └── banana.glb
    │       └── run_demo.py
    │   ├── discrn
    │       ├── caption_baseline
    │       │   ├── predict_audio.py
    │       │   ├── predict_image.py
    │       │   ├── predict_pc.py
    │       │   ├── predict_video.py
    │       │   └── render_images.py
    │       └── data_generation
    │       │   ├── audiocaps_video_audio.py
    │       │   └── objaverse_img_3d.py
    │   └── modelnet_baseline
    │       └── render_images.py
├── pyproject.toml
├── requirements.txt
├── run_scripts
    ├── albef
    │   ├── eval
    │   │   ├── eval_albef_nlvr.sh
    │   │   ├── eval_albef_ve.sh
    │   │   ├── eval_coco_retrieval.sh
    │   │   ├── eval_flickr30k_retrieval.sh
    │   │   ├── test_albef_vqa.sh
    │   │   └── val_albef_vqa.sh
    │   └── train
    │   │   ├── pretrain.sh
    │   │   ├── train_aokvqa_albef.sh
    │   │   ├── train_coco_retrieval_albef.sh
    │   │   ├── train_flickr30k_retrieval_albef.sh
    │   │   ├── train_nlvr_albef.sh
    │   │   ├── train_okvqa_albef.sh
    │   │   ├── train_ve_albef.sh
    │   │   └── train_vqa_albef.sh
    ├── alpro
    │   ├── eval
    │   │   ├── eval_didemo_ret.sh
    │   │   ├── eval_msrvtt_qa.sh
    │   │   ├── eval_msrvtt_ret.sh
    │   │   └── eval_msvd_qa.sh
    │   └── train
    │   │   ├── train_didemo_ret.sh
    │   │   ├── train_msrvtt_qa.sh
    │   │   ├── train_msrvtt_ret.sh
    │   │   └── train_msvd_qa.sh
    ├── blip-diffusion
    │   ├── train_db.sh
    │   ├── train_db_dog.sh
    │   ├── train_db_jacket_s.sh
    │   ├── train_db_pink_dress.sh
    │   └── train_db_shein_jacket.sh
    ├── blip
    │   ├── eval
    │   │   ├── eval_aokvqa.sh
    │   │   ├── eval_coco_cap.sh
    │   │   ├── eval_coco_cap_large.sh
    │   │   ├── eval_nlvr.sh
    │   │   ├── eval_nocaps.sh
    │   │   ├── eval_okvqa.sh
    │   │   ├── eval_ret_coco.sh
    │   │   ├── eval_ret_flickr.sh
    │   │   └── validate_vqa.sh
    │   └── train
    │   │   ├── pretrain.sh
    │   │   ├── train_aokvqa.sh
    │   │   ├── train_caption_coco.sh
    │   │   ├── train_caption_coco_large.sh
    │   │   ├── train_caption_coco_large_iters.sh
    │   │   ├── train_nlvr.sh
    │   │   ├── train_okvqa.sh
    │   │   ├── train_retrieval_coco.sh
    │   │   ├── train_retrieval_flickr.sh
    │   │   └── train_vqa.sh
    ├── blip2
    │   ├── eval
    │   │   ├── eval_cap_coco_flant5xl.sh
    │   │   ├── eval_cap_coco_opt2.7b.sh
    │   │   ├── eval_cap_coco_opt6.7b.sh
    │   │   ├── eval_gqa_zeroshot_flant5xl.sh
    │   │   ├── eval_okvqa_zeroshot_flant5xl.sh
    │   │   ├── eval_ret_coco.sh
    │   │   ├── eval_ret_flickr.sh
    │   │   ├── validate_vqa_zeroshot_flant5xl.sh
    │   │   └── validate_vqa_zeroshot_opt.sh
    │   └── train
    │   │   ├── pretrain_stage1.sh
    │   │   ├── pretrain_stage2.sh
    │   │   ├── train_caption_coco.sh
    │   │   └── train_retrieval_coco.sh
    ├── clip
    │   └── eval
    │   │   ├── eval_clip_ret_coco.sh
    │   │   ├── eval_clip_ret_flickr.sh
    │   │   └── eval_clip_zs_imnet.sh
    ├── gpt
    │   ├── eval
    │   │   └── eval_video_dialogue_avsd.sh
    │   └── train
    │   │   └── train_video_dialogue_avsd.sh
    ├── pnp-vqa
    │   └── eval
    │   │   ├── eval_gqa.sh
    │   │   ├── eval_gqa_3b.sh
    │   │   ├── eval_gqa_large.sh
    │   │   ├── eval_okvqa.sh
    │   │   ├── eval_okvqa_3b.sh
    │   │   ├── eval_okvqa_large.sh
    │   │   ├── eval_vqav2.sh
    │   │   ├── eval_vqav2_3b.sh
    │   │   ├── eval_vqav2_large.sh
    │   │   ├── eval_vqav2_test.sh
    │   │   ├── eval_vqav2_test_3b.sh
    │   │   └── eval_vqav2_test_large.sh
    ├── run_browser.sh
    └── run_demo.sh
├── setup.py
├── tests
    └── models
    │   ├── test_albef.py
    │   ├── test_blip.py
    │   ├── test_blip2.py
    │   └── test_pnp_vqa.py
└── train.py


/.github/workflows/docs.yaml:
--------------------------------------------------------------------------------
 1 | name: docs
 2 | 
 3 | on:
 4 |   push:
 5 |     branches: [ main ]
 6 |   pull_request:
 7 |     branches: [ main ]
 8 |   release:
 9 |     types: [ published ]
10 | 
11 | jobs:
12 |   build:
13 | 
14 |     runs-on: ubuntu-18.04
15 | 
16 |     steps:
17 |     - uses: actions/checkout@v2
18 |       with:
19 |         fetch-depth: 0
20 |     - name: Set up Python
21 |       uses: actions/setup-python@v2
22 |       with:
23 |         python-version: '3.8'
24 |     - name: Install dependencies
25 |       run: |
26 |         python -m pip install --upgrade pip setuptools wheel
27 |         sudo apt-get update
28 |         sudo apt-get install openjdk-11-jdk
29 |         sudo apt-get install pandoc
30 |     - name: Build Sphinx docs
31 |       run: |
32 |         docs/build_docs.sh
33 |     - name: Deploy to gh-pages
34 |       uses: peaceiris/actions-gh-pages@v3
35 |       if: ${{ github.ref == 'refs/heads/main' || github.event_name == 'release' }}
36 |       with:
37 |         github_token: ${{ secrets.GITHUB_TOKEN }}
38 |         publish_dir: docs/_build/html


--------------------------------------------------------------------------------
/.pre-commit-config.yaml:
--------------------------------------------------------------------------------
 1 | repos:
 2 | -   repo: https://github.com/pre-commit/pre-commit-hooks
 3 |     rev: v4.1.0
 4 |     hooks:
 5 |     -   id: trailing-whitespace
 6 |     -   id: check-ast
 7 |     -   id: no-commit-to-branch
 8 |         args: ['--branch=main']
 9 |     -   id: check-added-large-files
10 |         args: ['--maxkb=5000']
11 |     -   id: end-of-file-fixer
12 | 
13 | -   repo: https://github.com/psf/black
14 |     rev: stable
15 |     hooks:
16 |     - id: black
17 |       language_version: python3.8
18 | 
19 | -   repo: https://github.com/PyCQA/flake8
20 |     rev: 3.9.2
21 |     hooks:
22 |     -   id: flake8
23 |         args: [
24 |             # only error for syntax errors and undefined names
25 |             "--select=E9,F63,F7,F82",
26 |         ]
27 | 


--------------------------------------------------------------------------------
/CODEOWNERS:
--------------------------------------------------------------------------------
1 | # Comment line immediately above ownership line is reserved for related gus information. Please be careful while editing.
2 | #ECCN:Open Source


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include lavis/configs *.yaml *.json
2 | recursive-include lavis/projects *.yaml *.json
3 | 
4 | recursive-exclude lavis/datasets/download_scripts *
5 | recursive-exclude lavis/output *
6 | 
7 | include requirements.txt
8 | include lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz
9 | 


--------------------------------------------------------------------------------
/SECURITY.md:
--------------------------------------------------------------------------------
1 | ## Security
2 | 
3 | Please report any security issue to [security@salesforce.com](mailto:security@salesforce.com)
4 | as soon as it is discovered. This library limits its runtime dependencies in
5 | order to reduce the total cost of ownership as much as can be, but all consumers
6 | should remain vigilant and have their security stakeholders review all third-party
7 | products (3PP) like this one and their dependencies.


--------------------------------------------------------------------------------
/app/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  # Copyright (c) 2022, salesforce.com, inc.
 3 |  # All rights reserved.
 4 |  # SPDX-License-Identifier: BSD-3-Clause
 5 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from PIL import Image
 9 | import requests
10 | 
11 | import streamlit as st
12 | import torch
13 | 
14 | 
15 | @st.cache()
16 | def load_demo_image():
17 |     img_url = (
18 |         "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg"
19 |     )
20 |     raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
21 |     return raw_image
22 | 
23 | 
24 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25 | 
26 | cache_root = "/export/home/.cache/lavis/"
27 | 


--------------------------------------------------------------------------------
/app/main.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  # Copyright (c) 2022, salesforce.com, inc.
 3 |  # All rights reserved.
 4 |  # SPDX-License-Identifier: BSD-3-Clause
 5 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from app.multipage import MultiPage
 9 | from app import vqa, caption
10 | from app import image_text_match as itm
11 | from app import text_localization as tl
12 | from app import multimodal_search as ms
13 | from app import classification as cl
14 | 
15 | 
16 | if __name__ == "__main__":
17 |     app = MultiPage()
18 | 
19 |     app.add_page("Image Description Generation", caption.app)
20 |     app.add_page("Multimodal Search", ms.app)
21 |     app.add_page("Visual Question Answering", vqa.app)
22 |     app.add_page("Image Text Matching", itm.app)
23 |     app.add_page("Text Localization", tl.app)
24 |     app.add_page("Classification", cl.app)
25 |     app.run()
26 | 


--------------------------------------------------------------------------------
/assets/demo-6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/assets/demo-6.png


--------------------------------------------------------------------------------
/dataset_card/gqa.md:
--------------------------------------------------------------------------------
 1 | ![From https://arxiv.org/abs/1902.09506.pdf.](imgs/gqa.png)
 2 | 
 3 | # GQA Dataset
 4 | 
 5 | ## Description
 6 | (from https://cs.stanford.edu/people/dorarad/gqa/about.html)
 7 | 
 8 | GQA is a VQA dataset for real-word images which requires visual, spatial and compositional reasoning. 
 9 | It consists of 22M questions and 110K images.
10 | 
11 | ## Task
12 | (from https://arxiv.org/abs/1902.09506)
13 | 
14 | Given an image and a question, the model is required to output a correct answer. 
15 | GQA questions require spatial understanding, multiple reasoning skills and multiple-step inference. 
16 | 
17 | ## Metrics
18 | 
19 | The metrics are accuracy, consistency, validity, plausibility. The commonly reported metric is accuracy.
20 | 
21 | ## Leaderboard
22 | 
23 | TBD
24 | 
25 | ## Auto-Downloading
26 | 
27 | ```
28 | cd lavis/datasets/download_scripts && python download_gqa.py
29 | ```
30 | 
31 | ## References
32 | "GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering", Drew A. Hudson, Christopher D. Manning


--------------------------------------------------------------------------------
/dataset_card/imgs/NLVR2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/NLVR2.png


--------------------------------------------------------------------------------
/dataset_card/imgs/avsd_dialogue.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/avsd_dialogue.png


--------------------------------------------------------------------------------
/dataset_card/imgs/coco_caption.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/coco_caption.png


--------------------------------------------------------------------------------
/dataset_card/imgs/conceptual_captions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/conceptual_captions.png


--------------------------------------------------------------------------------
/dataset_card/imgs/didemo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/didemo.png


--------------------------------------------------------------------------------
/dataset_card/imgs/flickr30k.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/flickr30k.png


--------------------------------------------------------------------------------
/dataset_card/imgs/gqa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/gqa.png


--------------------------------------------------------------------------------
/dataset_card/imgs/msrvtt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/msrvtt.png


--------------------------------------------------------------------------------
/dataset_card/imgs/msrvtt_qa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/msrvtt_qa.png


--------------------------------------------------------------------------------
/dataset_card/imgs/msvd_qa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/msvd_qa.png


--------------------------------------------------------------------------------
/dataset_card/imgs/nocaps.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/nocaps.png


--------------------------------------------------------------------------------
/dataset_card/imgs/sbu_caption.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/sbu_caption.png


--------------------------------------------------------------------------------
/dataset_card/imgs/snli_ve.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/snli_ve.png


--------------------------------------------------------------------------------
/dataset_card/imgs/vqav2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/vqav2.png


--------------------------------------------------------------------------------
/dataset_card/sbu_caption.md:
--------------------------------------------------------------------------------
 1 | ![sbu caption](imgs/sbu_caption.png)
 2 | (image credit: http://tamaraberg.com/papers/generation_nips2011.pdf)
 3 | 
 4 | # SBU Caption Dataset
 5 | (from http://tamaraberg.com/papers/generation_nips2011.pdf)
 6 | 
 7 | SBU caption dataset is a new dataset, collected by performing Flickr queries and
 8 | then filtering the noisy results down to 1 million images with associated visually
 9 | relevant captions.
10 | 
11 | ## Auto-Downloading
12 | ```
13 | cd lavis/datasets/download_scripts && python download_sbu.py
14 | ```
15 | ## References
16 | ```bibtex
17 | @inproceedings{Ordonez:2011:im2text,
18 |   Author    = {Vicente Ordonez and Girish Kulkarni and Tamara L. Berg},
19 |   Title     = {Im2Text: Describing Images Using 1 Million Captioned Photographs},
20 |   Booktitle = {Neural Information Processing Systems ({NIPS})},
21 |   Year      = {2011},
22 | }
23 | ```
24 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line, and also
 5 | # from the environment for the first two.
 6 | SPHINXOPTS    ?=
 7 | SPHINXBUILD   ?= sphinx-build
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 | 


--------------------------------------------------------------------------------
/docs/_static/Confusing-Pictures.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/docs/_static/Confusing-Pictures.jpg


--------------------------------------------------------------------------------
/docs/_static/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/docs/_static/architecture.png


--------------------------------------------------------------------------------
/docs/_static/logo_final.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/docs/_static/logo_final.png


--------------------------------------------------------------------------------
/docs/_static/merlion.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/docs/_static/merlion.png


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | .. LAVIS documentation master file, created by
 2 |    sphinx-quickstart on Sun Jul 31 10:32:27 2022.
 3 |    You can adapt this file completely to your liking, but it should at least
 4 |    contain the root `toctree` directive.
 5 | 
 6 | Welcome to LAVIS's documentation!
 7 | =================================
 8 | 
 9 | .. toctree::
10 |    :maxdepth: 1
11 |    :caption: Introduction
12 | 
13 |    intro
14 | 
15 | 
16 | .. toctree::
17 |    :maxdepth: 1
18 |    :caption: Getting Started
19 | 
20 |    getting_started
21 | 
22 | 
23 | ..    :maxdepth: 1
24 | ..    :caption: Advanced Training
25 | 
26 | ..    advanced_training
27 | 
28 | 
29 | .. toctree::
30 |    :maxdepth: 2
31 |    :caption: Advanced Usage
32 | 
33 |    benchmark
34 |    tutorial
35 | 
36 | 
37 | .. Documentations
38 | .. ===================
39 | 
40 | 
41 | Indices and tables
42 | ==================
43 | 
44 | * :ref:`genindex`
45 | * :ref:`modindex`
46 | * :ref:`search`
47 | 


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=source
11 | set BUILDDIR=build
12 | 
13 | if "%1" == "" goto help
14 | 
15 | %SPHINXBUILD% >NUL 2>NUL
16 | if errorlevel 9009 (
17 | 	echo.
18 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
19 | 	echo.installed, then set the SPHINXBUILD environment variable to point
20 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
21 | 	echo.may add the Sphinx directory to PATH.
22 | 	echo.
23 | 	echo.If you don't have Sphinx installed, grab it from
24 | 	echo.http://sphinx-doc.org/
25 | 	exit /b 1
26 | )
27 | 
28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29 | goto end
30 | 
31 | :help
32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33 | 
34 | :end
35 | popd
36 | 


--------------------------------------------------------------------------------
/docs/requirements.txt:
--------------------------------------------------------------------------------
1 | GitPython
2 | ipykernel
3 | nbsphinx==0.8.7
4 | pandoc
5 | sphinx
6 | sphinx_autodoc_typehints
7 | sphinx_rtd_theme


--------------------------------------------------------------------------------
/docs/tutorial.rst:
--------------------------------------------------------------------------------
 1 | Tutorials
 2 | ==============================
 3 | 
 4 | .. toctree::
 5 |    :maxdepth: 1
 6 | 
 7 |    tutorial.evaluation
 8 |    tutorial.training-example
 9 |    tutorial.configs
10 |    tutorial.datasets
11 |    tutorial.processors
12 |    tutorial.models
13 |    tutorial.tasks
14 | 


--------------------------------------------------------------------------------
/lavis/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from lavis.common.registry import registry
14 | 
15 | from lavis.datasets.builders import *
16 | from lavis.models import *
17 | from lavis.processors import *
18 | from lavis.tasks import *
19 | 
20 | 
21 | root_dir = os.path.dirname(os.path.abspath(__file__))
22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
23 | 
24 | registry.register_path("library_root", root_dir)
25 | repo_root = os.path.join(root_dir, "..")
26 | registry.register_path("repo_root", repo_root)
27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
28 | registry.register_path("cache_root", cache_root)
29 | 
30 | registry.register("MAX_INT", sys.maxsize)
31 | registry.register("SPLIT_NAMES", ["train", "val", "test"])
32 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/canny/__init__.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | 
3 | 
4 | class CannyDetector:
5 |     def __call__(self, img, low_threshold, high_threshold):
6 |         return cv2.Canny(img, low_threshold, high_threshold)
7 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/ckpts/download.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt
4 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/network-bsds500.pth
5 | 
6 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/midas/midas/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/lavis/common/annotator/midas/midas/__init__.py


--------------------------------------------------------------------------------
/lavis/common/annotator/midas/midas/base_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class BaseModel(torch.nn.Module):
 5 |     def load(self, path):
 6 |         """Load model from file.
 7 | 
 8 |         Args:
 9 |             path (str): file path
10 |         """
11 |         parameters = torch.load(path, map_location=torch.device('cpu'))
12 | 
13 |         if "optimizer" in parameters:
14 |             parameters = parameters["model"]
15 | 
16 |         self.load_state_dict(parameters)
17 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from annotator.uniformer.mmseg.apis import init_segmentor, inference_segmentor, show_result_pyplot
 4 | from annotator.uniformer.mmseg.core.evaluation import get_palette
 5 | from annotator.util import annotator_ckpts_path
 6 | 
 7 | 
 8 | checkpoint_file = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/upernet_global_small.pth"
 9 | 
10 | 
11 | class UniformerDetector:
12 |     def __init__(self):
13 |         modelpath = os.path.join(annotator_ckpts_path, "upernet_global_small.pth")
14 |         if not os.path.exists(modelpath):
15 |             from basicsr.utils.download_util import load_file_from_url
16 |             load_file_from_url(checkpoint_file, model_dir=annotator_ckpts_path)
17 |         config_file = os.path.join(os.path.dirname(annotator_ckpts_path), "uniformer", "exp", "upernet_global_small", "config.py")
18 |         self.model = init_segmentor(config_file, modelpath).cuda()
19 | 
20 |     def __call__(self, img):
21 |         result = inference_segmentor(self.model, img)
22 |         res_img = show_result_pyplot(self.model, img, result, get_palette('ade'), opacity=1)
23 |         return res_img
24 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/configs/_base_/datasets/pascal_voc12_aug.py:
--------------------------------------------------------------------------------
 1 | _base_ = './pascal_voc12.py'
 2 | # dataset settings
 3 | data = dict(
 4 |     train=dict(
 5 |         ann_dir=['SegmentationClass', 'SegmentationClassAug'],
 6 |         split=[
 7 |             'ImageSets/Segmentation/train.txt',
 8 |             'ImageSets/Segmentation/aug.txt'
 9 |         ]))
10 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/configs/_base_/default_runtime.py:
--------------------------------------------------------------------------------
 1 | # yapf:disable
 2 | log_config = dict(
 3 |     interval=50,
 4 |     hooks=[
 5 |         dict(type='TextLoggerHook', by_epoch=False),
 6 |         # dict(type='TensorboardLoggerHook')
 7 |     ])
 8 | # yapf:enable
 9 | dist_params = dict(backend='nccl')
10 | log_level = 'INFO'
11 | load_from = None
12 | resume_from = None
13 | workflow = [('train', 1)]
14 | cudnn_benchmark = True
15 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/configs/_base_/models/fpn_uniformer.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | model = dict(
 4 |     type='EncoderDecoder',
 5 |     backbone=dict(
 6 |         type='UniFormer',
 7 |         embed_dim=[64, 128, 320, 512],
 8 |         layers=[3, 4, 8, 3],
 9 |         head_dim=64,
10 |         mlp_ratio=4.,
11 |         qkv_bias=True,
12 |         drop_rate=0.,
13 |         attn_drop_rate=0.,
14 |         drop_path_rate=0.1),
15 |     neck=dict(
16 |         type='FPN',
17 |         in_channels=[64, 128, 320, 512],
18 |         out_channels=256,
19 |         num_outs=4),
20 |     decode_head=dict(
21 |         type='FPNHead',
22 |         in_channels=[256, 256, 256, 256],
23 |         in_index=[0, 1, 2, 3],
24 |         feature_strides=[4, 8, 16, 32],
25 |         channels=128,
26 |         dropout_ratio=0.1,
27 |         num_classes=150,
28 |         norm_cfg=norm_cfg,
29 |         align_corners=False,
30 |         loss_decode=dict(
31 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
32 |     # model training and testing settings
33 |     train_cfg=dict(),
34 |     test_cfg=dict(mode='whole')
35 | )
36 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
 3 | model = dict(
 4 |     type='EncoderDecoder',
 5 |     backbone=dict(
 6 |         type='MobileNetV3',
 7 |         arch='large',
 8 |         out_indices=(1, 3, 16),
 9 |         norm_cfg=norm_cfg),
10 |     decode_head=dict(
11 |         type='LRASPPHead',
12 |         in_channels=(16, 24, 960),
13 |         in_index=(0, 1, 2),
14 |         channels=128,
15 |         input_transform='multiple_select',
16 |         dropout_ratio=0.1,
17 |         num_classes=19,
18 |         norm_cfg=norm_cfg,
19 |         act_cfg=dict(type='ReLU'),
20 |         align_corners=False,
21 |         loss_decode=dict(
22 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
23 |     # model training and testing settings
24 |     train_cfg=dict(),
25 |     test_cfg=dict(mode='whole'))
26 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_160k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=160000)
 8 | checkpoint_config = dict(by_epoch=False, interval=16000)
 9 | evaluation = dict(interval=16000, metric='mIoU')
10 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_20k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=20000)
 8 | checkpoint_config = dict(by_epoch=False, interval=2000)
 9 | evaluation = dict(interval=2000, metric='mIoU')
10 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_40k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=40000)
 8 | checkpoint_config = dict(by_epoch=False, interval=4000)
 9 | evaluation = dict(interval=4000, metric='mIoU')
10 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_80k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=80000)
 8 | checkpoint_config = dict(by_epoch=False, interval=8000)
 9 | evaluation = dict(interval=8000, metric='mIoU')
10 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/exp/upernet_global_small/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | work_path=$(dirname $0)
 4 | PYTHONPATH="$(dirname $0)/../../":$PYTHONPATH \
 5 | python -m torch.distributed.launch --nproc_per_node=8 \
 6 |     tools/train.py ${work_path}/config.py \
 7 |     --launcher pytorch \
 8 |     --options model.backbone.pretrained_path='your_model_path/uniformer_small_in1k.pth' \
 9 |     --work-dir ${work_path}/ckpt \
10 |     2>&1 | tee -a ${work_path}/log.txt
11 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/exp/upernet_global_small/test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | work_path=$(dirname $0)
 4 | PYTHONPATH="$(dirname $0)/../../":$PYTHONPATH \
 5 | python -m torch.distributed.launch --nproc_per_node=8 \
 6 |     tools/test.py ${work_path}/test_config_h32.py \
 7 |     ${work_path}/ckpt/latest.pth \
 8 |     --launcher pytorch \
 9 |     --eval mIoU \
10 |     2>&1 | tee -a ${work_path}/log.txt
11 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | # flake8: noqa
 3 | from .arraymisc import *
 4 | from .fileio import *
 5 | from .image import *
 6 | from .utils import *
 7 | from .version import *
 8 | from .video import *
 9 | from .visualization import *
10 | 
11 | # The following modules are not imported to this level, so mmcv may be used
12 | # without PyTorch.
13 | # - runner
14 | # - parallel
15 | # - op
16 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/arraymisc/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .quantization import dequantize, quantize
3 | 
4 | __all__ = ['quantize', 'dequantize']
5 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/cnn/bricks/hswish.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch.nn as nn
 3 | 
 4 | from .registry import ACTIVATION_LAYERS
 5 | 
 6 | 
 7 | @ACTIVATION_LAYERS.register_module()
 8 | class HSwish(nn.Module):
 9 |     """Hard Swish Module.
10 | 
11 |     This module applies the hard swish function:
12 | 
13 |     .. math::
14 |         Hswish(x) = x * ReLU6(x + 3) / 6
15 | 
16 |     Args:
17 |         inplace (bool): can optionally do the operation in-place.
18 |             Default: False.
19 | 
20 |     Returns:
21 |         Tensor: The output tensor.
22 |     """
23 | 
24 |     def __init__(self, inplace=False):
25 |         super(HSwish, self).__init__()
26 |         self.act = nn.ReLU6(inplace)
27 | 
28 |     def forward(self, x):
29 |         return x * self.act(x + 3) / 6
30 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/cnn/bricks/registry.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from annotator.uniformer.mmcv.utils import Registry
 3 | 
 4 | CONV_LAYERS = Registry('conv layer')
 5 | NORM_LAYERS = Registry('norm layer')
 6 | ACTIVATION_LAYERS = Registry('activation layer')
 7 | PADDING_LAYERS = Registry('padding layer')
 8 | UPSAMPLE_LAYERS = Registry('upsample layer')
 9 | PLUGIN_LAYERS = Registry('plugin layer')
10 | 
11 | DROPOUT_LAYERS = Registry('drop out layers')
12 | POSITIONAL_ENCODING = Registry('position encoding')
13 | ATTENTION = Registry('attention')
14 | FEEDFORWARD_NETWORK = Registry('feed-forward Network')
15 | TRANSFORMER_LAYER = Registry('transformerLayer')
16 | TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence')
17 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/cnn/bricks/scale.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | class Scale(nn.Module):
 7 |     """A learnable scale parameter.
 8 | 
 9 |     This layer scales the input by a learnable factor. It multiplies a
10 |     learnable scale parameter of shape (1,) with input of any shape.
11 | 
12 |     Args:
13 |         scale (float): Initial value of scale factor. Default: 1.0
14 |     """
15 | 
16 |     def __init__(self, scale=1.0):
17 |         super(Scale, self).__init__()
18 |         self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
19 | 
20 |     def forward(self, x):
21 |         return x * self.scale
22 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/cnn/bricks/swish.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from .registry import ACTIVATION_LAYERS
 6 | 
 7 | 
 8 | @ACTIVATION_LAYERS.register_module()
 9 | class Swish(nn.Module):
10 |     """Swish Module.
11 | 
12 |     This module applies the swish function:
13 | 
14 |     .. math::
15 |         Swish(x) = x * Sigmoid(x)
16 | 
17 |     Returns:
18 |         Tensor: The output tensor.
19 |     """
20 | 
21 |     def __init__(self):
22 |         super(Swish, self).__init__()
23 | 
24 |     def forward(self, x):
25 |         return x * torch.sigmoid(x)
26 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/cnn/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .flops_counter import get_model_complexity_info
 3 | from .fuse_conv_bn import fuse_conv_bn
 4 | from .sync_bn import revert_sync_batchnorm
 5 | from .weight_init import (INITIALIZERS, Caffe2XavierInit, ConstantInit,
 6 |                           KaimingInit, NormalInit, PretrainedInit,
 7 |                           TruncNormalInit, UniformInit, XavierInit,
 8 |                           bias_init_with_prob, caffe2_xavier_init,
 9 |                           constant_init, initialize, kaiming_init, normal_init,
10 |                           trunc_normal_init, uniform_init, xavier_init)
11 | 
12 | __all__ = [
13 |     'get_model_complexity_info', 'bias_init_with_prob', 'caffe2_xavier_init',
14 |     'constant_init', 'kaiming_init', 'normal_init', 'trunc_normal_init',
15 |     'uniform_init', 'xavier_init', 'fuse_conv_bn', 'initialize',
16 |     'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit',
17 |     'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit',
18 |     'Caffe2XavierInit', 'revert_sync_batchnorm'
19 | ]
20 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/engine/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .test import (collect_results_cpu, collect_results_gpu, multi_gpu_test,
3 |                    single_gpu_test)
4 | 
5 | __all__ = [
6 |     'collect_results_cpu', 'collect_results_gpu', 'multi_gpu_test',
7 |     'single_gpu_test'
8 | ]
9 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/fileio/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .file_client import BaseStorageBackend, FileClient
 3 | from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler
 4 | from .io import dump, load, register_handler
 5 | from .parse import dict_from_file, list_from_file
 6 | 
 7 | __all__ = [
 8 |     'BaseStorageBackend', 'FileClient', 'load', 'dump', 'register_handler',
 9 |     'BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler',
10 |     'list_from_file', 'dict_from_file'
11 | ]
12 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/fileio/handlers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .base import BaseFileHandler
3 | from .json_handler import JsonHandler
4 | from .pickle_handler import PickleHandler
5 | from .yaml_handler import YamlHandler
6 | 
7 | __all__ = ['BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler']
8 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/fileio/handlers/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from abc import ABCMeta, abstractmethod
 3 | 
 4 | 
 5 | class BaseFileHandler(metaclass=ABCMeta):
 6 |     # `str_like` is a flag to indicate whether the type of file object is
 7 |     # str-like object or bytes-like object. Pickle only processes bytes-like
 8 |     # objects but json only processes str-like object. If it is str-like
 9 |     # object, `StringIO` will be used to process the buffer.
10 |     str_like = True
11 | 
12 |     @abstractmethod
13 |     def load_from_fileobj(self, file, **kwargs):
14 |         pass
15 | 
16 |     @abstractmethod
17 |     def dump_to_fileobj(self, obj, file, **kwargs):
18 |         pass
19 | 
20 |     @abstractmethod
21 |     def dump_to_str(self, obj, **kwargs):
22 |         pass
23 | 
24 |     def load_from_path(self, filepath, mode='r', **kwargs):
25 |         with open(filepath, mode) as f:
26 |             return self.load_from_fileobj(f, **kwargs)
27 | 
28 |     def dump_to_path(self, obj, filepath, mode='w', **kwargs):
29 |         with open(filepath, mode) as f:
30 |             self.dump_to_fileobj(obj, f, **kwargs)
31 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/fileio/handlers/pickle_handler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import pickle
 3 | 
 4 | from .base import BaseFileHandler
 5 | 
 6 | 
 7 | class PickleHandler(BaseFileHandler):
 8 | 
 9 |     str_like = False
10 | 
11 |     def load_from_fileobj(self, file, **kwargs):
12 |         return pickle.load(file, **kwargs)
13 | 
14 |     def load_from_path(self, filepath, **kwargs):
15 |         return super(PickleHandler, self).load_from_path(
16 |             filepath, mode='rb', **kwargs)
17 | 
18 |     def dump_to_str(self, obj, **kwargs):
19 |         kwargs.setdefault('protocol', 2)
20 |         return pickle.dumps(obj, **kwargs)
21 | 
22 |     def dump_to_fileobj(self, obj, file, **kwargs):
23 |         kwargs.setdefault('protocol', 2)
24 |         pickle.dump(obj, file, **kwargs)
25 | 
26 |     def dump_to_path(self, obj, filepath, **kwargs):
27 |         super(PickleHandler, self).dump_to_path(
28 |             obj, filepath, mode='wb', **kwargs)
29 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/fileio/handlers/yaml_handler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import yaml
 3 | 
 4 | try:
 5 |     from yaml import CLoader as Loader, CDumper as Dumper
 6 | except ImportError:
 7 |     from yaml import Loader, Dumper
 8 | 
 9 | from .base import BaseFileHandler  # isort:skip
10 | 
11 | 
12 | class YamlHandler(BaseFileHandler):
13 | 
14 |     def load_from_fileobj(self, file, **kwargs):
15 |         kwargs.setdefault('Loader', Loader)
16 |         return yaml.load(file, **kwargs)
17 | 
18 |     def dump_to_fileobj(self, obj, file, **kwargs):
19 |         kwargs.setdefault('Dumper', Dumper)
20 |         yaml.dump(obj, file, **kwargs)
21 | 
22 |     def dump_to_str(self, obj, **kwargs):
23 |         kwargs.setdefault('Dumper', Dumper)
24 |         return yaml.dump(obj, **kwargs)
25 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/model_zoo/deprecated.json:
--------------------------------------------------------------------------------
1 | {
2 |   "resnet50_caffe": "detectron/resnet50_caffe",
3 |   "resnet50_caffe_bgr": "detectron2/resnet50_caffe_bgr",
4 |   "resnet101_caffe": "detectron/resnet101_caffe",
5 |   "resnet101_caffe_bgr": "detectron2/resnet101_caffe_bgr"
6 | }
7 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/ops/info.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import glob
 3 | import os
 4 | 
 5 | import torch
 6 | 
 7 | if torch.__version__ == 'parrots':
 8 |     import parrots
 9 | 
10 |     def get_compiler_version():
11 |         return 'GCC ' + parrots.version.compiler
12 | 
13 |     def get_compiling_cuda_version():
14 |         return parrots.version.cuda
15 | else:
16 |     from ..utils import ext_loader
17 |     ext_module = ext_loader.load_ext(
18 |         '_ext', ['get_compiler_version', 'get_compiling_cuda_version'])
19 | 
20 |     def get_compiler_version():
21 |         return ext_module.get_compiler_version()
22 | 
23 |     def get_compiling_cuda_version():
24 |         return ext_module.get_compiling_cuda_version()
25 | 
26 | 
27 | def get_onnxruntime_op_path():
28 |     wildcard = os.path.join(
29 |         os.path.abspath(os.path.dirname(os.path.dirname(__file__))),
30 |         '_ext_ort.*.so')
31 | 
32 |     paths = glob.glob(wildcard)
33 |     if len(paths) > 0:
34 |         return paths[0]
35 |     else:
36 |         return ''
37 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/parallel/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .collate import collate
 3 | from .data_container import DataContainer
 4 | from .data_parallel import MMDataParallel
 5 | from .distributed import MMDistributedDataParallel
 6 | from .registry import MODULE_WRAPPERS
 7 | from .scatter_gather import scatter, scatter_kwargs
 8 | from .utils import is_module_wrapper
 9 | 
10 | __all__ = [
11 |     'collate', 'DataContainer', 'MMDataParallel', 'MMDistributedDataParallel',
12 |     'scatter', 'scatter_kwargs', 'is_module_wrapper', 'MODULE_WRAPPERS'
13 | ]
14 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/parallel/registry.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from torch.nn.parallel import DataParallel, DistributedDataParallel
3 | 
4 | from annotator.uniformer.mmcv.utils import Registry
5 | 
6 | MODULE_WRAPPERS = Registry('module wrapper')
7 | MODULE_WRAPPERS.register_module(module=DataParallel)
8 | MODULE_WRAPPERS.register_module(module=DistributedDataParallel)
9 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/parallel/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .registry import MODULE_WRAPPERS
 3 | 
 4 | 
 5 | def is_module_wrapper(module):
 6 |     """Check if a module is a module wrapper.
 7 | 
 8 |     The following 3 modules in MMCV (and their subclasses) are regarded as
 9 |     module wrappers: DataParallel, DistributedDataParallel,
10 |     MMDistributedDataParallel (the deprecated version). You may add you own
11 |     module wrapper by registering it to mmcv.parallel.MODULE_WRAPPERS.
12 | 
13 |     Args:
14 |         module (nn.Module): The module to be checked.
15 | 
16 |     Returns:
17 |         bool: True if the input module is a module wrapper.
18 |     """
19 |     module_wrappers = tuple(MODULE_WRAPPERS.module_dict.values())
20 |     return isinstance(module, module_wrappers)
21 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/runner/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import copy
 3 | 
 4 | from ..utils import Registry
 5 | 
 6 | RUNNERS = Registry('runner')
 7 | RUNNER_BUILDERS = Registry('runner builder')
 8 | 
 9 | 
10 | def build_runner_constructor(cfg):
11 |     return RUNNER_BUILDERS.build(cfg)
12 | 
13 | 
14 | def build_runner(cfg, default_args=None):
15 |     runner_cfg = copy.deepcopy(cfg)
16 |     constructor_type = runner_cfg.pop('constructor',
17 |                                       'DefaultRunnerConstructor')
18 |     runner_constructor = build_runner_constructor(
19 |         dict(
20 |             type=constructor_type,
21 |             runner_cfg=runner_cfg,
22 |             default_args=default_args))
23 |     runner = runner_constructor()
24 |     return runner
25 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/runner/hooks/closure.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .hook import HOOKS, Hook
 3 | 
 4 | 
 5 | @HOOKS.register_module()
 6 | class ClosureHook(Hook):
 7 | 
 8 |     def __init__(self, fn_name, fn):
 9 |         assert hasattr(self, fn_name)
10 |         assert callable(fn)
11 |         setattr(self, fn_name, fn)
12 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/runner/hooks/iter_timer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import time
 3 | 
 4 | from .hook import HOOKS, Hook
 5 | 
 6 | 
 7 | @HOOKS.register_module()
 8 | class IterTimerHook(Hook):
 9 | 
10 |     def before_epoch(self, runner):
11 |         self.t = time.time()
12 | 
13 |     def before_iter(self, runner):
14 |         runner.log_buffer.update({'data_time': time.time() - self.t})
15 | 
16 |     def after_iter(self, runner):
17 |         runner.log_buffer.update({'time': time.time() - self.t})
18 |         self.t = time.time()
19 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .base import LoggerHook
 3 | from .dvclive import DvcliveLoggerHook
 4 | from .mlflow import MlflowLoggerHook
 5 | from .neptune import NeptuneLoggerHook
 6 | from .pavi import PaviLoggerHook
 7 | from .tensorboard import TensorboardLoggerHook
 8 | from .text import TextLoggerHook
 9 | from .wandb import WandbLoggerHook
10 | 
11 | __all__ = [
12 |     'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook',
13 |     'TensorboardLoggerHook', 'TextLoggerHook', 'WandbLoggerHook',
14 |     'NeptuneLoggerHook', 'DvcliveLoggerHook'
15 | ]
16 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/runner/hooks/memory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | 
 4 | from .hook import HOOKS, Hook
 5 | 
 6 | 
 7 | @HOOKS.register_module()
 8 | class EmptyCacheHook(Hook):
 9 | 
10 |     def __init__(self, before_epoch=False, after_epoch=True, after_iter=False):
11 |         self._before_epoch = before_epoch
12 |         self._after_epoch = after_epoch
13 |         self._after_iter = after_iter
14 | 
15 |     def after_iter(self, runner):
16 |         if self._after_iter:
17 |             torch.cuda.empty_cache()
18 | 
19 |     def before_epoch(self, runner):
20 |         if self._before_epoch:
21 |             torch.cuda.empty_cache()
22 | 
23 |     def after_epoch(self, runner):
24 |         if self._after_epoch:
25 |             torch.cuda.empty_cache()
26 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/runner/hooks/sampler_seed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .hook import HOOKS, Hook
 3 | 
 4 | 
 5 | @HOOKS.register_module()
 6 | class DistSamplerSeedHook(Hook):
 7 |     """Data-loading sampler for distributed training.
 8 | 
 9 |     When distributed training, it is only useful in conjunction with
10 |     :obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same
11 |     purpose with :obj:`IterLoader`.
12 |     """
13 | 
14 |     def before_epoch(self, runner):
15 |         if hasattr(runner.data_loader.sampler, 'set_epoch'):
16 |             # in case the data loader uses `SequentialSampler` in Pytorch
17 |             runner.data_loader.sampler.set_epoch(runner.epoch)
18 |         elif hasattr(runner.data_loader.batch_sampler.sampler, 'set_epoch'):
19 |             # batch sampler in pytorch warps the sampler as its attributes.
20 |             runner.data_loader.batch_sampler.sampler.set_epoch(runner.epoch)
21 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/runner/hooks/sync_buffer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from ..dist_utils import allreduce_params
 3 | from .hook import HOOKS, Hook
 4 | 
 5 | 
 6 | @HOOKS.register_module()
 7 | class SyncBuffersHook(Hook):
 8 |     """Synchronize model buffers such as running_mean and running_var in BN at
 9 |     the end of each epoch.
10 | 
11 |     Args:
12 |         distributed (bool): Whether distributed training is used. It is
13 |           effective only for distributed training. Defaults to True.
14 |     """
15 | 
16 |     def __init__(self, distributed=True):
17 |         self.distributed = distributed
18 | 
19 |     def after_epoch(self, runner):
20 |         """All-reduce model buffers at the end of each epoch."""
21 |         if self.distributed:
22 |             allreduce_params(runner.model.buffers())
23 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/runner/optimizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .builder import (OPTIMIZER_BUILDERS, OPTIMIZERS, build_optimizer,
 3 |                       build_optimizer_constructor)
 4 | from .default_constructor import DefaultOptimizerConstructor
 5 | 
 6 | __all__ = [
 7 |     'OPTIMIZER_BUILDERS', 'OPTIMIZERS', 'DefaultOptimizerConstructor',
 8 |     'build_optimizer', 'build_optimizer_constructor'
 9 | ]
10 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/utils/parrots_jit.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import os
 3 | 
 4 | from .parrots_wrapper import TORCH_VERSION
 5 | 
 6 | parrots_jit_option = os.getenv('PARROTS_JIT_OPTION')
 7 | 
 8 | if TORCH_VERSION == 'parrots' and parrots_jit_option == 'ON':
 9 |     from parrots.jit import pat as jit
10 | else:
11 | 
12 |     def jit(func=None,
13 |             check_input=None,
14 |             full_shape=True,
15 |             derivate=False,
16 |             coderize=False,
17 |             optimize=False):
18 | 
19 |         def wrapper(func):
20 | 
21 |             def wrapper_inner(*args, **kargs):
22 |                 return func(*args, **kargs)
23 | 
24 |             return wrapper_inner
25 | 
26 |         if func is None:
27 |             return wrapper
28 |         else:
29 |             return func
30 | 
31 | 
32 | if TORCH_VERSION == 'parrots':
33 |     from parrots.utils.tester import skip_no_elena
34 | else:
35 | 
36 |     def skip_no_elena(func):
37 | 
38 |         def wrapper(*args, **kargs):
39 |             return func(*args, **kargs)
40 | 
41 |         return wrapper
42 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/utils/trace.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | import torch
 4 | 
 5 | from annotator.uniformer.mmcv.utils import digit_version
 6 | 
 7 | 
 8 | def is_jit_tracing() -> bool:
 9 |     if (torch.__version__ != 'parrots'
10 |             and digit_version(torch.__version__) >= digit_version('1.6.0')):
11 |         on_trace = torch.jit.is_tracing()
12 |         # In PyTorch 1.6, torch.jit.is_tracing has a bug.
13 |         # Refers to https://github.com/pytorch/pytorch/issues/42448
14 |         if isinstance(on_trace, bool):
15 |             return on_trace
16 |         else:
17 |             return torch._C._is_tracing()
18 |     else:
19 |         warnings.warn(
20 |             'torch.jit.is_tracing is only supported after v1.6.0. '
21 |             'Therefore is_tracing returns False automatically. Please '
22 |             'set on_trace manually if you are using trace.', UserWarning)
23 |         return False
24 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/video/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .io import Cache, VideoReader, frames2video
 3 | from .optflow import (dequantize_flow, flow_from_bytes, flow_warp, flowread,
 4 |                       flowwrite, quantize_flow, sparse_flow_from_bytes)
 5 | from .processing import concat_video, convert_video, cut_video, resize_video
 6 | 
 7 | __all__ = [
 8 |     'Cache', 'VideoReader', 'frames2video', 'convert_video', 'resize_video',
 9 |     'cut_video', 'concat_video', 'flowread', 'flowwrite', 'quantize_flow',
10 |     'dequantize_flow', 'flow_warp', 'flow_from_bytes', 'sparse_flow_from_bytes'
11 | ]
12 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv/visualization/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .color import Color, color_val
 3 | from .image import imshow, imshow_bboxes, imshow_det_bboxes
 4 | from .optflow import flow2rgb, flowshow, make_color_wheel
 5 | 
 6 | __all__ = [
 7 |     'Color', 'color_val', 'imshow', 'imshow_bboxes', 'imshow_det_bboxes',
 8 |     'flowshow', 'flow2rgb', 'make_color_wheel'
 9 | ]
10 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmcv_custom/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from .checkpoint import load_checkpoint
4 | 
5 | __all__ = ['load_checkpoint']


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/apis/__init__.py:
--------------------------------------------------------------------------------
 1 | from .inference import inference_segmentor, init_segmentor, show_result_pyplot
 2 | from .test import multi_gpu_test, single_gpu_test
 3 | from .train import get_root_logger, set_random_seed, train_segmentor
 4 | 
 5 | __all__ = [
 6 |     'get_root_logger', 'set_random_seed', 'train_segmentor', 'init_segmentor',
 7 |     'inference_segmentor', 'multi_gpu_test', 'single_gpu_test',
 8 |     'show_result_pyplot'
 9 | ]
10 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .evaluation import *  # noqa: F401, F403
2 | from .seg import *  # noqa: F401, F403
3 | from .utils import *  # noqa: F401, F403
4 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/core/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .class_names import get_classes, get_palette
2 | from .eval_hooks import DistEvalHook, EvalHook
3 | from .metrics import eval_metrics, mean_dice, mean_fscore, mean_iou
4 | 
5 | __all__ = [
6 |     'EvalHook', 'DistEvalHook', 'mean_dice', 'mean_iou', 'mean_fscore',
7 |     'eval_metrics', 'get_classes', 'get_palette'
8 | ]
9 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/core/seg/__init__.py:
--------------------------------------------------------------------------------
1 | from .builder import build_pixel_sampler
2 | from .sampler import BasePixelSampler, OHEMPixelSampler
3 | 
4 | __all__ = ['build_pixel_sampler', 'BasePixelSampler', 'OHEMPixelSampler']
5 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/core/seg/builder.py:
--------------------------------------------------------------------------------
1 | from annotator.uniformer.mmcv.utils import Registry, build_from_cfg
2 | 
3 | PIXEL_SAMPLERS = Registry('pixel sampler')
4 | 
5 | 
6 | def build_pixel_sampler(cfg, **default_args):
7 |     """Build pixel sampler for segmentation map."""
8 |     return build_from_cfg(cfg, PIXEL_SAMPLERS, default_args)
9 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/core/seg/sampler/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_pixel_sampler import BasePixelSampler
2 | from .ohem_pixel_sampler import OHEMPixelSampler
3 | 
4 | __all__ = ['BasePixelSampler', 'OHEMPixelSampler']
5 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/core/seg/sampler/base_pixel_sampler.py:
--------------------------------------------------------------------------------
 1 | from abc import ABCMeta, abstractmethod
 2 | 
 3 | 
 4 | class BasePixelSampler(metaclass=ABCMeta):
 5 |     """Base class of pixel sampler."""
 6 | 
 7 |     def __init__(self, **kwargs):
 8 |         pass
 9 | 
10 |     @abstractmethod
11 |     def sample(self, seg_logit, seg_label):
12 |         """Placeholder for sample function."""
13 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/core/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .misc import add_prefix
2 | 
3 | __all__ = ['add_prefix']
4 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/core/utils/misc.py:
--------------------------------------------------------------------------------
 1 | def add_prefix(inputs, prefix):
 2 |     """Add prefix for dict.
 3 | 
 4 |     Args:
 5 |         inputs (dict): The input dict with str keys.
 6 |         prefix (str): The prefix to add.
 7 | 
 8 |     Returns:
 9 | 
10 |         dict: The dict with keys updated with ``prefix``.
11 |     """
12 | 
13 |     outputs = dict()
14 |     for name, value in inputs.items():
15 |         outputs[f'{prefix}.{name}'] = value
16 | 
17 |     return outputs
18 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | from .ade import ADE20KDataset
 2 | from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset
 3 | from .chase_db1 import ChaseDB1Dataset
 4 | from .cityscapes import CityscapesDataset
 5 | from .custom import CustomDataset
 6 | from .dataset_wrappers import ConcatDataset, RepeatDataset
 7 | from .drive import DRIVEDataset
 8 | from .hrf import HRFDataset
 9 | from .pascal_context import PascalContextDataset, PascalContextDataset59
10 | from .stare import STAREDataset
11 | from .voc import PascalVOCDataset
12 | 
13 | __all__ = [
14 |     'CustomDataset', 'build_dataloader', 'ConcatDataset', 'RepeatDataset',
15 |     'DATASETS', 'build_dataset', 'PIPELINES', 'CityscapesDataset',
16 |     'PascalVOCDataset', 'ADE20KDataset', 'PascalContextDataset',
17 |     'PascalContextDataset59', 'ChaseDB1Dataset', 'DRIVEDataset', 'HRFDataset',
18 |     'STAREDataset'
19 | ]
20 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/datasets/chase_db1.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | 
 3 | from .builder import DATASETS
 4 | from .custom import CustomDataset
 5 | 
 6 | 
 7 | @DATASETS.register_module()
 8 | class ChaseDB1Dataset(CustomDataset):
 9 |     """Chase_db1 dataset.
10 | 
11 |     In segmentation map annotation for Chase_db1, 0 stands for background,
12 |     which is included in 2 categories. ``reduce_zero_label`` is fixed to False.
13 |     The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
14 |     '_1stHO.png'.
15 |     """
16 | 
17 |     CLASSES = ('background', 'vessel')
18 | 
19 |     PALETTE = [[120, 120, 120], [6, 230, 230]]
20 | 
21 |     def __init__(self, **kwargs):
22 |         super(ChaseDB1Dataset, self).__init__(
23 |             img_suffix='.png',
24 |             seg_map_suffix='_1stHO.png',
25 |             reduce_zero_label=False,
26 |             **kwargs)
27 |         assert osp.exists(self.img_dir)
28 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/datasets/drive.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | 
 3 | from .builder import DATASETS
 4 | from .custom import CustomDataset
 5 | 
 6 | 
 7 | @DATASETS.register_module()
 8 | class DRIVEDataset(CustomDataset):
 9 |     """DRIVE dataset.
10 | 
11 |     In segmentation map annotation for DRIVE, 0 stands for background, which is
12 |     included in 2 categories. ``reduce_zero_label`` is fixed to False. The
13 |     ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
14 |     '_manual1.png'.
15 |     """
16 | 
17 |     CLASSES = ('background', 'vessel')
18 | 
19 |     PALETTE = [[120, 120, 120], [6, 230, 230]]
20 | 
21 |     def __init__(self, **kwargs):
22 |         super(DRIVEDataset, self).__init__(
23 |             img_suffix='.png',
24 |             seg_map_suffix='_manual1.png',
25 |             reduce_zero_label=False,
26 |             **kwargs)
27 |         assert osp.exists(self.img_dir)
28 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/datasets/hrf.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | 
 3 | from .builder import DATASETS
 4 | from .custom import CustomDataset
 5 | 
 6 | 
 7 | @DATASETS.register_module()
 8 | class HRFDataset(CustomDataset):
 9 |     """HRF dataset.
10 | 
11 |     In segmentation map annotation for HRF, 0 stands for background, which is
12 |     included in 2 categories. ``reduce_zero_label`` is fixed to False. The
13 |     ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
14 |     '.png'.
15 |     """
16 | 
17 |     CLASSES = ('background', 'vessel')
18 | 
19 |     PALETTE = [[120, 120, 120], [6, 230, 230]]
20 | 
21 |     def __init__(self, **kwargs):
22 |         super(HRFDataset, self).__init__(
23 |             img_suffix='.png',
24 |             seg_map_suffix='.png',
25 |             reduce_zero_label=False,
26 |             **kwargs)
27 |         assert osp.exists(self.img_dir)
28 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/__init__.py:
--------------------------------------------------------------------------------
 1 | from .compose import Compose
 2 | from .formating import (Collect, ImageToTensor, ToDataContainer, ToTensor,
 3 |                         Transpose, to_tensor)
 4 | from .loading import LoadAnnotations, LoadImageFromFile
 5 | from .test_time_aug import MultiScaleFlipAug
 6 | from .transforms import (CLAHE, AdjustGamma, Normalize, Pad,
 7 |                          PhotoMetricDistortion, RandomCrop, RandomFlip,
 8 |                          RandomRotate, Rerange, Resize, RGB2Gray, SegRescale)
 9 | 
10 | __all__ = [
11 |     'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer',
12 |     'Transpose', 'Collect', 'LoadAnnotations', 'LoadImageFromFile',
13 |     'MultiScaleFlipAug', 'Resize', 'RandomFlip', 'Pad', 'RandomCrop',
14 |     'Normalize', 'SegRescale', 'PhotoMetricDistortion', 'RandomRotate',
15 |     'AdjustGamma', 'CLAHE', 'Rerange', 'RGB2Gray'
16 | ]
17 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/datasets/stare.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | 
 3 | from .builder import DATASETS
 4 | from .custom import CustomDataset
 5 | 
 6 | 
 7 | @DATASETS.register_module()
 8 | class STAREDataset(CustomDataset):
 9 |     """STARE dataset.
10 | 
11 |     In segmentation map annotation for STARE, 0 stands for background, which is
12 |     included in 2 categories. ``reduce_zero_label`` is fixed to False. The
13 |     ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
14 |     '.ah.png'.
15 |     """
16 | 
17 |     CLASSES = ('background', 'vessel')
18 | 
19 |     PALETTE = [[120, 120, 120], [6, 230, 230]]
20 | 
21 |     def __init__(self, **kwargs):
22 |         super(STAREDataset, self).__init__(
23 |             img_suffix='.png',
24 |             seg_map_suffix='.ah.png',
25 |             reduce_zero_label=False,
26 |             **kwargs)
27 |         assert osp.exists(self.img_dir)
28 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .backbones import *  # noqa: F401,F403
 2 | from .builder import (BACKBONES, HEADS, LOSSES, SEGMENTORS, build_backbone,
 3 |                       build_head, build_loss, build_segmentor)
 4 | from .decode_heads import *  # noqa: F401,F403
 5 | from .losses import *  # noqa: F401,F403
 6 | from .necks import *  # noqa: F401,F403
 7 | from .segmentors import *  # noqa: F401,F403
 8 | 
 9 | __all__ = [
10 |     'BACKBONES', 'HEADS', 'LOSSES', 'SEGMENTORS', 'build_backbone',
11 |     'build_head', 'build_loss', 'build_segmentor'
12 | ]
13 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/models/backbones/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cgnet import CGNet
 2 | # from .fast_scnn import FastSCNN
 3 | from .hrnet import HRNet
 4 | from .mobilenet_v2 import MobileNetV2
 5 | from .mobilenet_v3 import MobileNetV3
 6 | from .resnest import ResNeSt
 7 | from .resnet import ResNet, ResNetV1c, ResNetV1d
 8 | from .resnext import ResNeXt
 9 | from .unet import UNet
10 | from .vit import VisionTransformer
11 | from .uniformer import UniFormer
12 | 
13 | __all__ = [
14 |     'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet',
15 |     'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3',
16 |     'VisionTransformer', 'UniFormer'
17 | ]
18 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/models/decode_heads/__init__.py:
--------------------------------------------------------------------------------
 1 | from .ann_head import ANNHead
 2 | from .apc_head import APCHead
 3 | from .aspp_head import ASPPHead
 4 | from .cc_head import CCHead
 5 | from .da_head import DAHead
 6 | from .dm_head import DMHead
 7 | from .dnl_head import DNLHead
 8 | from .ema_head import EMAHead
 9 | from .enc_head import EncHead
10 | from .fcn_head import FCNHead
11 | from .fpn_head import FPNHead
12 | from .gc_head import GCHead
13 | from .lraspp_head import LRASPPHead
14 | from .nl_head import NLHead
15 | from .ocr_head import OCRHead
16 | # from .point_head import PointHead
17 | from .psa_head import PSAHead
18 | from .psp_head import PSPHead
19 | from .sep_aspp_head import DepthwiseSeparableASPPHead
20 | from .sep_fcn_head import DepthwiseSeparableFCNHead
21 | from .uper_head import UPerHead
22 | 
23 | __all__ = [
24 |     'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead',
25 |     'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead',
26 |     'EncHead', 'DepthwiseSeparableFCNHead', 'FPNHead', 'EMAHead', 'DNLHead',
27 |     'APCHead', 'DMHead', 'LRASPPHead'
28 | ]
29 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/models/losses/__init__.py:
--------------------------------------------------------------------------------
 1 | from .accuracy import Accuracy, accuracy
 2 | from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy,
 3 |                                  cross_entropy, mask_cross_entropy)
 4 | from .dice_loss import DiceLoss
 5 | from .lovasz_loss import LovaszLoss
 6 | from .utils import reduce_loss, weight_reduce_loss, weighted_loss
 7 | 
 8 | __all__ = [
 9 |     'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy',
10 |     'mask_cross_entropy', 'CrossEntropyLoss', 'reduce_loss',
11 |     'weight_reduce_loss', 'weighted_loss', 'LovaszLoss', 'DiceLoss'
12 | ]
13 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/models/necks/__init__.py:
--------------------------------------------------------------------------------
1 | from .fpn import FPN
2 | from .multilevel_neck import MultiLevelNeck
3 | 
4 | __all__ = ['FPN', 'MultiLevelNeck']
5 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/models/segmentors/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import BaseSegmentor
2 | from .cascade_encoder_decoder import CascadeEncoderDecoder
3 | from .encoder_decoder import EncoderDecoder
4 | 
5 | __all__ = ['BaseSegmentor', 'EncoderDecoder', 'CascadeEncoderDecoder']
6 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/models/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .drop import DropPath
 2 | from .inverted_residual import InvertedResidual, InvertedResidualV3
 3 | from .make_divisible import make_divisible
 4 | from .res_layer import ResLayer
 5 | from .se_layer import SELayer
 6 | from .self_attention_block import SelfAttentionBlock
 7 | from .up_conv_block import UpConvBlock
 8 | from .weight_init import trunc_normal_
 9 | 
10 | __all__ = [
11 |     'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual',
12 |     'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'DropPath', 'trunc_normal_'
13 | ]
14 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/models/utils/drop.py:
--------------------------------------------------------------------------------
 1 | """Modified from https://github.com/rwightman/pytorch-image-
 2 | models/blob/master/timm/models/layers/drop.py."""
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | 
 8 | class DropPath(nn.Module):
 9 |     """Drop paths (Stochastic Depth) per sample  (when applied in main path of
10 |     residual blocks).
11 | 
12 |     Args:
13 |         drop_prob (float): Drop rate for paths of model. Dropout rate has
14 |             to be between 0 and 1. Default: 0.
15 |     """
16 | 
17 |     def __init__(self, drop_prob=0.):
18 |         super(DropPath, self).__init__()
19 |         self.drop_prob = drop_prob
20 |         self.keep_prob = 1 - drop_prob
21 | 
22 |     def forward(self, x):
23 |         if self.drop_prob == 0. or not self.training:
24 |             return x
25 |         shape = (x.shape[0], ) + (1, ) * (
26 |             x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
27 |         random_tensor = self.keep_prob + torch.rand(
28 |             shape, dtype=x.dtype, device=x.device)
29 |         random_tensor.floor_()  # binarize
30 |         output = x.div(self.keep_prob) * random_tensor
31 |         return output
32 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/ops/__init__.py:
--------------------------------------------------------------------------------
1 | from .encoding import Encoding
2 | from .wrappers import Upsample, resize
3 | 
4 | __all__ = ['Upsample', 'resize', 'Encoding']
5 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .collect_env import collect_env
2 | from .logger import get_root_logger
3 | 
4 | __all__ = ['get_root_logger', 'collect_env']
5 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/utils/collect_env.py:
--------------------------------------------------------------------------------
 1 | from annotator.uniformer.mmcv.utils import collect_env as collect_base_env
 2 | from annotator.uniformer.mmcv.utils import get_git_hash
 3 | 
 4 | import annotator.uniformer.mmseg as mmseg
 5 | 
 6 | 
 7 | def collect_env():
 8 |     """Collect the information of the running environments."""
 9 |     env_info = collect_base_env()
10 |     env_info['MMSegmentation'] = f'{mmseg.__version__}+{get_git_hash()[:7]}'
11 | 
12 |     return env_info
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     for name, val in collect_env().items():
17 |         print('{}: {}'.format(name, val))
18 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/uniformer/mmseg/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from annotator.uniformer.mmcv.utils import get_logger
 4 | 
 5 | 
 6 | def get_root_logger(log_file=None, log_level=logging.INFO):
 7 |     """Get the root logger.
 8 | 
 9 |     The logger will be initialized if it has not been initialized. By default a
10 |     StreamHandler will be added. If `log_file` is specified, a FileHandler will
11 |     also be added. The name of the root logger is the top-level package name,
12 |     e.g., "mmseg".
13 | 
14 |     Args:
15 |         log_file (str | None): The log filename. If specified, a FileHandler
16 |             will be added to the root logger.
17 |         log_level (int): The root logger level. Note that only the process of
18 |             rank 0 is affected, while other processes will set the level to
19 |             "Error" and be silent most of the time.
20 | 
21 |     Returns:
22 |         logging.Logger: The root logger.
23 |     """
24 | 
25 |     logger = get_logger(name='mmseg', log_file=log_file, log_level=log_level)
26 | 
27 |     return logger
28 | 


--------------------------------------------------------------------------------
/lavis/common/annotator/util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import cv2
 3 | import os
 4 | 
 5 | 
 6 | annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts')
 7 | 
 8 | 
 9 | def HWC3(x):
10 |     assert x.dtype == np.uint8
11 |     if x.ndim == 2:
12 |         x = x[:, :, None]
13 |     assert x.ndim == 3
14 |     H, W, C = x.shape
15 |     assert C == 1 or C == 3 or C == 4
16 |     if C == 3:
17 |         return x
18 |     if C == 1:
19 |         return np.concatenate([x, x, x], axis=2)
20 |     if C == 4:
21 |         color = x[:, :, 0:3].astype(np.float32)
22 |         alpha = x[:, :, 3:4].astype(np.float32) / 255.0
23 |         y = color * alpha + 255.0 * (1.0 - alpha)
24 |         y = y.clip(0, 255).astype(np.uint8)
25 |         return y
26 | 
27 | 
28 | def resize_image(input_image, resolution):
29 |     H, W, C = input_image.shape
30 |     H = float(H)
31 |     W = float(W)
32 |     k = float(resolution) / min(H, W)
33 |     H *= k
34 |     W *= k
35 |     H = int(np.round(H / 64.0)) * 64
36 |     W = int(np.round(W / 64.0)) * 64
37 |     img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
38 |     return img
39 | 


--------------------------------------------------------------------------------
/lavis/common/gradcam.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from matplotlib import pyplot as plt
 3 | from scipy.ndimage import filters
 4 | from skimage import transform as skimage_transform
 5 | 
 6 | 
 7 | def getAttMap(img, attMap, blur=True, overlap=True):
 8 |     attMap -= attMap.min()
 9 |     if attMap.max() > 0:
10 |         attMap /= attMap.max()
11 |     attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
12 |     if blur:
13 |         attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
14 |         attMap -= attMap.min()
15 |         attMap /= attMap.max()
16 |     cmap = plt.get_cmap("jet")
17 |     attMapV = cmap(attMap)
18 |     attMapV = np.delete(attMapV, 3, 2)
19 |     if overlap:
20 |         attMap = (
21 |             1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
22 |             + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
23 |         )
24 |     return attMap
25 | 


--------------------------------------------------------------------------------
/lavis/common/vqa_tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |  Copyright (c) 2022, salesforce.com, inc.
3 |  All rights reserved.
4 |  SPDX-License-Identifier: BSD-3-Clause
5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 | 
8 | __author__ = "aagrawal"
9 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/blip_diffusion_datasets/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   blip_diffusion_finetune: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       images:
14 |         storage: ""
15 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/conceptual_caption/defaults_12m.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   conceptual_caption_12m:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - /export/home/workspace/datasets/cc12m.json
17 |           storage:
18 |               - conceptual_caption/annotations/cc12m.json
19 |       images:
20 |           storage: conceptual_caption/images_12m
21 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/conceptual_caption/defaults_3m.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   conceptual_caption_3m:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - /export/home/workspace/datasets/cc3m.json
17 |           storage:
18 |               - conceptual_caption/annotations/cc3m.json
19 |       images:
20 |           storage: conceptual_caption/images
21 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/flickr30k/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   flickr30k:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images
10 | 
11 |     build_info:
12 |       annotations:
13 |         train:
14 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json
15 |           storage: flickr30k/annotations/train.json
16 |         val:
17 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json
18 |           storage: flickr30k/annotations/val.json
19 |         test:
20 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json
21 |           storage: flickr30k/annotations/test.json
22 |       images:
23 |           storage: flickr30k/images
24 |           # storage: /export/share/datasets/vision/flickr30k
25 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/imagenet/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   imagenet:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       splits: ["val"]
14 |       images:
15 |           storage: /export/share/datasets/vision/imagenet
16 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/laion/defaults_2B_multi.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   laion2B_multi:
 8 | 
 9 |     data_type: images
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar
14 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/laion/defaults_400M.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   laion400M:
 8 | 
 9 |     data_type: images
10 | 
11 |     text_processor:
12 |       train:
13 |         name: blip_caption
14 |       eval:
15 |         name: blip_caption
16 | 
17 |     build_info:
18 |       # Be careful not to append minus sign (-) before split to avoid itemizing
19 |       storage: /export/laion400m-data-ssd/laion115m_capfilt_20220817/{part0/part0,part1/part1,part2/part2}_node{00..15}_shard{000000..000118}.tar
20 | #      storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar
21 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/laion/defaults_400M_instruct.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   laion400M_instruct:
 8 | 
 9 |     data_type: images
10 | 
11 |     vis_processor:
12 |       train:
13 |         name: "clip_image_train"
14 |         image_size: 224
15 |       eval:
16 |         name: "clip_image_eval"
17 |         image_size: 224
18 | 
19 |     
20 |     text_processor:
21 |       train:
22 |         name: blip_instruction
23 |         modality: image
24 |         task: caption
25 |       eval:
26 |         name: blip_caption
27 | 
28 |     build_info:
29 |       # Be careful not to append minus sign (-) before split to avoid itemizing
30 |       storage: /export/laion400m-data-ssd/laion115m_capfilt_20220817/{part0/part0,part1/part1,part2/part2}_node{00..15}_shard{000000..000118}.tar
31 | #      storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar
32 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/llava150k/defaults_dial.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   llava150k_dialogue_instruct: #394276 train examples
 8 | 
 9 |     data_type: images
10 | 
11 |     vis_processor:
12 |       train:
13 |         name: "clip_image_train"
14 |         image_size: 224
15 |       eval:
16 |         name: "clip_image_eval"
17 |         image_size: 224
18 | 
19 |     text_processor:
20 |         train:
21 |           name: "blip_caption"
22 | 
23 |     build_info:
24 |       annotations:
25 |         train:
26 |           url: 
27 |             - https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json
28 |           storage: 
29 |             - LLaVA-Instruct-150K/annotations/lava_instruct_150k.json
30 |       # Be careful not to append minus sign (-) before split to avoid itemizing
31 |       images:
32 |         storage: /export/share/datasets/vision/coco/images/train2017
33 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/msrvtt/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msrvtt_cap: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json
16 |           storage: msrvtt/annotations/cap_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json
19 |           storage: msrvtt/annotations/cap_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json
22 |           storage: msrvtt/annotations/cap_test.json
23 |       videos:
24 |         storage: msrvtt/videos
25 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/msvd/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msvd_cap: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json
16 |           storage: msvd/annotations/cap_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json
19 |           storage: msvd/annotations/cap_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json
22 |           storage: msvd/annotations/cap_test.json
23 |       videos:
24 |         storage: msvd/videos
25 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/nlvr/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   nlvr:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json
16 |           storage: nlvr/annotations/train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
19 |           storage: nlvr/annotations/dev.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
22 |           storage: nlvr/annotations/test.json
23 |       images:
24 |           storage: /export/share/datasets/vision/NLVR2/
25 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/nocaps/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   nocaps: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         val:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json
16 |           storage:  nocaps/annotations/nocaps_val.json
17 |         test:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json
19 |           storage: nocaps/annotations/nocaps_test.json
20 |       images:
21 |         storage: nocaps/images
22 |         # storage: /export/share/datasets/vision/nocaps/
23 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/sbu_caption/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   sbu_caption:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json
17 |               # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json
18 |           storage:
19 |               - sbu_captions/annotations/sbu.json
20 |       images:
21 |           storage: sbu_captions/images
22 |           # storage: /export/share/datasets/vision_language/sbu_resize
23 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/snli_ve/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   snli_ve:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json
16 |           storage: snli/annotations/ve_train.json
17 |         val:
18 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json
19 |           storage: snli/annotations/ve_dev.json
20 |         test:
21 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json
22 |           storage: snli/annotations/ve_test.json
23 |       images:
24 |           storage: flickr30k/images/flickr30k-images
25 |           # storage: /export/share/datasets/vision/flickr30k/flickr30k-images
26 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/vg/defaults_caption.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   vg_caption:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json
16 |           storage: vg/annotations/vg_caption.json
17 |       images:
18 |         storage: vg/images/
19 | 


--------------------------------------------------------------------------------
/lavis/configs/datasets/vg/defaults_vqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   vg_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json
16 |           storage: vg/annotations/vg_qa.json
17 |       images:
18 |         storage: vg/images/
19 | 


--------------------------------------------------------------------------------
/lavis/configs/default.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | env:
 7 |   # For default users
 8 |   # cache_root: "cache"
 9 |   # For internal use with persistent storage
10 |   cache_root: "/export/home/.cache/lavis"
11 | 


--------------------------------------------------------------------------------
/lavis/configs/models/albef_classification_ve.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_classification
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt"
11 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
12 | 
13 |   num_classes: 3
14 | 
15 |   use_distill: True
16 |   momentum: 0.995
17 |   alpha: 0.4
18 | 
19 |   # vit encoder
20 |   vit_type: "base"
21 |   vit_grad_ckpt: False
22 |   vit_ckpt_layer: 0
23 |   vit_layer_norm_epsilon: 1e-6
24 | 
25 |   image_size: 384
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/med_config_albef.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |       eval:
35 |         name: "blip_image_eval"
36 |   text_processor:
37 |       train:
38 |         name: "blip_caption"
39 |       eval:
40 |         name: "blip_caption"
41 | 


--------------------------------------------------------------------------------
/lavis/configs/models/albef_feature_extractor.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_pretrain
 8 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
 9 | 
10 |   # vit encoder
11 |   vit_type: "base"
12 |   image_size: 224
13 |   vit_ckpt_layer: 0
14 |   vit_drop_path_rate: 0
15 |   vit_layer_norm_epsilon: 1e-6
16 |   vit_grad_ckpt: False
17 | 
18 |   # bert config
19 |   med_config_path: "configs/models/med_config_albef.json"
20 | 
21 |   embed_dim: 256
22 | 
23 | preprocess:
24 |   vis_processor:
25 |       eval:
26 |         name: "blip_image_eval"
27 |         image_size: 224
28 |   text_processor:
29 |       eval:
30 |         name: "blip_caption"
31 | 


--------------------------------------------------------------------------------
/lavis/configs/models/albef_pretrain_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_pretrain
 8 | 
 9 |   load_pretrained: True
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   image_size: 224
15 |   vit_ckpt_layer: 0
16 |   vit_drop_path_rate: 0
17 |   vit_layer_norm_epsilon: 1e-6
18 |   vit_grad_ckpt: False
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/med_config_albef.json"
22 |   mlm_mask_prob: 0.15
23 | 
24 |   embed_dim: 256
25 |   momentum: 0.995
26 |   alpha: 0.4
27 |   temp: 0.07
28 | 
29 |   max_txt_len: 30
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 256
36 |     text_processor:
37 |         train:
38 |           name: "blip_caption"
39 | 


--------------------------------------------------------------------------------
/lavis/configs/models/alpro_retrieval_didemo.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 | 
 9 |   load_finetuned: True
10 | 
11 |   finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13 | 
14 |   timesformer:
15 |     n_frms: 8
16 |     image_size: 224
17 | 
18 |     patch_size: 16
19 |     attn_drop_rate: 0.
20 |     drop_rate: 0.
21 |     drop_path_rate: 0.1
22 |     use_grad_ckpt: False
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/bert_config_alpro.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       eval:
30 |         name: "alpro_video_eval"
31 |         n_frms: 8
32 |         image_size: 224
33 |   text_processor:
34 |       eval:
35 |         name: "blip_caption"
36 | 


--------------------------------------------------------------------------------
/lavis/configs/models/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30522,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/lavis/configs/models/bert_config_alpro.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": true,
18 |   "type_vocab_size": 2,
19 |   "vocab_size": 30522,
20 |   "encoder_width": 768,
21 |   "add_cross_attention": false,
22 |   "fusion_layer": 6
23 | }


--------------------------------------------------------------------------------
/lavis/configs/models/blip-diffusion/blip_diffusion_base.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   vit_model: "clip_L"
 3 | 
 4 |   qformer_num_query_token: 16
 5 |   qformer_cross_attention_freq: 1
 6 | 
 7 |   sd_train_text_encoder: False
 8 |   sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 
 9 | 
10 |   load_finetuned: False
11 |   load_pretrained: True
12 |   # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz"
14 | 
15 | preprocess:
16 |   vis_processor:
17 |     train:
18 |       name: "blip_diffusion_inp_image_eval"
19 |     eval:
20 |       name: "blip_diffusion_inp_image_eval"
21 |   text_processor:
22 |     train:
23 |       name: "blip_caption"
24 |     eval:
25 |       name: "blip_caption"
26 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   vit_model: "clip_L"
 3 | 
 4 |   qformer_num_query_token: 16
 5 |   qformer_cross_attention_freq: 1
 6 | 
 7 |   sd_train_text_encoder: False
 8 |   sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 
 9 | 
10 |   load_finetuned: False
11 |   load_pretrained: True
12 |   # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz"
14 | 
15 |   controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-canny"
16 | 
17 | preprocess:
18 |   vis_processor:
19 |     train:
20 |       name: "blip_diffusion_inp_image_eval"
21 |     eval:
22 |       name: "blip_diffusion_inp_image_eval"
23 |   text_processor:
24 |     train:
25 |       name: "blip_caption"
26 |     eval:
27 |       name: "blip_caption"
28 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   vit_model: "clip_L"
 3 | 
 4 |   qformer_num_query_token: 16
 5 |   qformer_cross_attention_freq: 1
 6 | 
 7 |   sd_train_text_encoder: False
 8 |   sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 
 9 | 
10 |   load_finetuned: False
11 |   load_pretrained: True
12 |   # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz"
14 | 
15 |   controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-depth"
16 | 
17 | preprocess:
18 |   vis_processor:
19 |     train:
20 |       name: "blip_diffusion_inp_image_eval"
21 |     eval:
22 |       name: "blip_diffusion_inp_image_eval"
23 |   text_processor:
24 |     train:
25 |       name: "blip_caption"
26 |     eval:
27 |       name: "blip_caption"
28 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_hed.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   vit_model: "clip_L"
 3 | 
 4 |   qformer_num_query_token: 16
 5 |   qformer_cross_attention_freq: 1
 6 | 
 7 |   sd_train_text_encoder: False
 8 |   sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 
 9 | 
10 |   load_finetuned: False
11 |   load_pretrained: True
12 |   # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz"
14 | 
15 |   controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-hed"
16 | 
17 | preprocess:
18 |   vis_processor:
19 |     train:
20 |       name: "blip_diffusion_inp_image_eval"
21 |     eval:
22 |       name: "blip_diffusion_inp_image_eval"
23 |   text_processor:
24 |     train:
25 |       name: "blip_caption"
26 |     eval:
27 |       name: "blip_caption"
28 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: coco
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: True
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 364
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 364
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |         eval:
36 |           name: "blip_caption"
37 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 224
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 224
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |         eval:
36 |           name: "blip_caption"
37 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xl"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xxl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xxl"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_opt2.7b
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-2.7b"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_opt6.7b
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-6.7b"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vitL.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   vit_model: "clip_L"
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 | 
25 | preprocess:
26 |     vis_processor:
27 |         train:
28 |           name: "blip_image_train"
29 |           image_size: 224
30 |         eval:
31 |           name: "blip_image_eval"
32 |           image_size: 224
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 |         eval:
37 |           name: "blip_caption"
38 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_caption_large_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth"
12 | 
13 |   vit_type: "large"
14 |   vit_grad_ckpt: True
15 |   vit_ckpt_layer: 5
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_large_config.json"
21 | 
22 |   # generation configs
23 |   prompt: "a picture of "
24 | 
25 | 
26 | preprocess:
27 |     vis_processor:
28 |         train:
29 |           name: "blip_image_train"
30 |         eval:
31 |           name: "blip_image_eval"
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |           prompt: "a picture of "
36 |         eval:
37 |           name: "blip_caption"
38 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_classification_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_classification
 8 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
 9 | 
10 |   use_distill: True
11 |   momentum: 0.995
12 |   alpha: 0.4
13 | 
14 |   # vit encoder
15 |   vit_type: "base"
16 |   vit_grad_ckpt: False
17 |   vit_ckpt_layer: 0
18 | 
19 |   image_size: 384
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_feature_extractor_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
 9 | 
10 |   # vit encoder
11 |   vit_type: "base"
12 |   vit_grad_ckpt: False
13 |   vit_ckpt_layer: 0
14 | 
15 |   image_size: 224
16 | 
17 |   # bert config
18 |   med_config_path: "configs/models/med_config.json"
19 | 
20 |   embed_dim: 256
21 | 
22 | preprocess:
23 |   vis_processor:
24 |       eval:
25 |         name: "blip_image_eval"
26 |         image_size: 224
27 |   text_processor:
28 |       eval:
29 |         name: "blip_caption"
30 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_itm_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_image_text_matching
 8 | 
 9 |   load_finetuned: True
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_config.json"
21 | 
22 |   embed_dim: 256
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         eval:
27 |           name: "blip_image_eval"
28 |           image_size: 384
29 |     text_processor:
30 |         eval:
31 |           name: "blip_caption"
32 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_itm_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_image_text_matching
 8 | 
 9 |   load_finetuned: True
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "large"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_large_config.json"
21 | 
22 |   embed_dim: 256
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         eval:
27 |           name: "blip_image_eval"
28 |           image_size: 384
29 |     text_processor:
30 |         eval:
31 |           name: "blip_caption"
32 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_pretrain_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 | 
 9 |   load_pretrained: True
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 224
18 |   alpha: 0.4
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/bert_config.json"
22 | 
23 |   embed_dim: 256
24 | 
25 |   # generation configs
26 |   prompt: "a picture of "
27 | 
28 | preprocess:
29 |     vis_processor:
30 |         train:
31 |           name: "blip_image_train"
32 |           image_size: 224
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_pretrain_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 | 
 9 |   # vit encoder
10 |   vit_type: "large"
11 |   vit_grad_ckpt: True
12 |   vit_ckpt_layer: 5
13 | 
14 |   image_size: 224
15 | 
16 |   # bert config
17 |   med_config_path: "configs/models/med_large_config.json"
18 | 
19 |   embed_dim: 256
20 | 
21 |   # generation configs
22 |   prompt: "a picture of "
23 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_retrieval_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   queue_size: 57600
14 | 
15 |   # vit encoder
16 |   vit_type: "base"
17 |   vit_grad_ckpt: True
18 |   vit_ckpt_layer: 4
19 | 
20 |   image_size: 384
21 | 
22 |   # bert config
23 |   med_config_path: "configs/models/med_config.json"
24 | 
25 |   embed_dim: 256
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "blip_image_train"
31 |         image_size: 384
32 |       eval:
33 |         name: "blip_image_eval"
34 |         image_size: 384
35 |   text_processor:
36 |       train:
37 |         name: "blip_caption"
38 |       eval:
39 |         name: "blip_caption"
40 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_vqa_aokvqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_vqa_okvqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/lavis/configs/models/blip_vqav2.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN101-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             23,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN101.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             23,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN50-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             6,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             6,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN50x16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 384,
 5 |         "layers": [
 6 |             6,
 7 |             8,
 8 |             18,
 9 |             8
10 |         ],
11 |         "width": 96,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 768,
18 |         "heads": 12,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/RN50x4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 288,
 5 |         "layers": [
 6 |             4,
 7 |             6,
 8 |             10,
 9 |             6
10 |         ],
11 |         "width": 80,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 640,
18 |         "heads": 10,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-16-plus-240.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 240,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-16-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-32-plus-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 256,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-H-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 16
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-L-14-280.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 280,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-L-16-320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 320,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-L-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/ViT-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1024,
15 |         "heads": 16,
16 |         "layers": 24
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "efficientnetv2_rw_s",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 288
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 768,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-resnet50d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnet50d",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-resnetaa50d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnetaa50d",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-resnetblur50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnetblur50",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "swin_base_patch4_window7_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-vit_base_patch16_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_base_patch16_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-vit_base_patch32_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_base_patch32_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip/timm-vit_small_patch16_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_small_patch16_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip_resnet50.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: RN50
10 | 
11 |   pretrained: openai
12 | 


--------------------------------------------------------------------------------
/lavis/configs/models/clip_vit_base16.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-B-16
10 | 
11 |   pretrained: openai
12 | 
13 | preprocess:
14 |   vis_processor:
15 |       eval:
16 |         name: "clip_image_eval"
17 |         image_size: 224
18 | 


--------------------------------------------------------------------------------
/lavis/configs/models/gpt_dialogue_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: gpt_dialogue
 8 |   # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
 9 |   # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
10 | 
11 |   len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens 
12 |   
13 |   len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128
14 | 
15 | preprocess:
16 |     vis_processor:
17 |         train:
18 |           name: "gpt_video_ft"
19 |         eval:
20 |           name: "gpt_video_ft"
21 |     text_processor:
22 |         train:
23 |           name: "gpt_dialogue"
24 |         eval:
25 |           name: "gpt_dialogue"


--------------------------------------------------------------------------------
/lavis/configs/models/med_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30524,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/lavis/configs/models/med_config_albef.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30522,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true,
21 |   "fusion_layer": 6
22 | }


--------------------------------------------------------------------------------
/lavis/configs/models/med_large_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30524,
19 |   "encoder_width": 1024,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/lavis/datasets/builders/audio_qa_builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2023, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.datasets.builders.audio_caption_builder import AudioCapBuilder
10 | from lavis.datasets.datasets.audio_qa_datasets import AudioCapsQADataset, ClothoQADataset
11 | 
12 | @registry.register_builder("audiocaps_mm_qa")
13 | class AudioCapsQABuilder(AudioCapBuilder):
14 |     train_dataset_cls = AudioCapsQADataset
15 |     eval_dataset_cls = AudioCapsQADataset
16 | 
17 |     DATASET_CONFIG_DICT = {
18 |         "default": "configs/datasets/audiocaps/defaults_mm_qa.yaml",
19 |     }
20 | 
21 | @registry.register_builder("clotho_qa")
22 | class ClothoQABuilder(AudioCapBuilder):
23 |     train_dataset_cls = ClothoQADataset
24 |     eval_dataset_cls = ClothoQADataset
25 | 
26 |     DATASET_CONFIG_DICT = {
27 |         "default": "configs/datasets/clotho/defaults_mm_qa.yaml",
28 |     }


--------------------------------------------------------------------------------
/lavis/datasets/builders/discrn_builders.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder
10 | from lavis.datasets.datasets.discriminatory_reasoning_datasets import DisCRnDataset
11 | 
12 | 
13 | 
14 | @registry.register_builder("image_pc_discrn")
15 | class DiscrnImagePcBuilder(MultiModalDatasetBuilder):
16 |     eval_dataset_cls = DisCRnDataset
17 | 
18 |     DATASET_CONFIG_DICT = {
19 |         "default": "configs/datasets/discriminatory_reasoning/defaults_mm_image_pc.yaml",
20 |     }
21 | 
22 | @registry.register_builder("audio_video_discrn")
23 | class DiscrnAudioVideoBuilder(MultiModalDatasetBuilder):
24 |     eval_dataset_cls = DisCRnDataset
25 | 
26 |     DATASET_CONFIG_DICT = {
27 |         "default": "configs/datasets/discriminatory_reasoning/defaults_mm_audio_video.yaml",
28 |     }
29 | 


--------------------------------------------------------------------------------
/lavis/datasets/builders/object3d_classification_builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder
10 | from lavis.datasets.datasets.object3d_classification_datasets import ModelNetClassificationDataset
11 | 
12 | @registry.register_builder("modelnet40_cls")
13 | class ModelNetClassificationBuilder(MultiModalDatasetBuilder):
14 |     train_dataset_cls = ModelNetClassificationDataset
15 |     eval_dataset_cls = ModelNetClassificationDataset
16 | 
17 |     DATASET_CONFIG_DICT = {
18 |         "default": "configs/datasets/modelnet40/defaults_cls.yaml",
19 |     }


--------------------------------------------------------------------------------
/lavis/datasets/builders/object3d_qa_builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.datasets.builders.object3d_caption_builder import ObjaverseCaptionBuilder
10 | from lavis.datasets.datasets.object3d_qa_datasets import ObjaverseQADataset
11 | 
12 | @registry.register_builder("objaverse_mm_qa")
13 | class ObjaverseQABuilder(ObjaverseCaptionBuilder):
14 |     train_dataset_cls = ObjaverseQADataset
15 |     eval_dataset_cls = ObjaverseQADataset
16 | 
17 |     DATASET_CONFIG_DICT = {
18 |         "default": "configs/datasets/objaverse/defaults_mm_qa.yaml",
19 |     }


--------------------------------------------------------------------------------
/lavis/datasets/datasets/multimodal_classification_datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from abc import abstractmethod
 9 | from lavis.datasets.datasets.base_dataset import BaseDataset
10 | 
11 | 
12 | class MultimodalClassificationDataset(BaseDataset):
13 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
14 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
15 | 
16 |         self.class_labels = None
17 | 
18 |     @abstractmethod
19 |     def _build_class_labels(self):
20 |         pass
21 | 


--------------------------------------------------------------------------------
/lavis/datasets/download_scripts/download_charade.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2023, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import json
 9 | from tqdm import tqdm
10 | 
11 | train_file = './train.jsonl'
12 | test_file = './test.jsonl'
13 | 
14 | train_data = [json.loads(l.strip()) for l in open(train_file).readlines()]
15 | test_data = [json.loads(l.strip()) for l in open(test_file).readlines()]
16 | 
17 | for d in tqdm(train_data):
18 |     d['video_path'] = d['video_id'] + '.mp4'
19 |     d['ts'] = [float(d['start']), float(d['end'])]
20 | 
21 | for d in tqdm(test_data):
22 |     d['video_path'] = d['video_id'] + '.mp4'
23 |     d['ts'] = [float(d['start']), float(d['end'])]
24 | 
25 | json.dump(train_data, open('train_lavis.json', 'w'))
26 | json.dump(test_data, open('test_lavis.json', 'w'))


--------------------------------------------------------------------------------
/lavis/datasets/download_scripts/download_violin.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2023, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import json
 9 | import os
10 | 
11 | json_path = './violin_annotation.json'
12 | 
13 | ## convert annotations
14 | all_json = json.load(open(json_path))
15 | train_data = [v for v in all_json.values() if 'split' in v and v['split'] == 'train']
16 | test_data = [v for v in all_json.values() if 'split' in v and v['split'] == 'test']
17 | 
18 | json.dump(train_data, open('train.json', 'w'))
19 | json.dump(test_data, open('test.json', 'w'))


--------------------------------------------------------------------------------
/lavis/models/beats/LICENSE_BEATs.txt:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) Microsoft Corporation
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/lavis/models/blip2_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/lavis/models/blip2_models/__init__.py


--------------------------------------------------------------------------------
/lavis/models/blip_diffusion_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/lavis/models/blip_diffusion_models/__init__.py


--------------------------------------------------------------------------------
/lavis/models/clip_models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | 
 7 |  Based on https://github.com/mlfoundations/open_clip
 8 | """
 9 | 
10 | """ OpenAI pretrained model functions
11 | Adapted from https://github.com/mlfoundations/open_clip and https://github.com/openai/CLIP.
12 | 
13 | Originally MIT License, Copyright (c) 2021 OpenAI.
14 | """
15 | 


--------------------------------------------------------------------------------
/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/lavis/models/clip_models/pics/CLIP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/lavis/models/clip_models/pics/CLIP.png


--------------------------------------------------------------------------------
/lavis/models/img2prompt_models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import torch
 9 | 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/lavis/models/timesformer/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |  Copyright (c) 2022, salesforce.com, inc.
3 |  All rights reserved.
4 |  SPDX-License-Identifier: BSD-3-Clause
5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | 
7 |  Based on https://github.com/facebookresearch/TimeSformer
8 | """
9 | 


--------------------------------------------------------------------------------
/lavis/models/timesformer/linear.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | """ Linear layer (alternate definition)
 9 | """
10 | import torch
11 | import torch.nn.functional as F
12 | from torch import nn as nn
13 | 
14 | 
15 | class Linear(nn.Linear):
16 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
17 |         if torch.jit.is_scripting():
18 |             bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None
19 |             return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias)
20 |         else:
21 |             return F.linear(input, self.weight, self.bias)
22 | 


--------------------------------------------------------------------------------
/lavis/models/ulip_models/pointbert/PointTransformer_8192point.yaml:
--------------------------------------------------------------------------------
 1 | optimizer : {
 2 |   type: AdamW,
 3 |   kwargs: {
 4 |   lr : 0.0005, 
 5 |   weight_decay : 0.05
 6 | }}
 7 | 
 8 | scheduler: {
 9 |   type: CosLR,
10 |   kwargs: {
11 |     epochs: 300,
12 |     initial_epochs : 10
13 | }}
14 | 
15 | model : {
16 |   NAME: PointTransformer,
17 |   trans_dim: 384, 
18 |   depth: 12, 
19 |   drop_path_rate: 0.1, 
20 |   cls_dim: 40, 
21 |   num_heads: 6,
22 |   group_size: 32, 
23 |   num_group: 512,
24 |   encoder_dims: 256,
25 | }
26 | npoints: 8192
27 | total_bs : 32
28 | step_per_update : 1
29 | max_epoch : 300
30 | grad_norm_clip : 10
31 | 
32 | consider_metric: CDL1


--------------------------------------------------------------------------------
/lavis/models/ulip_models/ulip_scaled_up_config.yaml:
--------------------------------------------------------------------------------
 1 | optimizer : {
 2 |   type: AdamW,
 3 |   kwargs: {
 4 |   lr : 0.0005, 
 5 |   weight_decay : 0.05
 6 | }}
 7 | 
 8 | scheduler: {
 9 |   type: CosLR,
10 |   kwargs: {
11 |     epochs: 300,
12 |     initial_epochs : 10
13 | }}
14 | 
15 | model : {
16 |   NAME: PointTransformer,
17 |   trans_dim: 384, 
18 |   depth: 18, 
19 |   drop_path_rate: 0.1, 
20 |   cls_dim: 40, 
21 |   num_heads: 6,
22 |   group_size: 32, 
23 |   num_group: 512,
24 |   encoder_dims: 256,
25 | }
26 | npoints: 8192
27 | total_bs : 32
28 | step_per_update : 1
29 | max_epoch : 300
30 | grad_norm_clip : 10
31 | 
32 | consider_metric: CDL1


--------------------------------------------------------------------------------
/lavis/models/ulip_models/utils/__init__.py:
--------------------------------------------------------------------------------
1 | '''
2 |  * Copyright (c) 2023, salesforce.com, inc.
3 |  * All rights reserved.
4 |  * SPDX-License-Identifier: BSD-3-Clause
5 |  * For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 |  * By Le Xue
7 | '''
8 | 


--------------------------------------------------------------------------------
/lavis/models/ulip_models/utils/build.py:
--------------------------------------------------------------------------------
 1 | from utils import registry
 2 | 
 3 | 
 4 | DATASETS = registry.Registry('dataset')
 5 | 
 6 | 
 7 | def build_dataset_from_cfg(cfg, default_args = None):
 8 |     """
 9 |     Build a dataset, defined by `dataset_name`.
10 |     Args:
11 |         cfg (eDICT): 
12 |     Returns:
13 |         Dataset: a constructed dataset specified by dataset_name.
14 |     """
15 |     return DATASETS.build(cfg, default_args = default_args)
16 | 
17 | 
18 | 


--------------------------------------------------------------------------------
/lavis/processors/base_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from omegaconf import OmegaConf
 9 | 
10 | 
11 | class BaseProcessor:
12 |     def __init__(self):
13 |         self.transform = lambda x: x
14 |         return
15 | 
16 |     def __call__(self, item):
17 |         return self.transform(item)
18 | 
19 |     @classmethod
20 |     def from_config(cls, cfg=None):
21 |         return cls()
22 | 
23 |     def build(self, **kwargs):
24 |         cfg = OmegaConf.create(kwargs)
25 | 
26 |         return self.from_config(cfg)
27 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/nlvr_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_nlvr
 8 |   model_type: nlvr
 9 | 
10 | datasets:
11 |   nlvr: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: multimodal_classification
22 | 
23 |   batch_size_train: 16
24 |   batch_size_eval: 64
25 |   num_workers: 4
26 | 
27 |   seed: 42
28 |   output_dir: "output/ALBEF/NLVR"
29 | 
30 |   evaluate: True
31 |   test_splits: ["val", "test"]
32 | 
33 |   # distribution-specific
34 |   device: "cuda"
35 |   world_size: 1
36 |   dist_url: "env://"
37 |   distributed: True
38 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/ret_coco_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   model_type: coco
 9 | 
10 | datasets:
11 |   coco_retrieval: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: retrieval
22 | 
23 |   # dataloading
24 |   num_workers: 4
25 |   batch_size_train: 32
26 |   batch_size_eval: 64
27 | 
28 |   test_splits: ["test"]
29 | 
30 |   # distribution
31 |   device: "cuda"
32 |   world_size: 1
33 |   dist_url: "env://"
34 |   distributed: True
35 |   use_dist_eval_sampler: False
36 | 
37 |   # model specific
38 |   k_test: 128
39 | 
40 |   # misc
41 |   seed: 42
42 |   output_dir: "output/ALBEF/Retrieval_COCO"
43 | 
44 |   evaluate: True
45 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/ret_flickr30k_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   model_type: flickr
 9 | 
10 | datasets:
11 |   flickr30k: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: retrieval
22 | 
23 |   # dataloading
24 |   num_workers: 4
25 |   batch_size_train: 32
26 |   batch_size_eval: 64
27 | 
28 |   test_splits: ["test"]
29 | 
30 |   # distribution
31 |   device: "cuda"
32 |   world_size: 1
33 |   dist_url: "env://"
34 |   distributed: True
35 |   use_dist_eval_sampler: False
36 | 
37 |   # model specific
38 |   k_test: 128
39 | 
40 |   # misc
41 |   seed: 42
42 |   output_dir: "output/ALBEF/Retrieval_Flickr30k"
43 | 
44 |   evaluate: True
45 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/snli_ve_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_classification
 8 |   model_type: ve
 9 | 
10 | datasets:
11 |   snli_ve: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |     text_processor:
16 |         eval:
17 |           name: "blip_caption"
18 | 
19 | run:
20 |   task: multimodal_classification
21 |   # optimization-specific
22 |   batch_size_train: 32
23 |   batch_size_eval: 64
24 |   num_workers: 4
25 | 
26 |   seed: 42
27 |   output_dir: "output/ALBEF/SNLI_VE"
28 | 
29 |   evaluate: True
30 |   test_splits: ["val", "test"]
31 | 
32 |   # distribution-specific
33 |   device: "cuda"
34 |   world_size: 1
35 |   dist_url: "env://"
36 |   distributed: True
37 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/vqa_test.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_vqa
 8 |   model_type: vqav2
 9 | 
10 |   image_size: 384
11 | 
12 | 
13 | datasets:
14 |   coco_vqa: # name of the dataset builder
15 |     vis_processor:
16 |         eval:
17 |           name: "blip_image_eval"
18 |           image_size: 384
19 |     text_processor:
20 |         eval:
21 |           name: "blip_question"
22 | 
23 | run:
24 |   task: vqa
25 | 
26 |   # optimization-specific
27 |   batch_size_train: 16
28 |   batch_size_eval: 64
29 |   num_workers: 4
30 | 
31 |   # inference-specific
32 |   max_len: 10
33 |   min_len: 1
34 |   num_beams: 3
35 |   num_ans_candidates: 128
36 |   inference_method: "rank"
37 | 
38 |   seed: 42
39 |   output_dir: "output/ALBEF/VQA"
40 | 
41 |   evaluate: True
42 |   train_splits: ["train"]
43 |   test_splits: ["test"]
44 | 
45 |   # distribution-specific
46 |   device: "cuda"
47 |   world_size: 1
48 |   dist_url: "env://"
49 |   distributed: True
50 | 


--------------------------------------------------------------------------------
/lavis/projects/albef/eval/vqa_val.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_vqa
 8 |   model_type: vqav2
 9 | 
10 |   image_size: 384
11 | 
12 | datasets:
13 |   coco_vqa: # name of the dataset builder
14 |     type: eval
15 |     vis_processor:
16 |         eval:
17 |           name: "blip_image_eval"
18 |           image_size: 384
19 |     text_processor:
20 |         eval:
21 |           name: "blip_question"
22 | 
23 | run:
24 |   task: vqa
25 | 
26 |   # optimization-specific
27 |   batch_size_train: 16
28 |   batch_size_eval: 64
29 |   num_workers: 4
30 | 
31 |   # inference-specific
32 |   max_len: 10
33 |   min_len: 1
34 |   num_beams: 3
35 |   num_ans_candidates: 128
36 |   inference_method: "rank"
37 | 
38 |   seed: 42
39 |   output_dir: "output/ALBEF/VQA"
40 | 
41 |   evaluate: True
42 |   test_splits: ["val"]
43 | 
44 |   # distribution-specific
45 |   device: "cuda"
46 |   world_size: 1
47 |   dist_url: "env://"
48 |   distributed: True
49 | 


--------------------------------------------------------------------------------
/lavis/projects/alpro/eval/msrvtt_qa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   model_type: msrvtt
 9 | 
10 | datasets:
11 |   msrvtt_qa: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "alpro_video_eval"
15 |           n_frms: 16
16 |           image_size: 224
17 |     text_processor:
18 |         eval:
19 |           name: "blip_caption"
20 | 
21 | run:
22 |   task: multimodal_classification
23 |   # optimization-specific
24 |   batch_size_train: 32
25 |   batch_size_eval: 64
26 |   num_workers: 4
27 | 
28 |   seed: 42
29 |   output_dir: "output/ALPRO/msrvtt_qa"
30 | 
31 |   evaluate: True
32 |   valid_splits: ["val"]
33 |   test_splits: ["test"]
34 | 
35 |   # distribution-specific
36 |   device: "cuda"
37 |   world_size: 1
38 |   dist_url: "env://"
39 |   distributed: True
40 | 


--------------------------------------------------------------------------------
/lavis/projects/alpro/eval/msrvtt_ret_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 |   model_type: msrvtt
 9 | 
10 | datasets:
11 |   msrvtt_retrieval: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "alpro_video_eval"
15 |           n_frms: 8
16 |           image_size: 224
17 |     text_processor:
18 |         eval:
19 |           name: "blip_caption"
20 | 
21 | run:
22 |   task: retrieval
23 |   # optimization-specific
24 |   batch_size_train: 24
25 |   batch_size_eval: 64
26 |   num_workers: 4
27 | 
28 |   # k_test: 256
29 |   k_test: 1000
30 | 
31 |   seed: 42
32 |   output_dir: "output/ALPRO/msrvtt_retrieval"
33 | 
34 |   evaluate: True
35 |   test_splits: ["test"]
36 | 
37 |   # distribution-specific
38 |   device: "cuda"
39 |   world_size: 1
40 |   dist_url: "env://"
41 |   distributed: True
42 |   use_dist_eval_sampler: False
43 | 


--------------------------------------------------------------------------------
/lavis/projects/alpro/eval/msvd_qa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   model_type: msvd
 9 | 
10 | datasets:
11 |   msvd_qa: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "alpro_video_eval"
15 |           n_frms: 16
16 |           image_size: 224
17 |     text_processor:
18 |         train:
19 |           name: "blip_caption"
20 |         eval:
21 |           name: "blip_caption"
22 | 
23 | run:
24 |   task: multimodal_classification
25 |   # optimization-specific
26 |   batch_size_train: 24
27 |   batch_size_eval: 64
28 |   num_workers: 4
29 | 
30 |   seed: 42
31 |   output_dir: "output/ALPRO/msvd_qa"
32 | 
33 |   evaluate: True
34 |   test_splits: ["test"]
35 | 
36 |   # distribution-specific
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/aokvqa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   model_type: aokvqa
 9 |   image_size: 480
10 | 
11 | datasets:
12 |   aok_vqa: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 480
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: aok_vqa
23 |   # optimization-specific
24 |   batch_size_train: 64
25 |   batch_size_eval: 64
26 |   num_workers: 4
27 | 
28 |   # inference-specific
29 |   max_len: 10
30 |   min_len: 1
31 |   num_beams: 3
32 |   num_ans_candidates: 128
33 |   inference_method: "rank"
34 | 
35 |   seed: 42
36 |   output_dir: "output/BLIP/AOKVQA"
37 | 
38 |   evaluate: True
39 |   test_splits: ["val", "test"]
40 | 
41 |   # distribution-specific
42 |   device: "cuda"
43 |   world_size: 1
44 |   dist_url: "env://"
45 |   distributed: True
46 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/caption_coco_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   model_type: base_coco
 9 | 
10 | datasets:
11 |   coco_caption: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |     text_processor:
16 |         eval:
17 |           name: "blip_caption"
18 | 
19 | run:
20 |   # task: retrieval
21 |   task: captioning
22 |   # optimizer
23 |   batch_size_train: 32
24 |   batch_size_eval: 64
25 |   num_workers: 4
26 | 
27 |   max_len: 20
28 |   min_len: 5
29 |   num_beams: 3
30 | 
31 |   seed: 42
32 |   output_dir: "output/BLIP/Caption_coco"
33 | 
34 |   evaluate: True
35 |   test_splits: ["test"]
36 | 
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/caption_coco_eval_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   model_type: large_coco
 9 | 
10 | datasets:
11 |   coco_caption: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |     text_processor:
16 |         eval:
17 |           name: "blip_caption"
18 | 
19 | run:
20 |   # task: retrieval
21 |   task: captioning
22 |   # optimizer
23 |   batch_size_train: 32
24 |   batch_size_eval: 64
25 |   num_workers: 4
26 | 
27 |   max_len: 20
28 |   min_len: 5
29 |   num_beams: 3
30 | 
31 |   seed: 42
32 |   output_dir: "output/BLIP/Caption_coco"
33 | 
34 |   evaluate: True
35 |   test_splits: ["test"]
36 | 
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/nlvr_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_nlvr
 8 |   model_type: nlvr
 9 | 
10 | datasets:
11 |   nlvr: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: multimodal_classification
22 | 
23 |   batch_size_train: 16
24 |   batch_size_eval: 64
25 |   num_workers: 4
26 | 
27 |   seed: 42
28 |   output_dir: "output/BLIP/NLVR"
29 | 
30 |   evaluate: True
31 |   test_splits: ["val", "test"]
32 | 
33 |   # distribution-specific
34 |   device: "cuda"
35 |   world_size: 1
36 |   dist_url: "env://"
37 |   distributed: True
38 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/okvqa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   model_type: okvqa
 9 |   image_size: 480
10 | 
11 | datasets:
12 |   ok_vqa: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 480
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: vqa
23 |   # optimization-specific
24 |   batch_size_train: 16
25 |   batch_size_eval: 16
26 |   num_workers: 4
27 | 
28 |   # inference-specific
29 |   max_len: 10
30 |   min_len: 1
31 |   num_beams: 3
32 |   num_ans_candidates: 128
33 |   inference_method: "rank"
34 | 
35 |   seed: 42
36 |   output_dir: "output/BLIP/OKVQA"
37 | 
38 |   evaluate: True
39 |   test_splits: ["test"]
40 | 
41 |   # distribution-specific
42 |   device: "cuda"
43 |   world_size: 1
44 |   dist_url: "env://"
45 |   distributed: True
46 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/ret_flickr_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   model_type: flickr
 9 | 
10 | datasets:
11 |   flickr30k: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: retrieval
22 | 
23 |   # dataloading
24 |   num_workers: 4
25 |   batch_size_train: 32
26 |   batch_size_eval: 64
27 | 
28 |   test_splits: ["test"]
29 | 
30 |   # distribution
31 |   device: "cuda"
32 |   world_size: 1
33 |   dist_url: "env://"
34 |   distributed: True
35 |   use_dist_eval_sampler: False
36 | 
37 |   # model specific
38 |   k_test: 128
39 | 
40 |   # misc
41 |   seed: 42
42 |   output_dir: "output/Retrieval_Flickr30k"
43 | 
44 |   evaluate: True
45 | 


--------------------------------------------------------------------------------
/lavis/projects/blip/eval/vqav2_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   model_type: vqav2
 9 |   image_size: 480
10 | 
11 | datasets:
12 |   coco_vqa: # name of the dataset builder
13 |     type: eval
14 |     vis_processor:
15 |         eval:
16 |           name: "blip_image_eval"
17 |           image_size: 480
18 |     text_processor:
19 |         eval:
20 |           name: "blip_question"
21 | 
22 | run:
23 |   task: vqa
24 |   # optimization-specific
25 |   batch_size_train: 16
26 |   batch_size_eval: 64
27 |   num_workers: 4
28 | 
29 |   # inference-specific
30 |   max_len: 10
31 |   min_len: 1
32 |   num_beams: 3
33 |   num_ans_candidates: 128
34 |   inference_method: "rank"
35 | 
36 |   seed: 42
37 |   output_dir: "output/BLIP/VQA"
38 | 
39 |   evaluate: True
40 |   test_splits: ["val"]
41 | 
42 |   # distribution-specific
43 |   device: "cuda"
44 |   world_size: 1
45 |   dist_url: "env://"
46 |   distributed: True
47 | 


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml


--------------------------------------------------------------------------------
/lavis/projects/blip2/eval/ret_flickr_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip2
 8 |   model_type: coco
 9 |   use_grad_checkpoint: False
10 | 
11 | datasets:
12 |   flickr30k: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 364
17 |     text_processor:
18 |         eval:
19 |           name: "blip_caption"
20 | 
21 | run:
22 |   task: retrieval
23 | 
24 |   # dataloading
25 |   num_workers: 4
26 |   batch_size_train: 16
27 |   batch_size_eval: 32
28 | 
29 |   test_splits: ["test"]
30 | 
31 |   # distribution
32 |   device: "cuda"
33 |   world_size: 1
34 |   dist_url: "env://"
35 |   distributed: True
36 |   use_dist_eval_sampler: False
37 | 
38 |   # model specific
39 |   k_test: 128
40 | 
41 |   # misc
42 |   seed: 42
43 |   output_dir: "output/BLIP2/Retrieval_Flickr30k"
44 | 
45 |   evaluate: True


--------------------------------------------------------------------------------
/lavis/projects/clip/exp_coco_ret_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14-336
10 | 
11 | datasets:
12 |   coco_retrieval: # name of the dataset builder
13 |     vis_processor:
14 |         train:
15 |           name: "clip_image_train"
16 |           image_size: 336
17 |         eval:
18 |           name: "clip_image_eval"
19 |           image_size: 336
20 |     text_processor:
21 |         train:
22 |           name: "blip_caption"
23 |         eval:
24 |           name: "blip_caption"
25 | 
26 | run:
27 |   task: retrieval
28 | 
29 |   # dataloading
30 |   num_workers: 4
31 |   batch_size_train: 32
32 |   batch_size_eval: 128
33 | 
34 |   test_splits: ["test"]
35 | 
36 |   # distribution
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 |   use_dist_eval_sampler: True
42 | 
43 |   # misc
44 |   seed: 42
45 |   output_dir: "output/clip/Retrieval_COCO"
46 | 
47 |   evaluate: True
48 | 


--------------------------------------------------------------------------------
/lavis/projects/clip/exp_flickr_ret_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14-336
10 | 
11 | datasets:
12 |   flickr30k: # name of the dataset builder
13 |     vis_processor:
14 |         train:
15 |           name: "clip_image_train"
16 |           image_size: 336
17 |         eval:
18 |           name: "clip_image_eval"
19 |           image_size: 336
20 |     text_processor:
21 |         train:
22 |           name: "blip_caption"
23 |         eval:
24 |           name: "blip_caption"
25 | 
26 | run:
27 |   task: retrieval
28 | 
29 |   # dataloading
30 |   num_workers: 4
31 |   batch_size_train: 32
32 |   batch_size_eval: 128
33 | 
34 |   test_splits: ["test"]
35 | 
36 |   # distribution
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 |   use_dist_eval_sampler: True
42 | 
43 |   # misc
44 |   seed: 42
45 |   output_dir: "output/clip/Retrieval_Flickr"
46 | 
47 |   evaluate: True
48 | 


--------------------------------------------------------------------------------
/lavis/projects/clip/exp_imnet_zs_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14-336
10 | 
11 | datasets:
12 |   imagenet: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "clip_image_eval"
16 |           # image_size: 224
17 |           image_size: 336
18 | 
19 | run:
20 |   task: multimodal_classification
21 | 
22 |   # dataloading
23 |   num_workers: 4
24 |   batch_size_train: 32
25 |   batch_size_eval: 128
26 | 
27 |   test_splits: ["val"]
28 | 
29 |   # distribution
30 |   device: "cuda"
31 |   world_size: 1
32 |   dist_url: "env://"
33 |   distributed: True
34 | 
35 |   # misc
36 |   seed: 42
37 |   output_dir: "output/clip/zs_imnet"
38 | 
39 |   evaluate: True
40 | 


--------------------------------------------------------------------------------
/lavis/projects/gpt/eval/dialogue_avsd_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: gpt_dialogue
 8 |   model_type: base
 9 | 
10 | datasets:
11 |   avsd_dialogue: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "gpt_video_ft"
15 |           visual_ft: ["i3d_flow", "i3d_rgb"]
16 |           audio_ft: ["vggish"]
17 |     text_processor:
18 |         eval:
19 |           name: "gpt_dialogue"
20 |           max_turns:  3
21 |           use_caption: True
22 | 
23 | run:
24 |   task: dialogue
25 |   # optimizer
26 |   batch_size_train: 16
27 |   batch_size_eval: 16
28 |   num_workers: 0
29 | 
30 |   max_len: 20
31 |   min_len: 5
32 |   num_beams: 5
33 | 
34 |   seed: 42
35 |   output_dir: "output/gpt2/dialogue_avsd"
36 | 
37 |   evaluate: True
38 |   valid_splits: ["test"]
39 | 
40 |   device: "cuda"
41 |   world_size: 1
42 |   dist_url: "env://"
43 |   distributed: True
44 | 


--------------------------------------------------------------------------------
/lavis/runners/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.runners.runner_base import RunnerBase
 9 | from lavis.runners.runner_iter import RunnerIter
10 | 
11 | __all__ = ["RunnerBase", "RunnerIter"]
12 | 


--------------------------------------------------------------------------------
/lavis/tasks/image_text_pretrain.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.tasks.base_task import BaseTask
10 | 
11 | 
12 | @registry.register_task("image_text_pretrain")
13 | class ImageTextPretrainTask(BaseTask):
14 |     def __init__(self):
15 |         super().__init__()
16 | 
17 |     def evaluation(self, model, data_loader, cuda_enabled=True):
18 |         pass
19 | 


--------------------------------------------------------------------------------
/lavis/tasks/text_to_image_generation.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.tasks import BaseTask
 9 | from lavis.common.registry import registry
10 | 
11 | 
12 | @registry.register_task("text-to-image-generation")
13 | class TextToImageGenerationTask(BaseTask):
14 |     def __init__(self, cfg):
15 |         super().__init__()
16 | 
17 |         self.cfg = cfg
18 | 
19 |     @classmethod
20 |     def setup_task(cls, cfg):
21 |         run_cfg = cfg.run_cfg
22 | 
23 |         return cls(cfg=run_cfg)
24 | 


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/black-cat.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/black-cat.png


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/cat-sofa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/cat-sofa.png


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/dog.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dog.png


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/dog2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dog2.png


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/dreambooth/dog/00.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog/00.jpg


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/dreambooth/dog/01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog/01.jpg


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/dreambooth/dog/02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog/02.jpg


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/dreambooth/dog/03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog/03.jpg


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/dreambooth/dog/04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog/04.jpg


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/dreambooth/dog8/00.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog8/00.jpg


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/dreambooth/dog8/01.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog8/01.jpg


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/dreambooth/dog8/02.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog8/02.jpg


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/dreambooth/dog8/03.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog8/03.jpg


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/dreambooth/dog8/04.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog8/04.jpg


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/dress-model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dress-model.png


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/flower.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/flower.jpg


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/green-skirt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/green-skirt.png


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/jacket-letter-s/jacket-letter-s.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/jacket-letter-s/jacket-letter-s.png


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/kettle.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/kettle.jpg


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/pink-dress.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/pink-dress.png


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/pink-dress/pink-dress.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/pink-dress/pink-dress.png


--------------------------------------------------------------------------------
/projects/blip-diffusion/images/shein-jacket/shein-jacket.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/shein-jacket/shein-jacket.jpg


--------------------------------------------------------------------------------
/projects/blip-diffusion/teaser-website.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/teaser-website.png


--------------------------------------------------------------------------------
/projects/blip2/blip2_illustration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip2/blip2_illustration.png


--------------------------------------------------------------------------------
/projects/img2llm-vqa/Caption.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/img2llm-vqa/Caption.png


--------------------------------------------------------------------------------
/projects/img2llm-vqa/Illustration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/img2llm-vqa/Illustration.png


--------------------------------------------------------------------------------
/projects/img2llm-vqa/QuestionGeneration.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/img2llm-vqa/QuestionGeneration.png


--------------------------------------------------------------------------------
/projects/img2llm-vqa/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/img2llm-vqa/demo.png


--------------------------------------------------------------------------------
/projects/img2prompt-vqa/README.md:
--------------------------------------------------------------------------------
 1 | ## From Images to Textual Prompts: Zero-shot VQA with Frozen Large Language Models
 2 | 
 3 | This is the official code for <a href="https://arxiv.org/abs/2212.10846">Img2LLM-VQA paper</a>.
 4 | 
 5 | We have renamed **Img2Prompt-VQA** to **Img2LLM-VQA**. See the [new project page](https://github.com/salesforce/LAVIS/tree/main/projects/img2llm-vqa) for details
 6 | 
 7 | ### Citation
 8 | If you find this code to be useful for your research, please consider citing.
 9 | ```bibtex
10 | @misc{guo2023from,
11 |   title={From Images to Textual Prompts: Zero-shot {VQA} with Frozen Large Language Models},
12 |   author={Jiaxian Guo and Junnan Li and Dongxu Li and Anthony Tiong and Boyang Li and Dacheng Tao and Steven HOI},
13 |   year={2023},
14 |   url={https://openreview.net/forum?id=Ck1UtnVukP8}
15 | }
16 | ```
17 | 


--------------------------------------------------------------------------------
/projects/instructblip/comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/instructblip/comparison.png


--------------------------------------------------------------------------------
/projects/instructblip/showcase.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/instructblip/showcase.png


--------------------------------------------------------------------------------
/projects/pnp-vqa/pnp_vqa.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/pnp-vqa/pnp_vqa.png


--------------------------------------------------------------------------------
/projects/xinstructblip/assets/architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/xinstructblip/assets/architecture.png


--------------------------------------------------------------------------------
/projects/xinstructblip/assets/data.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/xinstructblip/assets/data.png


--------------------------------------------------------------------------------
/projects/xinstructblip/demo/examples/audio/110714_wren.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/xinstructblip/demo/examples/audio/110714_wren.wav


--------------------------------------------------------------------------------
/projects/xinstructblip/demo/examples/audio/Group_of_Dogs_Barking.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/xinstructblip/demo/examples/audio/Group_of_Dogs_Barking.wav


--------------------------------------------------------------------------------
/projects/xinstructblip/demo/examples/point_cloud/banana.glb:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/xinstructblip/demo/examples/point_cloud/banana.glb


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires      = ["setuptools>=61.0.0", "wheel"]
3 | build-backend = "setuptools.build_meta"
4 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | contexttimer
 2 | decord
 3 | diffusers<=0.16.0
 4 | einops>=0.4.1
 5 | fairscale==0.4.4
 6 | ftfy
 7 | iopath
 8 | ipython
 9 | omegaconf
10 | opencv-python-headless==4.5.5.64
11 | opendatasets
12 | packaging
13 | pandas
14 | plotly
15 | pre-commit
16 | pycocoevalcap
17 | pycocotools
18 | python-magic
19 | scikit-image
20 | sentencepiece
21 | spacy
22 | streamlit
23 | timm==0.4.12
24 | torch>=1.10.0
25 | torchvision
26 | tqdm
27 | transformers==4.33.2
28 | webdataset
29 | wheel
30 | torchaudio
31 | soundfile
32 | moviepy
33 | nltk
34 | peft
35 | 
36 | easydict==1.9
37 | pyyaml_env_tag==0.1
38 | open3d==0.13.0
39 | h5py
40 | 


--------------------------------------------------------------------------------
/run_scripts/albef/eval/eval_albef_nlvr.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/albef/eval/nlvr_eval.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/albef/eval/eval_albef_ve.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=16 evaluate.py --cfg-path lavis/projects/albef/eval/snli_ve_eval.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/albef/eval/eval_coco_retrieval.sh:
--------------------------------------------------------------------------------
1 | # python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/albef/eval/coco_retrieval_eval.yaml
2 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/albef/eval/ret_coco_eval.yaml
3 | 


--------------------------------------------------------------------------------
/run_scripts/albef/eval/eval_flickr30k_retrieval.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/albef/eval/ret_flickr30k_eval.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/albef/eval/test_albef_vqa.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/albef/eval/vqa_test.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/albef/eval/val_albef_vqa.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 --master_port 2345 evaluate.py --cfg-path lavis/projects/albef/eval/vqa_val.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/albef/train/pretrain.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/albef/train/pretrain.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/albef/train/train_aokvqa_albef.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/albef/train/aokvqa_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/albef/train/train_coco_retrieval_albef.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/albef/train/ret_coco_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/albef/train/train_flickr30k_retrieval_albef.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/albef/train/ret_flickr30k_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/albef/train/train_nlvr_albef.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/albef/train/nlvr_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/albef/train/train_okvqa_albef.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/albef/train/okvqa_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/albef/train/train_ve_albef.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/albef/train/snli_ve_ft.yaml
2 | # CUDA_VISIBLE_DEVICES=8,9,10,11,12,13,14,15 python -m torch.distributed.run --nproc_per_node=8 --master_port 47770 train.py --cfg-path lavis/projects/albef/train/snli_ve_ft.yaml
3 | 


--------------------------------------------------------------------------------
/run_scripts/albef/train/train_vqa_albef.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/albef/train/vqa_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/alpro/eval/eval_didemo_ret.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/alpro/eval/didemo_ret_eval.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/alpro/eval/eval_msrvtt_qa.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/alpro/eval/msrvtt_qa_eval.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/alpro/eval/eval_msrvtt_ret.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/alpro/eval/msrvtt_ret_eval.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/alpro/eval/eval_msvd_qa.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=16 evaluate.py --cfg-path lavis/projects/alpro/eval/msvd_qa_eval.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/alpro/train/train_didemo_ret.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/alpro/train/didemo_ret_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/alpro/train/train_msrvtt_qa.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/alpro/train/msrvtt_qa_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/alpro/train/train_msrvtt_ret.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/alpro/train/msrvtt_retrieval_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/alpro/train/train_msvd_qa.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/alpro/train/msvd_qa_ft.yaml


--------------------------------------------------------------------------------
/run_scripts/blip-diffusion/train_db.sh:
--------------------------------------------------------------------------------
 1 | SUBJECT_TEXT="dog"
 2 | IMAGE_STORAGE="/export/home/workspace/LAVIS-Diffusion/LAVIS/projects/blip-diffusion/images/dreambooth/dog"
 3 | MAX_ITERS=40
 4 | ITERS_PER_INNER_EPOCH=40 # number of iterations before saving a checkpoint 
 5 | BATCH_SIZE=3
 6 | LR=5e-6
 7 | WEIGHT_DECAY=0.01
 8 | OUTPUT_DIR="output/debug/BLIP-diffusion/finetune/dog"
 9 | 
10 | python -m torch.distributed.run \
11 | --nproc_per_node=1 train.py \
12 | --cfg-path lavis/projects/blip_diffusion/finetune-db-template.yaml \
13 | --options datasets.blip_diffusion_finetune.build_info.subject_text=$SUBJECT_TEXT \
14 |           datasets.blip_diffusion_finetune.build_info.images.storage=$IMAGE_STORAGE \
15 |           run.max_iters=$MAX_ITERS \
16 |           run.iters_per_inner_epoch=$ITERS_PER_INNER_EPOCH \
17 |           run.output_dir=$OUTPUT_DIR \
18 |           run.init_lr=$LR \
19 |           run.weight_decay=$WEIGHT_DECAY \
20 |           run.batch_size_train=$BATCH_SIZE
21 | 


--------------------------------------------------------------------------------
/run_scripts/blip-diffusion/train_db_dog.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=1 train.py --cfg-path lavis/projects/blip_diffusion/finetune-db-dog.yaml


--------------------------------------------------------------------------------
/run_scripts/blip-diffusion/train_db_jacket_s.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=1 train.py --cfg-path lavis/projects/blip_diffusion/finetune-db-jacket-s.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip-diffusion/train_db_pink_dress.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=1 train.py --cfg-path lavis/projects/blip_diffusion/finetune-db-pink-dress.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip-diffusion/train_db_shein_jacket.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=1 train.py --cfg-path lavis/projects/blip_diffusion/finetune-db-shein-jacket.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip/eval/eval_aokvqa.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=1 evaluate.py --cfg-path lavis/projects/blip/eval/aokvqa_eval.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip/eval/eval_coco_cap.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip/eval/caption_coco_eval.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip/eval/eval_coco_cap_large.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip/eval/caption_coco_eval_large.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip/eval/eval_nlvr.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip/eval/nlvr_eval.yaml
2 | # python -m torch.distributed.run --nproc_per_node=1 evaluate.py --cfg-path lavis/projects/blip/eval/nlvr_eval.yaml
3 | 


--------------------------------------------------------------------------------
/run_scripts/blip/eval/eval_nocaps.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip/eval/nocaps_eval.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip/eval/eval_okvqa.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip/eval/okvqa_eval.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip/eval/eval_ret_coco.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip/eval/ret_coco_eval.yaml
2 | # python -m torch.distributed.run --nproc_per_node=16 evaluate.py --cfg-path lavis/projects/blip/eval/ret_coco_eval.yaml
3 | 


--------------------------------------------------------------------------------
/run_scripts/blip/eval/eval_ret_flickr.sh:
--------------------------------------------------------------------------------
1 | # python -m torch.distributed.run --nproc_per_node=16 evaluate.py --cfg-path lavis/projects/blip/eval/ret_flickr_eval.yaml
2 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip/eval/ret_flickr_eval.yaml
3 | 


--------------------------------------------------------------------------------
/run_scripts/blip/eval/validate_vqa.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip/eval/vqav2_eval.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip/train/pretrain.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/blip/train/pretrain_14m.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip/train/train_aokvqa.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/blip/train/aokvqa_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip/train/train_caption_coco.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/blip/train/caption_coco_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip/train/train_caption_coco_large.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/blip/train/caption_coco_large_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip/train/train_caption_coco_large_iters.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/blip/coco_cap_ft_iter.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip/train/train_nlvr.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/blip/train/nlvr_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip/train/train_okvqa.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/blip/train/okvqa_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip/train/train_retrieval_coco.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/blip/train/retrieval_coco_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip/train/train_retrieval_flickr.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/blip/train/retrieval_flickr_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip/train/train_vqa.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/blip/vqav2_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip2/eval/eval_cap_coco_flant5xl.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip2/eval/eval_cap_coco_opt2.7b.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/caption_coco_opt2.7b_eval.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip2/eval/eval_cap_coco_opt6.7b.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/caption_coco_opt6.7b_eval.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/blip2/eval/eval_gqa_zeroshot_flant5xl.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/gqa_zeroshot_flant5xl_eval.yaml


--------------------------------------------------------------------------------
/run_scripts/blip2/eval/eval_okvqa_zeroshot_flant5xl.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/okvqa_zeroshot_flant5xl_eval.yaml


--------------------------------------------------------------------------------
/run_scripts/blip2/eval/eval_ret_coco.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/ret_coco_eval.yaml
2 | # python -m torch.distributed.run --nproc_per_node=16 evaluate.py --cfg-path lavis/projects/blip2/eval/ret_coco_eval.yaml
3 | 


--------------------------------------------------------------------------------
/run_scripts/blip2/eval/eval_ret_flickr.sh:
--------------------------------------------------------------------------------
1 | # python -m torch.distributed.run --nproc_per_node=16 evaluate.py --cfg-path lavis/projects/blip2/eval/ret_flickr_eval.yaml
2 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/ret_flickr_eval.yaml
3 | 


--------------------------------------------------------------------------------
/run_scripts/blip2/eval/validate_vqa_zeroshot_flant5xl.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/vqav2_zeroshot_flant5xl_eval.yaml


--------------------------------------------------------------------------------
/run_scripts/blip2/eval/validate_vqa_zeroshot_opt.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/vqav2_zeroshot_opt_eval.yaml


--------------------------------------------------------------------------------
/run_scripts/blip2/train/pretrain_stage1.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/blip2/train/pretrain_stage1.yaml


--------------------------------------------------------------------------------
/run_scripts/blip2/train/pretrain_stage2.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/blip2/train/pretrain_stage2.yaml


--------------------------------------------------------------------------------
/run_scripts/blip2/train/train_caption_coco.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/blip2/train/caption_coco_ft.yaml


--------------------------------------------------------------------------------
/run_scripts/blip2/train/train_retrieval_coco.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/blip2/train/retrieval_coco_ft.yaml


--------------------------------------------------------------------------------
/run_scripts/clip/eval/eval_clip_ret_coco.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=1 evaluate.py --cfg-path lavis/projects/clip/exp_coco_ret_eval.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/clip/eval/eval_clip_ret_flickr.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=1 evaluate.py --cfg-path lavis/projects/clip/exp_flickr_ret_eval.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/clip/eval/eval_clip_zs_imnet.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/clip/exp_imnet_zs_eval.yaml # --options run.num_workers=0
2 | 


--------------------------------------------------------------------------------
/run_scripts/gpt/eval/eval_video_dialogue_avsd.sh:
--------------------------------------------------------------------------------
1 | python evaluate.py --cfg-path lavis/projects/gpt/train/dialogue_avsd_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/gpt/train/train_video_dialogue_avsd.sh:
--------------------------------------------------------------------------------
1 | python train.py --cfg-path lavis/projects/gpt/train/dialogue_avsd_ft.yaml
2 | 


--------------------------------------------------------------------------------
/run_scripts/pnp-vqa/eval/eval_gqa.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/gqa_eval.yaml


--------------------------------------------------------------------------------
/run_scripts/pnp-vqa/eval/eval_gqa_3b.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/gqa_eval_3b.yaml


--------------------------------------------------------------------------------
/run_scripts/pnp-vqa/eval/eval_gqa_large.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/gqa_eval_large.yaml


--------------------------------------------------------------------------------
/run_scripts/pnp-vqa/eval/eval_okvqa.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/okvqa_eval.yaml


--------------------------------------------------------------------------------
/run_scripts/pnp-vqa/eval/eval_okvqa_3b.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/okvqa_eval_3b.yaml


--------------------------------------------------------------------------------
/run_scripts/pnp-vqa/eval/eval_okvqa_large.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/okvqa_eval_large.yaml


--------------------------------------------------------------------------------
/run_scripts/pnp-vqa/eval/eval_vqav2.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/vqav2_eval.yaml


--------------------------------------------------------------------------------
/run_scripts/pnp-vqa/eval/eval_vqav2_3b.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/vqav2_eval_3b.yaml


--------------------------------------------------------------------------------
/run_scripts/pnp-vqa/eval/eval_vqav2_large.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/vqav2_eval_large.yaml


--------------------------------------------------------------------------------
/run_scripts/pnp-vqa/eval/eval_vqav2_test.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/vqav2_test_eval.yaml


--------------------------------------------------------------------------------
/run_scripts/pnp-vqa/eval/eval_vqav2_test_3b.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/vqav2_test_eval_3b.yaml


--------------------------------------------------------------------------------
/run_scripts/pnp-vqa/eval/eval_vqav2_test_large.sh:
--------------------------------------------------------------------------------
1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/vqav2_test_eval_large.yaml


--------------------------------------------------------------------------------
/run_scripts/run_browser.sh:
--------------------------------------------------------------------------------
1 | streamlit run app/dataset_browser.py --server.fileWatcherType none
2 | 


--------------------------------------------------------------------------------
/run_scripts/run_demo.sh:
--------------------------------------------------------------------------------
1 | streamlit run app/main.py --server.fileWatcherType none
2 | 


--------------------------------------------------------------------------------