├── .github └── workflows │ └── docs.yaml ├── .gitignore ├── .pre-commit-config.yaml ├── CODEOWNERS ├── CODE_OF_CONDUCT.md ├── LICENSE.txt ├── MANIFEST.in ├── README.md ├── SECURITY.md ├── app ├── __init__.py ├── calculate_coco_features.py ├── caption.py ├── classification.py ├── dataset_browser.py ├── image_text_match.py ├── main.py ├── multimodal_search.py ├── multipage.py ├── text_localization.py ├── utils.py └── vqa.py ├── assets └── demo-6.png ├── dataset_card ├── avsd_dialogue.md ├── coco_caption.md ├── coco_retrieval.md ├── conceptual_captions.md ├── didemo_retrieval.md ├── flickr_retrieval.md ├── gqa.md ├── imgs │ ├── NLVR2.png │ ├── avsd_dialogue.png │ ├── coco_caption.png │ ├── conceptual_captions.png │ ├── didemo.png │ ├── flickr30k.png │ ├── gqa.png │ ├── msrvtt.png │ ├── msrvtt_qa.png │ ├── msvd_qa.png │ ├── nocaps.png │ ├── sbu_caption.png │ ├── snli_ve.png │ └── vqav2.png ├── msrvtt_qa.md ├── msrvtt_retrieval.md ├── msvd_qa.md ├── nlvr2.md ├── nocaps.md ├── sbu_caption.md ├── snli_visual_entailment.md └── vqav2.md ├── docs ├── Makefile ├── _static │ ├── Confusing-Pictures.jpg │ ├── architecture.png │ ├── logo_final.png │ └── merlion.png ├── benchmark.rst ├── build_docs.sh ├── conf.py ├── getting_started.rst ├── index.rst ├── intro.rst ├── make.bat ├── requirements.txt ├── tutorial.configs.rst ├── tutorial.datasets.rst ├── tutorial.evaluation.rst ├── tutorial.models.rst ├── tutorial.processors.rst ├── tutorial.rst ├── tutorial.tasks.rst └── tutorial.training-example.rst ├── evaluate.py ├── examples ├── albef_feature_extraction.ipynb ├── albef_vqa.ipynb ├── albef_zero_shot_classification.ipynb ├── blip2_feature_extraction.ipynb ├── blip2_image_text_matching.ipynb ├── blip2_instructed_generation.ipynb ├── blip_feature_extraction.ipynb ├── blip_image_captioning.ipynb ├── blip_image_text_matching.ipynb ├── blip_text_localization.ipynb ├── blip_vqa.ipynb ├── blip_zero_shot_classification.ipynb ├── clip_feature_extraction.ipynb └── clip_zero_shot_classification.ipynb ├── lavis ├── __init__.py ├── common │ ├── annotator │ │ ├── canny │ │ │ └── __init__.py │ │ ├── ckpts │ │ │ └── download.sh │ │ ├── hed │ │ │ └── __init__.py │ │ ├── midas │ │ │ ├── __init__.py │ │ │ ├── api.py │ │ │ ├── midas │ │ │ │ ├── __init__.py │ │ │ │ ├── base_model.py │ │ │ │ ├── blocks.py │ │ │ │ ├── dpt_depth.py │ │ │ │ ├── midas_net.py │ │ │ │ ├── midas_net_custom.py │ │ │ │ ├── transforms.py │ │ │ │ └── vit.py │ │ │ └── utils.py │ │ ├── mlsd │ │ │ ├── __init__.py │ │ │ ├── models │ │ │ │ ├── mbv2_mlsd_large.py │ │ │ │ └── mbv2_mlsd_tiny.py │ │ │ └── utils.py │ │ ├── openpose │ │ │ ├── __init__.py │ │ │ ├── body.py │ │ │ ├── hand.py │ │ │ ├── model.py │ │ │ └── util.py │ │ ├── uniformer │ │ │ ├── __init__.py │ │ │ ├── configs │ │ │ │ └── _base_ │ │ │ │ │ ├── datasets │ │ │ │ │ ├── ade20k.py │ │ │ │ │ ├── chase_db1.py │ │ │ │ │ ├── cityscapes.py │ │ │ │ │ ├── cityscapes_769x769.py │ │ │ │ │ ├── drive.py │ │ │ │ │ ├── hrf.py │ │ │ │ │ ├── pascal_context.py │ │ │ │ │ ├── pascal_context_59.py │ │ │ │ │ ├── pascal_voc12.py │ │ │ │ │ ├── pascal_voc12_aug.py │ │ │ │ │ └── stare.py │ │ │ │ │ ├── default_runtime.py │ │ │ │ │ ├── models │ │ │ │ │ ├── ann_r50-d8.py │ │ │ │ │ ├── apcnet_r50-d8.py │ │ │ │ │ ├── ccnet_r50-d8.py │ │ │ │ │ ├── cgnet.py │ │ │ │ │ ├── danet_r50-d8.py │ │ │ │ │ ├── deeplabv3_r50-d8.py │ │ │ │ │ ├── deeplabv3_unet_s5-d16.py │ │ │ │ │ ├── deeplabv3plus_r50-d8.py │ │ │ │ │ ├── dmnet_r50-d8.py │ │ │ │ │ ├── dnl_r50-d8.py │ │ │ │ │ ├── emanet_r50-d8.py │ │ │ │ │ ├── encnet_r50-d8.py │ │ │ │ │ ├── fast_scnn.py │ │ │ │ │ ├── fcn_hr18.py │ │ │ │ │ ├── fcn_r50-d8.py │ │ │ │ │ ├── fcn_unet_s5-d16.py │ │ │ │ │ ├── fpn_r50.py │ │ │ │ │ ├── fpn_uniformer.py │ │ │ │ │ ├── gcnet_r50-d8.py │ │ │ │ │ ├── lraspp_m-v3-d8.py │ │ │ │ │ ├── nonlocal_r50-d8.py │ │ │ │ │ ├── ocrnet_hr18.py │ │ │ │ │ ├── ocrnet_r50-d8.py │ │ │ │ │ ├── pointrend_r50.py │ │ │ │ │ ├── psanet_r50-d8.py │ │ │ │ │ ├── pspnet_r50-d8.py │ │ │ │ │ ├── pspnet_unet_s5-d16.py │ │ │ │ │ ├── upernet_r50.py │ │ │ │ │ └── upernet_uniformer.py │ │ │ │ │ └── schedules │ │ │ │ │ ├── schedule_160k.py │ │ │ │ │ ├── schedule_20k.py │ │ │ │ │ ├── schedule_40k.py │ │ │ │ │ └── schedule_80k.py │ │ │ ├── exp │ │ │ │ └── upernet_global_small │ │ │ │ │ ├── config.py │ │ │ │ │ ├── run.sh │ │ │ │ │ ├── test.sh │ │ │ │ │ ├── test_config_g.py │ │ │ │ │ ├── test_config_h32.py │ │ │ │ │ └── test_config_w32.py │ │ │ ├── mmcv │ │ │ │ ├── __init__.py │ │ │ │ ├── arraymisc │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── quantization.py │ │ │ │ ├── cnn │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── alexnet.py │ │ │ │ │ ├── bricks │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── activation.py │ │ │ │ │ │ ├── context_block.py │ │ │ │ │ │ ├── conv.py │ │ │ │ │ │ ├── conv2d_adaptive_padding.py │ │ │ │ │ │ ├── conv_module.py │ │ │ │ │ │ ├── conv_ws.py │ │ │ │ │ │ ├── depthwise_separable_conv_module.py │ │ │ │ │ │ ├── drop.py │ │ │ │ │ │ ├── generalized_attention.py │ │ │ │ │ │ ├── hsigmoid.py │ │ │ │ │ │ ├── hswish.py │ │ │ │ │ │ ├── non_local.py │ │ │ │ │ │ ├── norm.py │ │ │ │ │ │ ├── padding.py │ │ │ │ │ │ ├── plugin.py │ │ │ │ │ │ ├── registry.py │ │ │ │ │ │ ├── scale.py │ │ │ │ │ │ ├── swish.py │ │ │ │ │ │ ├── transformer.py │ │ │ │ │ │ ├── upsample.py │ │ │ │ │ │ └── wrappers.py │ │ │ │ │ ├── builder.py │ │ │ │ │ ├── resnet.py │ │ │ │ │ ├── utils │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── flops_counter.py │ │ │ │ │ │ ├── fuse_conv_bn.py │ │ │ │ │ │ ├── sync_bn.py │ │ │ │ │ │ └── weight_init.py │ │ │ │ │ └── vgg.py │ │ │ │ ├── engine │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── test.py │ │ │ │ ├── fileio │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── file_client.py │ │ │ │ │ ├── handlers │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── base.py │ │ │ │ │ │ ├── json_handler.py │ │ │ │ │ │ ├── pickle_handler.py │ │ │ │ │ │ └── yaml_handler.py │ │ │ │ │ ├── io.py │ │ │ │ │ └── parse.py │ │ │ │ ├── image │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── colorspace.py │ │ │ │ │ ├── geometric.py │ │ │ │ │ ├── io.py │ │ │ │ │ ├── misc.py │ │ │ │ │ └── photometric.py │ │ │ │ ├── model_zoo │ │ │ │ │ ├── deprecated.json │ │ │ │ │ ├── mmcls.json │ │ │ │ │ └── open_mmlab.json │ │ │ │ ├── ops │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── assign_score_withk.py │ │ │ │ │ ├── ball_query.py │ │ │ │ │ ├── bbox.py │ │ │ │ │ ├── border_align.py │ │ │ │ │ ├── box_iou_rotated.py │ │ │ │ │ ├── carafe.py │ │ │ │ │ ├── cc_attention.py │ │ │ │ │ ├── contour_expand.py │ │ │ │ │ ├── corner_pool.py │ │ │ │ │ ├── correlation.py │ │ │ │ │ ├── deform_conv.py │ │ │ │ │ ├── deform_roi_pool.py │ │ │ │ │ ├── deprecated_wrappers.py │ │ │ │ │ ├── focal_loss.py │ │ │ │ │ ├── furthest_point_sample.py │ │ │ │ │ ├── fused_bias_leakyrelu.py │ │ │ │ │ ├── gather_points.py │ │ │ │ │ ├── group_points.py │ │ │ │ │ ├── info.py │ │ │ │ │ ├── iou3d.py │ │ │ │ │ ├── knn.py │ │ │ │ │ ├── masked_conv.py │ │ │ │ │ ├── merge_cells.py │ │ │ │ │ ├── modulated_deform_conv.py │ │ │ │ │ ├── multi_scale_deform_attn.py │ │ │ │ │ ├── nms.py │ │ │ │ │ ├── pixel_group.py │ │ │ │ │ ├── point_sample.py │ │ │ │ │ ├── points_in_boxes.py │ │ │ │ │ ├── points_sampler.py │ │ │ │ │ ├── psa_mask.py │ │ │ │ │ ├── roi_align.py │ │ │ │ │ ├── roi_align_rotated.py │ │ │ │ │ ├── roi_pool.py │ │ │ │ │ ├── roiaware_pool3d.py │ │ │ │ │ ├── roipoint_pool3d.py │ │ │ │ │ ├── saconv.py │ │ │ │ │ ├── scatter_points.py │ │ │ │ │ ├── sync_bn.py │ │ │ │ │ ├── three_interpolate.py │ │ │ │ │ ├── three_nn.py │ │ │ │ │ ├── tin_shift.py │ │ │ │ │ ├── upfirdn2d.py │ │ │ │ │ └── voxelize.py │ │ │ │ ├── parallel │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── _functions.py │ │ │ │ │ ├── collate.py │ │ │ │ │ ├── data_container.py │ │ │ │ │ ├── data_parallel.py │ │ │ │ │ ├── distributed.py │ │ │ │ │ ├── distributed_deprecated.py │ │ │ │ │ ├── registry.py │ │ │ │ │ ├── scatter_gather.py │ │ │ │ │ └── utils.py │ │ │ │ ├── runner │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base_module.py │ │ │ │ │ ├── base_runner.py │ │ │ │ │ ├── builder.py │ │ │ │ │ ├── checkpoint.py │ │ │ │ │ ├── default_constructor.py │ │ │ │ │ ├── dist_utils.py │ │ │ │ │ ├── epoch_based_runner.py │ │ │ │ │ ├── fp16_utils.py │ │ │ │ │ ├── hooks │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── checkpoint.py │ │ │ │ │ │ ├── closure.py │ │ │ │ │ │ ├── ema.py │ │ │ │ │ │ ├── evaluation.py │ │ │ │ │ │ ├── hook.py │ │ │ │ │ │ ├── iter_timer.py │ │ │ │ │ │ ├── logger │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ ├── base.py │ │ │ │ │ │ │ ├── dvclive.py │ │ │ │ │ │ │ ├── mlflow.py │ │ │ │ │ │ │ ├── neptune.py │ │ │ │ │ │ │ ├── pavi.py │ │ │ │ │ │ │ ├── tensorboard.py │ │ │ │ │ │ │ ├── text.py │ │ │ │ │ │ │ └── wandb.py │ │ │ │ │ │ ├── lr_updater.py │ │ │ │ │ │ ├── memory.py │ │ │ │ │ │ ├── momentum_updater.py │ │ │ │ │ │ ├── optimizer.py │ │ │ │ │ │ ├── profiler.py │ │ │ │ │ │ ├── sampler_seed.py │ │ │ │ │ │ └── sync_buffer.py │ │ │ │ │ ├── iter_based_runner.py │ │ │ │ │ ├── log_buffer.py │ │ │ │ │ ├── optimizer │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── builder.py │ │ │ │ │ │ └── default_constructor.py │ │ │ │ │ ├── priority.py │ │ │ │ │ └── utils.py │ │ │ │ ├── utils │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── config.py │ │ │ │ │ ├── env.py │ │ │ │ │ ├── ext_loader.py │ │ │ │ │ ├── logging.py │ │ │ │ │ ├── misc.py │ │ │ │ │ ├── parrots_jit.py │ │ │ │ │ ├── parrots_wrapper.py │ │ │ │ │ ├── path.py │ │ │ │ │ ├── progressbar.py │ │ │ │ │ ├── registry.py │ │ │ │ │ ├── testing.py │ │ │ │ │ ├── timer.py │ │ │ │ │ ├── trace.py │ │ │ │ │ └── version_utils.py │ │ │ │ ├── version.py │ │ │ │ ├── video │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── io.py │ │ │ │ │ ├── optflow.py │ │ │ │ │ └── processing.py │ │ │ │ └── visualization │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── color.py │ │ │ │ │ ├── image.py │ │ │ │ │ └── optflow.py │ │ │ ├── mmcv_custom │ │ │ │ ├── __init__.py │ │ │ │ └── checkpoint.py │ │ │ └── mmseg │ │ │ │ ├── apis │ │ │ │ ├── __init__.py │ │ │ │ ├── inference.py │ │ │ │ ├── test.py │ │ │ │ └── train.py │ │ │ │ ├── core │ │ │ │ ├── __init__.py │ │ │ │ ├── evaluation │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── class_names.py │ │ │ │ │ ├── eval_hooks.py │ │ │ │ │ └── metrics.py │ │ │ │ ├── seg │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── builder.py │ │ │ │ │ └── sampler │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── base_pixel_sampler.py │ │ │ │ │ │ └── ohem_pixel_sampler.py │ │ │ │ └── utils │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── misc.py │ │ │ │ ├── datasets │ │ │ │ ├── __init__.py │ │ │ │ ├── ade.py │ │ │ │ ├── builder.py │ │ │ │ ├── chase_db1.py │ │ │ │ ├── cityscapes.py │ │ │ │ ├── custom.py │ │ │ │ ├── dataset_wrappers.py │ │ │ │ ├── drive.py │ │ │ │ ├── hrf.py │ │ │ │ ├── pascal_context.py │ │ │ │ ├── pipelines │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── compose.py │ │ │ │ │ ├── formating.py │ │ │ │ │ ├── loading.py │ │ │ │ │ ├── test_time_aug.py │ │ │ │ │ └── transforms.py │ │ │ │ ├── stare.py │ │ │ │ └── voc.py │ │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ ├── backbones │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── cgnet.py │ │ │ │ │ ├── fast_scnn.py │ │ │ │ │ ├── hrnet.py │ │ │ │ │ ├── mobilenet_v2.py │ │ │ │ │ ├── mobilenet_v3.py │ │ │ │ │ ├── resnest.py │ │ │ │ │ ├── resnet.py │ │ │ │ │ ├── resnext.py │ │ │ │ │ ├── unet.py │ │ │ │ │ ├── uniformer.py │ │ │ │ │ └── vit.py │ │ │ │ ├── builder.py │ │ │ │ ├── decode_heads │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── ann_head.py │ │ │ │ │ ├── apc_head.py │ │ │ │ │ ├── aspp_head.py │ │ │ │ │ ├── cascade_decode_head.py │ │ │ │ │ ├── cc_head.py │ │ │ │ │ ├── da_head.py │ │ │ │ │ ├── decode_head.py │ │ │ │ │ ├── dm_head.py │ │ │ │ │ ├── dnl_head.py │ │ │ │ │ ├── ema_head.py │ │ │ │ │ ├── enc_head.py │ │ │ │ │ ├── fcn_head.py │ │ │ │ │ ├── fpn_head.py │ │ │ │ │ ├── gc_head.py │ │ │ │ │ ├── lraspp_head.py │ │ │ │ │ ├── nl_head.py │ │ │ │ │ ├── ocr_head.py │ │ │ │ │ ├── point_head.py │ │ │ │ │ ├── psa_head.py │ │ │ │ │ ├── psp_head.py │ │ │ │ │ ├── sep_aspp_head.py │ │ │ │ │ ├── sep_fcn_head.py │ │ │ │ │ └── uper_head.py │ │ │ │ ├── losses │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── accuracy.py │ │ │ │ │ ├── cross_entropy_loss.py │ │ │ │ │ ├── dice_loss.py │ │ │ │ │ ├── lovasz_loss.py │ │ │ │ │ └── utils.py │ │ │ │ ├── necks │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── fpn.py │ │ │ │ │ └── multilevel_neck.py │ │ │ │ ├── segmentors │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── cascade_encoder_decoder.py │ │ │ │ │ └── encoder_decoder.py │ │ │ │ └── utils │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── drop.py │ │ │ │ │ ├── inverted_residual.py │ │ │ │ │ ├── make_divisible.py │ │ │ │ │ ├── res_layer.py │ │ │ │ │ ├── se_layer.py │ │ │ │ │ ├── self_attention_block.py │ │ │ │ │ ├── up_conv_block.py │ │ │ │ │ └── weight_init.py │ │ │ │ ├── ops │ │ │ │ ├── __init__.py │ │ │ │ ├── encoding.py │ │ │ │ └── wrappers.py │ │ │ │ └── utils │ │ │ │ ├── __init__.py │ │ │ │ ├── collect_env.py │ │ │ │ └── logger.py │ │ └── util.py │ ├── config.py │ ├── dist_utils.py │ ├── gradcam.py │ ├── logger.py │ ├── optims.py │ ├── registry.py │ ├── utils.py │ └── vqa_tools │ │ ├── __init__.py │ │ ├── vqa.py │ │ └── vqa_eval.py ├── configs │ ├── datasets │ │ ├── aokvqa │ │ │ ├── defaults.yaml │ │ │ └── defaults_instruct.yaml │ │ ├── audiocaps │ │ │ ├── defaults_mm_cap.yaml │ │ │ ├── defaults_mm_cap_instruct.yaml │ │ │ └── defaults_mm_qa.yaml │ │ ├── audioset │ │ │ ├── defaults_mm_cap.yaml │ │ │ └── defaults_mm_cap_instruct.yaml │ │ ├── avsd │ │ │ ├── defaults_dial.yaml │ │ │ └── defaults_mm_dial_instruct.yaml │ │ ├── blip_diffusion_datasets │ │ │ └── defaults.yaml │ │ ├── capfilt14m │ │ │ ├── defaults_cap.yaml │ │ │ └── defaults_cap_instruct.yaml │ │ ├── charade │ │ │ ├── defaults_cap.yaml │ │ │ └── defaults_cap_instruct.yaml │ │ ├── clotho │ │ │ ├── defaults_mm_cap.yaml │ │ │ ├── defaults_mm_cap_instruct.yaml │ │ │ └── defaults_mm_qa.yaml │ │ ├── coco │ │ │ ├── defaults_cap.yaml │ │ │ ├── defaults_cap_instruct.yaml │ │ │ ├── defaults_ret.yaml │ │ │ ├── defaults_vqa.yaml │ │ │ ├── defaults_vqa_instruct.yaml │ │ │ └── eval_vqa.yaml │ │ ├── coin │ │ │ ├── defaults_cap.yaml │ │ │ └── defaults_cap_instruct.yaml │ │ ├── conceptual_caption │ │ │ ├── defaults_12m.yaml │ │ │ ├── defaults_12m_instruct.yaml │ │ │ ├── defaults_3m.yaml │ │ │ └── defaults_3m_instruct.yaml │ │ ├── didemo │ │ │ └── defaults_ret.yaml │ │ ├── discriminatory_reasoning │ │ │ ├── defaults_mm_audio_video.yaml │ │ │ ├── defaults_mm_image_pc.yaml │ │ │ └── discriminatory_dataset │ │ │ │ ├── audiocaps_discrn.json │ │ │ │ └── objaverse_discrn.json │ │ ├── esc50 │ │ │ └── defaults_mm_cls.yaml │ │ ├── flickr30k │ │ │ ├── defaults.yaml │ │ │ ├── defaults_cap.yaml │ │ │ └── defaults_cap_instruct.yaml │ │ ├── gqa │ │ │ ├── balanced_testdev.yaml │ │ │ ├── balanced_testdev_instruct.yaml │ │ │ ├── balanced_val.yaml │ │ │ ├── balanced_val_instruct.yaml │ │ │ ├── defaults.yaml │ │ │ └── defaults_instruct.yaml │ │ ├── iconqa │ │ │ ├── defaults.yaml │ │ │ └── defaults_instruct.yaml │ │ ├── imagenet │ │ │ └── defaults.yaml │ │ ├── laion │ │ │ ├── defaults_2B_multi.yaml │ │ │ ├── defaults_400M.yaml │ │ │ └── defaults_400M_instruct.yaml │ │ ├── llava150k │ │ │ └── defaults_dial.yaml │ │ ├── modelnet40 │ │ │ └── defaults_cls.yaml │ │ ├── msrvtt │ │ │ ├── defaults_cap.yaml │ │ │ ├── defaults_cap_instruct.yaml │ │ │ ├── defaults_qa.yaml │ │ │ ├── defaults_qa_instruct.yaml │ │ │ └── defaults_ret.yaml │ │ ├── msvd │ │ │ ├── defaults_cap.yaml │ │ │ ├── defaults_cap_instruct.yaml │ │ │ ├── defaults_qa.yaml │ │ │ └── defaults_qa_instruct.yaml │ │ ├── music_avqa │ │ │ ├── defaults_mm_qa.yaml │ │ │ └── defaults_mm_qa_instruct.yaml │ │ ├── nlvr │ │ │ └── defaults.yaml │ │ ├── nocaps │ │ │ └── defaults.yaml │ │ ├── objaverse │ │ │ ├── defaults_mm_cap.yaml │ │ │ ├── defaults_mm_cap_instruct.yaml │ │ │ └── defaults_mm_qa.yaml │ │ ├── ocrvqa │ │ │ ├── defaults.yaml │ │ │ └── defaults_instruct.yaml │ │ ├── okvqa │ │ │ ├── defaults.yaml │ │ │ └── defaults_instruct.yaml │ │ ├── sbu_caption │ │ │ ├── defaults.yaml │ │ │ └── defaults_instruct.yaml │ │ ├── scienceqa │ │ │ ├── defaults.yaml │ │ │ └── defaults_instruct.yaml │ │ ├── shapenet │ │ │ ├── defaults_mm_cap.yaml │ │ │ └── defaults_mm_cap_instruct.yaml │ │ ├── snli_ve │ │ │ ├── defaults.yaml │ │ │ └── defaults_instruct.yaml │ │ ├── textcaps │ │ │ ├── defaults.yaml │ │ │ └── defaults_instruct.yaml │ │ ├── valor │ │ │ ├── defaults_mm_cap.yaml │ │ │ └── defaults_mm_cap_instruct.yaml │ │ ├── vatex │ │ │ ├── defaults_cap.yaml │ │ │ └── defaults_cap_instruct.yaml │ │ ├── vg │ │ │ ├── defaults_caption.yaml │ │ │ ├── defaults_caption_instruct.yaml │ │ │ ├── defaults_vqa.yaml │ │ │ └── defaults_vqa_instruct.yaml │ │ ├── violin │ │ │ ├── defaults_cap.yaml │ │ │ ├── defaults_cap_instruct.yaml │ │ │ ├── defaults_entail.yaml │ │ │ └── defaults_entail_instruct.yaml │ │ ├── visdial │ │ │ ├── defaults_dial.yaml │ │ │ └── defaults_dial_instruct.yaml │ │ ├── vizwiz │ │ │ └── defaults.yaml │ │ ├── vlep │ │ │ ├── defaults_cap.yaml │ │ │ └── defaults_cap_instruct.yaml │ │ ├── vsr │ │ │ ├── defaults.yaml │ │ │ ├── defaults_classification.yaml │ │ │ ├── defaults_classification_instruct.yaml │ │ │ └── defaults_instruct.yaml │ │ ├── wavcaps │ │ │ ├── defaults_mm_cap.yaml │ │ │ └── defaults_mm_cap_instruct.yaml │ │ ├── webvid │ │ │ ├── defaults_cap.yaml │ │ │ └── defaults_cap_instruct.yaml │ │ ├── youcook │ │ │ ├── defaults_cap.yaml │ │ │ └── defaults_cap_instruct.yaml │ │ └── yt8m │ │ │ └── defaults_mm_dial.yaml │ ├── default.yaml │ └── models │ │ ├── albef_classification_ve.yaml │ │ ├── albef_feature_extractor.yaml │ │ ├── albef_nlvr.yaml │ │ ├── albef_pretrain_base.yaml │ │ ├── albef_retrieval_coco.yaml │ │ ├── albef_retrieval_flickr.yaml │ │ ├── albef_vqav2.yaml │ │ ├── alpro_qa_msrvtt.yaml │ │ ├── alpro_qa_msvd.yaml │ │ ├── alpro_retrieval_didemo.yaml │ │ ├── alpro_retrieval_msrvtt.yaml │ │ ├── bert_config.json │ │ ├── bert_config_alpro.json │ │ ├── blip-diffusion │ │ ├── blip_diffusion_base.yaml │ │ ├── blip_diffusion_controlnet_canny.yaml │ │ ├── blip_diffusion_controlnet_depth.yaml │ │ └── blip_diffusion_controlnet_hed.yaml │ │ ├── blip2 │ │ ├── blip2_caption_flant5xl.yaml │ │ ├── blip2_caption_opt2.7b.yaml │ │ ├── blip2_caption_opt6.7b.yaml │ │ ├── blip2_coco.yaml │ │ ├── blip2_instruct_flant5xl.yaml │ │ ├── blip2_instruct_flant5xxl.yaml │ │ ├── blip2_instruct_vicuna13b.yaml │ │ ├── blip2_instruct_vicuna7b.yaml │ │ ├── blip2_pretrain.yaml │ │ ├── blip2_pretrain_flant5xl.yaml │ │ ├── blip2_pretrain_flant5xl_vitL.yaml │ │ ├── blip2_pretrain_flant5xxl.yaml │ │ ├── blip2_pretrain_llama7b.yaml │ │ ├── blip2_pretrain_opt2.7b.yaml │ │ ├── blip2_pretrain_opt6.7b.yaml │ │ ├── blip2_pretrain_vitL.yaml │ │ ├── blip2_xinstruct_vicuna13b.yaml │ │ └── blip2_xinstruct_vicuna7b.yaml │ │ ├── blip_caption_base_coco.yaml │ │ ├── blip_caption_large_coco.yaml │ │ ├── blip_classification_base.yaml │ │ ├── blip_feature_extractor_base.yaml │ │ ├── blip_itm_base.yaml │ │ ├── blip_itm_large.yaml │ │ ├── blip_nlvr.yaml │ │ ├── blip_pretrain_base.yaml │ │ ├── blip_pretrain_large.yaml │ │ ├── blip_retrieval_coco.yaml │ │ ├── blip_retrieval_flickr.yaml │ │ ├── blip_vqa_aokvqa.yaml │ │ ├── blip_vqa_okvqa.yaml │ │ ├── blip_vqav2.yaml │ │ ├── clip │ │ ├── RN101-quickgelu.json │ │ ├── RN101.json │ │ ├── RN50-quickgelu.json │ │ ├── RN50.json │ │ ├── RN50x16.json │ │ ├── RN50x4.json │ │ ├── ViT-B-16-plus-240.json │ │ ├── ViT-B-16-plus.json │ │ ├── ViT-B-16.json │ │ ├── ViT-B-32-plus-256.json │ │ ├── ViT-B-32-quickgelu.json │ │ ├── ViT-B-32.json │ │ ├── ViT-H-14.json │ │ ├── ViT-H-16.json │ │ ├── ViT-L-14-280.json │ │ ├── ViT-L-14-336.json │ │ ├── ViT-L-14.json │ │ ├── ViT-L-16-320.json │ │ ├── ViT-L-16.json │ │ ├── ViT-g-14.json │ │ ├── timm-efficientnetv2_rw_s.json │ │ ├── timm-resnet50d.json │ │ ├── timm-resnetaa50d.json │ │ ├── timm-resnetblur50.json │ │ ├── timm-swin_base_patch4_window7_224.json │ │ ├── timm-vit_base_patch16_224.json │ │ ├── timm-vit_base_patch32_224.json │ │ └── timm-vit_small_patch16_224.json │ │ ├── clip_resnet50.yaml │ │ ├── clip_vit_base16.yaml │ │ ├── clip_vit_base32.yaml │ │ ├── clip_vit_large14.yaml │ │ ├── clip_vit_large14_336.yaml │ │ ├── gpt_dialogue_base.yaml │ │ ├── img2prompt-vqa │ │ └── img2prompt_vqa_base.yaml │ │ ├── med_config.json │ │ ├── med_config_albef.json │ │ ├── med_large_config.json │ │ └── pnp-vqa │ │ ├── pnp_vqa_3b.yaml │ │ ├── pnp_vqa_base.yaml │ │ ├── pnp_vqa_large.yaml │ │ ├── unifiedqav2_3b_config.json │ │ ├── unifiedqav2_base_config.json │ │ └── unifiedqav2_large_config.json ├── datasets │ ├── builders │ │ ├── __init__.py │ │ ├── audio_caption_builder.py │ │ ├── audio_qa_builder.py │ │ ├── base_dataset_builder.py │ │ ├── caption_builder.py │ │ ├── classification_builder.py │ │ ├── dialogue_builder.py │ │ ├── discrn_builders.py │ │ ├── image_text_pair_builder.py │ │ ├── imagefolder_builder.py │ │ ├── object3d_caption_builder.py │ │ ├── object3d_classification_builder.py │ │ ├── object3d_qa_builder.py │ │ ├── retrieval_builder.py │ │ ├── text_to_image_generation_builder.py │ │ ├── video_qa_builder.py │ │ └── vqa_builder.py │ ├── data_utils.py │ ├── datasets │ │ ├── aok_vqa_datasets.py │ │ ├── audio_captioning_datasets.py │ │ ├── audio_classification_datasets.py │ │ ├── audio_qa_datasets.py │ │ ├── avsd_dialogue_datasets.py │ │ ├── base_dataset.py │ │ ├── capfilt_dataset.py │ │ ├── caption_datasets.py │ │ ├── coco_caption_datasets.py │ │ ├── coco_vqa_datasets.py │ │ ├── dataloader_utils.py │ │ ├── dialogue_datasets.py │ │ ├── discriminatory_reasoning_datasets.py │ │ ├── gqa_datasets.py │ │ ├── iconqa_datasets.py │ │ ├── image_text_pair_datasets.py │ │ ├── imagefolder_dataset.py │ │ ├── laion_dataset.py │ │ ├── llava150k_dataset.py │ │ ├── multimodal_classification_datasets.py │ │ ├── music_avqa.py │ │ ├── nlvr_datasets.py │ │ ├── object3d_captioning_datasets.py │ │ ├── object3d_classification_datasets.py │ │ ├── object3d_qa_datasets.py │ │ ├── ocr_datasets.py │ │ ├── retrieval_datasets.py │ │ ├── snli_ve_datasets.py │ │ ├── subject_driven_t2i_dataset.py │ │ ├── textcaps_datasets.py │ │ ├── valor_caption.py │ │ ├── vatex_captioning_datasets.py │ │ ├── vg_vqa_datasets.py │ │ ├── video_caption_datasets.py │ │ ├── video_vqa_datasets.py │ │ ├── violin_dataset.py │ │ ├── visdial_dialogue_datasets.py │ │ ├── vizwiz_vqa_datasets.py │ │ ├── vlep_dataset.py │ │ ├── vqa_datasets.py │ │ ├── vsr_datasets.py │ │ └── yt8m_video_dialogue_datasets.py │ └── download_scripts │ │ ├── DownloadConceptualCaptions │ │ ├── LICENSE │ │ ├── README.md │ │ ├── create_annotation_12m.ipynb │ │ ├── create_annotation_3m.ipynb │ │ ├── download_data_cc12m.py │ │ └── download_data_cc3m.py │ │ ├── download_charade.py │ │ ├── download_coco.py │ │ ├── download_coin.py │ │ ├── download_didemo.py │ │ ├── download_flickr.py │ │ ├── download_gqa.py │ │ ├── download_iconqa.py │ │ ├── download_msrvtt.py │ │ ├── download_msvd.py │ │ ├── download_nocaps.py │ │ ├── download_sbu.py │ │ ├── download_vg.py │ │ └── download_violin.py ├── models │ ├── __init__.py │ ├── albef_models │ │ ├── __init__.py │ │ ├── albef_classification.py │ │ ├── albef_feature_extractor.py │ │ ├── albef_nlvr.py │ │ ├── albef_outputs.py │ │ ├── albef_pretrain.py │ │ ├── albef_retrieval.py │ │ └── albef_vqa.py │ ├── alpro_models │ │ ├── __init__.py │ │ ├── alpro_outputs.py │ │ ├── alpro_qa.py │ │ └── alpro_retrieval.py │ ├── base_model.py │ ├── beats │ │ ├── BEATs.py │ │ ├── LICENSE_BEATs.txt │ │ ├── README.md │ │ ├── Tokenizers.py │ │ ├── backbone.py │ │ ├── modules.py │ │ └── quantizer.py │ ├── beats_encoder.py │ ├── blip2_models │ │ ├── Qformer.py │ │ ├── __init__.py │ │ ├── blip2.py │ │ ├── blip2_image_text_matching.py │ │ ├── blip2_opt.py │ │ ├── blip2_qformer.py │ │ ├── blip2_t5.py │ │ ├── blip2_t5_instruct.py │ │ ├── blip2_vicuna_instruct.py │ │ ├── blip2_vicuna_xinstruct.py │ │ ├── modeling_llama.py │ │ ├── modeling_opt.py │ │ └── modeling_t5.py │ ├── blip_diffusion_models │ │ ├── __init__.py │ │ ├── blip_diffusion.py │ │ ├── modeling_ctx_clip.py │ │ ├── ptp_utils.py │ │ └── utils.py │ ├── blip_models │ │ ├── __init__.py │ │ ├── blip.py │ │ ├── blip_caption.py │ │ ├── blip_classification.py │ │ ├── blip_feature_extractor.py │ │ ├── blip_image_text_matching.py │ │ ├── blip_nlvr.py │ │ ├── blip_outputs.py │ │ ├── blip_pretrain.py │ │ ├── blip_retrieval.py │ │ ├── blip_vqa.py │ │ └── nlvr_encoder.py │ ├── clip_models │ │ ├── __init__.py │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ ├── clip_outputs.py │ │ ├── loss.py │ │ ├── model.py │ │ ├── pics │ │ │ └── CLIP.png │ │ ├── pretrained.py │ │ ├── timm_model.py │ │ ├── tokenizer.py │ │ ├── transform.py │ │ └── utils.py │ ├── clip_vit.py │ ├── eva_vit.py │ ├── gpt_models │ │ └── gpt_dialogue.py │ ├── img2prompt_models │ │ ├── __init__.py │ │ └── img2prompt_vqa.py │ ├── med.py │ ├── pnp_vqa_models │ │ ├── __init__.py │ │ ├── pnp_unifiedqav2_fid.py │ │ └── pnp_vqa.py │ ├── timesformer │ │ ├── __init__.py │ │ ├── conv2d_same.py │ │ ├── features.py │ │ ├── helpers.py │ │ ├── linear.py │ │ ├── vit.py │ │ └── vit_utils.py │ ├── ulip_models │ │ ├── ULIP_models.py │ │ ├── losses.py │ │ ├── pointbert │ │ │ ├── PointTransformer_8192point.yaml │ │ │ ├── checkpoint.py │ │ │ ├── dvae.py │ │ │ ├── logger.py │ │ │ ├── misc.py │ │ │ └── point_encoder.py │ │ ├── ulip_scaled_up_config.yaml │ │ └── utils │ │ │ ├── __init__.py │ │ │ ├── build.py │ │ │ ├── config.py │ │ │ ├── io.py │ │ │ ├── logger.py │ │ │ ├── registry.py │ │ │ ├── tokenizer.py │ │ │ └── utils.py │ └── vit.py ├── processors │ ├── __init__.py │ ├── alpro_processors.py │ ├── audio_processors.py │ ├── base_processor.py │ ├── blip_diffusion_processors.py │ ├── blip_processors.py │ ├── clip_processors.py │ ├── functional_video.py │ ├── gpt_processors.py │ ├── instruction_text_processors.py │ ├── randaugment.py │ ├── transforms_video.py │ └── ulip_processors.py ├── projects │ ├── albef │ │ ├── eval │ │ │ ├── nlvr_eval.yaml │ │ │ ├── ret_coco_eval.yaml │ │ │ ├── ret_flickr30k_eval.yaml │ │ │ ├── snli_ve_eval.yaml │ │ │ ├── vqa_test.yaml │ │ │ └── vqa_val.yaml │ │ └── train │ │ │ ├── aokvqa_ft.yaml │ │ │ ├── nlvr_ft.yaml │ │ │ ├── okvqa_ft.yaml │ │ │ ├── pretrain.yaml │ │ │ ├── ret_coco_ft.yaml │ │ │ ├── ret_flickr30k_ft.yaml │ │ │ ├── snli_ve_ft.yaml │ │ │ └── vqa_ft.yaml │ ├── alpro │ │ ├── eval │ │ │ ├── didemo_ret_eval.yaml │ │ │ ├── msrvtt_qa_eval.yaml │ │ │ ├── msrvtt_ret_eval.yaml │ │ │ └── msvd_qa_eval.yaml │ │ └── train │ │ │ ├── didemo_ret_ft.yaml │ │ │ ├── msrvtt_qa_ft.yaml │ │ │ ├── msrvtt_retrieval_ft.yaml │ │ │ └── msvd_qa_ft.yaml │ ├── blip │ │ ├── coco_cap_ft_iter.yaml │ │ ├── eval │ │ │ ├── aokvqa_eval.yaml │ │ │ ├── caption_coco_eval.yaml │ │ │ ├── caption_coco_eval_large.yaml │ │ │ ├── nlvr_eval.yaml │ │ │ ├── nocaps_eval.yaml │ │ │ ├── okvqa_eval.yaml │ │ │ ├── ret_coco_eval.yaml │ │ │ ├── ret_flickr_eval.yaml │ │ │ └── vqav2_eval.yaml │ │ └── train │ │ │ ├── aokvqa_ft.yaml │ │ │ ├── caption_coco_ft.yaml │ │ │ ├── caption_coco_large_ft.yaml │ │ │ ├── nlvr_ft.yaml │ │ │ ├── okvqa_ft.yaml │ │ │ ├── pretrain_14m.yaml │ │ │ ├── retrieval_coco_ft.yaml │ │ │ ├── retrieval_flickr_ft.yaml │ │ │ └── vqav2_ft.yaml │ ├── blip2 │ │ ├── eval │ │ │ ├── caption_coco_flant5xl_eval.yaml │ │ │ ├── caption_coco_opt2.7b_eval.yaml │ │ │ ├── caption_coco_opt6.7b_eval.yaml │ │ │ ├── caption_nocaps_out_domain_flant5xl_eval.yaml │ │ │ ├── caption_nocaps_out_domain_flant5xxl_eval.yaml │ │ │ ├── gqa_zeroshot_flant5xl_eval.yaml │ │ │ ├── okvqa_zeroshot_flant5xl_eval.yaml │ │ │ ├── ret_coco_eval.yaml │ │ │ ├── ret_flickr_eval.yaml │ │ │ ├── vqav2_zeroshot_flant5xl_eval.yaml │ │ │ └── vqav2_zeroshot_opt_eval.yaml │ │ └── train │ │ │ ├── caption_coco_ft.yaml │ │ │ ├── pretrain_stage1.yaml │ │ │ ├── pretrain_stage2.yaml │ │ │ └── retrieval_coco_ft.yaml │ ├── blip_diffusion │ │ ├── finetune-db-dog.yaml │ │ ├── finetune-db-pink-dress.yaml │ │ ├── finetune-db-shein-jacket.yaml │ │ └── finetune-db-template.yaml │ ├── clip │ │ ├── exp_coco_ret_eval.yaml │ │ ├── exp_flickr_ret_eval.yaml │ │ └── exp_imnet_zs_eval.yaml │ ├── gpt │ │ ├── eval │ │ │ └── dialogue_avsd_eval.yaml │ │ └── train │ │ │ └── dialogue_avsd_ft.yaml │ ├── instructblip │ │ ├── caption_coco_flant5xl_eval_test.yaml │ │ ├── caption_coco_flant5xl_eval_val.yaml │ │ ├── caption_coco_flant5xxl_eval_test.yaml │ │ ├── caption_coco_flant5xxl_eval_val.yaml │ │ ├── caption_coco_vicuna13b_eval_test.yaml │ │ ├── caption_coco_vicuna13b_eval_val.yaml │ │ ├── caption_coco_vicuna7b_eval_test.yaml │ │ ├── caption_coco_vicuna7b_eval_val.yaml │ │ ├── caption_msrvtt_flant5xl_eval_test.yaml │ │ ├── caption_msrvtt_flant5xl_eval_val.yaml │ │ ├── caption_msrvtt_flant5xxl_eval_test.yaml │ │ ├── caption_msrvtt_flant5xxl_eval_val.yaml │ │ ├── caption_msrvtt_vicuna13b_eval_test.yaml │ │ ├── caption_msrvtt_vicuna13b_eval_val.yaml │ │ ├── caption_msrvtt_vicuna7b_eval_test.yaml │ │ ├── caption_msrvtt_vicuna7b_eval_val.yaml │ │ ├── caption_msvd_flant5xl_eval.yaml │ │ ├── caption_msvd_flant5xxl_eval.yaml │ │ ├── caption_msvd_vicuna13b_eval.yaml │ │ ├── caption_msvd_vicuna7b_eval.yaml │ │ ├── caption_nocaps_out_domain_flant5xl_eval.yaml │ │ ├── caption_nocaps_out_domain_flant5xxl_eval.yaml │ │ ├── caption_nocaps_out_domain_vicuna13b_eval.yaml │ │ ├── caption_nocaps_out_domain_vicuna7b_eval.yaml │ │ ├── caption_vatex_flant5xl_eval.yaml │ │ ├── caption_vatex_flant5xxl_eval.yaml │ │ ├── caption_vatex_vicuna13b_eval.yaml │ │ ├── caption_vatex_vicuna7b_eval.yaml │ │ ├── classification_modelnet40_vicuna13b.yaml │ │ ├── classification_modelnet40_vicuna7b.yaml │ │ ├── classification_snlive_flant5xl.yaml │ │ ├── classification_snlive_flant5xxl.yaml │ │ ├── classification_snlive_vicuna13b.yaml │ │ ├── classification_snlive_vicuna13b_test.yaml │ │ ├── classification_snlive_vicuna7b_test.yaml │ │ ├── classification_snlive_vicuna7b_val.yaml │ │ ├── completion_modelnet40_vicuna13b.yaml │ │ ├── completion_modelnet40_vicuna7b.yaml │ │ ├── qa_msrvtt_flant5xl_eval_test.yaml │ │ ├── qa_msrvtt_flant5xxl_eval_test.yaml │ │ ├── qa_msrvtt_vicuna13b_eval_test.yaml │ │ ├── qa_msrvtt_vicuna7b_eval_test.yaml │ │ ├── qa_msvd_flant5xl_eval.yaml │ │ ├── qa_msvd_flant5xxl_eval.yaml │ │ ├── qa_msvd_vicuna13b_eval.yaml │ │ ├── qa_msvd_vicuna7b_eval.yaml │ │ ├── qa_okvqa_flant5xl_eval.yaml │ │ ├── qa_okvqa_flant5xxl_eval.yaml │ │ ├── qa_okvqa_vicuna13b_eval.yaml │ │ └── qa_okvqa_vicuna7b_eval.yaml │ ├── pnp-vqa │ │ └── eval │ │ │ ├── gqa_eval.yaml │ │ │ ├── gqa_eval_3b.yaml │ │ │ ├── gqa_eval_large.yaml │ │ │ ├── okvqa_eval.yaml │ │ │ ├── okvqa_eval_3b.yaml │ │ │ ├── okvqa_eval_large.yaml │ │ │ ├── vqav2_eval.yaml │ │ │ ├── vqav2_eval_3b.yaml │ │ │ ├── vqav2_eval_large.yaml │ │ │ ├── vqav2_test_eval.yaml │ │ │ ├── vqav2_test_eval_3b.yaml │ │ │ └── vqav2_test_eval_large.yaml │ └── xinstruct_blip │ │ ├── eval │ │ ├── discrn │ │ │ ├── audio_video_caption.yaml │ │ │ ├── audio_video_caption_13b.yaml │ │ │ ├── audio_video_describe.yaml │ │ │ ├── audio_video_describe_13b.yaml │ │ │ ├── audio_video_describe_nocue.yaml │ │ │ ├── audio_video_describe_proj copy.yaml │ │ │ ├── audio_video_describe_proj.yaml │ │ │ ├── audio_video_describe_rand_init.yaml │ │ │ ├── image_3d_caption.yaml │ │ │ ├── image_3d_caption_13b.yaml │ │ │ ├── image_3d_describe.yaml │ │ │ ├── image_3d_describe_13b.yaml │ │ │ ├── image_3d_describe_no_init.yaml │ │ │ ├── image_3d_describe_nocue.yaml │ │ │ └── image_3d_describe_proj.yaml │ │ ├── vicuna13b │ │ │ ├── audio │ │ │ │ ├── audiocaps_captioning_qa.yaml │ │ │ │ ├── audiocaps_captioning_test.yaml │ │ │ │ ├── audiocaps_captioning_val.yaml │ │ │ │ ├── clothoQA_captioning.yaml │ │ │ │ ├── clothov1_captioning.yaml │ │ │ │ ├── clothov2_captioning.yaml │ │ │ │ ├── esc50_classification.yaml │ │ │ │ └── esc50_classification_completion.yaml │ │ │ ├── crossmodal │ │ │ │ ├── musicavqa │ │ │ │ │ ├── musicavqa_audio_eval.yaml │ │ │ │ │ ├── musicavqa_joint_eval.yaml │ │ │ │ │ └── musicavqa_video_eval.yaml │ │ │ │ └── vatex │ │ │ │ │ ├── vatex_audio_captioning.yaml │ │ │ │ │ ├── vatex_captioning.yaml │ │ │ │ │ ├── vatex_joint_captioning.yaml │ │ │ │ │ └── vatex_joint_captioning_interleave.yaml │ │ │ ├── image │ │ │ │ ├── coco_captioning_test.yaml │ │ │ │ ├── coco_captioning_val.yaml │ │ │ │ ├── flickr30k_captioning.yaml │ │ │ │ ├── gqa_qa.yaml │ │ │ │ ├── nocaps_captioning.yaml │ │ │ │ ├── nocaps_out_domain_captioning.yaml │ │ │ │ ├── okvqa_qa.yaml │ │ │ │ ├── snlive_classification_test.yaml │ │ │ │ ├── snlive_classification_val.yaml │ │ │ │ └── vizwiz_qa.yaml │ │ │ ├── image_with_coco │ │ │ │ ├── coco_captioning_test.yaml │ │ │ │ ├── coco_captioning_val.yaml │ │ │ │ ├── flickr30k_captioning.yaml │ │ │ │ ├── gqa_qa.yaml │ │ │ │ ├── nocaps_captioning.yaml │ │ │ │ ├── nocaps_out_domain_captioning.yaml │ │ │ │ ├── okvqa_qa.yaml │ │ │ │ ├── snlive_classification_test.yaml │ │ │ │ ├── snlive_classification_val.yaml │ │ │ │ └── vizwiz_qa.yaml │ │ │ ├── pc │ │ │ │ ├── modelnet40_classification.yaml │ │ │ │ ├── modelnet40_completion.yaml │ │ │ │ ├── objaverse_captioning.yaml │ │ │ │ └── objaverse_qa.yaml │ │ │ ├── video │ │ │ │ ├── msrvtt_captioning.yaml │ │ │ │ ├── msrvtt_captioning_test.yaml │ │ │ │ ├── msrvtt_captioning_val.yaml │ │ │ │ ├── msrvtt_qa_test.yaml │ │ │ │ ├── msrvtt_qa_val.yaml │ │ │ │ ├── msvd_captioning.yaml │ │ │ │ ├── msvd_qa.yaml │ │ │ │ ├── vatex_audio_captioning.yaml │ │ │ │ ├── vatex_captioning.yaml │ │ │ │ ├── vatex_joint_captioning.yaml │ │ │ │ └── vatex_joint_captioning_interleave.yaml │ │ │ └── video_image │ │ │ │ ├── msvd_captioning.yaml │ │ │ │ ├── msvd_qa.yaml │ │ │ │ └── vatex_captioning.yaml │ │ ├── vicuna7b │ │ │ ├── audio │ │ │ │ ├── audiocaps_captioning_qa.yaml │ │ │ │ ├── audiocaps_captioning_test.yaml │ │ │ │ ├── audiocaps_captioning_val.yaml │ │ │ │ ├── clothoQA_captioning.yaml │ │ │ │ ├── clothov1_captioning.yaml │ │ │ │ ├── clothov2_captioning.yaml │ │ │ │ ├── esc50_classification.yaml │ │ │ │ └── esc50_classification_completion.yaml │ │ │ ├── audio_no_init │ │ │ │ ├── audiocaps_captioning_qa.yaml │ │ │ │ ├── audiocaps_captioning_test.yaml │ │ │ │ ├── audiocaps_captioning_val.yaml │ │ │ │ ├── clothoQA_captioning.yaml │ │ │ │ ├── clothov1_captioning.yaml │ │ │ │ ├── clothov2_captioning.yaml │ │ │ │ ├── esc50_classification.yaml │ │ │ │ └── esc50_classification_completion.yaml │ │ │ ├── audio_projection_only │ │ │ │ ├── audiocaps_captioning_qa.yaml │ │ │ │ ├── audiocaps_captioning_test.yaml │ │ │ │ ├── audiocaps_captioning_val.yaml │ │ │ │ ├── clothoQA_captioning.yaml │ │ │ │ ├── clothov1_captioning.yaml │ │ │ │ ├── clothov2_captioning.yaml │ │ │ │ ├── esc50_classification.yaml │ │ │ │ └── esc50_classification_completion.yaml │ │ │ ├── audio_projection_only_nocue │ │ │ │ ├── audiocaps_captioning_qa.yaml │ │ │ │ ├── audiocaps_captioning_test.yaml │ │ │ │ ├── audiocaps_captioning_val.yaml │ │ │ │ ├── clothoQA_captioning.yaml │ │ │ │ ├── clothov1_captioning.yaml │ │ │ │ ├── clothov2_captioning.yaml │ │ │ │ ├── esc50_classification.yaml │ │ │ │ └── esc50_classification_completion.yaml │ │ │ ├── crossmodal │ │ │ │ ├── musicavqa │ │ │ │ │ ├── musicavqa_audio_eval.yaml │ │ │ │ │ ├── musicavqa_joint_eval.yaml │ │ │ │ │ └── musicavqa_video_eval.yaml │ │ │ │ └── vatex │ │ │ │ │ ├── vatex_audio_captioning.yaml │ │ │ │ │ ├── vatex_captioning.yaml │ │ │ │ │ ├── vatex_joint_captioning.yaml │ │ │ │ │ └── vatex_joint_captioning_interleave.yaml │ │ │ ├── image │ │ │ │ ├── coco_captioning_test.yaml │ │ │ │ ├── coco_captioning_val.yaml │ │ │ │ ├── flickr30k_captioning.yaml │ │ │ │ ├── gqa_qa.yaml │ │ │ │ ├── gqa_qa_val.yaml │ │ │ │ ├── nocaps_captioning.yaml │ │ │ │ ├── nocaps_out_domain_captioning.yaml │ │ │ │ ├── okvqa_qa.yaml │ │ │ │ ├── snlive_classification_test.yaml │ │ │ │ ├── snlive_classification_val.yaml │ │ │ │ └── vizwiz_qa.yaml │ │ │ ├── image_full_init │ │ │ │ ├── coco_captioning_test.yaml │ │ │ │ ├── coco_captioning_val.yaml │ │ │ │ ├── flickr30k_captioning.yaml │ │ │ │ ├── gqa_qa.yaml │ │ │ │ ├── gqa_qa_val.yaml │ │ │ │ ├── nocaps_captioning.yaml │ │ │ │ ├── nocaps_out_domain_captioning.yaml │ │ │ │ ├── okvqa_qa.yaml │ │ │ │ ├── snlive_classification_test.yaml │ │ │ │ ├── snlive_classification_val.yaml │ │ │ │ └── vizwiz_qa.yaml │ │ │ ├── image_no_init │ │ │ │ ├── coco_captioning_test.yaml │ │ │ │ ├── coco_captioning_val.yaml │ │ │ │ ├── flickr30k_captioning.yaml │ │ │ │ ├── gqa_qa.yaml │ │ │ │ ├── gqa_qa_val.yaml │ │ │ │ ├── nocaps_captioning.yaml │ │ │ │ ├── nocaps_out_domain_captioning.yaml │ │ │ │ ├── okvqa_qa.yaml │ │ │ │ ├── snlive_classification_test.yaml │ │ │ │ ├── snlive_classification_val.yaml │ │ │ │ └── vizwiz_qa.yaml │ │ │ ├── image_pre_coco │ │ │ │ ├── coco_captioning_test.yaml │ │ │ │ ├── coco_captioning_val.yaml │ │ │ │ ├── flickr30k_captioning.yaml │ │ │ │ ├── gqa_qa.yaml │ │ │ │ ├── nocaps_captioning.yaml │ │ │ │ ├── nocaps_out_domain_captioning.yaml │ │ │ │ ├── okvqa_qa.yaml │ │ │ │ ├── snlive_classification_test.yaml │ │ │ │ ├── snlive_classification_val.yaml │ │ │ │ └── vizwiz_qa.yaml │ │ │ ├── image_projection_only │ │ │ │ ├── coco_captioning_test.yaml │ │ │ │ ├── coco_captioning_val.yaml │ │ │ │ ├── flickr30k_captioning.yaml │ │ │ │ ├── gqa_qa.yaml │ │ │ │ ├── gqa_qa_val.yaml │ │ │ │ ├── nocaps_captioning.yaml │ │ │ │ ├── nocaps_out_domain_captioning.yaml │ │ │ │ ├── okvqa_qa.yaml │ │ │ │ ├── snlive_classification_test.yaml │ │ │ │ ├── snlive_classification_val.yaml │ │ │ │ └── vizwiz_qa.yaml │ │ │ ├── pc │ │ │ │ ├── modelnet40_classification.yaml │ │ │ │ ├── modelnet40_completion.yaml │ │ │ │ ├── objaverse_captioning.yaml │ │ │ │ └── objaverse_qa.yaml │ │ │ ├── pc_no_init │ │ │ │ ├── modelnet40_classification.yaml │ │ │ │ ├── modelnet40_completion.yaml │ │ │ │ ├── objaverse_captioning.yaml │ │ │ │ └── objaverse_qa.yaml │ │ │ ├── pc_projection_only │ │ │ │ ├── modelnet40_classification.yaml │ │ │ │ ├── modelnet40_completion.yaml │ │ │ │ ├── objaverse_captioning.yaml │ │ │ │ └── objaverse_qa.yaml │ │ │ ├── pc_ulip1 │ │ │ │ ├── modelnet40_classification.yaml │ │ │ │ ├── modelnet40_completion.yaml │ │ │ │ ├── objaverse_captioning.yaml │ │ │ │ └── objaverse_qa.yaml │ │ │ ├── pc_ulip2_scaled_up │ │ │ │ ├── modelnet40_classification.yaml │ │ │ │ ├── modelnet40_completion.yaml │ │ │ │ ├── objaverse_captioning.yaml │ │ │ │ └── objaverse_qa.yaml │ │ │ ├── pc_ulip_objaverse │ │ │ │ ├── modelnet40_classification.yaml │ │ │ │ ├── modelnet40_completion.yaml │ │ │ │ ├── objaverse_captioning.yaml │ │ │ │ └── objaverse_qa.yaml │ │ │ ├── pc_ulip_objaverse_shapenet │ │ │ │ ├── modelnet40_classification.yaml │ │ │ │ ├── modelnet40_completion.yaml │ │ │ │ ├── objaverse_captioning.yaml │ │ │ │ └── objaverse_qa.yaml │ │ │ ├── pc_ulip_shapenet │ │ │ │ ├── modelnet40_classification.yaml │ │ │ │ ├── modelnet40_completion.yaml │ │ │ │ ├── objaverse_captioning.yaml │ │ │ │ └── objaverse_qa.yaml │ │ │ ├── video │ │ │ │ ├── msrvtt_captioning_test.yaml │ │ │ │ ├── msrvtt_captioning_val.yaml │ │ │ │ ├── msrvtt_qa_test.yaml │ │ │ │ ├── msrvtt_qa_val.yaml │ │ │ │ ├── msvd_captioning.yaml │ │ │ │ ├── msvd_qa.yaml │ │ │ │ └── vatex_captioning.yaml │ │ │ ├── video_image │ │ │ │ ├── msvd_captioning.yaml │ │ │ │ ├── msvd_qa.yaml │ │ │ │ └── vatex_captioning.yaml │ │ │ ├── video_image_pre_coco │ │ │ │ ├── msvd_captioning.yaml │ │ │ │ ├── msvd_qa.yaml │ │ │ │ └── vatex_captioning.yaml │ │ │ └── video_no_upsample │ │ │ │ ├── msrvtt_captioning_test.yaml │ │ │ │ ├── msrvtt_captioning_val.yaml │ │ │ │ ├── msrvtt_qa_test.yaml │ │ │ │ ├── msrvtt_qa_val.yaml │ │ │ │ ├── msvd_captioning.yaml │ │ │ │ ├── msvd_captioning_up.yaml │ │ │ │ ├── msvd_qa.yaml │ │ │ │ ├── msvd_qa_up.yaml │ │ │ │ ├── vatex_captioning.yaml │ │ │ │ └── vatex_captioning_up.yaml │ │ └── vicuna7b_nocue │ │ │ ├── audio │ │ │ ├── audiocaps_captioning_qa.yaml │ │ │ ├── audiocaps_captioning_test.yaml │ │ │ ├── audiocaps_captioning_val.yaml │ │ │ ├── clothoQA_captioning.yaml │ │ │ ├── clothov1_captioning.yaml │ │ │ ├── clothov2_captioning.yaml │ │ │ ├── esc50_classification.yaml │ │ │ └── esc50_classification_completion.yaml │ │ │ ├── crossmodal │ │ │ ├── musicavqa │ │ │ │ ├── musicavqa_audio_eval.yaml │ │ │ │ ├── musicavqa_joint_eval.yaml │ │ │ │ └── musicavqa_video_eval.yaml │ │ │ └── vatex │ │ │ │ ├── vatex_audio_captioning.yaml │ │ │ │ ├── vatex_captioning.yaml │ │ │ │ └── vatex_joint_captioning.yaml │ │ │ ├── image │ │ │ ├── coco_captioning_test.yaml │ │ │ ├── coco_captioning_val.yaml │ │ │ ├── flickr30k_captioning.yaml │ │ │ ├── gqa_qa.yaml │ │ │ ├── nocaps_captioning.yaml │ │ │ ├── nocaps_out_domain_captioning.yaml │ │ │ ├── okvqa_qa.yaml │ │ │ ├── snlive_classification_test.yaml │ │ │ ├── snlive_classification_val.yaml │ │ │ └── vizwiz_qa.yaml │ │ │ ├── pc │ │ │ ├── modelnet40_classification.yaml │ │ │ ├── modelnet40_completion.yaml │ │ │ ├── objaverse_captioning.yaml │ │ │ └── objaverse_qa.yaml │ │ │ ├── video │ │ │ ├── msrvtt_captioning_test.yaml │ │ │ ├── msrvtt_captioning_val.yaml │ │ │ ├── msrvtt_qa_test.yaml │ │ │ ├── msrvtt_qa_val.yaml │ │ │ ├── msvd_captioning.yaml │ │ │ ├── msvd_qa.yaml │ │ │ └── vatex_captioning.yaml │ │ │ └── video_image │ │ │ ├── msvd_captioning.yaml │ │ │ ├── msvd_qa.yaml │ │ │ └── vatex_captioning.yaml │ │ ├── prompt_variation │ │ └── nocaps │ │ │ ├── instructblip │ │ │ ├── original.yaml │ │ │ ├── template_1.yaml │ │ │ ├── template_2.yaml │ │ │ ├── template_3.yaml │ │ │ ├── template_4.yaml │ │ │ └── template_5.yaml │ │ │ └── xinstructblip │ │ │ ├── template_1.yaml │ │ │ ├── template_2.yaml │ │ │ ├── template_3.yaml │ │ │ ├── template_4.yaml │ │ │ └── template_5.yaml │ │ └── train │ │ ├── vicuna13b │ │ ├── audio_training.yaml │ │ ├── audio_training_continue.yaml │ │ ├── image_train.yaml │ │ ├── image_train_continue.yaml │ │ ├── pc_training.yaml │ │ └── video_training.yaml │ │ ├── vicuna7b │ │ ├── audio_training.yaml │ │ ├── audio_training_improved.yaml │ │ ├── audio_training_no_init.yaml │ │ ├── audio_training_projection_only.yaml │ │ ├── audio_training_projection_only_nocue.yaml │ │ ├── image_train.yaml │ │ ├── image_train_improved.yaml │ │ ├── image_train_no_init.yaml │ │ ├── image_train_projection_only.yaml │ │ ├── lora_training.yaml │ │ ├── pc_training.yaml │ │ ├── pc_training_improved.yaml │ │ ├── pc_training_no_init.yaml │ │ ├── pc_training_projection_only.yaml │ │ ├── pc_training_projection_only_nocue.yaml │ │ ├── pc_training_scaled_up.yaml │ │ ├── pc_training_ulip1.yaml │ │ ├── pc_training_ulip2_objaverse_shapenet_k_1.yaml │ │ ├── pc_training_ulip_objaverse.yaml │ │ ├── pc_training_ulip_shapenet.yaml │ │ ├── video_training.yaml │ │ └── video_training_no_msrvtt_upsample.yaml │ │ └── vicuna7b_nocue │ │ ├── audio_training.yaml │ │ ├── image_train.yaml │ │ ├── pc_training.yaml │ │ └── video_training.yaml ├── runners │ ├── __init__.py │ ├── runner_base.py │ └── runner_iter.py └── tasks │ ├── __init__.py │ ├── base_task.py │ ├── captioning.py │ ├── dialogue.py │ ├── image_text_pretrain.py │ ├── multimodal_classification.py │ ├── retrieval.py │ ├── text_to_image_generation.py │ ├── vqa.py │ └── vqa_reading_comprehension.py ├── projects ├── blip-diffusion │ ├── README.md │ ├── images │ │ ├── black-cat.png │ │ ├── cat-sofa.png │ │ ├── dog.png │ │ ├── dog2.png │ │ ├── dreambooth │ │ │ ├── dog │ │ │ │ ├── 00.jpg │ │ │ │ ├── 01.jpg │ │ │ │ ├── 02.jpg │ │ │ │ ├── 03.jpg │ │ │ │ └── 04.jpg │ │ │ └── dog8 │ │ │ │ ├── 00.jpg │ │ │ │ ├── 01.jpg │ │ │ │ ├── 02.jpg │ │ │ │ ├── 03.jpg │ │ │ │ └── 04.jpg │ │ ├── dress-model.png │ │ ├── flower.jpg │ │ ├── green-skirt.png │ │ ├── jacket-letter-s │ │ │ └── jacket-letter-s.png │ │ ├── kettle.jpg │ │ ├── pink-dress.png │ │ ├── pink-dress │ │ │ └── pink-dress.png │ │ └── shein-jacket │ │ │ └── shein-jacket.jpg │ ├── notebooks │ │ ├── editing_real_finetuned.ipynb │ │ ├── editing_real_zeroshot.ipynb │ │ ├── editing_synthetic_zeroshot.ipynb │ │ ├── editing_tryon_zeroshot.ipynb │ │ ├── generation_finetuned_dog.ipynb │ │ ├── generation_zeroshot.ipynb │ │ └── stylization.ipynb │ └── teaser-website.png ├── blip2 │ ├── README.md │ └── blip2_illustration.png ├── img2llm-vqa │ ├── Caption.png │ ├── Illustration.png │ ├── QuestionGeneration.png │ ├── README.md │ ├── demo.png │ ├── img2llm_vqa.ipynb │ └── img2llm_vqa.py ├── img2prompt-vqa │ └── README.md ├── instructblip │ ├── README.md │ ├── comparison.png │ ├── run_demo.py │ └── showcase.png ├── pnp-vqa │ ├── README.md │ ├── pnp_vqa.ipynb │ └── pnp_vqa.png └── xinstructblip │ ├── README.md │ ├── assets │ ├── architecture.png │ └── data.png │ ├── data_aug │ ├── 3d_qa_data_generation.py │ └── audio_qa_data_generation.py │ ├── demo │ ├── configs │ │ ├── vicuna13b.yaml │ │ ├── vicuna7b.yaml │ │ ├── vicuna7b_blip_init.yaml │ │ ├── vicuna7b_no_init.yaml │ │ ├── vicuna7b_nocue.yaml │ │ ├── vicuna7b_projection.yaml │ │ ├── vicuna7b_rand.yaml │ │ └── vicuna7b_v2.yaml │ ├── demo.ipynb │ ├── examples │ │ ├── audio │ │ │ ├── 110714_wren.wav │ │ │ └── Group_of_Dogs_Barking.wav │ │ └── point_cloud │ │ │ └── banana.glb │ └── run_demo.py │ ├── discrn │ ├── caption_baseline │ │ ├── predict_audio.py │ │ ├── predict_image.py │ │ ├── predict_pc.py │ │ ├── predict_video.py │ │ └── render_images.py │ └── data_generation │ │ ├── audiocaps_video_audio.py │ │ └── objaverse_img_3d.py │ └── modelnet_baseline │ └── render_images.py ├── pyproject.toml ├── requirements.txt ├── run_scripts ├── albef │ ├── eval │ │ ├── eval_albef_nlvr.sh │ │ ├── eval_albef_ve.sh │ │ ├── eval_coco_retrieval.sh │ │ ├── eval_flickr30k_retrieval.sh │ │ ├── test_albef_vqa.sh │ │ └── val_albef_vqa.sh │ └── train │ │ ├── pretrain.sh │ │ ├── train_aokvqa_albef.sh │ │ ├── train_coco_retrieval_albef.sh │ │ ├── train_flickr30k_retrieval_albef.sh │ │ ├── train_nlvr_albef.sh │ │ ├── train_okvqa_albef.sh │ │ ├── train_ve_albef.sh │ │ └── train_vqa_albef.sh ├── alpro │ ├── eval │ │ ├── eval_didemo_ret.sh │ │ ├── eval_msrvtt_qa.sh │ │ ├── eval_msrvtt_ret.sh │ │ └── eval_msvd_qa.sh │ └── train │ │ ├── train_didemo_ret.sh │ │ ├── train_msrvtt_qa.sh │ │ ├── train_msrvtt_ret.sh │ │ └── train_msvd_qa.sh ├── blip-diffusion │ ├── train_db.sh │ ├── train_db_dog.sh │ ├── train_db_jacket_s.sh │ ├── train_db_pink_dress.sh │ └── train_db_shein_jacket.sh ├── blip │ ├── eval │ │ ├── eval_aokvqa.sh │ │ ├── eval_coco_cap.sh │ │ ├── eval_coco_cap_large.sh │ │ ├── eval_nlvr.sh │ │ ├── eval_nocaps.sh │ │ ├── eval_okvqa.sh │ │ ├── eval_ret_coco.sh │ │ ├── eval_ret_flickr.sh │ │ └── validate_vqa.sh │ └── train │ │ ├── pretrain.sh │ │ ├── train_aokvqa.sh │ │ ├── train_caption_coco.sh │ │ ├── train_caption_coco_large.sh │ │ ├── train_caption_coco_large_iters.sh │ │ ├── train_nlvr.sh │ │ ├── train_okvqa.sh │ │ ├── train_retrieval_coco.sh │ │ ├── train_retrieval_flickr.sh │ │ └── train_vqa.sh ├── blip2 │ ├── eval │ │ ├── eval_cap_coco_flant5xl.sh │ │ ├── eval_cap_coco_opt2.7b.sh │ │ ├── eval_cap_coco_opt6.7b.sh │ │ ├── eval_gqa_zeroshot_flant5xl.sh │ │ ├── eval_okvqa_zeroshot_flant5xl.sh │ │ ├── eval_ret_coco.sh │ │ ├── eval_ret_flickr.sh │ │ ├── validate_vqa_zeroshot_flant5xl.sh │ │ └── validate_vqa_zeroshot_opt.sh │ └── train │ │ ├── pretrain_stage1.sh │ │ ├── pretrain_stage2.sh │ │ ├── train_caption_coco.sh │ │ └── train_retrieval_coco.sh ├── clip │ └── eval │ │ ├── eval_clip_ret_coco.sh │ │ ├── eval_clip_ret_flickr.sh │ │ └── eval_clip_zs_imnet.sh ├── gpt │ ├── eval │ │ └── eval_video_dialogue_avsd.sh │ └── train │ │ └── train_video_dialogue_avsd.sh ├── pnp-vqa │ └── eval │ │ ├── eval_gqa.sh │ │ ├── eval_gqa_3b.sh │ │ ├── eval_gqa_large.sh │ │ ├── eval_okvqa.sh │ │ ├── eval_okvqa_3b.sh │ │ ├── eval_okvqa_large.sh │ │ ├── eval_vqav2.sh │ │ ├── eval_vqav2_3b.sh │ │ ├── eval_vqav2_large.sh │ │ ├── eval_vqav2_test.sh │ │ ├── eval_vqav2_test_3b.sh │ │ └── eval_vqav2_test_large.sh ├── run_browser.sh └── run_demo.sh ├── setup.py ├── tests └── models │ ├── test_albef.py │ ├── test_blip.py │ ├── test_blip2.py │ └── test_pnp_vqa.py └── train.py /.github/workflows/docs.yaml: -------------------------------------------------------------------------------- 1 | name: docs 2 | 3 | on: 4 | push: 5 | branches: [ main ] 6 | pull_request: 7 | branches: [ main ] 8 | release: 9 | types: [ published ] 10 | 11 | jobs: 12 | build: 13 | 14 | runs-on: ubuntu-18.04 15 | 16 | steps: 17 | - uses: actions/checkout@v2 18 | with: 19 | fetch-depth: 0 20 | - name: Set up Python 21 | uses: actions/setup-python@v2 22 | with: 23 | python-version: '3.8' 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip setuptools wheel 27 | sudo apt-get update 28 | sudo apt-get install openjdk-11-jdk 29 | sudo apt-get install pandoc 30 | - name: Build Sphinx docs 31 | run: | 32 | docs/build_docs.sh 33 | - name: Deploy to gh-pages 34 | uses: peaceiris/actions-gh-pages@v3 35 | if: ${{ github.ref == 'refs/heads/main' || github.event_name == 'release' }} 36 | with: 37 | github_token: ${{ secrets.GITHUB_TOKEN }} 38 | publish_dir: docs/_build/html -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | repos: 2 | - repo: https://github.com/pre-commit/pre-commit-hooks 3 | rev: v4.1.0 4 | hooks: 5 | - id: trailing-whitespace 6 | - id: check-ast 7 | - id: no-commit-to-branch 8 | args: ['--branch=main'] 9 | - id: check-added-large-files 10 | args: ['--maxkb=5000'] 11 | - id: end-of-file-fixer 12 | 13 | - repo: https://github.com/psf/black 14 | rev: stable 15 | hooks: 16 | - id: black 17 | language_version: python3.8 18 | 19 | - repo: https://github.com/PyCQA/flake8 20 | rev: 3.9.2 21 | hooks: 22 | - id: flake8 23 | args: [ 24 | # only error for syntax errors and undefined names 25 | "--select=E9,F63,F7,F82", 26 | ] 27 | -------------------------------------------------------------------------------- /CODEOWNERS: -------------------------------------------------------------------------------- 1 | # Comment line immediately above ownership line is reserved for related gus information. Please be careful while editing. 2 | #ECCN:Open Source -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include lavis/configs *.yaml *.json 2 | recursive-include lavis/projects *.yaml *.json 3 | 4 | recursive-exclude lavis/datasets/download_scripts * 5 | recursive-exclude lavis/output * 6 | 7 | include requirements.txt 8 | include lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz 9 | -------------------------------------------------------------------------------- /SECURITY.md: -------------------------------------------------------------------------------- 1 | ## Security 2 | 3 | Please report any security issue to [security@salesforce.com](mailto:security@salesforce.com) 4 | as soon as it is discovered. This library limits its runtime dependencies in 5 | order to reduce the total cost of ownership as much as can be, but all consumers 6 | should remain vigilant and have their security stakeholders review all third-party 7 | products (3PP) like this one and their dependencies. -------------------------------------------------------------------------------- /app/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | # Copyright (c) 2022, salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from PIL import Image 9 | import requests 10 | 11 | import streamlit as st 12 | import torch 13 | 14 | 15 | @st.cache() 16 | def load_demo_image(): 17 | img_url = ( 18 | "https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg" 19 | ) 20 | raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB") 21 | return raw_image 22 | 23 | 24 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 25 | 26 | cache_root = "/export/home/.cache/lavis/" 27 | -------------------------------------------------------------------------------- /app/main.py: -------------------------------------------------------------------------------- 1 | """ 2 | # Copyright (c) 2022, salesforce.com, inc. 3 | # All rights reserved. 4 | # SPDX-License-Identifier: BSD-3-Clause 5 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from app.multipage import MultiPage 9 | from app import vqa, caption 10 | from app import image_text_match as itm 11 | from app import text_localization as tl 12 | from app import multimodal_search as ms 13 | from app import classification as cl 14 | 15 | 16 | if __name__ == "__main__": 17 | app = MultiPage() 18 | 19 | app.add_page("Image Description Generation", caption.app) 20 | app.add_page("Multimodal Search", ms.app) 21 | app.add_page("Visual Question Answering", vqa.app) 22 | app.add_page("Image Text Matching", itm.app) 23 | app.add_page("Text Localization", tl.app) 24 | app.add_page("Classification", cl.app) 25 | app.run() 26 | -------------------------------------------------------------------------------- /assets/demo-6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/assets/demo-6.png -------------------------------------------------------------------------------- /dataset_card/gqa.md: -------------------------------------------------------------------------------- 1 | ![From https://arxiv.org/abs/1902.09506.pdf.](imgs/gqa.png) 2 | 3 | # GQA Dataset 4 | 5 | ## Description 6 | (from https://cs.stanford.edu/people/dorarad/gqa/about.html) 7 | 8 | GQA is a VQA dataset for real-word images which requires visual, spatial and compositional reasoning. 9 | It consists of 22M questions and 110K images. 10 | 11 | ## Task 12 | (from https://arxiv.org/abs/1902.09506) 13 | 14 | Given an image and a question, the model is required to output a correct answer. 15 | GQA questions require spatial understanding, multiple reasoning skills and multiple-step inference. 16 | 17 | ## Metrics 18 | 19 | The metrics are accuracy, consistency, validity, plausibility. The commonly reported metric is accuracy. 20 | 21 | ## Leaderboard 22 | 23 | TBD 24 | 25 | ## Auto-Downloading 26 | 27 | ``` 28 | cd lavis/datasets/download_scripts && python download_gqa.py 29 | ``` 30 | 31 | ## References 32 | "GQA: A New Dataset for Real-World Visual Reasoning and Compositional Question Answering", Drew A. Hudson, Christopher D. Manning -------------------------------------------------------------------------------- /dataset_card/imgs/NLVR2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/NLVR2.png -------------------------------------------------------------------------------- /dataset_card/imgs/avsd_dialogue.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/avsd_dialogue.png -------------------------------------------------------------------------------- /dataset_card/imgs/coco_caption.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/coco_caption.png -------------------------------------------------------------------------------- /dataset_card/imgs/conceptual_captions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/conceptual_captions.png -------------------------------------------------------------------------------- /dataset_card/imgs/didemo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/didemo.png -------------------------------------------------------------------------------- /dataset_card/imgs/flickr30k.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/flickr30k.png -------------------------------------------------------------------------------- /dataset_card/imgs/gqa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/gqa.png -------------------------------------------------------------------------------- /dataset_card/imgs/msrvtt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/msrvtt.png -------------------------------------------------------------------------------- /dataset_card/imgs/msrvtt_qa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/msrvtt_qa.png -------------------------------------------------------------------------------- /dataset_card/imgs/msvd_qa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/msvd_qa.png -------------------------------------------------------------------------------- /dataset_card/imgs/nocaps.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/nocaps.png -------------------------------------------------------------------------------- /dataset_card/imgs/sbu_caption.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/sbu_caption.png -------------------------------------------------------------------------------- /dataset_card/imgs/snli_ve.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/snli_ve.png -------------------------------------------------------------------------------- /dataset_card/imgs/vqav2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/dataset_card/imgs/vqav2.png -------------------------------------------------------------------------------- /dataset_card/sbu_caption.md: -------------------------------------------------------------------------------- 1 | ![sbu caption](imgs/sbu_caption.png) 2 | (image credit: http://tamaraberg.com/papers/generation_nips2011.pdf) 3 | 4 | # SBU Caption Dataset 5 | (from http://tamaraberg.com/papers/generation_nips2011.pdf) 6 | 7 | SBU caption dataset is a new dataset, collected by performing Flickr queries and 8 | then filtering the noisy results down to 1 million images with associated visually 9 | relevant captions. 10 | 11 | ## Auto-Downloading 12 | ``` 13 | cd lavis/datasets/download_scripts && python download_sbu.py 14 | ``` 15 | ## References 16 | ```bibtex 17 | @inproceedings{Ordonez:2011:im2text, 18 | Author = {Vicente Ordonez and Girish Kulkarni and Tamara L. Berg}, 19 | Title = {Im2Text: Describing Images Using 1 Million Captioned Photographs}, 20 | Booktitle = {Neural Information Processing Systems ({NIPS})}, 21 | Year = {2011}, 22 | } 23 | ``` 24 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /docs/_static/Confusing-Pictures.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/docs/_static/Confusing-Pictures.jpg -------------------------------------------------------------------------------- /docs/_static/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/docs/_static/architecture.png -------------------------------------------------------------------------------- /docs/_static/logo_final.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/docs/_static/logo_final.png -------------------------------------------------------------------------------- /docs/_static/merlion.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/docs/_static/merlion.png -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | .. LAVIS documentation master file, created by 2 | sphinx-quickstart on Sun Jul 31 10:32:27 2022. 3 | You can adapt this file completely to your liking, but it should at least 4 | contain the root `toctree` directive. 5 | 6 | Welcome to LAVIS's documentation! 7 | ================================= 8 | 9 | .. toctree:: 10 | :maxdepth: 1 11 | :caption: Introduction 12 | 13 | intro 14 | 15 | 16 | .. toctree:: 17 | :maxdepth: 1 18 | :caption: Getting Started 19 | 20 | getting_started 21 | 22 | 23 | .. :maxdepth: 1 24 | .. :caption: Advanced Training 25 | 26 | .. advanced_training 27 | 28 | 29 | .. toctree:: 30 | :maxdepth: 2 31 | :caption: Advanced Usage 32 | 33 | benchmark 34 | tutorial 35 | 36 | 37 | .. Documentations 38 | .. =================== 39 | 40 | 41 | Indices and tables 42 | ================== 43 | 44 | * :ref:`genindex` 45 | * :ref:`modindex` 46 | * :ref:`search` 47 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=source 11 | set BUILDDIR=build 12 | 13 | if "%1" == "" goto help 14 | 15 | %SPHINXBUILD% >NUL 2>NUL 16 | if errorlevel 9009 ( 17 | echo. 18 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 19 | echo.installed, then set the SPHINXBUILD environment variable to point 20 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 21 | echo.may add the Sphinx directory to PATH. 22 | echo. 23 | echo.If you don't have Sphinx installed, grab it from 24 | echo.http://sphinx-doc.org/ 25 | exit /b 1 26 | ) 27 | 28 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 29 | goto end 30 | 31 | :help 32 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% 33 | 34 | :end 35 | popd 36 | -------------------------------------------------------------------------------- /docs/requirements.txt: -------------------------------------------------------------------------------- 1 | GitPython 2 | ipykernel 3 | nbsphinx==0.8.7 4 | pandoc 5 | sphinx 6 | sphinx_autodoc_typehints 7 | sphinx_rtd_theme -------------------------------------------------------------------------------- /docs/tutorial.rst: -------------------------------------------------------------------------------- 1 | Tutorials 2 | ============================== 3 | 4 | .. toctree:: 5 | :maxdepth: 1 6 | 7 | tutorial.evaluation 8 | tutorial.training-example 9 | tutorial.configs 10 | tutorial.datasets 11 | tutorial.processors 12 | tutorial.models 13 | tutorial.tasks 14 | -------------------------------------------------------------------------------- /lavis/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from lavis.common.registry import registry 14 | 15 | from lavis.datasets.builders import * 16 | from lavis.models import * 17 | from lavis.processors import * 18 | from lavis.tasks import * 19 | 20 | 21 | root_dir = os.path.dirname(os.path.abspath(__file__)) 22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml")) 23 | 24 | registry.register_path("library_root", root_dir) 25 | repo_root = os.path.join(root_dir, "..") 26 | registry.register_path("repo_root", repo_root) 27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root) 28 | registry.register_path("cache_root", cache_root) 29 | 30 | registry.register("MAX_INT", sys.maxsize) 31 | registry.register("SPLIT_NAMES", ["train", "val", "test"]) 32 | -------------------------------------------------------------------------------- /lavis/common/annotator/canny/__init__.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | 3 | 4 | class CannyDetector: 5 | def __call__(self, img, low_threshold, high_threshold): 6 | return cv2.Canny(img, low_threshold, high_threshold) 7 | -------------------------------------------------------------------------------- /lavis/common/annotator/ckpts/download.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt 4 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/network-bsds500.pth 5 | 6 | -------------------------------------------------------------------------------- /lavis/common/annotator/midas/midas/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/lavis/common/annotator/midas/midas/__init__.py -------------------------------------------------------------------------------- /lavis/common/annotator/midas/midas/base_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class BaseModel(torch.nn.Module): 5 | def load(self, path): 6 | """Load model from file. 7 | 8 | Args: 9 | path (str): file path 10 | """ 11 | parameters = torch.load(path, map_location=torch.device('cpu')) 12 | 13 | if "optimizer" in parameters: 14 | parameters = parameters["model"] 15 | 16 | self.load_state_dict(parameters) 17 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from annotator.uniformer.mmseg.apis import init_segmentor, inference_segmentor, show_result_pyplot 4 | from annotator.uniformer.mmseg.core.evaluation import get_palette 5 | from annotator.util import annotator_ckpts_path 6 | 7 | 8 | checkpoint_file = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/upernet_global_small.pth" 9 | 10 | 11 | class UniformerDetector: 12 | def __init__(self): 13 | modelpath = os.path.join(annotator_ckpts_path, "upernet_global_small.pth") 14 | if not os.path.exists(modelpath): 15 | from basicsr.utils.download_util import load_file_from_url 16 | load_file_from_url(checkpoint_file, model_dir=annotator_ckpts_path) 17 | config_file = os.path.join(os.path.dirname(annotator_ckpts_path), "uniformer", "exp", "upernet_global_small", "config.py") 18 | self.model = init_segmentor(config_file, modelpath).cuda() 19 | 20 | def __call__(self, img): 21 | result = inference_segmentor(self.model, img) 22 | res_img = show_result_pyplot(self.model, img, result, get_palette('ade'), opacity=1) 23 | return res_img 24 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/configs/_base_/datasets/pascal_voc12_aug.py: -------------------------------------------------------------------------------- 1 | _base_ = './pascal_voc12.py' 2 | # dataset settings 3 | data = dict( 4 | train=dict( 5 | ann_dir=['SegmentationClass', 'SegmentationClassAug'], 6 | split=[ 7 | 'ImageSets/Segmentation/train.txt', 8 | 'ImageSets/Segmentation/aug.txt' 9 | ])) 10 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/configs/_base_/default_runtime.py: -------------------------------------------------------------------------------- 1 | # yapf:disable 2 | log_config = dict( 3 | interval=50, 4 | hooks=[ 5 | dict(type='TextLoggerHook', by_epoch=False), 6 | # dict(type='TensorboardLoggerHook') 7 | ]) 8 | # yapf:enable 9 | dist_params = dict(backend='nccl') 10 | log_level = 'INFO' 11 | load_from = None 12 | resume_from = None 13 | workflow = [('train', 1)] 14 | cudnn_benchmark = True 15 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/configs/_base_/models/fpn_uniformer.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | backbone=dict( 6 | type='UniFormer', 7 | embed_dim=[64, 128, 320, 512], 8 | layers=[3, 4, 8, 3], 9 | head_dim=64, 10 | mlp_ratio=4., 11 | qkv_bias=True, 12 | drop_rate=0., 13 | attn_drop_rate=0., 14 | drop_path_rate=0.1), 15 | neck=dict( 16 | type='FPN', 17 | in_channels=[64, 128, 320, 512], 18 | out_channels=256, 19 | num_outs=4), 20 | decode_head=dict( 21 | type='FPNHead', 22 | in_channels=[256, 256, 256, 256], 23 | in_index=[0, 1, 2, 3], 24 | feature_strides=[4, 8, 16, 32], 25 | channels=128, 26 | dropout_ratio=0.1, 27 | num_classes=150, 28 | norm_cfg=norm_cfg, 29 | align_corners=False, 30 | loss_decode=dict( 31 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 32 | # model training and testing settings 33 | train_cfg=dict(), 34 | test_cfg=dict(mode='whole') 35 | ) 36 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | backbone=dict( 6 | type='MobileNetV3', 7 | arch='large', 8 | out_indices=(1, 3, 16), 9 | norm_cfg=norm_cfg), 10 | decode_head=dict( 11 | type='LRASPPHead', 12 | in_channels=(16, 24, 960), 13 | in_index=(0, 1, 2), 14 | channels=128, 15 | input_transform='multiple_select', 16 | dropout_ratio=0.1, 17 | num_classes=19, 18 | norm_cfg=norm_cfg, 19 | act_cfg=dict(type='ReLU'), 20 | align_corners=False, 21 | loss_decode=dict( 22 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 23 | # model training and testing settings 24 | train_cfg=dict(), 25 | test_cfg=dict(mode='whole')) 26 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_160k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=160000) 8 | checkpoint_config = dict(by_epoch=False, interval=16000) 9 | evaluation = dict(interval=16000, metric='mIoU') 10 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_20k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=20000) 8 | checkpoint_config = dict(by_epoch=False, interval=2000) 9 | evaluation = dict(interval=2000, metric='mIoU') 10 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_40k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=40000) 8 | checkpoint_config = dict(by_epoch=False, interval=4000) 9 | evaluation = dict(interval=4000, metric='mIoU') 10 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_80k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=80000) 8 | checkpoint_config = dict(by_epoch=False, interval=8000) 9 | evaluation = dict(interval=8000, metric='mIoU') 10 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/exp/upernet_global_small/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | work_path=$(dirname $0) 4 | PYTHONPATH="$(dirname $0)/../../":$PYTHONPATH \ 5 | python -m torch.distributed.launch --nproc_per_node=8 \ 6 | tools/train.py ${work_path}/config.py \ 7 | --launcher pytorch \ 8 | --options model.backbone.pretrained_path='your_model_path/uniformer_small_in1k.pth' \ 9 | --work-dir ${work_path}/ckpt \ 10 | 2>&1 | tee -a ${work_path}/log.txt 11 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/exp/upernet_global_small/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | work_path=$(dirname $0) 4 | PYTHONPATH="$(dirname $0)/../../":$PYTHONPATH \ 5 | python -m torch.distributed.launch --nproc_per_node=8 \ 6 | tools/test.py ${work_path}/test_config_h32.py \ 7 | ${work_path}/ckpt/latest.pth \ 8 | --launcher pytorch \ 9 | --eval mIoU \ 10 | 2>&1 | tee -a ${work_path}/log.txt 11 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # flake8: noqa 3 | from .arraymisc import * 4 | from .fileio import * 5 | from .image import * 6 | from .utils import * 7 | from .version import * 8 | from .video import * 9 | from .visualization import * 10 | 11 | # The following modules are not imported to this level, so mmcv may be used 12 | # without PyTorch. 13 | # - runner 14 | # - parallel 15 | # - op 16 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/arraymisc/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .quantization import dequantize, quantize 3 | 4 | __all__ = ['quantize', 'dequantize'] 5 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/cnn/bricks/hswish.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch.nn as nn 3 | 4 | from .registry import ACTIVATION_LAYERS 5 | 6 | 7 | @ACTIVATION_LAYERS.register_module() 8 | class HSwish(nn.Module): 9 | """Hard Swish Module. 10 | 11 | This module applies the hard swish function: 12 | 13 | .. math:: 14 | Hswish(x) = x * ReLU6(x + 3) / 6 15 | 16 | Args: 17 | inplace (bool): can optionally do the operation in-place. 18 | Default: False. 19 | 20 | Returns: 21 | Tensor: The output tensor. 22 | """ 23 | 24 | def __init__(self, inplace=False): 25 | super(HSwish, self).__init__() 26 | self.act = nn.ReLU6(inplace) 27 | 28 | def forward(self, x): 29 | return x * self.act(x + 3) / 6 30 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/cnn/bricks/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from annotator.uniformer.mmcv.utils import Registry 3 | 4 | CONV_LAYERS = Registry('conv layer') 5 | NORM_LAYERS = Registry('norm layer') 6 | ACTIVATION_LAYERS = Registry('activation layer') 7 | PADDING_LAYERS = Registry('padding layer') 8 | UPSAMPLE_LAYERS = Registry('upsample layer') 9 | PLUGIN_LAYERS = Registry('plugin layer') 10 | 11 | DROPOUT_LAYERS = Registry('drop out layers') 12 | POSITIONAL_ENCODING = Registry('position encoding') 13 | ATTENTION = Registry('attention') 14 | FEEDFORWARD_NETWORK = Registry('feed-forward Network') 15 | TRANSFORMER_LAYER = Registry('transformerLayer') 16 | TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence') 17 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/cnn/bricks/scale.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class Scale(nn.Module): 7 | """A learnable scale parameter. 8 | 9 | This layer scales the input by a learnable factor. It multiplies a 10 | learnable scale parameter of shape (1,) with input of any shape. 11 | 12 | Args: 13 | scale (float): Initial value of scale factor. Default: 1.0 14 | """ 15 | 16 | def __init__(self, scale=1.0): 17 | super(Scale, self).__init__() 18 | self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float)) 19 | 20 | def forward(self, x): 21 | return x * self.scale 22 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/cnn/bricks/swish.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | import torch.nn as nn 4 | 5 | from .registry import ACTIVATION_LAYERS 6 | 7 | 8 | @ACTIVATION_LAYERS.register_module() 9 | class Swish(nn.Module): 10 | """Swish Module. 11 | 12 | This module applies the swish function: 13 | 14 | .. math:: 15 | Swish(x) = x * Sigmoid(x) 16 | 17 | Returns: 18 | Tensor: The output tensor. 19 | """ 20 | 21 | def __init__(self): 22 | super(Swish, self).__init__() 23 | 24 | def forward(self, x): 25 | return x * torch.sigmoid(x) 26 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/cnn/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .flops_counter import get_model_complexity_info 3 | from .fuse_conv_bn import fuse_conv_bn 4 | from .sync_bn import revert_sync_batchnorm 5 | from .weight_init import (INITIALIZERS, Caffe2XavierInit, ConstantInit, 6 | KaimingInit, NormalInit, PretrainedInit, 7 | TruncNormalInit, UniformInit, XavierInit, 8 | bias_init_with_prob, caffe2_xavier_init, 9 | constant_init, initialize, kaiming_init, normal_init, 10 | trunc_normal_init, uniform_init, xavier_init) 11 | 12 | __all__ = [ 13 | 'get_model_complexity_info', 'bias_init_with_prob', 'caffe2_xavier_init', 14 | 'constant_init', 'kaiming_init', 'normal_init', 'trunc_normal_init', 15 | 'uniform_init', 'xavier_init', 'fuse_conv_bn', 'initialize', 16 | 'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit', 17 | 'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit', 18 | 'Caffe2XavierInit', 'revert_sync_batchnorm' 19 | ] 20 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/engine/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .test import (collect_results_cpu, collect_results_gpu, multi_gpu_test, 3 | single_gpu_test) 4 | 5 | __all__ = [ 6 | 'collect_results_cpu', 'collect_results_gpu', 'multi_gpu_test', 7 | 'single_gpu_test' 8 | ] 9 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/fileio/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .file_client import BaseStorageBackend, FileClient 3 | from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler 4 | from .io import dump, load, register_handler 5 | from .parse import dict_from_file, list_from_file 6 | 7 | __all__ = [ 8 | 'BaseStorageBackend', 'FileClient', 'load', 'dump', 'register_handler', 9 | 'BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler', 10 | 'list_from_file', 'dict_from_file' 11 | ] 12 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/fileio/handlers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .base import BaseFileHandler 3 | from .json_handler import JsonHandler 4 | from .pickle_handler import PickleHandler 5 | from .yaml_handler import YamlHandler 6 | 7 | __all__ = ['BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler'] 8 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/fileio/handlers/base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from abc import ABCMeta, abstractmethod 3 | 4 | 5 | class BaseFileHandler(metaclass=ABCMeta): 6 | # `str_like` is a flag to indicate whether the type of file object is 7 | # str-like object or bytes-like object. Pickle only processes bytes-like 8 | # objects but json only processes str-like object. If it is str-like 9 | # object, `StringIO` will be used to process the buffer. 10 | str_like = True 11 | 12 | @abstractmethod 13 | def load_from_fileobj(self, file, **kwargs): 14 | pass 15 | 16 | @abstractmethod 17 | def dump_to_fileobj(self, obj, file, **kwargs): 18 | pass 19 | 20 | @abstractmethod 21 | def dump_to_str(self, obj, **kwargs): 22 | pass 23 | 24 | def load_from_path(self, filepath, mode='r', **kwargs): 25 | with open(filepath, mode) as f: 26 | return self.load_from_fileobj(f, **kwargs) 27 | 28 | def dump_to_path(self, obj, filepath, mode='w', **kwargs): 29 | with open(filepath, mode) as f: 30 | self.dump_to_fileobj(obj, f, **kwargs) 31 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/fileio/handlers/pickle_handler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import pickle 3 | 4 | from .base import BaseFileHandler 5 | 6 | 7 | class PickleHandler(BaseFileHandler): 8 | 9 | str_like = False 10 | 11 | def load_from_fileobj(self, file, **kwargs): 12 | return pickle.load(file, **kwargs) 13 | 14 | def load_from_path(self, filepath, **kwargs): 15 | return super(PickleHandler, self).load_from_path( 16 | filepath, mode='rb', **kwargs) 17 | 18 | def dump_to_str(self, obj, **kwargs): 19 | kwargs.setdefault('protocol', 2) 20 | return pickle.dumps(obj, **kwargs) 21 | 22 | def dump_to_fileobj(self, obj, file, **kwargs): 23 | kwargs.setdefault('protocol', 2) 24 | pickle.dump(obj, file, **kwargs) 25 | 26 | def dump_to_path(self, obj, filepath, **kwargs): 27 | super(PickleHandler, self).dump_to_path( 28 | obj, filepath, mode='wb', **kwargs) 29 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/fileio/handlers/yaml_handler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import yaml 3 | 4 | try: 5 | from yaml import CLoader as Loader, CDumper as Dumper 6 | except ImportError: 7 | from yaml import Loader, Dumper 8 | 9 | from .base import BaseFileHandler # isort:skip 10 | 11 | 12 | class YamlHandler(BaseFileHandler): 13 | 14 | def load_from_fileobj(self, file, **kwargs): 15 | kwargs.setdefault('Loader', Loader) 16 | return yaml.load(file, **kwargs) 17 | 18 | def dump_to_fileobj(self, obj, file, **kwargs): 19 | kwargs.setdefault('Dumper', Dumper) 20 | yaml.dump(obj, file, **kwargs) 21 | 22 | def dump_to_str(self, obj, **kwargs): 23 | kwargs.setdefault('Dumper', Dumper) 24 | return yaml.dump(obj, **kwargs) 25 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/model_zoo/deprecated.json: -------------------------------------------------------------------------------- 1 | { 2 | "resnet50_caffe": "detectron/resnet50_caffe", 3 | "resnet50_caffe_bgr": "detectron2/resnet50_caffe_bgr", 4 | "resnet101_caffe": "detectron/resnet101_caffe", 5 | "resnet101_caffe_bgr": "detectron2/resnet101_caffe_bgr" 6 | } 7 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/ops/info.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import glob 3 | import os 4 | 5 | import torch 6 | 7 | if torch.__version__ == 'parrots': 8 | import parrots 9 | 10 | def get_compiler_version(): 11 | return 'GCC ' + parrots.version.compiler 12 | 13 | def get_compiling_cuda_version(): 14 | return parrots.version.cuda 15 | else: 16 | from ..utils import ext_loader 17 | ext_module = ext_loader.load_ext( 18 | '_ext', ['get_compiler_version', 'get_compiling_cuda_version']) 19 | 20 | def get_compiler_version(): 21 | return ext_module.get_compiler_version() 22 | 23 | def get_compiling_cuda_version(): 24 | return ext_module.get_compiling_cuda_version() 25 | 26 | 27 | def get_onnxruntime_op_path(): 28 | wildcard = os.path.join( 29 | os.path.abspath(os.path.dirname(os.path.dirname(__file__))), 30 | '_ext_ort.*.so') 31 | 32 | paths = glob.glob(wildcard) 33 | if len(paths) > 0: 34 | return paths[0] 35 | else: 36 | return '' 37 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/parallel/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .collate import collate 3 | from .data_container import DataContainer 4 | from .data_parallel import MMDataParallel 5 | from .distributed import MMDistributedDataParallel 6 | from .registry import MODULE_WRAPPERS 7 | from .scatter_gather import scatter, scatter_kwargs 8 | from .utils import is_module_wrapper 9 | 10 | __all__ = [ 11 | 'collate', 'DataContainer', 'MMDataParallel', 'MMDistributedDataParallel', 12 | 'scatter', 'scatter_kwargs', 'is_module_wrapper', 'MODULE_WRAPPERS' 13 | ] 14 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/parallel/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from torch.nn.parallel import DataParallel, DistributedDataParallel 3 | 4 | from annotator.uniformer.mmcv.utils import Registry 5 | 6 | MODULE_WRAPPERS = Registry('module wrapper') 7 | MODULE_WRAPPERS.register_module(module=DataParallel) 8 | MODULE_WRAPPERS.register_module(module=DistributedDataParallel) 9 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/parallel/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .registry import MODULE_WRAPPERS 3 | 4 | 5 | def is_module_wrapper(module): 6 | """Check if a module is a module wrapper. 7 | 8 | The following 3 modules in MMCV (and their subclasses) are regarded as 9 | module wrappers: DataParallel, DistributedDataParallel, 10 | MMDistributedDataParallel (the deprecated version). You may add you own 11 | module wrapper by registering it to mmcv.parallel.MODULE_WRAPPERS. 12 | 13 | Args: 14 | module (nn.Module): The module to be checked. 15 | 16 | Returns: 17 | bool: True if the input module is a module wrapper. 18 | """ 19 | module_wrappers = tuple(MODULE_WRAPPERS.module_dict.values()) 20 | return isinstance(module, module_wrappers) 21 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/runner/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import copy 3 | 4 | from ..utils import Registry 5 | 6 | RUNNERS = Registry('runner') 7 | RUNNER_BUILDERS = Registry('runner builder') 8 | 9 | 10 | def build_runner_constructor(cfg): 11 | return RUNNER_BUILDERS.build(cfg) 12 | 13 | 14 | def build_runner(cfg, default_args=None): 15 | runner_cfg = copy.deepcopy(cfg) 16 | constructor_type = runner_cfg.pop('constructor', 17 | 'DefaultRunnerConstructor') 18 | runner_constructor = build_runner_constructor( 19 | dict( 20 | type=constructor_type, 21 | runner_cfg=runner_cfg, 22 | default_args=default_args)) 23 | runner = runner_constructor() 24 | return runner 25 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/runner/hooks/closure.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .hook import HOOKS, Hook 3 | 4 | 5 | @HOOKS.register_module() 6 | class ClosureHook(Hook): 7 | 8 | def __init__(self, fn_name, fn): 9 | assert hasattr(self, fn_name) 10 | assert callable(fn) 11 | setattr(self, fn_name, fn) 12 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/runner/hooks/iter_timer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import time 3 | 4 | from .hook import HOOKS, Hook 5 | 6 | 7 | @HOOKS.register_module() 8 | class IterTimerHook(Hook): 9 | 10 | def before_epoch(self, runner): 11 | self.t = time.time() 12 | 13 | def before_iter(self, runner): 14 | runner.log_buffer.update({'data_time': time.time() - self.t}) 15 | 16 | def after_iter(self, runner): 17 | runner.log_buffer.update({'time': time.time() - self.t}) 18 | self.t = time.time() 19 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .base import LoggerHook 3 | from .dvclive import DvcliveLoggerHook 4 | from .mlflow import MlflowLoggerHook 5 | from .neptune import NeptuneLoggerHook 6 | from .pavi import PaviLoggerHook 7 | from .tensorboard import TensorboardLoggerHook 8 | from .text import TextLoggerHook 9 | from .wandb import WandbLoggerHook 10 | 11 | __all__ = [ 12 | 'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook', 13 | 'TensorboardLoggerHook', 'TextLoggerHook', 'WandbLoggerHook', 14 | 'NeptuneLoggerHook', 'DvcliveLoggerHook' 15 | ] 16 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/runner/hooks/memory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | 4 | from .hook import HOOKS, Hook 5 | 6 | 7 | @HOOKS.register_module() 8 | class EmptyCacheHook(Hook): 9 | 10 | def __init__(self, before_epoch=False, after_epoch=True, after_iter=False): 11 | self._before_epoch = before_epoch 12 | self._after_epoch = after_epoch 13 | self._after_iter = after_iter 14 | 15 | def after_iter(self, runner): 16 | if self._after_iter: 17 | torch.cuda.empty_cache() 18 | 19 | def before_epoch(self, runner): 20 | if self._before_epoch: 21 | torch.cuda.empty_cache() 22 | 23 | def after_epoch(self, runner): 24 | if self._after_epoch: 25 | torch.cuda.empty_cache() 26 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/runner/hooks/sampler_seed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .hook import HOOKS, Hook 3 | 4 | 5 | @HOOKS.register_module() 6 | class DistSamplerSeedHook(Hook): 7 | """Data-loading sampler for distributed training. 8 | 9 | When distributed training, it is only useful in conjunction with 10 | :obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same 11 | purpose with :obj:`IterLoader`. 12 | """ 13 | 14 | def before_epoch(self, runner): 15 | if hasattr(runner.data_loader.sampler, 'set_epoch'): 16 | # in case the data loader uses `SequentialSampler` in Pytorch 17 | runner.data_loader.sampler.set_epoch(runner.epoch) 18 | elif hasattr(runner.data_loader.batch_sampler.sampler, 'set_epoch'): 19 | # batch sampler in pytorch warps the sampler as its attributes. 20 | runner.data_loader.batch_sampler.sampler.set_epoch(runner.epoch) 21 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/runner/hooks/sync_buffer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from ..dist_utils import allreduce_params 3 | from .hook import HOOKS, Hook 4 | 5 | 6 | @HOOKS.register_module() 7 | class SyncBuffersHook(Hook): 8 | """Synchronize model buffers such as running_mean and running_var in BN at 9 | the end of each epoch. 10 | 11 | Args: 12 | distributed (bool): Whether distributed training is used. It is 13 | effective only for distributed training. Defaults to True. 14 | """ 15 | 16 | def __init__(self, distributed=True): 17 | self.distributed = distributed 18 | 19 | def after_epoch(self, runner): 20 | """All-reduce model buffers at the end of each epoch.""" 21 | if self.distributed: 22 | allreduce_params(runner.model.buffers()) 23 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/runner/optimizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .builder import (OPTIMIZER_BUILDERS, OPTIMIZERS, build_optimizer, 3 | build_optimizer_constructor) 4 | from .default_constructor import DefaultOptimizerConstructor 5 | 6 | __all__ = [ 7 | 'OPTIMIZER_BUILDERS', 'OPTIMIZERS', 'DefaultOptimizerConstructor', 8 | 'build_optimizer', 'build_optimizer_constructor' 9 | ] 10 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/utils/parrots_jit.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import os 3 | 4 | from .parrots_wrapper import TORCH_VERSION 5 | 6 | parrots_jit_option = os.getenv('PARROTS_JIT_OPTION') 7 | 8 | if TORCH_VERSION == 'parrots' and parrots_jit_option == 'ON': 9 | from parrots.jit import pat as jit 10 | else: 11 | 12 | def jit(func=None, 13 | check_input=None, 14 | full_shape=True, 15 | derivate=False, 16 | coderize=False, 17 | optimize=False): 18 | 19 | def wrapper(func): 20 | 21 | def wrapper_inner(*args, **kargs): 22 | return func(*args, **kargs) 23 | 24 | return wrapper_inner 25 | 26 | if func is None: 27 | return wrapper 28 | else: 29 | return func 30 | 31 | 32 | if TORCH_VERSION == 'parrots': 33 | from parrots.utils.tester import skip_no_elena 34 | else: 35 | 36 | def skip_no_elena(func): 37 | 38 | def wrapper(*args, **kargs): 39 | return func(*args, **kargs) 40 | 41 | return wrapper 42 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/utils/trace.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import torch 4 | 5 | from annotator.uniformer.mmcv.utils import digit_version 6 | 7 | 8 | def is_jit_tracing() -> bool: 9 | if (torch.__version__ != 'parrots' 10 | and digit_version(torch.__version__) >= digit_version('1.6.0')): 11 | on_trace = torch.jit.is_tracing() 12 | # In PyTorch 1.6, torch.jit.is_tracing has a bug. 13 | # Refers to https://github.com/pytorch/pytorch/issues/42448 14 | if isinstance(on_trace, bool): 15 | return on_trace 16 | else: 17 | return torch._C._is_tracing() 18 | else: 19 | warnings.warn( 20 | 'torch.jit.is_tracing is only supported after v1.6.0. ' 21 | 'Therefore is_tracing returns False automatically. Please ' 22 | 'set on_trace manually if you are using trace.', UserWarning) 23 | return False 24 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/video/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .io import Cache, VideoReader, frames2video 3 | from .optflow import (dequantize_flow, flow_from_bytes, flow_warp, flowread, 4 | flowwrite, quantize_flow, sparse_flow_from_bytes) 5 | from .processing import concat_video, convert_video, cut_video, resize_video 6 | 7 | __all__ = [ 8 | 'Cache', 'VideoReader', 'frames2video', 'convert_video', 'resize_video', 9 | 'cut_video', 'concat_video', 'flowread', 'flowwrite', 'quantize_flow', 10 | 'dequantize_flow', 'flow_warp', 'flow_from_bytes', 'sparse_flow_from_bytes' 11 | ] 12 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .color import Color, color_val 3 | from .image import imshow, imshow_bboxes, imshow_det_bboxes 4 | from .optflow import flow2rgb, flowshow, make_color_wheel 5 | 6 | __all__ = [ 7 | 'Color', 'color_val', 'imshow', 'imshow_bboxes', 'imshow_det_bboxes', 8 | 'flowshow', 'flow2rgb', 'make_color_wheel' 9 | ] 10 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmcv_custom/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .checkpoint import load_checkpoint 4 | 5 | __all__ = ['load_checkpoint'] -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/apis/__init__.py: -------------------------------------------------------------------------------- 1 | from .inference import inference_segmentor, init_segmentor, show_result_pyplot 2 | from .test import multi_gpu_test, single_gpu_test 3 | from .train import get_root_logger, set_random_seed, train_segmentor 4 | 5 | __all__ = [ 6 | 'get_root_logger', 'set_random_seed', 'train_segmentor', 'init_segmentor', 7 | 'inference_segmentor', 'multi_gpu_test', 'single_gpu_test', 8 | 'show_result_pyplot' 9 | ] 10 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .evaluation import * # noqa: F401, F403 2 | from .seg import * # noqa: F401, F403 3 | from .utils import * # noqa: F401, F403 4 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/core/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .class_names import get_classes, get_palette 2 | from .eval_hooks import DistEvalHook, EvalHook 3 | from .metrics import eval_metrics, mean_dice, mean_fscore, mean_iou 4 | 5 | __all__ = [ 6 | 'EvalHook', 'DistEvalHook', 'mean_dice', 'mean_iou', 'mean_fscore', 7 | 'eval_metrics', 'get_classes', 'get_palette' 8 | ] 9 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/core/seg/__init__.py: -------------------------------------------------------------------------------- 1 | from .builder import build_pixel_sampler 2 | from .sampler import BasePixelSampler, OHEMPixelSampler 3 | 4 | __all__ = ['build_pixel_sampler', 'BasePixelSampler', 'OHEMPixelSampler'] 5 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/core/seg/builder.py: -------------------------------------------------------------------------------- 1 | from annotator.uniformer.mmcv.utils import Registry, build_from_cfg 2 | 3 | PIXEL_SAMPLERS = Registry('pixel sampler') 4 | 5 | 6 | def build_pixel_sampler(cfg, **default_args): 7 | """Build pixel sampler for segmentation map.""" 8 | return build_from_cfg(cfg, PIXEL_SAMPLERS, default_args) 9 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/core/seg/sampler/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_pixel_sampler import BasePixelSampler 2 | from .ohem_pixel_sampler import OHEMPixelSampler 3 | 4 | __all__ = ['BasePixelSampler', 'OHEMPixelSampler'] 5 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/core/seg/sampler/base_pixel_sampler.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | 3 | 4 | class BasePixelSampler(metaclass=ABCMeta): 5 | """Base class of pixel sampler.""" 6 | 7 | def __init__(self, **kwargs): 8 | pass 9 | 10 | @abstractmethod 11 | def sample(self, seg_logit, seg_label): 12 | """Placeholder for sample function.""" 13 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/core/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .misc import add_prefix 2 | 3 | __all__ = ['add_prefix'] 4 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/core/utils/misc.py: -------------------------------------------------------------------------------- 1 | def add_prefix(inputs, prefix): 2 | """Add prefix for dict. 3 | 4 | Args: 5 | inputs (dict): The input dict with str keys. 6 | prefix (str): The prefix to add. 7 | 8 | Returns: 9 | 10 | dict: The dict with keys updated with ``prefix``. 11 | """ 12 | 13 | outputs = dict() 14 | for name, value in inputs.items(): 15 | outputs[f'{prefix}.{name}'] = value 16 | 17 | return outputs 18 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .ade import ADE20KDataset 2 | from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset 3 | from .chase_db1 import ChaseDB1Dataset 4 | from .cityscapes import CityscapesDataset 5 | from .custom import CustomDataset 6 | from .dataset_wrappers import ConcatDataset, RepeatDataset 7 | from .drive import DRIVEDataset 8 | from .hrf import HRFDataset 9 | from .pascal_context import PascalContextDataset, PascalContextDataset59 10 | from .stare import STAREDataset 11 | from .voc import PascalVOCDataset 12 | 13 | __all__ = [ 14 | 'CustomDataset', 'build_dataloader', 'ConcatDataset', 'RepeatDataset', 15 | 'DATASETS', 'build_dataset', 'PIPELINES', 'CityscapesDataset', 16 | 'PascalVOCDataset', 'ADE20KDataset', 'PascalContextDataset', 17 | 'PascalContextDataset59', 'ChaseDB1Dataset', 'DRIVEDataset', 'HRFDataset', 18 | 'STAREDataset' 19 | ] 20 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/datasets/chase_db1.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from .builder import DATASETS 4 | from .custom import CustomDataset 5 | 6 | 7 | @DATASETS.register_module() 8 | class ChaseDB1Dataset(CustomDataset): 9 | """Chase_db1 dataset. 10 | 11 | In segmentation map annotation for Chase_db1, 0 stands for background, 12 | which is included in 2 categories. ``reduce_zero_label`` is fixed to False. 13 | The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to 14 | '_1stHO.png'. 15 | """ 16 | 17 | CLASSES = ('background', 'vessel') 18 | 19 | PALETTE = [[120, 120, 120], [6, 230, 230]] 20 | 21 | def __init__(self, **kwargs): 22 | super(ChaseDB1Dataset, self).__init__( 23 | img_suffix='.png', 24 | seg_map_suffix='_1stHO.png', 25 | reduce_zero_label=False, 26 | **kwargs) 27 | assert osp.exists(self.img_dir) 28 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/datasets/drive.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from .builder import DATASETS 4 | from .custom import CustomDataset 5 | 6 | 7 | @DATASETS.register_module() 8 | class DRIVEDataset(CustomDataset): 9 | """DRIVE dataset. 10 | 11 | In segmentation map annotation for DRIVE, 0 stands for background, which is 12 | included in 2 categories. ``reduce_zero_label`` is fixed to False. The 13 | ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to 14 | '_manual1.png'. 15 | """ 16 | 17 | CLASSES = ('background', 'vessel') 18 | 19 | PALETTE = [[120, 120, 120], [6, 230, 230]] 20 | 21 | def __init__(self, **kwargs): 22 | super(DRIVEDataset, self).__init__( 23 | img_suffix='.png', 24 | seg_map_suffix='_manual1.png', 25 | reduce_zero_label=False, 26 | **kwargs) 27 | assert osp.exists(self.img_dir) 28 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/datasets/hrf.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from .builder import DATASETS 4 | from .custom import CustomDataset 5 | 6 | 7 | @DATASETS.register_module() 8 | class HRFDataset(CustomDataset): 9 | """HRF dataset. 10 | 11 | In segmentation map annotation for HRF, 0 stands for background, which is 12 | included in 2 categories. ``reduce_zero_label`` is fixed to False. The 13 | ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to 14 | '.png'. 15 | """ 16 | 17 | CLASSES = ('background', 'vessel') 18 | 19 | PALETTE = [[120, 120, 120], [6, 230, 230]] 20 | 21 | def __init__(self, **kwargs): 22 | super(HRFDataset, self).__init__( 23 | img_suffix='.png', 24 | seg_map_suffix='.png', 25 | reduce_zero_label=False, 26 | **kwargs) 27 | assert osp.exists(self.img_dir) 28 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/datasets/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from .compose import Compose 2 | from .formating import (Collect, ImageToTensor, ToDataContainer, ToTensor, 3 | Transpose, to_tensor) 4 | from .loading import LoadAnnotations, LoadImageFromFile 5 | from .test_time_aug import MultiScaleFlipAug 6 | from .transforms import (CLAHE, AdjustGamma, Normalize, Pad, 7 | PhotoMetricDistortion, RandomCrop, RandomFlip, 8 | RandomRotate, Rerange, Resize, RGB2Gray, SegRescale) 9 | 10 | __all__ = [ 11 | 'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer', 12 | 'Transpose', 'Collect', 'LoadAnnotations', 'LoadImageFromFile', 13 | 'MultiScaleFlipAug', 'Resize', 'RandomFlip', 'Pad', 'RandomCrop', 14 | 'Normalize', 'SegRescale', 'PhotoMetricDistortion', 'RandomRotate', 15 | 'AdjustGamma', 'CLAHE', 'Rerange', 'RGB2Gray' 16 | ] 17 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/datasets/stare.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from .builder import DATASETS 4 | from .custom import CustomDataset 5 | 6 | 7 | @DATASETS.register_module() 8 | class STAREDataset(CustomDataset): 9 | """STARE dataset. 10 | 11 | In segmentation map annotation for STARE, 0 stands for background, which is 12 | included in 2 categories. ``reduce_zero_label`` is fixed to False. The 13 | ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to 14 | '.ah.png'. 15 | """ 16 | 17 | CLASSES = ('background', 'vessel') 18 | 19 | PALETTE = [[120, 120, 120], [6, 230, 230]] 20 | 21 | def __init__(self, **kwargs): 22 | super(STAREDataset, self).__init__( 23 | img_suffix='.png', 24 | seg_map_suffix='.ah.png', 25 | reduce_zero_label=False, 26 | **kwargs) 27 | assert osp.exists(self.img_dir) 28 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbones import * # noqa: F401,F403 2 | from .builder import (BACKBONES, HEADS, LOSSES, SEGMENTORS, build_backbone, 3 | build_head, build_loss, build_segmentor) 4 | from .decode_heads import * # noqa: F401,F403 5 | from .losses import * # noqa: F401,F403 6 | from .necks import * # noqa: F401,F403 7 | from .segmentors import * # noqa: F401,F403 8 | 9 | __all__ = [ 10 | 'BACKBONES', 'HEADS', 'LOSSES', 'SEGMENTORS', 'build_backbone', 11 | 'build_head', 'build_loss', 'build_segmentor' 12 | ] 13 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .cgnet import CGNet 2 | # from .fast_scnn import FastSCNN 3 | from .hrnet import HRNet 4 | from .mobilenet_v2 import MobileNetV2 5 | from .mobilenet_v3 import MobileNetV3 6 | from .resnest import ResNeSt 7 | from .resnet import ResNet, ResNetV1c, ResNetV1d 8 | from .resnext import ResNeXt 9 | from .unet import UNet 10 | from .vit import VisionTransformer 11 | from .uniformer import UniFormer 12 | 13 | __all__ = [ 14 | 'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 15 | 'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3', 16 | 'VisionTransformer', 'UniFormer' 17 | ] 18 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/models/decode_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .ann_head import ANNHead 2 | from .apc_head import APCHead 3 | from .aspp_head import ASPPHead 4 | from .cc_head import CCHead 5 | from .da_head import DAHead 6 | from .dm_head import DMHead 7 | from .dnl_head import DNLHead 8 | from .ema_head import EMAHead 9 | from .enc_head import EncHead 10 | from .fcn_head import FCNHead 11 | from .fpn_head import FPNHead 12 | from .gc_head import GCHead 13 | from .lraspp_head import LRASPPHead 14 | from .nl_head import NLHead 15 | from .ocr_head import OCRHead 16 | # from .point_head import PointHead 17 | from .psa_head import PSAHead 18 | from .psp_head import PSPHead 19 | from .sep_aspp_head import DepthwiseSeparableASPPHead 20 | from .sep_fcn_head import DepthwiseSeparableFCNHead 21 | from .uper_head import UPerHead 22 | 23 | __all__ = [ 24 | 'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead', 25 | 'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead', 26 | 'EncHead', 'DepthwiseSeparableFCNHead', 'FPNHead', 'EMAHead', 'DNLHead', 27 | 'APCHead', 'DMHead', 'LRASPPHead' 28 | ] 29 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/models/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .accuracy import Accuracy, accuracy 2 | from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy, 3 | cross_entropy, mask_cross_entropy) 4 | from .dice_loss import DiceLoss 5 | from .lovasz_loss import LovaszLoss 6 | from .utils import reduce_loss, weight_reduce_loss, weighted_loss 7 | 8 | __all__ = [ 9 | 'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy', 10 | 'mask_cross_entropy', 'CrossEntropyLoss', 'reduce_loss', 11 | 'weight_reduce_loss', 'weighted_loss', 'LovaszLoss', 'DiceLoss' 12 | ] 13 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/models/necks/__init__.py: -------------------------------------------------------------------------------- 1 | from .fpn import FPN 2 | from .multilevel_neck import MultiLevelNeck 3 | 4 | __all__ = ['FPN', 'MultiLevelNeck'] 5 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/models/segmentors/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseSegmentor 2 | from .cascade_encoder_decoder import CascadeEncoderDecoder 3 | from .encoder_decoder import EncoderDecoder 4 | 5 | __all__ = ['BaseSegmentor', 'EncoderDecoder', 'CascadeEncoderDecoder'] 6 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .drop import DropPath 2 | from .inverted_residual import InvertedResidual, InvertedResidualV3 3 | from .make_divisible import make_divisible 4 | from .res_layer import ResLayer 5 | from .se_layer import SELayer 6 | from .self_attention_block import SelfAttentionBlock 7 | from .up_conv_block import UpConvBlock 8 | from .weight_init import trunc_normal_ 9 | 10 | __all__ = [ 11 | 'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual', 12 | 'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'DropPath', 'trunc_normal_' 13 | ] 14 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/models/utils/drop.py: -------------------------------------------------------------------------------- 1 | """Modified from https://github.com/rwightman/pytorch-image- 2 | models/blob/master/timm/models/layers/drop.py.""" 3 | 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class DropPath(nn.Module): 9 | """Drop paths (Stochastic Depth) per sample (when applied in main path of 10 | residual blocks). 11 | 12 | Args: 13 | drop_prob (float): Drop rate for paths of model. Dropout rate has 14 | to be between 0 and 1. Default: 0. 15 | """ 16 | 17 | def __init__(self, drop_prob=0.): 18 | super(DropPath, self).__init__() 19 | self.drop_prob = drop_prob 20 | self.keep_prob = 1 - drop_prob 21 | 22 | def forward(self, x): 23 | if self.drop_prob == 0. or not self.training: 24 | return x 25 | shape = (x.shape[0], ) + (1, ) * ( 26 | x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets 27 | random_tensor = self.keep_prob + torch.rand( 28 | shape, dtype=x.dtype, device=x.device) 29 | random_tensor.floor_() # binarize 30 | output = x.div(self.keep_prob) * random_tensor 31 | return output 32 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/ops/__init__.py: -------------------------------------------------------------------------------- 1 | from .encoding import Encoding 2 | from .wrappers import Upsample, resize 3 | 4 | __all__ = ['Upsample', 'resize', 'Encoding'] 5 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .collect_env import collect_env 2 | from .logger import get_root_logger 3 | 4 | __all__ = ['get_root_logger', 'collect_env'] 5 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/utils/collect_env.py: -------------------------------------------------------------------------------- 1 | from annotator.uniformer.mmcv.utils import collect_env as collect_base_env 2 | from annotator.uniformer.mmcv.utils import get_git_hash 3 | 4 | import annotator.uniformer.mmseg as mmseg 5 | 6 | 7 | def collect_env(): 8 | """Collect the information of the running environments.""" 9 | env_info = collect_base_env() 10 | env_info['MMSegmentation'] = f'{mmseg.__version__}+{get_git_hash()[:7]}' 11 | 12 | return env_info 13 | 14 | 15 | if __name__ == '__main__': 16 | for name, val in collect_env().items(): 17 | print('{}: {}'.format(name, val)) 18 | -------------------------------------------------------------------------------- /lavis/common/annotator/uniformer/mmseg/utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from annotator.uniformer.mmcv.utils import get_logger 4 | 5 | 6 | def get_root_logger(log_file=None, log_level=logging.INFO): 7 | """Get the root logger. 8 | 9 | The logger will be initialized if it has not been initialized. By default a 10 | StreamHandler will be added. If `log_file` is specified, a FileHandler will 11 | also be added. The name of the root logger is the top-level package name, 12 | e.g., "mmseg". 13 | 14 | Args: 15 | log_file (str | None): The log filename. If specified, a FileHandler 16 | will be added to the root logger. 17 | log_level (int): The root logger level. Note that only the process of 18 | rank 0 is affected, while other processes will set the level to 19 | "Error" and be silent most of the time. 20 | 21 | Returns: 22 | logging.Logger: The root logger. 23 | """ 24 | 25 | logger = get_logger(name='mmseg', log_file=log_file, log_level=log_level) 26 | 27 | return logger 28 | -------------------------------------------------------------------------------- /lavis/common/annotator/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | import os 4 | 5 | 6 | annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts') 7 | 8 | 9 | def HWC3(x): 10 | assert x.dtype == np.uint8 11 | if x.ndim == 2: 12 | x = x[:, :, None] 13 | assert x.ndim == 3 14 | H, W, C = x.shape 15 | assert C == 1 or C == 3 or C == 4 16 | if C == 3: 17 | return x 18 | if C == 1: 19 | return np.concatenate([x, x, x], axis=2) 20 | if C == 4: 21 | color = x[:, :, 0:3].astype(np.float32) 22 | alpha = x[:, :, 3:4].astype(np.float32) / 255.0 23 | y = color * alpha + 255.0 * (1.0 - alpha) 24 | y = y.clip(0, 255).astype(np.uint8) 25 | return y 26 | 27 | 28 | def resize_image(input_image, resolution): 29 | H, W, C = input_image.shape 30 | H = float(H) 31 | W = float(W) 32 | k = float(resolution) / min(H, W) 33 | H *= k 34 | W *= k 35 | H = int(np.round(H / 64.0)) * 64 36 | W = int(np.round(W / 64.0)) * 64 37 | img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA) 38 | return img 39 | -------------------------------------------------------------------------------- /lavis/common/gradcam.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | from scipy.ndimage import filters 4 | from skimage import transform as skimage_transform 5 | 6 | 7 | def getAttMap(img, attMap, blur=True, overlap=True): 8 | attMap -= attMap.min() 9 | if attMap.max() > 0: 10 | attMap /= attMap.max() 11 | attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant") 12 | if blur: 13 | attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2])) 14 | attMap -= attMap.min() 15 | attMap /= attMap.max() 16 | cmap = plt.get_cmap("jet") 17 | attMapV = cmap(attMap) 18 | attMapV = np.delete(attMapV, 3, 2) 19 | if overlap: 20 | attMap = ( 21 | 1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img 22 | + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV 23 | ) 24 | return attMap 25 | -------------------------------------------------------------------------------- /lavis/common/vqa_tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | __author__ = "aagrawal" 9 | -------------------------------------------------------------------------------- /lavis/configs/datasets/blip_diffusion_datasets/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | blip_diffusion_finetune: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | images: 14 | storage: "" 15 | -------------------------------------------------------------------------------- /lavis/configs/datasets/conceptual_caption/defaults_12m.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | conceptual_caption_12m: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - /export/home/workspace/datasets/cc12m.json 17 | storage: 18 | - conceptual_caption/annotations/cc12m.json 19 | images: 20 | storage: conceptual_caption/images_12m 21 | -------------------------------------------------------------------------------- /lavis/configs/datasets/conceptual_caption/defaults_3m.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | conceptual_caption_3m: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - /export/home/workspace/datasets/cc3m.json 17 | storage: 18 | - conceptual_caption/annotations/cc3m.json 19 | images: 20 | storage: conceptual_caption/images 21 | -------------------------------------------------------------------------------- /lavis/configs/datasets/flickr30k/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | flickr30k: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images 10 | 11 | build_info: 12 | annotations: 13 | train: 14 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json 15 | storage: flickr30k/annotations/train.json 16 | val: 17 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json 18 | storage: flickr30k/annotations/val.json 19 | test: 20 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json 21 | storage: flickr30k/annotations/test.json 22 | images: 23 | storage: flickr30k/images 24 | # storage: /export/share/datasets/vision/flickr30k 25 | -------------------------------------------------------------------------------- /lavis/configs/datasets/imagenet/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | imagenet: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | splits: ["val"] 14 | images: 15 | storage: /export/share/datasets/vision/imagenet 16 | -------------------------------------------------------------------------------- /lavis/configs/datasets/laion/defaults_2B_multi.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | laion2B_multi: 8 | 9 | data_type: images 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar 14 | -------------------------------------------------------------------------------- /lavis/configs/datasets/laion/defaults_400M.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | laion400M: 8 | 9 | data_type: images 10 | 11 | text_processor: 12 | train: 13 | name: blip_caption 14 | eval: 15 | name: blip_caption 16 | 17 | build_info: 18 | # Be careful not to append minus sign (-) before split to avoid itemizing 19 | storage: /export/laion400m-data-ssd/laion115m_capfilt_20220817/{part0/part0,part1/part1,part2/part2}_node{00..15}_shard{000000..000118}.tar 20 | # storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar 21 | -------------------------------------------------------------------------------- /lavis/configs/datasets/laion/defaults_400M_instruct.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | laion400M_instruct: 8 | 9 | data_type: images 10 | 11 | vis_processor: 12 | train: 13 | name: "clip_image_train" 14 | image_size: 224 15 | eval: 16 | name: "clip_image_eval" 17 | image_size: 224 18 | 19 | 20 | text_processor: 21 | train: 22 | name: blip_instruction 23 | modality: image 24 | task: caption 25 | eval: 26 | name: blip_caption 27 | 28 | build_info: 29 | # Be careful not to append minus sign (-) before split to avoid itemizing 30 | storage: /export/laion400m-data-ssd/laion115m_capfilt_20220817/{part0/part0,part1/part1,part2/part2}_node{00..15}_shard{000000..000118}.tar 31 | # storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar 32 | -------------------------------------------------------------------------------- /lavis/configs/datasets/llava150k/defaults_dial.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | llava150k_dialogue_instruct: #394276 train examples 8 | 9 | data_type: images 10 | 11 | vis_processor: 12 | train: 13 | name: "clip_image_train" 14 | image_size: 224 15 | eval: 16 | name: "clip_image_eval" 17 | image_size: 224 18 | 19 | text_processor: 20 | train: 21 | name: "blip_caption" 22 | 23 | build_info: 24 | annotations: 25 | train: 26 | url: 27 | - https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K/resolve/main/llava_instruct_150k.json 28 | storage: 29 | - LLaVA-Instruct-150K/annotations/lava_instruct_150k.json 30 | # Be careful not to append minus sign (-) before split to avoid itemizing 31 | images: 32 | storage: /export/share/datasets/vision/coco/images/train2017 33 | -------------------------------------------------------------------------------- /lavis/configs/datasets/msrvtt/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msrvtt_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json 16 | storage: msrvtt/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json 19 | storage: msrvtt/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json 22 | storage: msrvtt/annotations/cap_test.json 23 | videos: 24 | storage: msrvtt/videos 25 | -------------------------------------------------------------------------------- /lavis/configs/datasets/msvd/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json 16 | storage: msvd/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json 19 | storage: msvd/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json 22 | storage: msvd/annotations/cap_test.json 23 | videos: 24 | storage: msvd/videos 25 | -------------------------------------------------------------------------------- /lavis/configs/datasets/nlvr/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | nlvr: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json 16 | storage: nlvr/annotations/train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json 19 | storage: nlvr/annotations/dev.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json 22 | storage: nlvr/annotations/test.json 23 | images: 24 | storage: /export/share/datasets/vision/NLVR2/ 25 | -------------------------------------------------------------------------------- /lavis/configs/datasets/nocaps/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | nocaps: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | val: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json 16 | storage: nocaps/annotations/nocaps_val.json 17 | test: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json 19 | storage: nocaps/annotations/nocaps_test.json 20 | images: 21 | storage: nocaps/images 22 | # storage: /export/share/datasets/vision/nocaps/ 23 | -------------------------------------------------------------------------------- /lavis/configs/datasets/sbu_caption/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | sbu_caption: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json 17 | # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json 18 | storage: 19 | - sbu_captions/annotations/sbu.json 20 | images: 21 | storage: sbu_captions/images 22 | # storage: /export/share/datasets/vision_language/sbu_resize 23 | -------------------------------------------------------------------------------- /lavis/configs/datasets/snli_ve/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | snli_ve: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json 16 | storage: snli/annotations/ve_train.json 17 | val: 18 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json 19 | storage: snli/annotations/ve_dev.json 20 | test: 21 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json 22 | storage: snli/annotations/ve_test.json 23 | images: 24 | storage: flickr30k/images/flickr30k-images 25 | # storage: /export/share/datasets/vision/flickr30k/flickr30k-images 26 | -------------------------------------------------------------------------------- /lavis/configs/datasets/vg/defaults_caption.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | vg_caption: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json 16 | storage: vg/annotations/vg_caption.json 17 | images: 18 | storage: vg/images/ 19 | -------------------------------------------------------------------------------- /lavis/configs/datasets/vg/defaults_vqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | vg_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json 16 | storage: vg/annotations/vg_qa.json 17 | images: 18 | storage: vg/images/ 19 | -------------------------------------------------------------------------------- /lavis/configs/default.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | env: 7 | # For default users 8 | # cache_root: "cache" 9 | # For internal use with persistent storage 10 | cache_root: "/export/home/.cache/lavis" 11 | -------------------------------------------------------------------------------- /lavis/configs/models/albef_classification_ve.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_classification 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt" 11 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 12 | 13 | num_classes: 3 14 | 15 | use_distill: True 16 | momentum: 0.995 17 | alpha: 0.4 18 | 19 | # vit encoder 20 | vit_type: "base" 21 | vit_grad_ckpt: False 22 | vit_ckpt_layer: 0 23 | vit_layer_norm_epsilon: 1e-6 24 | 25 | image_size: 384 26 | 27 | # bert config 28 | med_config_path: "configs/models/med_config_albef.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | eval: 35 | name: "blip_image_eval" 36 | text_processor: 37 | train: 38 | name: "blip_caption" 39 | eval: 40 | name: "blip_caption" 41 | -------------------------------------------------------------------------------- /lavis/configs/models/albef_feature_extractor.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_pretrain 8 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 9 | 10 | # vit encoder 11 | vit_type: "base" 12 | image_size: 224 13 | vit_ckpt_layer: 0 14 | vit_drop_path_rate: 0 15 | vit_layer_norm_epsilon: 1e-6 16 | vit_grad_ckpt: False 17 | 18 | # bert config 19 | med_config_path: "configs/models/med_config_albef.json" 20 | 21 | embed_dim: 256 22 | 23 | preprocess: 24 | vis_processor: 25 | eval: 26 | name: "blip_image_eval" 27 | image_size: 224 28 | text_processor: 29 | eval: 30 | name: "blip_caption" 31 | -------------------------------------------------------------------------------- /lavis/configs/models/albef_pretrain_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_pretrain 8 | 9 | load_pretrained: True 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | image_size: 224 15 | vit_ckpt_layer: 0 16 | vit_drop_path_rate: 0 17 | vit_layer_norm_epsilon: 1e-6 18 | vit_grad_ckpt: False 19 | 20 | # bert config 21 | med_config_path: "configs/models/med_config_albef.json" 22 | mlm_mask_prob: 0.15 23 | 24 | embed_dim: 256 25 | momentum: 0.995 26 | alpha: 0.4 27 | temp: 0.07 28 | 29 | max_txt_len: 30 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 256 36 | text_processor: 37 | train: 38 | name: "blip_caption" 39 | -------------------------------------------------------------------------------- /lavis/configs/models/alpro_retrieval_didemo.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | 9 | load_finetuned: True 10 | 11 | finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 13 | 14 | timesformer: 15 | n_frms: 8 16 | image_size: 224 17 | 18 | patch_size: 16 19 | attn_drop_rate: 0. 20 | drop_rate: 0. 21 | drop_path_rate: 0.1 22 | use_grad_ckpt: False 23 | 24 | # bert config 25 | med_config_path: "configs/models/bert_config_alpro.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | eval: 30 | name: "alpro_video_eval" 31 | n_frms: 8 32 | image_size: 224 33 | text_processor: 34 | eval: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /lavis/configs/models/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /lavis/configs/models/bert_config_alpro.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": true, 18 | "type_vocab_size": 2, 19 | "vocab_size": 30522, 20 | "encoder_width": 768, 21 | "add_cross_attention": false, 22 | "fusion_layer": 6 23 | } -------------------------------------------------------------------------------- /lavis/configs/models/blip-diffusion/blip_diffusion_base.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | vit_model: "clip_L" 3 | 4 | qformer_num_query_token: 16 5 | qformer_cross_attention_freq: 1 6 | 7 | sd_train_text_encoder: False 8 | sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 9 | 10 | load_finetuned: False 11 | load_pretrained: True 12 | # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz" 14 | 15 | preprocess: 16 | vis_processor: 17 | train: 18 | name: "blip_diffusion_inp_image_eval" 19 | eval: 20 | name: "blip_diffusion_inp_image_eval" 21 | text_processor: 22 | train: 23 | name: "blip_caption" 24 | eval: 25 | name: "blip_caption" 26 | -------------------------------------------------------------------------------- /lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | vit_model: "clip_L" 3 | 4 | qformer_num_query_token: 16 5 | qformer_cross_attention_freq: 1 6 | 7 | sd_train_text_encoder: False 8 | sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 9 | 10 | load_finetuned: False 11 | load_pretrained: True 12 | # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz" 14 | 15 | controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-canny" 16 | 17 | preprocess: 18 | vis_processor: 19 | train: 20 | name: "blip_diffusion_inp_image_eval" 21 | eval: 22 | name: "blip_diffusion_inp_image_eval" 23 | text_processor: 24 | train: 25 | name: "blip_caption" 26 | eval: 27 | name: "blip_caption" 28 | -------------------------------------------------------------------------------- /lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | vit_model: "clip_L" 3 | 4 | qformer_num_query_token: 16 5 | qformer_cross_attention_freq: 1 6 | 7 | sd_train_text_encoder: False 8 | sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 9 | 10 | load_finetuned: False 11 | load_pretrained: True 12 | # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz" 14 | 15 | controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-depth" 16 | 17 | preprocess: 18 | vis_processor: 19 | train: 20 | name: "blip_diffusion_inp_image_eval" 21 | eval: 22 | name: "blip_diffusion_inp_image_eval" 23 | text_processor: 24 | train: 25 | name: "blip_caption" 26 | eval: 27 | name: "blip_caption" 28 | -------------------------------------------------------------------------------- /lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_hed.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | vit_model: "clip_L" 3 | 4 | qformer_num_query_token: 16 5 | qformer_cross_attention_freq: 1 6 | 7 | sd_train_text_encoder: False 8 | sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 9 | 10 | load_finetuned: False 11 | load_pretrained: True 12 | # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz" 14 | 15 | controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-hed" 16 | 17 | preprocess: 18 | vis_processor: 19 | train: 20 | name: "blip_diffusion_inp_image_eval" 21 | eval: 22 | name: "blip_diffusion_inp_image_eval" 23 | text_processor: 24 | train: 25 | name: "blip_caption" 26 | eval: 27 | name: "blip_caption" 28 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: coco 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: True 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 364 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 364 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | eval: 36 | name: "blip_caption" 37 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 224 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 224 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | eval: 36 | name: "blip_caption" 37 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xxl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xxl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_opt2.7b 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-2.7b" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_opt6.7b 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-6.7b" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /lavis/configs/models/blip2/blip2_pretrain_vitL.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vitL.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | vit_model: "clip_L" 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | 25 | preprocess: 26 | vis_processor: 27 | train: 28 | name: "blip_image_train" 29 | image_size: 224 30 | eval: 31 | name: "blip_image_eval" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | eval: 37 | name: "blip_caption" 38 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_caption_large_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth" 12 | 13 | vit_type: "large" 14 | vit_grad_ckpt: True 15 | vit_ckpt_layer: 5 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_large_config.json" 21 | 22 | # generation configs 23 | prompt: "a picture of " 24 | 25 | 26 | preprocess: 27 | vis_processor: 28 | train: 29 | name: "blip_image_train" 30 | eval: 31 | name: "blip_image_eval" 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | prompt: "a picture of " 36 | eval: 37 | name: "blip_caption" 38 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_classification_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_classification 8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" 9 | 10 | use_distill: True 11 | momentum: 0.995 12 | alpha: 0.4 13 | 14 | # vit encoder 15 | vit_type: "base" 16 | vit_grad_ckpt: False 17 | vit_ckpt_layer: 0 18 | 19 | image_size: 384 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_feature_extractor_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 9 | 10 | # vit encoder 11 | vit_type: "base" 12 | vit_grad_ckpt: False 13 | vit_ckpt_layer: 0 14 | 15 | image_size: 224 16 | 17 | # bert config 18 | med_config_path: "configs/models/med_config.json" 19 | 20 | embed_dim: 256 21 | 22 | preprocess: 23 | vis_processor: 24 | eval: 25 | name: "blip_image_eval" 26 | image_size: 224 27 | text_processor: 28 | eval: 29 | name: "blip_caption" 30 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_itm_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_image_text_matching 8 | 9 | load_finetuned: True 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_config.json" 21 | 22 | embed_dim: 256 23 | 24 | preprocess: 25 | vis_processor: 26 | eval: 27 | name: "blip_image_eval" 28 | image_size: 384 29 | text_processor: 30 | eval: 31 | name: "blip_caption" 32 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_itm_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_image_text_matching 8 | 9 | load_finetuned: True 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth" 11 | 12 | # vit encoder 13 | vit_type: "large" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_large_config.json" 21 | 22 | embed_dim: 256 23 | 24 | preprocess: 25 | vis_processor: 26 | eval: 27 | name: "blip_image_eval" 28 | image_size: 384 29 | text_processor: 30 | eval: 31 | name: "blip_caption" 32 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_pretrain_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | 9 | load_pretrained: True 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 224 18 | alpha: 0.4 19 | 20 | # bert config 21 | med_config_path: "configs/models/bert_config.json" 22 | 23 | embed_dim: 256 24 | 25 | # generation configs 26 | prompt: "a picture of " 27 | 28 | preprocess: 29 | vis_processor: 30 | train: 31 | name: "blip_image_train" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_pretrain_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | 9 | # vit encoder 10 | vit_type: "large" 11 | vit_grad_ckpt: True 12 | vit_ckpt_layer: 5 13 | 14 | image_size: 224 15 | 16 | # bert config 17 | med_config_path: "configs/models/med_large_config.json" 18 | 19 | embed_dim: 256 20 | 21 | # generation configs 22 | prompt: "a picture of " 23 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_retrieval_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | queue_size: 57600 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | vit_grad_ckpt: True 18 | vit_ckpt_layer: 4 19 | 20 | image_size: 384 21 | 22 | # bert config 23 | med_config_path: "configs/models/med_config.json" 24 | 25 | embed_dim: 256 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | image_size: 384 32 | eval: 33 | name: "blip_image_eval" 34 | image_size: 384 35 | text_processor: 36 | train: 37 | name: "blip_caption" 38 | eval: 39 | name: "blip_caption" 40 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_vqa_aokvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_vqa_okvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /lavis/configs/models/blip_vqav2.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN101-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 23, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN101.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 23, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN50-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 6, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 6, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN50x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 384, 5 | "layers": [ 6 | 6, 7 | 8, 8 | 18, 9 | 8 10 | ], 11 | "width": 96, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 768, 18 | "heads": 12, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/RN50x4.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 288, 5 | "layers": [ 6 | 4, 7 | 6, 8 | 10, 9 | 6 10 | ], 11 | "width": 80, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 640, 18 | "heads": 10, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-16-plus-240.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 240, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-16-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-32-plus-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 256, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-H-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 16 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-L-14-280.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 280, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-L-16-320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 320, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-L-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/ViT-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-efficientnetv2_rw_s.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "efficientnetv2_rw_s", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 288 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-resnet50d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnet50d", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-resnetaa50d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnetaa50d", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-resnetblur50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnetblur50", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "swin_base_patch4_window7_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-vit_base_patch16_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_base_patch16_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-vit_base_patch32_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_base_patch32_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip/timm-vit_small_patch16_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_small_patch16_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /lavis/configs/models/clip_resnet50.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: RN50 10 | 11 | pretrained: openai 12 | -------------------------------------------------------------------------------- /lavis/configs/models/clip_vit_base16.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-B-16 10 | 11 | pretrained: openai 12 | 13 | preprocess: 14 | vis_processor: 15 | eval: 16 | name: "clip_image_eval" 17 | image_size: 224 18 | -------------------------------------------------------------------------------- /lavis/configs/models/gpt_dialogue_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: gpt_dialogue 8 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" 9 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 10 | 11 | len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens 12 | 13 | len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128 14 | 15 | preprocess: 16 | vis_processor: 17 | train: 18 | name: "gpt_video_ft" 19 | eval: 20 | name: "gpt_video_ft" 21 | text_processor: 22 | train: 23 | name: "gpt_dialogue" 24 | eval: 25 | name: "gpt_dialogue" -------------------------------------------------------------------------------- /lavis/configs/models/med_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30524, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /lavis/configs/models/med_config_albef.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true, 21 | "fusion_layer": 6 22 | } -------------------------------------------------------------------------------- /lavis/configs/models/med_large_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30524, 19 | "encoder_width": 1024, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /lavis/datasets/builders/audio_qa_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2023, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.datasets.builders.audio_caption_builder import AudioCapBuilder 10 | from lavis.datasets.datasets.audio_qa_datasets import AudioCapsQADataset, ClothoQADataset 11 | 12 | @registry.register_builder("audiocaps_mm_qa") 13 | class AudioCapsQABuilder(AudioCapBuilder): 14 | train_dataset_cls = AudioCapsQADataset 15 | eval_dataset_cls = AudioCapsQADataset 16 | 17 | DATASET_CONFIG_DICT = { 18 | "default": "configs/datasets/audiocaps/defaults_mm_qa.yaml", 19 | } 20 | 21 | @registry.register_builder("clotho_qa") 22 | class ClothoQABuilder(AudioCapBuilder): 23 | train_dataset_cls = ClothoQADataset 24 | eval_dataset_cls = ClothoQADataset 25 | 26 | DATASET_CONFIG_DICT = { 27 | "default": "configs/datasets/clotho/defaults_mm_qa.yaml", 28 | } -------------------------------------------------------------------------------- /lavis/datasets/builders/discrn_builders.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder 10 | from lavis.datasets.datasets.discriminatory_reasoning_datasets import DisCRnDataset 11 | 12 | 13 | 14 | @registry.register_builder("image_pc_discrn") 15 | class DiscrnImagePcBuilder(MultiModalDatasetBuilder): 16 | eval_dataset_cls = DisCRnDataset 17 | 18 | DATASET_CONFIG_DICT = { 19 | "default": "configs/datasets/discriminatory_reasoning/defaults_mm_image_pc.yaml", 20 | } 21 | 22 | @registry.register_builder("audio_video_discrn") 23 | class DiscrnAudioVideoBuilder(MultiModalDatasetBuilder): 24 | eval_dataset_cls = DisCRnDataset 25 | 26 | DATASET_CONFIG_DICT = { 27 | "default": "configs/datasets/discriminatory_reasoning/defaults_mm_audio_video.yaml", 28 | } 29 | -------------------------------------------------------------------------------- /lavis/datasets/builders/object3d_classification_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder, MultiModalDatasetBuilder 10 | from lavis.datasets.datasets.object3d_classification_datasets import ModelNetClassificationDataset 11 | 12 | @registry.register_builder("modelnet40_cls") 13 | class ModelNetClassificationBuilder(MultiModalDatasetBuilder): 14 | train_dataset_cls = ModelNetClassificationDataset 15 | eval_dataset_cls = ModelNetClassificationDataset 16 | 17 | DATASET_CONFIG_DICT = { 18 | "default": "configs/datasets/modelnet40/defaults_cls.yaml", 19 | } -------------------------------------------------------------------------------- /lavis/datasets/builders/object3d_qa_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.datasets.builders.object3d_caption_builder import ObjaverseCaptionBuilder 10 | from lavis.datasets.datasets.object3d_qa_datasets import ObjaverseQADataset 11 | 12 | @registry.register_builder("objaverse_mm_qa") 13 | class ObjaverseQABuilder(ObjaverseCaptionBuilder): 14 | train_dataset_cls = ObjaverseQADataset 15 | eval_dataset_cls = ObjaverseQADataset 16 | 17 | DATASET_CONFIG_DICT = { 18 | "default": "configs/datasets/objaverse/defaults_mm_qa.yaml", 19 | } -------------------------------------------------------------------------------- /lavis/datasets/datasets/multimodal_classification_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from abc import abstractmethod 9 | from lavis.datasets.datasets.base_dataset import BaseDataset 10 | 11 | 12 | class MultimodalClassificationDataset(BaseDataset): 13 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 14 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 15 | 16 | self.class_labels = None 17 | 18 | @abstractmethod 19 | def _build_class_labels(self): 20 | pass 21 | -------------------------------------------------------------------------------- /lavis/datasets/download_scripts/download_charade.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2023, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import json 9 | from tqdm import tqdm 10 | 11 | train_file = './train.jsonl' 12 | test_file = './test.jsonl' 13 | 14 | train_data = [json.loads(l.strip()) for l in open(train_file).readlines()] 15 | test_data = [json.loads(l.strip()) for l in open(test_file).readlines()] 16 | 17 | for d in tqdm(train_data): 18 | d['video_path'] = d['video_id'] + '.mp4' 19 | d['ts'] = [float(d['start']), float(d['end'])] 20 | 21 | for d in tqdm(test_data): 22 | d['video_path'] = d['video_id'] + '.mp4' 23 | d['ts'] = [float(d['start']), float(d['end'])] 24 | 25 | json.dump(train_data, open('train_lavis.json', 'w')) 26 | json.dump(test_data, open('test_lavis.json', 'w')) -------------------------------------------------------------------------------- /lavis/datasets/download_scripts/download_violin.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2023, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import json 9 | import os 10 | 11 | json_path = './violin_annotation.json' 12 | 13 | ## convert annotations 14 | all_json = json.load(open(json_path)) 15 | train_data = [v for v in all_json.values() if 'split' in v and v['split'] == 'train'] 16 | test_data = [v for v in all_json.values() if 'split' in v and v['split'] == 'test'] 17 | 18 | json.dump(train_data, open('train.json', 'w')) 19 | json.dump(test_data, open('test.json', 'w')) -------------------------------------------------------------------------------- /lavis/models/beats/LICENSE_BEATs.txt: -------------------------------------------------------------------------------- 1 | The MIT License (MIT) 2 | 3 | Copyright (c) Microsoft Corporation 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /lavis/models/blip2_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/lavis/models/blip2_models/__init__.py -------------------------------------------------------------------------------- /lavis/models/blip_diffusion_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/lavis/models/blip_diffusion_models/__init__.py -------------------------------------------------------------------------------- /lavis/models/clip_models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | 7 | Based on https://github.com/mlfoundations/open_clip 8 | """ 9 | 10 | """ OpenAI pretrained model functions 11 | Adapted from https://github.com/mlfoundations/open_clip and https://github.com/openai/CLIP. 12 | 13 | Originally MIT License, Copyright (c) 2021 OpenAI. 14 | """ 15 | -------------------------------------------------------------------------------- /lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /lavis/models/clip_models/pics/CLIP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/lavis/models/clip_models/pics/CLIP.png -------------------------------------------------------------------------------- /lavis/models/img2prompt_models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import torch 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /lavis/models/timesformer/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | 7 | Based on https://github.com/facebookresearch/TimeSformer 8 | """ 9 | -------------------------------------------------------------------------------- /lavis/models/timesformer/linear.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | """ Linear layer (alternate definition) 9 | """ 10 | import torch 11 | import torch.nn.functional as F 12 | from torch import nn as nn 13 | 14 | 15 | class Linear(nn.Linear): 16 | def forward(self, input: torch.Tensor) -> torch.Tensor: 17 | if torch.jit.is_scripting(): 18 | bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None 19 | return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias) 20 | else: 21 | return F.linear(input, self.weight, self.bias) 22 | -------------------------------------------------------------------------------- /lavis/models/ulip_models/pointbert/PointTransformer_8192point.yaml: -------------------------------------------------------------------------------- 1 | optimizer : { 2 | type: AdamW, 3 | kwargs: { 4 | lr : 0.0005, 5 | weight_decay : 0.05 6 | }} 7 | 8 | scheduler: { 9 | type: CosLR, 10 | kwargs: { 11 | epochs: 300, 12 | initial_epochs : 10 13 | }} 14 | 15 | model : { 16 | NAME: PointTransformer, 17 | trans_dim: 384, 18 | depth: 12, 19 | drop_path_rate: 0.1, 20 | cls_dim: 40, 21 | num_heads: 6, 22 | group_size: 32, 23 | num_group: 512, 24 | encoder_dims: 256, 25 | } 26 | npoints: 8192 27 | total_bs : 32 28 | step_per_update : 1 29 | max_epoch : 300 30 | grad_norm_clip : 10 31 | 32 | consider_metric: CDL1 -------------------------------------------------------------------------------- /lavis/models/ulip_models/ulip_scaled_up_config.yaml: -------------------------------------------------------------------------------- 1 | optimizer : { 2 | type: AdamW, 3 | kwargs: { 4 | lr : 0.0005, 5 | weight_decay : 0.05 6 | }} 7 | 8 | scheduler: { 9 | type: CosLR, 10 | kwargs: { 11 | epochs: 300, 12 | initial_epochs : 10 13 | }} 14 | 15 | model : { 16 | NAME: PointTransformer, 17 | trans_dim: 384, 18 | depth: 18, 19 | drop_path_rate: 0.1, 20 | cls_dim: 40, 21 | num_heads: 6, 22 | group_size: 32, 23 | num_group: 512, 24 | encoder_dims: 256, 25 | } 26 | npoints: 8192 27 | total_bs : 32 28 | step_per_update : 1 29 | max_epoch : 300 30 | grad_norm_clip : 10 31 | 32 | consider_metric: CDL1 -------------------------------------------------------------------------------- /lavis/models/ulip_models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | ''' 2 | * Copyright (c) 2023, salesforce.com, inc. 3 | * All rights reserved. 4 | * SPDX-License-Identifier: BSD-3-Clause 5 | * For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | * By Le Xue 7 | ''' 8 | -------------------------------------------------------------------------------- /lavis/models/ulip_models/utils/build.py: -------------------------------------------------------------------------------- 1 | from utils import registry 2 | 3 | 4 | DATASETS = registry.Registry('dataset') 5 | 6 | 7 | def build_dataset_from_cfg(cfg, default_args = None): 8 | """ 9 | Build a dataset, defined by `dataset_name`. 10 | Args: 11 | cfg (eDICT): 12 | Returns: 13 | Dataset: a constructed dataset specified by dataset_name. 14 | """ 15 | return DATASETS.build(cfg, default_args = default_args) 16 | 17 | 18 | -------------------------------------------------------------------------------- /lavis/processors/base_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from omegaconf import OmegaConf 9 | 10 | 11 | class BaseProcessor: 12 | def __init__(self): 13 | self.transform = lambda x: x 14 | return 15 | 16 | def __call__(self, item): 17 | return self.transform(item) 18 | 19 | @classmethod 20 | def from_config(cls, cfg=None): 21 | return cls() 22 | 23 | def build(self, **kwargs): 24 | cfg = OmegaConf.create(kwargs) 25 | 26 | return self.from_config(cfg) 27 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/nlvr_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_nlvr 8 | model_type: nlvr 9 | 10 | datasets: 11 | nlvr: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: multimodal_classification 22 | 23 | batch_size_train: 16 24 | batch_size_eval: 64 25 | num_workers: 4 26 | 27 | seed: 42 28 | output_dir: "output/ALBEF/NLVR" 29 | 30 | evaluate: True 31 | test_splits: ["val", "test"] 32 | 33 | # distribution-specific 34 | device: "cuda" 35 | world_size: 1 36 | dist_url: "env://" 37 | distributed: True 38 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/ret_coco_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | model_type: coco 9 | 10 | datasets: 11 | coco_retrieval: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: retrieval 22 | 23 | # dataloading 24 | num_workers: 4 25 | batch_size_train: 32 26 | batch_size_eval: 64 27 | 28 | test_splits: ["test"] 29 | 30 | # distribution 31 | device: "cuda" 32 | world_size: 1 33 | dist_url: "env://" 34 | distributed: True 35 | use_dist_eval_sampler: False 36 | 37 | # model specific 38 | k_test: 128 39 | 40 | # misc 41 | seed: 42 42 | output_dir: "output/ALBEF/Retrieval_COCO" 43 | 44 | evaluate: True 45 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/ret_flickr30k_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | model_type: flickr 9 | 10 | datasets: 11 | flickr30k: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: retrieval 22 | 23 | # dataloading 24 | num_workers: 4 25 | batch_size_train: 32 26 | batch_size_eval: 64 27 | 28 | test_splits: ["test"] 29 | 30 | # distribution 31 | device: "cuda" 32 | world_size: 1 33 | dist_url: "env://" 34 | distributed: True 35 | use_dist_eval_sampler: False 36 | 37 | # model specific 38 | k_test: 128 39 | 40 | # misc 41 | seed: 42 42 | output_dir: "output/ALBEF/Retrieval_Flickr30k" 43 | 44 | evaluate: True 45 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/snli_ve_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_classification 8 | model_type: ve 9 | 10 | datasets: 11 | snli_ve: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | text_processor: 16 | eval: 17 | name: "blip_caption" 18 | 19 | run: 20 | task: multimodal_classification 21 | # optimization-specific 22 | batch_size_train: 32 23 | batch_size_eval: 64 24 | num_workers: 4 25 | 26 | seed: 42 27 | output_dir: "output/ALBEF/SNLI_VE" 28 | 29 | evaluate: True 30 | test_splits: ["val", "test"] 31 | 32 | # distribution-specific 33 | device: "cuda" 34 | world_size: 1 35 | dist_url: "env://" 36 | distributed: True 37 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/vqa_test.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_vqa 8 | model_type: vqav2 9 | 10 | image_size: 384 11 | 12 | 13 | datasets: 14 | coco_vqa: # name of the dataset builder 15 | vis_processor: 16 | eval: 17 | name: "blip_image_eval" 18 | image_size: 384 19 | text_processor: 20 | eval: 21 | name: "blip_question" 22 | 23 | run: 24 | task: vqa 25 | 26 | # optimization-specific 27 | batch_size_train: 16 28 | batch_size_eval: 64 29 | num_workers: 4 30 | 31 | # inference-specific 32 | max_len: 10 33 | min_len: 1 34 | num_beams: 3 35 | num_ans_candidates: 128 36 | inference_method: "rank" 37 | 38 | seed: 42 39 | output_dir: "output/ALBEF/VQA" 40 | 41 | evaluate: True 42 | train_splits: ["train"] 43 | test_splits: ["test"] 44 | 45 | # distribution-specific 46 | device: "cuda" 47 | world_size: 1 48 | dist_url: "env://" 49 | distributed: True 50 | -------------------------------------------------------------------------------- /lavis/projects/albef/eval/vqa_val.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_vqa 8 | model_type: vqav2 9 | 10 | image_size: 384 11 | 12 | datasets: 13 | coco_vqa: # name of the dataset builder 14 | type: eval 15 | vis_processor: 16 | eval: 17 | name: "blip_image_eval" 18 | image_size: 384 19 | text_processor: 20 | eval: 21 | name: "blip_question" 22 | 23 | run: 24 | task: vqa 25 | 26 | # optimization-specific 27 | batch_size_train: 16 28 | batch_size_eval: 64 29 | num_workers: 4 30 | 31 | # inference-specific 32 | max_len: 10 33 | min_len: 1 34 | num_beams: 3 35 | num_ans_candidates: 128 36 | inference_method: "rank" 37 | 38 | seed: 42 39 | output_dir: "output/ALBEF/VQA" 40 | 41 | evaluate: True 42 | test_splits: ["val"] 43 | 44 | # distribution-specific 45 | device: "cuda" 46 | world_size: 1 47 | dist_url: "env://" 48 | distributed: True 49 | -------------------------------------------------------------------------------- /lavis/projects/alpro/eval/msrvtt_qa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | model_type: msrvtt 9 | 10 | datasets: 11 | msrvtt_qa: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "alpro_video_eval" 15 | n_frms: 16 16 | image_size: 224 17 | text_processor: 18 | eval: 19 | name: "blip_caption" 20 | 21 | run: 22 | task: multimodal_classification 23 | # optimization-specific 24 | batch_size_train: 32 25 | batch_size_eval: 64 26 | num_workers: 4 27 | 28 | seed: 42 29 | output_dir: "output/ALPRO/msrvtt_qa" 30 | 31 | evaluate: True 32 | valid_splits: ["val"] 33 | test_splits: ["test"] 34 | 35 | # distribution-specific 36 | device: "cuda" 37 | world_size: 1 38 | dist_url: "env://" 39 | distributed: True 40 | -------------------------------------------------------------------------------- /lavis/projects/alpro/eval/msrvtt_ret_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | model_type: msrvtt 9 | 10 | datasets: 11 | msrvtt_retrieval: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "alpro_video_eval" 15 | n_frms: 8 16 | image_size: 224 17 | text_processor: 18 | eval: 19 | name: "blip_caption" 20 | 21 | run: 22 | task: retrieval 23 | # optimization-specific 24 | batch_size_train: 24 25 | batch_size_eval: 64 26 | num_workers: 4 27 | 28 | # k_test: 256 29 | k_test: 1000 30 | 31 | seed: 42 32 | output_dir: "output/ALPRO/msrvtt_retrieval" 33 | 34 | evaluate: True 35 | test_splits: ["test"] 36 | 37 | # distribution-specific 38 | device: "cuda" 39 | world_size: 1 40 | dist_url: "env://" 41 | distributed: True 42 | use_dist_eval_sampler: False 43 | -------------------------------------------------------------------------------- /lavis/projects/alpro/eval/msvd_qa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | model_type: msvd 9 | 10 | datasets: 11 | msvd_qa: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "alpro_video_eval" 15 | n_frms: 16 16 | image_size: 224 17 | text_processor: 18 | train: 19 | name: "blip_caption" 20 | eval: 21 | name: "blip_caption" 22 | 23 | run: 24 | task: multimodal_classification 25 | # optimization-specific 26 | batch_size_train: 24 27 | batch_size_eval: 64 28 | num_workers: 4 29 | 30 | seed: 42 31 | output_dir: "output/ALPRO/msvd_qa" 32 | 33 | evaluate: True 34 | test_splits: ["test"] 35 | 36 | # distribution-specific 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/aokvqa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | model_type: aokvqa 9 | image_size: 480 10 | 11 | datasets: 12 | aok_vqa: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 480 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: aok_vqa 23 | # optimization-specific 24 | batch_size_train: 64 25 | batch_size_eval: 64 26 | num_workers: 4 27 | 28 | # inference-specific 29 | max_len: 10 30 | min_len: 1 31 | num_beams: 3 32 | num_ans_candidates: 128 33 | inference_method: "rank" 34 | 35 | seed: 42 36 | output_dir: "output/BLIP/AOKVQA" 37 | 38 | evaluate: True 39 | test_splits: ["val", "test"] 40 | 41 | # distribution-specific 42 | device: "cuda" 43 | world_size: 1 44 | dist_url: "env://" 45 | distributed: True 46 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/caption_coco_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | model_type: base_coco 9 | 10 | datasets: 11 | coco_caption: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | text_processor: 16 | eval: 17 | name: "blip_caption" 18 | 19 | run: 20 | # task: retrieval 21 | task: captioning 22 | # optimizer 23 | batch_size_train: 32 24 | batch_size_eval: 64 25 | num_workers: 4 26 | 27 | max_len: 20 28 | min_len: 5 29 | num_beams: 3 30 | 31 | seed: 42 32 | output_dir: "output/BLIP/Caption_coco" 33 | 34 | evaluate: True 35 | test_splits: ["test"] 36 | 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/caption_coco_eval_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | model_type: large_coco 9 | 10 | datasets: 11 | coco_caption: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | text_processor: 16 | eval: 17 | name: "blip_caption" 18 | 19 | run: 20 | # task: retrieval 21 | task: captioning 22 | # optimizer 23 | batch_size_train: 32 24 | batch_size_eval: 64 25 | num_workers: 4 26 | 27 | max_len: 20 28 | min_len: 5 29 | num_beams: 3 30 | 31 | seed: 42 32 | output_dir: "output/BLIP/Caption_coco" 33 | 34 | evaluate: True 35 | test_splits: ["test"] 36 | 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/nlvr_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_nlvr 8 | model_type: nlvr 9 | 10 | datasets: 11 | nlvr: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: multimodal_classification 22 | 23 | batch_size_train: 16 24 | batch_size_eval: 64 25 | num_workers: 4 26 | 27 | seed: 42 28 | output_dir: "output/BLIP/NLVR" 29 | 30 | evaluate: True 31 | test_splits: ["val", "test"] 32 | 33 | # distribution-specific 34 | device: "cuda" 35 | world_size: 1 36 | dist_url: "env://" 37 | distributed: True 38 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/okvqa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | model_type: okvqa 9 | image_size: 480 10 | 11 | datasets: 12 | ok_vqa: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 480 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: vqa 23 | # optimization-specific 24 | batch_size_train: 16 25 | batch_size_eval: 16 26 | num_workers: 4 27 | 28 | # inference-specific 29 | max_len: 10 30 | min_len: 1 31 | num_beams: 3 32 | num_ans_candidates: 128 33 | inference_method: "rank" 34 | 35 | seed: 42 36 | output_dir: "output/BLIP/OKVQA" 37 | 38 | evaluate: True 39 | test_splits: ["test"] 40 | 41 | # distribution-specific 42 | device: "cuda" 43 | world_size: 1 44 | dist_url: "env://" 45 | distributed: True 46 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/ret_flickr_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | model_type: flickr 9 | 10 | datasets: 11 | flickr30k: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: retrieval 22 | 23 | # dataloading 24 | num_workers: 4 25 | batch_size_train: 32 26 | batch_size_eval: 64 27 | 28 | test_splits: ["test"] 29 | 30 | # distribution 31 | device: "cuda" 32 | world_size: 1 33 | dist_url: "env://" 34 | distributed: True 35 | use_dist_eval_sampler: False 36 | 37 | # model specific 38 | k_test: 128 39 | 40 | # misc 41 | seed: 42 42 | output_dir: "output/Retrieval_Flickr30k" 43 | 44 | evaluate: True 45 | -------------------------------------------------------------------------------- /lavis/projects/blip/eval/vqav2_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | model_type: vqav2 9 | image_size: 480 10 | 11 | datasets: 12 | coco_vqa: # name of the dataset builder 13 | type: eval 14 | vis_processor: 15 | eval: 16 | name: "blip_image_eval" 17 | image_size: 480 18 | text_processor: 19 | eval: 20 | name: "blip_question" 21 | 22 | run: 23 | task: vqa 24 | # optimization-specific 25 | batch_size_train: 16 26 | batch_size_eval: 64 27 | num_workers: 4 28 | 29 | # inference-specific 30 | max_len: 10 31 | min_len: 1 32 | num_beams: 3 33 | num_ans_candidates: 128 34 | inference_method: "rank" 35 | 36 | seed: 42 37 | output_dir: "output/BLIP/VQA" 38 | 39 | evaluate: True 40 | test_splits: ["val"] 41 | 42 | # distribution-specific 43 | device: "cuda" 44 | world_size: 1 45 | dist_url: "env://" 46 | distributed: True 47 | -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml -------------------------------------------------------------------------------- /lavis/projects/blip2/eval/ret_flickr_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip2 8 | model_type: coco 9 | use_grad_checkpoint: False 10 | 11 | datasets: 12 | flickr30k: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 364 17 | text_processor: 18 | eval: 19 | name: "blip_caption" 20 | 21 | run: 22 | task: retrieval 23 | 24 | # dataloading 25 | num_workers: 4 26 | batch_size_train: 16 27 | batch_size_eval: 32 28 | 29 | test_splits: ["test"] 30 | 31 | # distribution 32 | device: "cuda" 33 | world_size: 1 34 | dist_url: "env://" 35 | distributed: True 36 | use_dist_eval_sampler: False 37 | 38 | # model specific 39 | k_test: 128 40 | 41 | # misc 42 | seed: 42 43 | output_dir: "output/BLIP2/Retrieval_Flickr30k" 44 | 45 | evaluate: True -------------------------------------------------------------------------------- /lavis/projects/clip/exp_coco_ret_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | 11 | datasets: 12 | coco_retrieval: # name of the dataset builder 13 | vis_processor: 14 | train: 15 | name: "clip_image_train" 16 | image_size: 336 17 | eval: 18 | name: "clip_image_eval" 19 | image_size: 336 20 | text_processor: 21 | train: 22 | name: "blip_caption" 23 | eval: 24 | name: "blip_caption" 25 | 26 | run: 27 | task: retrieval 28 | 29 | # dataloading 30 | num_workers: 4 31 | batch_size_train: 32 32 | batch_size_eval: 128 33 | 34 | test_splits: ["test"] 35 | 36 | # distribution 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | use_dist_eval_sampler: True 42 | 43 | # misc 44 | seed: 42 45 | output_dir: "output/clip/Retrieval_COCO" 46 | 47 | evaluate: True 48 | -------------------------------------------------------------------------------- /lavis/projects/clip/exp_flickr_ret_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | 11 | datasets: 12 | flickr30k: # name of the dataset builder 13 | vis_processor: 14 | train: 15 | name: "clip_image_train" 16 | image_size: 336 17 | eval: 18 | name: "clip_image_eval" 19 | image_size: 336 20 | text_processor: 21 | train: 22 | name: "blip_caption" 23 | eval: 24 | name: "blip_caption" 25 | 26 | run: 27 | task: retrieval 28 | 29 | # dataloading 30 | num_workers: 4 31 | batch_size_train: 32 32 | batch_size_eval: 128 33 | 34 | test_splits: ["test"] 35 | 36 | # distribution 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | use_dist_eval_sampler: True 42 | 43 | # misc 44 | seed: 42 45 | output_dir: "output/clip/Retrieval_Flickr" 46 | 47 | evaluate: True 48 | -------------------------------------------------------------------------------- /lavis/projects/clip/exp_imnet_zs_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | 11 | datasets: 12 | imagenet: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "clip_image_eval" 16 | # image_size: 224 17 | image_size: 336 18 | 19 | run: 20 | task: multimodal_classification 21 | 22 | # dataloading 23 | num_workers: 4 24 | batch_size_train: 32 25 | batch_size_eval: 128 26 | 27 | test_splits: ["val"] 28 | 29 | # distribution 30 | device: "cuda" 31 | world_size: 1 32 | dist_url: "env://" 33 | distributed: True 34 | 35 | # misc 36 | seed: 42 37 | output_dir: "output/clip/zs_imnet" 38 | 39 | evaluate: True 40 | -------------------------------------------------------------------------------- /lavis/projects/gpt/eval/dialogue_avsd_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: gpt_dialogue 8 | model_type: base 9 | 10 | datasets: 11 | avsd_dialogue: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "gpt_video_ft" 15 | visual_ft: ["i3d_flow", "i3d_rgb"] 16 | audio_ft: ["vggish"] 17 | text_processor: 18 | eval: 19 | name: "gpt_dialogue" 20 | max_turns: 3 21 | use_caption: True 22 | 23 | run: 24 | task: dialogue 25 | # optimizer 26 | batch_size_train: 16 27 | batch_size_eval: 16 28 | num_workers: 0 29 | 30 | max_len: 20 31 | min_len: 5 32 | num_beams: 5 33 | 34 | seed: 42 35 | output_dir: "output/gpt2/dialogue_avsd" 36 | 37 | evaluate: True 38 | valid_splits: ["test"] 39 | 40 | device: "cuda" 41 | world_size: 1 42 | dist_url: "env://" 43 | distributed: True 44 | -------------------------------------------------------------------------------- /lavis/runners/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.runners.runner_base import RunnerBase 9 | from lavis.runners.runner_iter import RunnerIter 10 | 11 | __all__ = ["RunnerBase", "RunnerIter"] 12 | -------------------------------------------------------------------------------- /lavis/tasks/image_text_pretrain.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.tasks.base_task import BaseTask 10 | 11 | 12 | @registry.register_task("image_text_pretrain") 13 | class ImageTextPretrainTask(BaseTask): 14 | def __init__(self): 15 | super().__init__() 16 | 17 | def evaluation(self, model, data_loader, cuda_enabled=True): 18 | pass 19 | -------------------------------------------------------------------------------- /lavis/tasks/text_to_image_generation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.tasks import BaseTask 9 | from lavis.common.registry import registry 10 | 11 | 12 | @registry.register_task("text-to-image-generation") 13 | class TextToImageGenerationTask(BaseTask): 14 | def __init__(self, cfg): 15 | super().__init__() 16 | 17 | self.cfg = cfg 18 | 19 | @classmethod 20 | def setup_task(cls, cfg): 21 | run_cfg = cfg.run_cfg 22 | 23 | return cls(cfg=run_cfg) 24 | -------------------------------------------------------------------------------- /projects/blip-diffusion/images/black-cat.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/black-cat.png -------------------------------------------------------------------------------- /projects/blip-diffusion/images/cat-sofa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/cat-sofa.png -------------------------------------------------------------------------------- /projects/blip-diffusion/images/dog.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dog.png -------------------------------------------------------------------------------- /projects/blip-diffusion/images/dog2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dog2.png -------------------------------------------------------------------------------- /projects/blip-diffusion/images/dreambooth/dog/00.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog/00.jpg -------------------------------------------------------------------------------- /projects/blip-diffusion/images/dreambooth/dog/01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog/01.jpg -------------------------------------------------------------------------------- /projects/blip-diffusion/images/dreambooth/dog/02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog/02.jpg -------------------------------------------------------------------------------- /projects/blip-diffusion/images/dreambooth/dog/03.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog/03.jpg -------------------------------------------------------------------------------- /projects/blip-diffusion/images/dreambooth/dog/04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog/04.jpg -------------------------------------------------------------------------------- /projects/blip-diffusion/images/dreambooth/dog8/00.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog8/00.jpg -------------------------------------------------------------------------------- /projects/blip-diffusion/images/dreambooth/dog8/01.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog8/01.jpg -------------------------------------------------------------------------------- /projects/blip-diffusion/images/dreambooth/dog8/02.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog8/02.jpg -------------------------------------------------------------------------------- /projects/blip-diffusion/images/dreambooth/dog8/03.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog8/03.jpg -------------------------------------------------------------------------------- /projects/blip-diffusion/images/dreambooth/dog8/04.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dreambooth/dog8/04.jpg -------------------------------------------------------------------------------- /projects/blip-diffusion/images/dress-model.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/dress-model.png -------------------------------------------------------------------------------- /projects/blip-diffusion/images/flower.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/flower.jpg -------------------------------------------------------------------------------- /projects/blip-diffusion/images/green-skirt.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/green-skirt.png -------------------------------------------------------------------------------- /projects/blip-diffusion/images/jacket-letter-s/jacket-letter-s.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/jacket-letter-s/jacket-letter-s.png -------------------------------------------------------------------------------- /projects/blip-diffusion/images/kettle.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/kettle.jpg -------------------------------------------------------------------------------- /projects/blip-diffusion/images/pink-dress.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/pink-dress.png -------------------------------------------------------------------------------- /projects/blip-diffusion/images/pink-dress/pink-dress.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/pink-dress/pink-dress.png -------------------------------------------------------------------------------- /projects/blip-diffusion/images/shein-jacket/shein-jacket.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/images/shein-jacket/shein-jacket.jpg -------------------------------------------------------------------------------- /projects/blip-diffusion/teaser-website.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip-diffusion/teaser-website.png -------------------------------------------------------------------------------- /projects/blip2/blip2_illustration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/blip2/blip2_illustration.png -------------------------------------------------------------------------------- /projects/img2llm-vqa/Caption.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/img2llm-vqa/Caption.png -------------------------------------------------------------------------------- /projects/img2llm-vqa/Illustration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/img2llm-vqa/Illustration.png -------------------------------------------------------------------------------- /projects/img2llm-vqa/QuestionGeneration.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/img2llm-vqa/QuestionGeneration.png -------------------------------------------------------------------------------- /projects/img2llm-vqa/demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/img2llm-vqa/demo.png -------------------------------------------------------------------------------- /projects/img2prompt-vqa/README.md: -------------------------------------------------------------------------------- 1 | ## From Images to Textual Prompts: Zero-shot VQA with Frozen Large Language Models 2 | 3 | This is the official code for Img2LLM-VQA paper. 4 | 5 | We have renamed **Img2Prompt-VQA** to **Img2LLM-VQA**. See the [new project page](https://github.com/salesforce/LAVIS/tree/main/projects/img2llm-vqa) for details 6 | 7 | ### Citation 8 | If you find this code to be useful for your research, please consider citing. 9 | ```bibtex 10 | @misc{guo2023from, 11 | title={From Images to Textual Prompts: Zero-shot {VQA} with Frozen Large Language Models}, 12 | author={Jiaxian Guo and Junnan Li and Dongxu Li and Anthony Tiong and Boyang Li and Dacheng Tao and Steven HOI}, 13 | year={2023}, 14 | url={https://openreview.net/forum?id=Ck1UtnVukP8} 15 | } 16 | ``` 17 | -------------------------------------------------------------------------------- /projects/instructblip/comparison.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/instructblip/comparison.png -------------------------------------------------------------------------------- /projects/instructblip/showcase.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/instructblip/showcase.png -------------------------------------------------------------------------------- /projects/pnp-vqa/pnp_vqa.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/pnp-vqa/pnp_vqa.png -------------------------------------------------------------------------------- /projects/xinstructblip/assets/architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/xinstructblip/assets/architecture.png -------------------------------------------------------------------------------- /projects/xinstructblip/assets/data.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/xinstructblip/assets/data.png -------------------------------------------------------------------------------- /projects/xinstructblip/demo/examples/audio/110714_wren.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/xinstructblip/demo/examples/audio/110714_wren.wav -------------------------------------------------------------------------------- /projects/xinstructblip/demo/examples/audio/Group_of_Dogs_Barking.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/xinstructblip/demo/examples/audio/Group_of_Dogs_Barking.wav -------------------------------------------------------------------------------- /projects/xinstructblip/demo/examples/point_cloud/banana.glb: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/salesforce/LAVIS/506965b9c4a18c1e565bd32acaccabe0198433f7/projects/xinstructblip/demo/examples/point_cloud/banana.glb -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0.0", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | contexttimer 2 | decord 3 | diffusers<=0.16.0 4 | einops>=0.4.1 5 | fairscale==0.4.4 6 | ftfy 7 | iopath 8 | ipython 9 | omegaconf 10 | opencv-python-headless==4.5.5.64 11 | opendatasets 12 | packaging 13 | pandas 14 | plotly 15 | pre-commit 16 | pycocoevalcap 17 | pycocotools 18 | python-magic 19 | scikit-image 20 | sentencepiece 21 | spacy 22 | streamlit 23 | timm==0.4.12 24 | torch>=1.10.0 25 | torchvision 26 | tqdm 27 | transformers==4.33.2 28 | webdataset 29 | wheel 30 | torchaudio 31 | soundfile 32 | moviepy 33 | nltk 34 | peft 35 | 36 | easydict==1.9 37 | pyyaml_env_tag==0.1 38 | open3d==0.13.0 39 | h5py 40 | -------------------------------------------------------------------------------- /run_scripts/albef/eval/eval_albef_nlvr.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/albef/eval/nlvr_eval.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/albef/eval/eval_albef_ve.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=16 evaluate.py --cfg-path lavis/projects/albef/eval/snli_ve_eval.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/albef/eval/eval_coco_retrieval.sh: -------------------------------------------------------------------------------- 1 | # python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/albef/eval/coco_retrieval_eval.yaml 2 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/albef/eval/ret_coco_eval.yaml 3 | -------------------------------------------------------------------------------- /run_scripts/albef/eval/eval_flickr30k_retrieval.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/albef/eval/ret_flickr30k_eval.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/albef/eval/test_albef_vqa.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/albef/eval/vqa_test.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/albef/eval/val_albef_vqa.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 --master_port 2345 evaluate.py --cfg-path lavis/projects/albef/eval/vqa_val.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/albef/train/pretrain.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/albef/train/pretrain.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/albef/train/train_aokvqa_albef.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/albef/train/aokvqa_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/albef/train/train_coco_retrieval_albef.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/albef/train/ret_coco_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/albef/train/train_flickr30k_retrieval_albef.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/albef/train/ret_flickr30k_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/albef/train/train_nlvr_albef.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/albef/train/nlvr_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/albef/train/train_okvqa_albef.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/albef/train/okvqa_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/albef/train/train_ve_albef.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/albef/train/snli_ve_ft.yaml 2 | # CUDA_VISIBLE_DEVICES=8,9,10,11,12,13,14,15 python -m torch.distributed.run --nproc_per_node=8 --master_port 47770 train.py --cfg-path lavis/projects/albef/train/snli_ve_ft.yaml 3 | -------------------------------------------------------------------------------- /run_scripts/albef/train/train_vqa_albef.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/albef/train/vqa_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/alpro/eval/eval_didemo_ret.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/alpro/eval/didemo_ret_eval.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/alpro/eval/eval_msrvtt_qa.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/alpro/eval/msrvtt_qa_eval.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/alpro/eval/eval_msrvtt_ret.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/alpro/eval/msrvtt_ret_eval.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/alpro/eval/eval_msvd_qa.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=16 evaluate.py --cfg-path lavis/projects/alpro/eval/msvd_qa_eval.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/alpro/train/train_didemo_ret.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/alpro/train/didemo_ret_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/alpro/train/train_msrvtt_qa.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/alpro/train/msrvtt_qa_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/alpro/train/train_msrvtt_ret.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/alpro/train/msrvtt_retrieval_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/alpro/train/train_msvd_qa.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/alpro/train/msvd_qa_ft.yaml -------------------------------------------------------------------------------- /run_scripts/blip-diffusion/train_db.sh: -------------------------------------------------------------------------------- 1 | SUBJECT_TEXT="dog" 2 | IMAGE_STORAGE="/export/home/workspace/LAVIS-Diffusion/LAVIS/projects/blip-diffusion/images/dreambooth/dog" 3 | MAX_ITERS=40 4 | ITERS_PER_INNER_EPOCH=40 # number of iterations before saving a checkpoint 5 | BATCH_SIZE=3 6 | LR=5e-6 7 | WEIGHT_DECAY=0.01 8 | OUTPUT_DIR="output/debug/BLIP-diffusion/finetune/dog" 9 | 10 | python -m torch.distributed.run \ 11 | --nproc_per_node=1 train.py \ 12 | --cfg-path lavis/projects/blip_diffusion/finetune-db-template.yaml \ 13 | --options datasets.blip_diffusion_finetune.build_info.subject_text=$SUBJECT_TEXT \ 14 | datasets.blip_diffusion_finetune.build_info.images.storage=$IMAGE_STORAGE \ 15 | run.max_iters=$MAX_ITERS \ 16 | run.iters_per_inner_epoch=$ITERS_PER_INNER_EPOCH \ 17 | run.output_dir=$OUTPUT_DIR \ 18 | run.init_lr=$LR \ 19 | run.weight_decay=$WEIGHT_DECAY \ 20 | run.batch_size_train=$BATCH_SIZE 21 | -------------------------------------------------------------------------------- /run_scripts/blip-diffusion/train_db_dog.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=1 train.py --cfg-path lavis/projects/blip_diffusion/finetune-db-dog.yaml -------------------------------------------------------------------------------- /run_scripts/blip-diffusion/train_db_jacket_s.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=1 train.py --cfg-path lavis/projects/blip_diffusion/finetune-db-jacket-s.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip-diffusion/train_db_pink_dress.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=1 train.py --cfg-path lavis/projects/blip_diffusion/finetune-db-pink-dress.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip-diffusion/train_db_shein_jacket.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=1 train.py --cfg-path lavis/projects/blip_diffusion/finetune-db-shein-jacket.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip/eval/eval_aokvqa.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=1 evaluate.py --cfg-path lavis/projects/blip/eval/aokvqa_eval.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip/eval/eval_coco_cap.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip/eval/caption_coco_eval.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip/eval/eval_coco_cap_large.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip/eval/caption_coco_eval_large.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip/eval/eval_nlvr.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip/eval/nlvr_eval.yaml 2 | # python -m torch.distributed.run --nproc_per_node=1 evaluate.py --cfg-path lavis/projects/blip/eval/nlvr_eval.yaml 3 | -------------------------------------------------------------------------------- /run_scripts/blip/eval/eval_nocaps.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip/eval/nocaps_eval.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip/eval/eval_okvqa.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip/eval/okvqa_eval.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip/eval/eval_ret_coco.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip/eval/ret_coco_eval.yaml 2 | # python -m torch.distributed.run --nproc_per_node=16 evaluate.py --cfg-path lavis/projects/blip/eval/ret_coco_eval.yaml 3 | -------------------------------------------------------------------------------- /run_scripts/blip/eval/eval_ret_flickr.sh: -------------------------------------------------------------------------------- 1 | # python -m torch.distributed.run --nproc_per_node=16 evaluate.py --cfg-path lavis/projects/blip/eval/ret_flickr_eval.yaml 2 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip/eval/ret_flickr_eval.yaml 3 | -------------------------------------------------------------------------------- /run_scripts/blip/eval/validate_vqa.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip/eval/vqav2_eval.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip/train/pretrain.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/blip/train/pretrain_14m.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip/train/train_aokvqa.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/blip/train/aokvqa_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip/train/train_caption_coco.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/blip/train/caption_coco_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip/train/train_caption_coco_large.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/blip/train/caption_coco_large_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip/train/train_caption_coco_large_iters.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/blip/coco_cap_ft_iter.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip/train/train_nlvr.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/blip/train/nlvr_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip/train/train_okvqa.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/blip/train/okvqa_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip/train/train_retrieval_coco.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/blip/train/retrieval_coco_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip/train/train_retrieval_flickr.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 train.py --cfg-path lavis/projects/blip/train/retrieval_flickr_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip/train/train_vqa.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/blip/vqav2_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip2/eval/eval_cap_coco_flant5xl.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip2/eval/eval_cap_coco_opt2.7b.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/caption_coco_opt2.7b_eval.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip2/eval/eval_cap_coco_opt6.7b.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/caption_coco_opt6.7b_eval.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/blip2/eval/eval_gqa_zeroshot_flant5xl.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/gqa_zeroshot_flant5xl_eval.yaml -------------------------------------------------------------------------------- /run_scripts/blip2/eval/eval_okvqa_zeroshot_flant5xl.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/okvqa_zeroshot_flant5xl_eval.yaml -------------------------------------------------------------------------------- /run_scripts/blip2/eval/eval_ret_coco.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/ret_coco_eval.yaml 2 | # python -m torch.distributed.run --nproc_per_node=16 evaluate.py --cfg-path lavis/projects/blip2/eval/ret_coco_eval.yaml 3 | -------------------------------------------------------------------------------- /run_scripts/blip2/eval/eval_ret_flickr.sh: -------------------------------------------------------------------------------- 1 | # python -m torch.distributed.run --nproc_per_node=16 evaluate.py --cfg-path lavis/projects/blip2/eval/ret_flickr_eval.yaml 2 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/ret_flickr_eval.yaml 3 | -------------------------------------------------------------------------------- /run_scripts/blip2/eval/validate_vqa_zeroshot_flant5xl.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/vqav2_zeroshot_flant5xl_eval.yaml -------------------------------------------------------------------------------- /run_scripts/blip2/eval/validate_vqa_zeroshot_opt.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/blip2/eval/vqav2_zeroshot_opt_eval.yaml -------------------------------------------------------------------------------- /run_scripts/blip2/train/pretrain_stage1.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/blip2/train/pretrain_stage1.yaml -------------------------------------------------------------------------------- /run_scripts/blip2/train/pretrain_stage2.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/blip2/train/pretrain_stage2.yaml -------------------------------------------------------------------------------- /run_scripts/blip2/train/train_caption_coco.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/blip2/train/caption_coco_ft.yaml -------------------------------------------------------------------------------- /run_scripts/blip2/train/train_retrieval_coco.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=16 train.py --cfg-path lavis/projects/blip2/train/retrieval_coco_ft.yaml -------------------------------------------------------------------------------- /run_scripts/clip/eval/eval_clip_ret_coco.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=1 evaluate.py --cfg-path lavis/projects/clip/exp_coco_ret_eval.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/clip/eval/eval_clip_ret_flickr.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=1 evaluate.py --cfg-path lavis/projects/clip/exp_flickr_ret_eval.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/clip/eval/eval_clip_zs_imnet.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/clip/exp_imnet_zs_eval.yaml # --options run.num_workers=0 2 | -------------------------------------------------------------------------------- /run_scripts/gpt/eval/eval_video_dialogue_avsd.sh: -------------------------------------------------------------------------------- 1 | python evaluate.py --cfg-path lavis/projects/gpt/train/dialogue_avsd_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/gpt/train/train_video_dialogue_avsd.sh: -------------------------------------------------------------------------------- 1 | python train.py --cfg-path lavis/projects/gpt/train/dialogue_avsd_ft.yaml 2 | -------------------------------------------------------------------------------- /run_scripts/pnp-vqa/eval/eval_gqa.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/gqa_eval.yaml -------------------------------------------------------------------------------- /run_scripts/pnp-vqa/eval/eval_gqa_3b.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/gqa_eval_3b.yaml -------------------------------------------------------------------------------- /run_scripts/pnp-vqa/eval/eval_gqa_large.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/gqa_eval_large.yaml -------------------------------------------------------------------------------- /run_scripts/pnp-vqa/eval/eval_okvqa.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/okvqa_eval.yaml -------------------------------------------------------------------------------- /run_scripts/pnp-vqa/eval/eval_okvqa_3b.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/okvqa_eval_3b.yaml -------------------------------------------------------------------------------- /run_scripts/pnp-vqa/eval/eval_okvqa_large.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/okvqa_eval_large.yaml -------------------------------------------------------------------------------- /run_scripts/pnp-vqa/eval/eval_vqav2.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/vqav2_eval.yaml -------------------------------------------------------------------------------- /run_scripts/pnp-vqa/eval/eval_vqav2_3b.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/vqav2_eval_3b.yaml -------------------------------------------------------------------------------- /run_scripts/pnp-vqa/eval/eval_vqav2_large.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/vqav2_eval_large.yaml -------------------------------------------------------------------------------- /run_scripts/pnp-vqa/eval/eval_vqav2_test.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/vqav2_test_eval.yaml -------------------------------------------------------------------------------- /run_scripts/pnp-vqa/eval/eval_vqav2_test_3b.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/vqav2_test_eval_3b.yaml -------------------------------------------------------------------------------- /run_scripts/pnp-vqa/eval/eval_vqav2_test_large.sh: -------------------------------------------------------------------------------- 1 | python -m torch.distributed.run --nproc_per_node=8 evaluate.py --cfg-path lavis/projects/pnp-vqa/eval/vqav2_test_eval_large.yaml -------------------------------------------------------------------------------- /run_scripts/run_browser.sh: -------------------------------------------------------------------------------- 1 | streamlit run app/dataset_browser.py --server.fileWatcherType none 2 | -------------------------------------------------------------------------------- /run_scripts/run_demo.sh: -------------------------------------------------------------------------------- 1 | streamlit run app/main.py --server.fileWatcherType none 2 | --------------------------------------------------------------------------------