├── .gitignore
├── LICENSE
├── README.md
├── assets
    ├── amber.png
    ├── amber_discriminative.png
    ├── eyes_forest.png
    ├── llava_bench.png
    ├── mme-fullset.png
    ├── mme-hallucination.png
    ├── motivation.png
    ├── observation.png
    ├── overview.png
    ├── pope.png
    ├── qualitative_amber_instructblip.png
    ├── qualitative_amber_instructblip2.png
    ├── qualitative_amber_llava.png
    ├── qualitative_amber_llava2.png
    ├── qualitative_mme2.png
    ├── qualitative_mme_instructblip.png
    ├── qualitative_mme_llava.png
    ├── qualitative_pope.png
    └── qualitative_pope2.png
├── avisc_utils
    ├── avisc_sample.py
    └── vcd_add_noise.py
├── eval_bench
    ├── SimSun.ttf
    ├── amber_eval_instructblip.py
    ├── amber_eval_llava.py
    ├── amber_loader.py
    ├── llava_bench_llava.py
    ├── pope_eval_instructblipb.py
    ├── pope_eval_llavab.py
    ├── pope_loader.py
    └── scripts
    │   ├── amber_eval.sh
    │   ├── llava_bench_eval.sh
    │   └── pope_eval_batch.sh
├── experiments
    ├── AMBER
    │   ├── LICENSE
    │   ├── README.md
    │   ├── README_File
    │   │   ├── Paper-Arxiv-orange.svg
    │   │   ├── comparison.jpg
    │   │   ├── intro.jpg
    │   │   ├── result.jpg
    │   │   └── statistics.jpg
    │   ├── data
    │   │   ├── annotations.json
    │   │   ├── metrics.txt
    │   │   ├── query
    │   │   │   ├── query_all.json
    │   │   │   ├── query_discriminative-attribute.json
    │   │   │   ├── query_discriminative-existence.json
    │   │   │   ├── query_discriminative-relation.json
    │   │   │   ├── query_discriminative.json
    │   │   │   └── query_generative.json
    │   │   ├── relation.json
    │   │   └── safe_words.txt
    │   └── inference.py
    ├── cd_scripts
    │   └── mme_eval.sh
    ├── eval
    │   ├── calculation.py
    │   ├── convert_answer_to_mme.py
    │   ├── eval_mme.py
    │   ├── eval_mme
    │   │   ├── .DS_Store
    │   │   ├── LaVIN
    │   │   │   ├── OCR.txt
    │   │   │   ├── artwork.txt
    │   │   │   ├── celebrity.txt
    │   │   │   ├── code_reasoning.txt
    │   │   │   ├── color.txt
    │   │   │   ├── commonsense_reasoning.txt
    │   │   │   ├── count.txt
    │   │   │   ├── existence.txt
    │   │   │   ├── landmark.txt
    │   │   │   ├── numerical_calculation.txt
    │   │   │   ├── position.txt
    │   │   │   ├── posters.txt
    │   │   │   ├── scene.txt
    │   │   │   └── text_translation.txt
    │   │   ├── Your_Results
    │   │   │   ├── OCR.txt
    │   │   │   ├── artwork.txt
    │   │   │   ├── celebrity.txt
    │   │   │   ├── code_reasoning.txt
    │   │   │   ├── color.txt
    │   │   │   ├── commonsense_reasoning.txt
    │   │   │   ├── count.txt
    │   │   │   ├── existence.txt
    │   │   │   ├── landmark.txt
    │   │   │   ├── numerical_calculation.txt
    │   │   │   ├── position.txt
    │   │   │   ├── posters.txt
    │   │   │   ├── scene.txt
    │   │   │   └── text_translation.txt
    │   │   └── readme.txt
    │   ├── eval_pope.py
    │   ├── mme_instructblip.py
    │   ├── mme_llava.py
    │   ├── object_hallucination_vqa_instructblip.py
    │   └── object_hallucination_vqa_llava.py
    ├── lavis
    │   ├── __init__.py
    │   ├── common
    │   │   ├── annotator
    │   │   │   ├── canny
    │   │   │   │   └── __init__.py
    │   │   │   ├── ckpts
    │   │   │   │   └── download.sh
    │   │   │   ├── hed
    │   │   │   │   └── __init__.py
    │   │   │   ├── midas
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── api.py
    │   │   │   │   ├── midas
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── base_model.py
    │   │   │   │   │   ├── blocks.py
    │   │   │   │   │   ├── dpt_depth.py
    │   │   │   │   │   ├── midas_net.py
    │   │   │   │   │   ├── midas_net_custom.py
    │   │   │   │   │   ├── transforms.py
    │   │   │   │   │   └── vit.py
    │   │   │   │   └── utils.py
    │   │   │   ├── mlsd
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── models
    │   │   │   │   │   ├── mbv2_mlsd_large.py
    │   │   │   │   │   └── mbv2_mlsd_tiny.py
    │   │   │   │   └── utils.py
    │   │   │   ├── openpose
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── body.py
    │   │   │   │   ├── hand.py
    │   │   │   │   ├── model.py
    │   │   │   │   └── util.py
    │   │   │   ├── uniformer
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── configs
    │   │   │   │   │   └── _base_
    │   │   │   │   │   │   ├── datasets
    │   │   │   │   │   │       ├── ade20k.py
    │   │   │   │   │   │       ├── chase_db1.py
    │   │   │   │   │   │       ├── cityscapes.py
    │   │   │   │   │   │       ├── cityscapes_769x769.py
    │   │   │   │   │   │       ├── drive.py
    │   │   │   │   │   │       ├── hrf.py
    │   │   │   │   │   │       ├── pascal_context.py
    │   │   │   │   │   │       ├── pascal_context_59.py
    │   │   │   │   │   │       ├── pascal_voc12.py
    │   │   │   │   │   │       ├── pascal_voc12_aug.py
    │   │   │   │   │   │       └── stare.py
    │   │   │   │   │   │   ├── default_runtime.py
    │   │   │   │   │   │   ├── models
    │   │   │   │   │   │       ├── ann_r50-d8.py
    │   │   │   │   │   │       ├── apcnet_r50-d8.py
    │   │   │   │   │   │       ├── ccnet_r50-d8.py
    │   │   │   │   │   │       ├── cgnet.py
    │   │   │   │   │   │       ├── danet_r50-d8.py
    │   │   │   │   │   │       ├── deeplabv3_r50-d8.py
    │   │   │   │   │   │       ├── deeplabv3_unet_s5-d16.py
    │   │   │   │   │   │       ├── deeplabv3plus_r50-d8.py
    │   │   │   │   │   │       ├── dmnet_r50-d8.py
    │   │   │   │   │   │       ├── dnl_r50-d8.py
    │   │   │   │   │   │       ├── emanet_r50-d8.py
    │   │   │   │   │   │       ├── encnet_r50-d8.py
    │   │   │   │   │   │       ├── fast_scnn.py
    │   │   │   │   │   │       ├── fcn_hr18.py
    │   │   │   │   │   │       ├── fcn_r50-d8.py
    │   │   │   │   │   │       ├── fcn_unet_s5-d16.py
    │   │   │   │   │   │       ├── fpn_r50.py
    │   │   │   │   │   │       ├── fpn_uniformer.py
    │   │   │   │   │   │       ├── gcnet_r50-d8.py
    │   │   │   │   │   │       ├── lraspp_m-v3-d8.py
    │   │   │   │   │   │       ├── nonlocal_r50-d8.py
    │   │   │   │   │   │       ├── ocrnet_hr18.py
    │   │   │   │   │   │       ├── ocrnet_r50-d8.py
    │   │   │   │   │   │       ├── pointrend_r50.py
    │   │   │   │   │   │       ├── psanet_r50-d8.py
    │   │   │   │   │   │       ├── pspnet_r50-d8.py
    │   │   │   │   │   │       ├── pspnet_unet_s5-d16.py
    │   │   │   │   │   │       ├── upernet_r50.py
    │   │   │   │   │   │       └── upernet_uniformer.py
    │   │   │   │   │   │   └── schedules
    │   │   │   │   │   │       ├── schedule_160k.py
    │   │   │   │   │   │       ├── schedule_20k.py
    │   │   │   │   │   │       ├── schedule_40k.py
    │   │   │   │   │   │       └── schedule_80k.py
    │   │   │   │   ├── exp
    │   │   │   │   │   └── upernet_global_small
    │   │   │   │   │   │   ├── config.py
    │   │   │   │   │   │   ├── run.sh
    │   │   │   │   │   │   ├── test.sh
    │   │   │   │   │   │   ├── test_config_g.py
    │   │   │   │   │   │   ├── test_config_h32.py
    │   │   │   │   │   │   └── test_config_w32.py
    │   │   │   │   ├── mmcv
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   ├── arraymisc
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   └── quantization.py
    │   │   │   │   │   ├── cnn
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── alexnet.py
    │   │   │   │   │   │   ├── bricks
    │   │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   │   ├── activation.py
    │   │   │   │   │   │   │   ├── context_block.py
    │   │   │   │   │   │   │   ├── conv.py
    │   │   │   │   │   │   │   ├── conv2d_adaptive_padding.py
    │   │   │   │   │   │   │   ├── conv_module.py
    │   │   │   │   │   │   │   ├── conv_ws.py
    │   │   │   │   │   │   │   ├── depthwise_separable_conv_module.py
    │   │   │   │   │   │   │   ├── drop.py
    │   │   │   │   │   │   │   ├── generalized_attention.py
    │   │   │   │   │   │   │   ├── hsigmoid.py
    │   │   │   │   │   │   │   ├── hswish.py
    │   │   │   │   │   │   │   ├── non_local.py
    │   │   │   │   │   │   │   ├── norm.py
    │   │   │   │   │   │   │   ├── padding.py
    │   │   │   │   │   │   │   ├── plugin.py
    │   │   │   │   │   │   │   ├── registry.py
    │   │   │   │   │   │   │   ├── scale.py
    │   │   │   │   │   │   │   ├── swish.py
    │   │   │   │   │   │   │   ├── transformer.py
    │   │   │   │   │   │   │   ├── upsample.py
    │   │   │   │   │   │   │   └── wrappers.py
    │   │   │   │   │   │   ├── builder.py
    │   │   │   │   │   │   ├── resnet.py
    │   │   │   │   │   │   ├── utils
    │   │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   │   ├── flops_counter.py
    │   │   │   │   │   │   │   ├── fuse_conv_bn.py
    │   │   │   │   │   │   │   ├── sync_bn.py
    │   │   │   │   │   │   │   └── weight_init.py
    │   │   │   │   │   │   └── vgg.py
    │   │   │   │   │   ├── engine
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   └── test.py
    │   │   │   │   │   ├── fileio
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── file_client.py
    │   │   │   │   │   │   ├── handlers
    │   │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   │   ├── base.py
    │   │   │   │   │   │   │   ├── json_handler.py
    │   │   │   │   │   │   │   ├── pickle_handler.py
    │   │   │   │   │   │   │   └── yaml_handler.py
    │   │   │   │   │   │   ├── io.py
    │   │   │   │   │   │   └── parse.py
    │   │   │   │   │   ├── image
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── colorspace.py
    │   │   │   │   │   │   ├── geometric.py
    │   │   │   │   │   │   ├── io.py
    │   │   │   │   │   │   ├── misc.py
    │   │   │   │   │   │   └── photometric.py
    │   │   │   │   │   ├── model_zoo
    │   │   │   │   │   │   ├── deprecated.json
    │   │   │   │   │   │   ├── mmcls.json
    │   │   │   │   │   │   └── open_mmlab.json
    │   │   │   │   │   ├── ops
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── assign_score_withk.py
    │   │   │   │   │   │   ├── ball_query.py
    │   │   │   │   │   │   ├── bbox.py
    │   │   │   │   │   │   ├── border_align.py
    │   │   │   │   │   │   ├── box_iou_rotated.py
    │   │   │   │   │   │   ├── carafe.py
    │   │   │   │   │   │   ├── cc_attention.py
    │   │   │   │   │   │   ├── contour_expand.py
    │   │   │   │   │   │   ├── corner_pool.py
    │   │   │   │   │   │   ├── correlation.py
    │   │   │   │   │   │   ├── deform_conv.py
    │   │   │   │   │   │   ├── deform_roi_pool.py
    │   │   │   │   │   │   ├── deprecated_wrappers.py
    │   │   │   │   │   │   ├── focal_loss.py
    │   │   │   │   │   │   ├── furthest_point_sample.py
    │   │   │   │   │   │   ├── fused_bias_leakyrelu.py
    │   │   │   │   │   │   ├── gather_points.py
    │   │   │   │   │   │   ├── group_points.py
    │   │   │   │   │   │   ├── info.py
    │   │   │   │   │   │   ├── iou3d.py
    │   │   │   │   │   │   ├── knn.py
    │   │   │   │   │   │   ├── masked_conv.py
    │   │   │   │   │   │   ├── merge_cells.py
    │   │   │   │   │   │   ├── modulated_deform_conv.py
    │   │   │   │   │   │   ├── multi_scale_deform_attn.py
    │   │   │   │   │   │   ├── nms.py
    │   │   │   │   │   │   ├── pixel_group.py
    │   │   │   │   │   │   ├── point_sample.py
    │   │   │   │   │   │   ├── points_in_boxes.py
    │   │   │   │   │   │   ├── points_sampler.py
    │   │   │   │   │   │   ├── psa_mask.py
    │   │   │   │   │   │   ├── roi_align.py
    │   │   │   │   │   │   ├── roi_align_rotated.py
    │   │   │   │   │   │   ├── roi_pool.py
    │   │   │   │   │   │   ├── roiaware_pool3d.py
    │   │   │   │   │   │   ├── roipoint_pool3d.py
    │   │   │   │   │   │   ├── saconv.py
    │   │   │   │   │   │   ├── scatter_points.py
    │   │   │   │   │   │   ├── sync_bn.py
    │   │   │   │   │   │   ├── three_interpolate.py
    │   │   │   │   │   │   ├── three_nn.py
    │   │   │   │   │   │   ├── tin_shift.py
    │   │   │   │   │   │   ├── upfirdn2d.py
    │   │   │   │   │   │   └── voxelize.py
    │   │   │   │   │   ├── parallel
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── _functions.py
    │   │   │   │   │   │   ├── collate.py
    │   │   │   │   │   │   ├── data_container.py
    │   │   │   │   │   │   ├── data_parallel.py
    │   │   │   │   │   │   ├── distributed.py
    │   │   │   │   │   │   ├── distributed_deprecated.py
    │   │   │   │   │   │   ├── registry.py
    │   │   │   │   │   │   ├── scatter_gather.py
    │   │   │   │   │   │   └── utils.py
    │   │   │   │   │   ├── runner
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── base_module.py
    │   │   │   │   │   │   ├── base_runner.py
    │   │   │   │   │   │   ├── builder.py
    │   │   │   │   │   │   ├── checkpoint.py
    │   │   │   │   │   │   ├── default_constructor.py
    │   │   │   │   │   │   ├── dist_utils.py
    │   │   │   │   │   │   ├── epoch_based_runner.py
    │   │   │   │   │   │   ├── fp16_utils.py
    │   │   │   │   │   │   ├── hooks
    │   │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   │   ├── checkpoint.py
    │   │   │   │   │   │   │   ├── closure.py
    │   │   │   │   │   │   │   ├── ema.py
    │   │   │   │   │   │   │   ├── evaluation.py
    │   │   │   │   │   │   │   ├── hook.py
    │   │   │   │   │   │   │   ├── iter_timer.py
    │   │   │   │   │   │   │   ├── logger
    │   │   │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   │   │   ├── base.py
    │   │   │   │   │   │   │   │   ├── dvclive.py
    │   │   │   │   │   │   │   │   ├── mlflow.py
    │   │   │   │   │   │   │   │   ├── neptune.py
    │   │   │   │   │   │   │   │   ├── pavi.py
    │   │   │   │   │   │   │   │   ├── tensorboard.py
    │   │   │   │   │   │   │   │   ├── text.py
    │   │   │   │   │   │   │   │   └── wandb.py
    │   │   │   │   │   │   │   ├── lr_updater.py
    │   │   │   │   │   │   │   ├── memory.py
    │   │   │   │   │   │   │   ├── momentum_updater.py
    │   │   │   │   │   │   │   ├── optimizer.py
    │   │   │   │   │   │   │   ├── profiler.py
    │   │   │   │   │   │   │   ├── sampler_seed.py
    │   │   │   │   │   │   │   └── sync_buffer.py
    │   │   │   │   │   │   ├── iter_based_runner.py
    │   │   │   │   │   │   ├── log_buffer.py
    │   │   │   │   │   │   ├── optimizer
    │   │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   │   ├── builder.py
    │   │   │   │   │   │   │   └── default_constructor.py
    │   │   │   │   │   │   ├── priority.py
    │   │   │   │   │   │   └── utils.py
    │   │   │   │   │   ├── utils
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── config.py
    │   │   │   │   │   │   ├── env.py
    │   │   │   │   │   │   ├── ext_loader.py
    │   │   │   │   │   │   ├── logging.py
    │   │   │   │   │   │   ├── misc.py
    │   │   │   │   │   │   ├── parrots_jit.py
    │   │   │   │   │   │   ├── parrots_wrapper.py
    │   │   │   │   │   │   ├── path.py
    │   │   │   │   │   │   ├── progressbar.py
    │   │   │   │   │   │   ├── registry.py
    │   │   │   │   │   │   ├── testing.py
    │   │   │   │   │   │   ├── timer.py
    │   │   │   │   │   │   ├── trace.py
    │   │   │   │   │   │   └── version_utils.py
    │   │   │   │   │   ├── version.py
    │   │   │   │   │   ├── video
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── io.py
    │   │   │   │   │   │   ├── optflow.py
    │   │   │   │   │   │   └── processing.py
    │   │   │   │   │   └── visualization
    │   │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   │   ├── color.py
    │   │   │   │   │   │   ├── image.py
    │   │   │   │   │   │   └── optflow.py
    │   │   │   │   ├── mmcv_custom
    │   │   │   │   │   ├── __init__.py
    │   │   │   │   │   └── checkpoint.py
    │   │   │   │   └── mmseg
    │   │   │   │   │   ├── apis
    │   │   │   │   │       ├── __init__.py
    │   │   │   │   │       ├── inference.py
    │   │   │   │   │       ├── test.py
    │   │   │   │   │       └── train.py
    │   │   │   │   │   ├── core
    │   │   │   │   │       ├── __init__.py
    │   │   │   │   │       ├── evaluation
    │   │   │   │   │       │   ├── __init__.py
    │   │   │   │   │       │   ├── class_names.py
    │   │   │   │   │       │   ├── eval_hooks.py
    │   │   │   │   │       │   └── metrics.py
    │   │   │   │   │       ├── seg
    │   │   │   │   │       │   ├── __init__.py
    │   │   │   │   │       │   ├── builder.py
    │   │   │   │   │       │   └── sampler
    │   │   │   │   │       │   │   ├── __init__.py
    │   │   │   │   │       │   │   ├── base_pixel_sampler.py
    │   │   │   │   │       │   │   └── ohem_pixel_sampler.py
    │   │   │   │   │       └── utils
    │   │   │   │   │       │   ├── __init__.py
    │   │   │   │   │       │   └── misc.py
    │   │   │   │   │   ├── datasets
    │   │   │   │   │       ├── __init__.py
    │   │   │   │   │       ├── ade.py
    │   │   │   │   │       ├── builder.py
    │   │   │   │   │       ├── chase_db1.py
    │   │   │   │   │       ├── cityscapes.py
    │   │   │   │   │       ├── custom.py
    │   │   │   │   │       ├── dataset_wrappers.py
    │   │   │   │   │       ├── drive.py
    │   │   │   │   │       ├── hrf.py
    │   │   │   │   │       ├── pascal_context.py
    │   │   │   │   │       ├── pipelines
    │   │   │   │   │       │   ├── __init__.py
    │   │   │   │   │       │   ├── compose.py
    │   │   │   │   │       │   ├── formating.py
    │   │   │   │   │       │   ├── loading.py
    │   │   │   │   │       │   ├── test_time_aug.py
    │   │   │   │   │       │   └── transforms.py
    │   │   │   │   │       ├── stare.py
    │   │   │   │   │       └── voc.py
    │   │   │   │   │   ├── models
    │   │   │   │   │       ├── __init__.py
    │   │   │   │   │       ├── backbones
    │   │   │   │   │       │   ├── __init__.py
    │   │   │   │   │       │   ├── cgnet.py
    │   │   │   │   │       │   ├── fast_scnn.py
    │   │   │   │   │       │   ├── hrnet.py
    │   │   │   │   │       │   ├── mobilenet_v2.py
    │   │   │   │   │       │   ├── mobilenet_v3.py
    │   │   │   │   │       │   ├── resnest.py
    │   │   │   │   │       │   ├── resnet.py
    │   │   │   │   │       │   ├── resnext.py
    │   │   │   │   │       │   ├── unet.py
    │   │   │   │   │       │   ├── uniformer.py
    │   │   │   │   │       │   └── vit.py
    │   │   │   │   │       ├── builder.py
    │   │   │   │   │       ├── decode_heads
    │   │   │   │   │       │   ├── __init__.py
    │   │   │   │   │       │   ├── ann_head.py
    │   │   │   │   │       │   ├── apc_head.py
    │   │   │   │   │       │   ├── aspp_head.py
    │   │   │   │   │       │   ├── cascade_decode_head.py
    │   │   │   │   │       │   ├── cc_head.py
    │   │   │   │   │       │   ├── da_head.py
    │   │   │   │   │       │   ├── decode_head.py
    │   │   │   │   │       │   ├── dm_head.py
    │   │   │   │   │       │   ├── dnl_head.py
    │   │   │   │   │       │   ├── ema_head.py
    │   │   │   │   │       │   ├── enc_head.py
    │   │   │   │   │       │   ├── fcn_head.py
    │   │   │   │   │       │   ├── fpn_head.py
    │   │   │   │   │       │   ├── gc_head.py
    │   │   │   │   │       │   ├── lraspp_head.py
    │   │   │   │   │       │   ├── nl_head.py
    │   │   │   │   │       │   ├── ocr_head.py
    │   │   │   │   │       │   ├── point_head.py
    │   │   │   │   │       │   ├── psa_head.py
    │   │   │   │   │       │   ├── psp_head.py
    │   │   │   │   │       │   ├── sep_aspp_head.py
    │   │   │   │   │       │   ├── sep_fcn_head.py
    │   │   │   │   │       │   └── uper_head.py
    │   │   │   │   │       ├── losses
    │   │   │   │   │       │   ├── __init__.py
    │   │   │   │   │       │   ├── accuracy.py
    │   │   │   │   │       │   ├── cross_entropy_loss.py
    │   │   │   │   │       │   ├── dice_loss.py
    │   │   │   │   │       │   ├── lovasz_loss.py
    │   │   │   │   │       │   └── utils.py
    │   │   │   │   │       ├── necks
    │   │   │   │   │       │   ├── __init__.py
    │   │   │   │   │       │   ├── fpn.py
    │   │   │   │   │       │   └── multilevel_neck.py
    │   │   │   │   │       ├── segmentors
    │   │   │   │   │       │   ├── __init__.py
    │   │   │   │   │       │   ├── base.py
    │   │   │   │   │       │   ├── cascade_encoder_decoder.py
    │   │   │   │   │       │   └── encoder_decoder.py
    │   │   │   │   │       └── utils
    │   │   │   │   │       │   ├── __init__.py
    │   │   │   │   │       │   ├── drop.py
    │   │   │   │   │       │   ├── inverted_residual.py
    │   │   │   │   │       │   ├── make_divisible.py
    │   │   │   │   │       │   ├── res_layer.py
    │   │   │   │   │       │   ├── se_layer.py
    │   │   │   │   │       │   ├── self_attention_block.py
    │   │   │   │   │       │   ├── up_conv_block.py
    │   │   │   │   │       │   └── weight_init.py
    │   │   │   │   │   ├── ops
    │   │   │   │   │       ├── __init__.py
    │   │   │   │   │       ├── encoding.py
    │   │   │   │   │       └── wrappers.py
    │   │   │   │   │   └── utils
    │   │   │   │   │       ├── __init__.py
    │   │   │   │   │       ├── collect_env.py
    │   │   │   │   │       └── logger.py
    │   │   │   └── util.py
    │   │   ├── config.py
    │   │   ├── dist_utils.py
    │   │   ├── gradcam.py
    │   │   ├── logger.py
    │   │   ├── optims.py
    │   │   ├── registry.py
    │   │   ├── utils.py
    │   │   └── vqa_tools
    │   │   │   ├── __init__.py
    │   │   │   ├── vqa.py
    │   │   │   └── vqa_eval.py
    │   ├── configs
    │   │   ├── datasets
    │   │   │   ├── aokvqa
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── avsd
    │   │   │   │   └── defaults_dial.yaml
    │   │   │   ├── blip_diffusion_datasets
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── coco
    │   │   │   │   ├── defaults_cap.yaml
    │   │   │   │   ├── defaults_ret.yaml
    │   │   │   │   ├── defaults_vqa.yaml
    │   │   │   │   └── eval_vqa.yaml
    │   │   │   ├── conceptual_caption
    │   │   │   │   ├── defaults_12m.yaml
    │   │   │   │   └── defaults_3m.yaml
    │   │   │   ├── didemo
    │   │   │   │   └── defaults_ret.yaml
    │   │   │   ├── flickr30k
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── gqa
    │   │   │   │   ├── balanced_testdev.yaml
    │   │   │   │   ├── balanced_val.yaml
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── imagenet
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── laion
    │   │   │   │   └── defaults_2B_multi.yaml
    │   │   │   ├── msrvtt
    │   │   │   │   ├── defaults_cap.yaml
    │   │   │   │   ├── defaults_qa.yaml
    │   │   │   │   └── defaults_ret.yaml
    │   │   │   ├── msvd
    │   │   │   │   ├── defaults_cap.yaml
    │   │   │   │   └── defaults_qa.yaml
    │   │   │   ├── nlvr
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── nocaps
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── okvqa
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── sbu_caption
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── snli_ve
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── vatex
    │   │   │   │   └── defaults_cap.yaml
    │   │   │   └── vg
    │   │   │   │   ├── defaults_caption.yaml
    │   │   │   │   └── defaults_vqa.yaml
    │   │   ├── default.yaml
    │   │   └── models
    │   │   │   ├── albef_classification_ve.yaml
    │   │   │   ├── albef_feature_extractor.yaml
    │   │   │   ├── albef_nlvr.yaml
    │   │   │   ├── albef_pretrain_base.yaml
    │   │   │   ├── albef_retrieval_coco.yaml
    │   │   │   ├── albef_retrieval_flickr.yaml
    │   │   │   ├── albef_vqav2.yaml
    │   │   │   ├── alpro_qa_msrvtt.yaml
    │   │   │   ├── alpro_qa_msvd.yaml
    │   │   │   ├── alpro_retrieval_didemo.yaml
    │   │   │   ├── alpro_retrieval_msrvtt.yaml
    │   │   │   ├── bert_config.json
    │   │   │   ├── bert_config_alpro.json
    │   │   │   ├── blip-diffusion
    │   │   │       ├── blip_diffusion_base.yaml
    │   │   │       ├── blip_diffusion_controlnet_canny.yaml
    │   │   │       ├── blip_diffusion_controlnet_depth.yaml
    │   │   │       └── blip_diffusion_controlnet_hed.yaml
    │   │   │   ├── blip2
    │   │   │       ├── blip2_caption_flant5xl.yaml
    │   │   │       ├── blip2_caption_opt2.7b.yaml
    │   │   │       ├── blip2_caption_opt6.7b.yaml
    │   │   │       ├── blip2_coco.yaml
    │   │   │       ├── blip2_instruct_flant5xl.yaml
    │   │   │       ├── blip2_instruct_flant5xxl.yaml
    │   │   │       ├── blip2_instruct_vicuna13b.yaml
    │   │   │       ├── blip2_instruct_vicuna7b.yaml
    │   │   │       ├── blip2_pretrain.yaml
    │   │   │       ├── blip2_pretrain_flant5xl.yaml
    │   │   │       ├── blip2_pretrain_flant5xl_vitL.yaml
    │   │   │       ├── blip2_pretrain_flant5xxl.yaml
    │   │   │       ├── blip2_pretrain_llama7b.yaml
    │   │   │       ├── blip2_pretrain_opt2.7b.yaml
    │   │   │       ├── blip2_pretrain_opt6.7b.yaml
    │   │   │       └── blip2_pretrain_vitL.yaml
    │   │   │   ├── blip_caption_base_coco.yaml
    │   │   │   ├── blip_caption_large_coco.yaml
    │   │   │   ├── blip_classification_base.yaml
    │   │   │   ├── blip_feature_extractor_base.yaml
    │   │   │   ├── blip_itm_base.yaml
    │   │   │   ├── blip_itm_large.yaml
    │   │   │   ├── blip_nlvr.yaml
    │   │   │   ├── blip_pretrain_base.yaml
    │   │   │   ├── blip_pretrain_large.yaml
    │   │   │   ├── blip_retrieval_coco.yaml
    │   │   │   ├── blip_retrieval_flickr.yaml
    │   │   │   ├── blip_vqa_aokvqa.yaml
    │   │   │   ├── blip_vqa_okvqa.yaml
    │   │   │   ├── blip_vqav2.yaml
    │   │   │   ├── clip
    │   │   │       ├── RN101-quickgelu.json
    │   │   │       ├── RN101.json
    │   │   │       ├── RN50-quickgelu.json
    │   │   │       ├── RN50.json
    │   │   │       ├── RN50x16.json
    │   │   │       ├── RN50x4.json
    │   │   │       ├── ViT-B-16-plus-240.json
    │   │   │       ├── ViT-B-16-plus.json
    │   │   │       ├── ViT-B-16.json
    │   │   │       ├── ViT-B-32-plus-256.json
    │   │   │       ├── ViT-B-32-quickgelu.json
    │   │   │       ├── ViT-B-32.json
    │   │   │       ├── ViT-H-14.json
    │   │   │       ├── ViT-H-16.json
    │   │   │       ├── ViT-L-14-280.json
    │   │   │       ├── ViT-L-14-336.json
    │   │   │       ├── ViT-L-14.json
    │   │   │       ├── ViT-L-16-320.json
    │   │   │       ├── ViT-L-16.json
    │   │   │       ├── ViT-g-14.json
    │   │   │       ├── timm-efficientnetv2_rw_s.json
    │   │   │       ├── timm-resnet50d.json
    │   │   │       ├── timm-resnetaa50d.json
    │   │   │       ├── timm-resnetblur50.json
    │   │   │       ├── timm-swin_base_patch4_window7_224.json
    │   │   │       ├── timm-vit_base_patch16_224.json
    │   │   │       ├── timm-vit_base_patch32_224.json
    │   │   │       └── timm-vit_small_patch16_224.json
    │   │   │   ├── clip_resnet50.yaml
    │   │   │   ├── clip_vit_base16.yaml
    │   │   │   ├── clip_vit_base32.yaml
    │   │   │   ├── clip_vit_large14.yaml
    │   │   │   ├── clip_vit_large14_336.yaml
    │   │   │   ├── gpt_dialogue_base.yaml
    │   │   │   ├── img2prompt-vqa
    │   │   │       └── img2prompt_vqa_base.yaml
    │   │   │   ├── med_config.json
    │   │   │   ├── med_config_albef.json
    │   │   │   ├── med_large_config.json
    │   │   │   └── pnp-vqa
    │   │   │       ├── pnp_vqa_3b.yaml
    │   │   │       ├── pnp_vqa_base.yaml
    │   │   │       ├── pnp_vqa_large.yaml
    │   │   │       ├── unifiedqav2_3b_config.json
    │   │   │       ├── unifiedqav2_base_config.json
    │   │   │       └── unifiedqav2_large_config.json
    │   ├── datasets
    │   │   ├── builders
    │   │   │   ├── __init__.py
    │   │   │   ├── base_dataset_builder.py
    │   │   │   ├── caption_builder.py
    │   │   │   ├── classification_builder.py
    │   │   │   ├── dialogue_builder.py
    │   │   │   ├── image_text_pair_builder.py
    │   │   │   ├── imagefolder_builder.py
    │   │   │   ├── retrieval_builder.py
    │   │   │   ├── text_to_image_generation_builder.py
    │   │   │   ├── video_qa_builder.py
    │   │   │   └── vqa_builder.py
    │   │   ├── data_utils.py
    │   │   ├── datasets
    │   │   │   ├── aok_vqa_datasets.py
    │   │   │   ├── avsd_dialogue_datasets.py
    │   │   │   ├── base_dataset.py
    │   │   │   ├── caption_datasets.py
    │   │   │   ├── coco_caption_datasets.py
    │   │   │   ├── coco_vqa_datasets.py
    │   │   │   ├── dataloader_utils.py
    │   │   │   ├── dialogue_datasets.py
    │   │   │   ├── gqa_datasets.py
    │   │   │   ├── image_text_pair_datasets.py
    │   │   │   ├── imagefolder_dataset.py
    │   │   │   ├── laion_dataset.py
    │   │   │   ├── multimodal_classification_datasets.py
    │   │   │   ├── nlvr_datasets.py
    │   │   │   ├── retrieval_datasets.py
    │   │   │   ├── snli_ve_datasets.py
    │   │   │   ├── subject_driven_t2i_dataset.py
    │   │   │   ├── vg_vqa_datasets.py
    │   │   │   ├── video_caption_datasets.py
    │   │   │   ├── video_vqa_datasets.py
    │   │   │   └── vqa_datasets.py
    │   │   └── download_scripts
    │   │   │   ├── DownloadConceptualCaptions
    │   │   │       ├── LICENSE
    │   │   │       ├── README.md
    │   │   │       ├── create_annotation_12m.ipynb
    │   │   │       ├── create_annotation_3m.ipynb
    │   │   │       ├── download_data_cc12m.py
    │   │   │       └── download_data_cc3m.py
    │   │   │   ├── download_coco.py
    │   │   │   ├── download_didemo.py
    │   │   │   ├── download_flickr.py
    │   │   │   ├── download_gqa.py
    │   │   │   ├── download_msrvtt.py
    │   │   │   ├── download_msvd.py
    │   │   │   ├── download_nocaps.py
    │   │   │   ├── download_sbu.py
    │   │   │   └── download_vg.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── albef_models
    │   │   │   ├── __init__.py
    │   │   │   ├── albef_classification.py
    │   │   │   ├── albef_feature_extractor.py
    │   │   │   ├── albef_nlvr.py
    │   │   │   ├── albef_outputs.py
    │   │   │   ├── albef_pretrain.py
    │   │   │   ├── albef_retrieval.py
    │   │   │   └── albef_vqa.py
    │   │   ├── alpro_models
    │   │   │   ├── __init__.py
    │   │   │   ├── alpro_outputs.py
    │   │   │   ├── alpro_qa.py
    │   │   │   └── alpro_retrieval.py
    │   │   ├── base_model.py
    │   │   ├── blip2_models
    │   │   │   ├── Qformer.py
    │   │   │   ├── __init__.py
    │   │   │   ├── blip2.py
    │   │   │   ├── blip2_image_text_matching.py
    │   │   │   ├── blip2_opt.py
    │   │   │   ├── blip2_qformer.py
    │   │   │   ├── blip2_t5.py
    │   │   │   ├── blip2_t5_instruct.py
    │   │   │   ├── blip2_vicuna_instruct.py
    │   │   │   ├── modeling_llama.py
    │   │   │   ├── modeling_opt.py
    │   │   │   └── modeling_t5.py
    │   │   ├── blip_diffusion_models
    │   │   │   ├── __init__.py
    │   │   │   ├── blip_diffusion.py
    │   │   │   ├── modeling_ctx_clip.py
    │   │   │   ├── ptp_utils.py
    │   │   │   └── utils.py
    │   │   ├── blip_models
    │   │   │   ├── __init__.py
    │   │   │   ├── blip.py
    │   │   │   ├── blip_caption.py
    │   │   │   ├── blip_classification.py
    │   │   │   ├── blip_feature_extractor.py
    │   │   │   ├── blip_image_text_matching.py
    │   │   │   ├── blip_nlvr.py
    │   │   │   ├── blip_outputs.py
    │   │   │   ├── blip_pretrain.py
    │   │   │   ├── blip_retrieval.py
    │   │   │   ├── blip_vqa.py
    │   │   │   └── nlvr_encoder.py
    │   │   ├── clip_models
    │   │   │   ├── __init__.py
    │   │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   │   ├── clip_outputs.py
    │   │   │   ├── loss.py
    │   │   │   ├── model.py
    │   │   │   ├── pics
    │   │   │   │   └── CLIP.png
    │   │   │   ├── pretrained.py
    │   │   │   ├── timm_model.py
    │   │   │   ├── tokenizer.py
    │   │   │   ├── transform.py
    │   │   │   └── utils.py
    │   │   ├── clip_vit.py
    │   │   ├── eva_vit.py
    │   │   ├── gpt_models
    │   │   │   └── gpt_dialogue.py
    │   │   ├── img2prompt_models
    │   │   │   ├── __init__.py
    │   │   │   └── img2prompt_vqa.py
    │   │   ├── med.py
    │   │   ├── pnp_vqa_models
    │   │   │   ├── __init__.py
    │   │   │   ├── pnp_unifiedqav2_fid.py
    │   │   │   └── pnp_vqa.py
    │   │   ├── timesformer
    │   │   │   ├── __init__.py
    │   │   │   ├── conv2d_same.py
    │   │   │   ├── features.py
    │   │   │   ├── helpers.py
    │   │   │   ├── linear.py
    │   │   │   ├── vit.py
    │   │   │   └── vit_utils.py
    │   │   └── vit.py
    │   ├── processors
    │   │   ├── __init__.py
    │   │   ├── alpro_processors.py
    │   │   ├── base_processor.py
    │   │   ├── blip_diffusion_processors.py
    │   │   ├── blip_processors.py
    │   │   ├── clip_processors.py
    │   │   ├── functional_video.py
    │   │   ├── gpt_processors.py
    │   │   ├── randaugment.py
    │   │   └── transforms_video.py
    │   ├── projects
    │   │   ├── albef
    │   │   │   ├── eval
    │   │   │   │   ├── nlvr_eval.yaml
    │   │   │   │   ├── ret_coco_eval.yaml
    │   │   │   │   ├── ret_flickr30k_eval.yaml
    │   │   │   │   ├── snli_ve_eval.yaml
    │   │   │   │   ├── vqa_test.yaml
    │   │   │   │   └── vqa_val.yaml
    │   │   │   └── train
    │   │   │   │   ├── aokvqa_ft.yaml
    │   │   │   │   ├── nlvr_ft.yaml
    │   │   │   │   ├── okvqa_ft.yaml
    │   │   │   │   ├── pretrain.yaml
    │   │   │   │   ├── ret_coco_ft.yaml
    │   │   │   │   ├── ret_flickr30k_ft.yaml
    │   │   │   │   ├── snli_ve_ft.yaml
    │   │   │   │   └── vqa_ft.yaml
    │   │   ├── alpro
    │   │   │   ├── eval
    │   │   │   │   ├── didemo_ret_eval.yaml
    │   │   │   │   ├── msrvtt_qa_eval.yaml
    │   │   │   │   ├── msrvtt_ret_eval.yaml
    │   │   │   │   └── msvd_qa_eval.yaml
    │   │   │   └── train
    │   │   │   │   ├── didemo_ret_ft.yaml
    │   │   │   │   ├── msrvtt_qa_ft.yaml
    │   │   │   │   ├── msrvtt_retrieval_ft.yaml
    │   │   │   │   └── msvd_qa_ft.yaml
    │   │   ├── blip
    │   │   │   ├── coco_cap_ft_iter.yaml
    │   │   │   ├── eval
    │   │   │   │   ├── aokvqa_eval.yaml
    │   │   │   │   ├── caption_coco_eval.yaml
    │   │   │   │   ├── caption_coco_eval_large.yaml
    │   │   │   │   ├── nlvr_eval.yaml
    │   │   │   │   ├── nocaps_eval.yaml
    │   │   │   │   ├── okvqa_eval.yaml
    │   │   │   │   ├── ret_coco_eval.yaml
    │   │   │   │   ├── ret_flickr_eval.yaml
    │   │   │   │   └── vqav2_eval.yaml
    │   │   │   └── train
    │   │   │   │   ├── aokvqa_ft.yaml
    │   │   │   │   ├── caption_coco_ft.yaml
    │   │   │   │   ├── caption_coco_large_ft.yaml
    │   │   │   │   ├── nlvr_ft.yaml
    │   │   │   │   ├── okvqa_ft.yaml
    │   │   │   │   ├── pretrain_14m.yaml
    │   │   │   │   ├── retrieval_coco_ft.yaml
    │   │   │   │   ├── retrieval_flickr_ft.yaml
    │   │   │   │   └── vqav2_ft.yaml
    │   │   ├── blip2
    │   │   │   ├── eval
    │   │   │   │   ├── caption_coco_flant5xl_eval.yaml
    │   │   │   │   ├── caption_coco_opt2.7b_eval.yaml
    │   │   │   │   ├── caption_coco_opt6.7b_eval.yaml
    │   │   │   │   ├── gqa_zeroshot_flant5xl_eval.yaml
    │   │   │   │   ├── okvqa_zeroshot_flant5xl_eval.yaml
    │   │   │   │   ├── ret_coco_eval.yaml
    │   │   │   │   ├── ret_flickr_eval.yaml
    │   │   │   │   ├── vqav2_zeroshot_flant5xl_eval.yaml
    │   │   │   │   └── vqav2_zeroshot_opt_eval.yaml
    │   │   │   └── train
    │   │   │   │   ├── caption_coco_ft.yaml
    │   │   │   │   ├── pretrain_stage1.yaml
    │   │   │   │   ├── pretrain_stage2.yaml
    │   │   │   │   └── retrieval_coco_ft.yaml
    │   │   ├── blip_diffusion
    │   │   │   ├── finetune-db-dog.yaml
    │   │   │   ├── finetune-db-pink-dress.yaml
    │   │   │   ├── finetune-db-shein-jacket.yaml
    │   │   │   └── finetune-db-template.yaml
    │   │   ├── clip
    │   │   │   ├── exp_coco_ret_eval.yaml
    │   │   │   ├── exp_flickr_ret_eval.yaml
    │   │   │   └── exp_imnet_zs_eval.yaml
    │   │   ├── gpt
    │   │   │   ├── eval
    │   │   │   │   └── dialogue_avsd_eval.yaml
    │   │   │   └── train
    │   │   │   │   └── dialogue_avsd_ft.yaml
    │   │   └── pnp-vqa
    │   │   │   └── eval
    │   │   │       ├── gqa_eval.yaml
    │   │   │       ├── gqa_eval_3b.yaml
    │   │   │       ├── gqa_eval_large.yaml
    │   │   │       ├── okvqa_eval.yaml
    │   │   │       ├── okvqa_eval_3b.yaml
    │   │   │       ├── okvqa_eval_large.yaml
    │   │   │       ├── vqav2_eval.yaml
    │   │   │       ├── vqav2_eval_3b.yaml
    │   │   │       ├── vqav2_eval_large.yaml
    │   │   │       ├── vqav2_test_eval.yaml
    │   │   │       ├── vqav2_test_eval_3b.yaml
    │   │   │       └── vqav2_test_eval_large.yaml
    │   ├── runners
    │   │   ├── __init__.py
    │   │   ├── runner_base.py
    │   │   └── runner_iter.py
    │   └── tasks
    │   │   ├── __init__.py
    │   │   ├── base_task.py
    │   │   ├── captioning.py
    │   │   ├── dialogue.py
    │   │   ├── image_text_pretrain.py
    │   │   ├── multimodal_classification.py
    │   │   ├── retrieval.py
    │   │   ├── text_to_image_generation.py
    │   │   ├── vqa.py
    │   │   └── vqa_reading_comprehension.py
    └── llava
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── conversation.py
    │   ├── mm_utils.py
    │   ├── model
    │       ├── __init__.py
    │       ├── builder.py
    │       ├── consolidate.py
    │       ├── language_model
    │       │   ├── llava_llama.py
    │       │   ├── llava_mpt.py
    │       │   └── mpt
    │       │   │   ├── adapt_tokenizer.py
    │       │   │   ├── attention.py
    │       │   │   ├── blocks.py
    │       │   │   ├── configuration_mpt.py
    │       │   │   ├── custom_embedding.py
    │       │   │   ├── flash_attn_triton.py
    │       │   │   ├── hf_prefixlm_converter.py
    │       │   │   ├── meta_init_context.py
    │       │   │   ├── modeling_mpt.py
    │       │   │   ├── norm.py
    │       │   │   └── param_init_fns.py
    │       ├── llava_arch.py
    │       ├── make_delta.py
    │       ├── multimodal_encoder
    │       │   ├── builder.py
    │       │   └── clip_encoder.py
    │       ├── multimodal_projector
    │       │   └── builder.py
    │       └── utils.py
    │   └── utils.py
├── requirements.txt
└── utils
    ├── dist_util.py
    └── logger.py


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Sangmin Woo
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/assets/amber.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/amber.png


--------------------------------------------------------------------------------
/assets/amber_discriminative.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/amber_discriminative.png


--------------------------------------------------------------------------------
/assets/eyes_forest.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/eyes_forest.png


--------------------------------------------------------------------------------
/assets/llava_bench.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/llava_bench.png


--------------------------------------------------------------------------------
/assets/mme-fullset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/mme-fullset.png


--------------------------------------------------------------------------------
/assets/mme-hallucination.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/mme-hallucination.png


--------------------------------------------------------------------------------
/assets/motivation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/motivation.png


--------------------------------------------------------------------------------
/assets/observation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/observation.png


--------------------------------------------------------------------------------
/assets/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/overview.png


--------------------------------------------------------------------------------
/assets/pope.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/pope.png


--------------------------------------------------------------------------------
/assets/qualitative_amber_instructblip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_amber_instructblip.png


--------------------------------------------------------------------------------
/assets/qualitative_amber_instructblip2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_amber_instructblip2.png


--------------------------------------------------------------------------------
/assets/qualitative_amber_llava.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_amber_llava.png


--------------------------------------------------------------------------------
/assets/qualitative_amber_llava2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_amber_llava2.png


--------------------------------------------------------------------------------
/assets/qualitative_mme2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_mme2.png


--------------------------------------------------------------------------------
/assets/qualitative_mme_instructblip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_mme_instructblip.png


--------------------------------------------------------------------------------
/assets/qualitative_mme_llava.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_mme_llava.png


--------------------------------------------------------------------------------
/assets/qualitative_pope.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_pope.png


--------------------------------------------------------------------------------
/assets/qualitative_pope2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_pope2.png


--------------------------------------------------------------------------------
/avisc_utils/vcd_add_noise.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | def add_diffusion_noise(image_tensor, noise_step):
 4 |     num_steps = 1000  # Number of diffusion steps
 5 | 
 6 |     # decide beta in each step
 7 |     betas = torch.linspace(-6,6,num_steps)
 8 |     betas = torch.sigmoid(betas) * (0.5e-2 - 1e-5) + 1e-5
 9 | 
10 |     # decide alphas in each step
11 |     alphas = 1 - betas
12 |     alphas_prod = torch.cumprod(alphas, dim=0)
13 |     alphas_prod_p = torch.cat([torch.tensor([1]).float(), alphas_prod[:-1]],0) # p for previous
14 |     alphas_bar_sqrt = torch.sqrt(alphas_prod)
15 |     one_minus_alphas_bar_log = torch.log(1 - alphas_prod)
16 |     one_minus_alphas_bar_sqrt = torch.sqrt(1 - alphas_prod)
17 | 
18 |     def q_x(x_0,t):
19 |         noise = torch.randn_like(x_0)
20 |         alphas_t = alphas_bar_sqrt[t]
21 |         alphas_1_m_t = one_minus_alphas_bar_sqrt[t]
22 |         return (alphas_t*x_0 + alphas_1_m_t*noise)
23 | 
24 |     noise_delta = int(noise_step) # from 0-999
25 |     noisy_image = image_tensor.clone()
26 |     image_tensor_cd = q_x(noisy_image,noise_step) 
27 | 
28 |     return image_tensor_cd
29 | 
30 | 


--------------------------------------------------------------------------------
/eval_bench/SimSun.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/eval_bench/SimSun.ttf


--------------------------------------------------------------------------------
/eval_bench/scripts/llava_bench_eval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | ## set below
 5 | ####################################################
 6 | seed=42
 7 | model="llava" # llava | qwen-vl | instructblip
 8 | use_avisc=false
 9 | use_cd=False
10 | gpus=0
11 | max_token=64
12 | cd_alpha=2.5
13 | cd_beta=0.1
14 | model_path="/path/to/the/checkpoints/llava-v1.5-7b"
15 | pope_path="path/to/dataset/llava-bench-in-the-wild/questions.jsonl"
16 | data_path="path/to/dataset/llava-bench-in-the-wild/images"
17 | log_path="path/to//llava_bench/.json"
18 | conv="llava_v1"
19 | batch_size=1
20 | ####################################################
21 | 
22 | export CUDA_VISIBLE_DEVICES=${gpus}
23 | python ./eval_bench/llava_bench_llava.py \
24 | --seed ${seed} \
25 | --model-path ${model_path} \
26 | --question-file ${pope_path} \
27 | --image-folder ${data_path} \
28 | --answers-file ${log_path} \
29 | --conv ${conv} \
30 | --use_avisc ${use_avisc} \
31 | --use_cd ${use_cd} \
32 | --max_token ${max_token} \
33 | --cd_alpha ${cd_alpha} \
34 | --cd_beta ${cd_beta} \
35 | 
36 | 


--------------------------------------------------------------------------------
/experiments/AMBER/README_File/Paper-Arxiv-orange.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="80" height="20" role="img" aria-label="Paper: Arxiv"><title>Paper: Arxiv</title><linearGradient id="s" x2="0" y2="100%"><stop offset="0" stop-color="#bbb" stop-opacity=".1"/><stop offset="1" stop-opacity=".1"/></linearGradient><clipPath id="r"><rect width="80" height="20" rx="3" fill="#fff"/></clipPath><g clip-path="url(#r)"><rect width="41" height="20" fill="#555"/><rect x="41" width="39" height="20" fill="#fe7d37"/><rect width="80" height="20" fill="url(#s)"/></g><g fill="#fff" text-anchor="middle" font-family="Verdana,Geneva,DejaVu Sans,sans-serif" text-rendering="geometricPrecision" font-size="110"><text aria-hidden="true" x="215" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="310">Paper</text><text x="215" y="140" transform="scale(.1)" fill="#fff" textLength="310">Paper</text><text aria-hidden="true" x="595" y="150" fill="#010101" fill-opacity=".3" transform="scale(.1)" textLength="290">Arxiv</text><text x="595" y="140" transform="scale(.1)" fill="#fff" textLength="290">Arxiv</text></g></svg>


--------------------------------------------------------------------------------
/experiments/AMBER/README_File/comparison.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/AMBER/README_File/comparison.jpg


--------------------------------------------------------------------------------
/experiments/AMBER/README_File/intro.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/AMBER/README_File/intro.jpg


--------------------------------------------------------------------------------
/experiments/AMBER/README_File/result.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/AMBER/README_File/result.jpg


--------------------------------------------------------------------------------
/experiments/AMBER/README_File/statistics.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/AMBER/README_File/statistics.jpg


--------------------------------------------------------------------------------
/experiments/AMBER/data/metrics.txt:
--------------------------------------------------------------------------------
 1 | chair_num=0.001
 2 | chair_score=0
 3 | safe_cover_num=0.001
 4 | safe_cover_score=0
 5 | hallu_cover_num=0.001
 6 | hallu_cover_score=0
 7 | non_hallu_score=0
 8 | non_hallu_num=0.001
 9 | qa_correct_score=0
10 | qa_correct_num=0.001
11 | qa_no_score=0
12 | qa_no_num=0.001
13 | qa_ans_no_score=0
14 | qa_ans_no_num=0.001
15 | as_qa_correct_score=0
16 | as_qa_correct_num=0.001
17 | as_qa_no_score=0
18 | as_qa_no_num=0.001
19 | as_qa_ans_no_score=0
20 | as_qa_ans_no_num=0.001
21 | an_qa_correct_score=0
22 | an_qa_correct_num=0.001
23 | an_qa_no_score=0
24 | an_qa_no_num=0.001
25 | an_qa_ans_no_score=0
26 | an_qa_ans_no_num=0.001
27 | aa_qa_correct_score=0
28 | aa_qa_correct_num=0.001
29 | aa_qa_no_score=0
30 | aa_qa_no_num=0.001
31 | aa_qa_ans_no_score=0
32 | aa_qa_ans_no_num=0.001
33 | asso_qa_correct_score=0
34 | asso_qa_correct_num=0.001
35 | asso_qa_no_score=0
36 | asso_qa_no_num=0.001
37 | asso_qa_ans_no_score=0
38 | asso_qa_ans_no_num=0.001
39 | ha_qa_correct_score=0
40 | ha_qa_correct_num=0.001
41 | ha_qa_no_score=0
42 | ha_qa_no_num=0.001
43 | ha_qa_ans_no_score=0
44 | ha_qa_ans_no_num=0.001


--------------------------------------------------------------------------------
/experiments/AMBER/data/safe_words.txt:
--------------------------------------------------------------------------------
1 | orange
2 | snack
3 | line
4 | camera
5 | light
6 | shoe
7 | sign
8 | range
9 | individual


--------------------------------------------------------------------------------
/experiments/eval/eval_mme/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/eval/eval_mme/.DS_Store


--------------------------------------------------------------------------------
/experiments/eval/eval_mme/readme.txt:
--------------------------------------------------------------------------------
 1 | # This is an automated calculation script for the acc, acc+, and score.
 2 | 
 3 | # You can directly run "python3 calculation.py" to get the evaluation results of LaVIN.
 4 | 
 5 | # In order to get the statistical results of your model:
 6 | 
 7 | (1) Fill all the files in "Your_Results", adding your model's responses:
 8 | Each file in "Your_Results" consists of:
 9 | Image_Name + "\t" + Question + "\t" + Ground_Truth_Answer + "\n"
10 | 
11 | You need to add the responses of your model as:
12 | Image_Name + "\t" + Question + "\t" + Ground_Truth_Answer + "\t" + Your_Response + "\n"
13 | 
14 | Note: if your responses contain "\n", please delet it. For each question, your response can only be in one line, not across lines!
15 | 
16 | (2) run "python3 calculation.py --results_dir ./Your_Results"
17 | 
18 | 
19 | 
20 | 


--------------------------------------------------------------------------------
/experiments/lavis/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from lavis.common.registry import registry
14 | 
15 | from lavis.datasets.builders import *
16 | from lavis.models import *
17 | from lavis.processors import *
18 | from lavis.tasks import *
19 | 
20 | 
21 | root_dir = os.path.dirname(os.path.abspath(__file__))
22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
23 | 
24 | registry.register_path("library_root", root_dir)
25 | repo_root = os.path.join(root_dir, "..")
26 | registry.register_path("repo_root", repo_root)
27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
28 | registry.register_path("cache_root", cache_root)
29 | 
30 | registry.register("MAX_INT", sys.maxsize)
31 | registry.register("SPLIT_NAMES", ["train", "val", "test"])
32 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/canny/__init__.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | 
3 | 
4 | class CannyDetector:
5 |     def __call__(self, img, low_threshold, high_threshold):
6 |         return cv2.Canny(img, low_threshold, high_threshold)
7 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/ckpts/download.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 | 
3 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt
4 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/network-bsds500.pth
5 | 
6 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/midas/midas/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/common/annotator/midas/midas/__init__.py


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/midas/midas/base_model.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | 
 4 | class BaseModel(torch.nn.Module):
 5 |     def load(self, path):
 6 |         """Load model from file.
 7 | 
 8 |         Args:
 9 |             path (str): file path
10 |         """
11 |         parameters = torch.load(path, map_location=torch.device('cpu'))
12 | 
13 |         if "optimizer" in parameters:
14 |             parameters = parameters["model"]
15 | 
16 |         self.load_state_dict(parameters)
17 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from annotator.uniformer.mmseg.apis import init_segmentor, inference_segmentor, show_result_pyplot
 4 | from annotator.uniformer.mmseg.core.evaluation import get_palette
 5 | from annotator.util import annotator_ckpts_path
 6 | 
 7 | 
 8 | checkpoint_file = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/upernet_global_small.pth"
 9 | 
10 | 
11 | class UniformerDetector:
12 |     def __init__(self):
13 |         modelpath = os.path.join(annotator_ckpts_path, "upernet_global_small.pth")
14 |         if not os.path.exists(modelpath):
15 |             from basicsr.utils.download_util import load_file_from_url
16 |             load_file_from_url(checkpoint_file, model_dir=annotator_ckpts_path)
17 |         config_file = os.path.join(os.path.dirname(annotator_ckpts_path), "uniformer", "exp", "upernet_global_small", "config.py")
18 |         self.model = init_segmentor(config_file, modelpath).cuda()
19 | 
20 |     def __call__(self, img):
21 |         result = inference_segmentor(self.model, img)
22 |         res_img = show_result_pyplot(self.model, img, result, get_palette('ade'), opacity=1)
23 |         return res_img
24 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/datasets/pascal_voc12_aug.py:
--------------------------------------------------------------------------------
 1 | _base_ = './pascal_voc12.py'
 2 | # dataset settings
 3 | data = dict(
 4 |     train=dict(
 5 |         ann_dir=['SegmentationClass', 'SegmentationClassAug'],
 6 |         split=[
 7 |             'ImageSets/Segmentation/train.txt',
 8 |             'ImageSets/Segmentation/aug.txt'
 9 |         ]))
10 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/default_runtime.py:
--------------------------------------------------------------------------------
 1 | # yapf:disable
 2 | log_config = dict(
 3 |     interval=50,
 4 |     hooks=[
 5 |         dict(type='TextLoggerHook', by_epoch=False),
 6 |         # dict(type='TensorboardLoggerHook')
 7 |     ])
 8 | # yapf:enable
 9 | dist_params = dict(backend='nccl')
10 | log_level = 'INFO'
11 | load_from = None
12 | resume_from = None
13 | workflow = [('train', 1)]
14 | cudnn_benchmark = True
15 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/models/cgnet.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True)
 3 | model = dict(
 4 |     type='EncoderDecoder',
 5 |     backbone=dict(
 6 |         type='CGNet',
 7 |         norm_cfg=norm_cfg,
 8 |         in_channels=3,
 9 |         num_channels=(32, 64, 128),
10 |         num_blocks=(3, 21),
11 |         dilations=(2, 4),
12 |         reductions=(8, 16)),
13 |     decode_head=dict(
14 |         type='FCNHead',
15 |         in_channels=256,
16 |         in_index=2,
17 |         channels=256,
18 |         num_convs=0,
19 |         concat_input=False,
20 |         dropout_ratio=0,
21 |         num_classes=19,
22 |         norm_cfg=norm_cfg,
23 |         loss_decode=dict(
24 |             type='CrossEntropyLoss',
25 |             use_sigmoid=False,
26 |             loss_weight=1.0,
27 |             class_weight=[
28 |                 2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352,
29 |                 10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905,
30 |                 10.347791, 6.3927646, 10.226669, 10.241062, 10.280587,
31 |                 10.396974, 10.055647
32 |             ])),
33 |     # model training and testing settings
34 |     train_cfg=dict(sampler=None),
35 |     test_cfg=dict(mode='whole'))
36 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/models/fpn_r50.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | model = dict(
 4 |     type='EncoderDecoder',
 5 |     pretrained='open-mmlab://resnet50_v1c',
 6 |     backbone=dict(
 7 |         type='ResNetV1c',
 8 |         depth=50,
 9 |         num_stages=4,
10 |         out_indices=(0, 1, 2, 3),
11 |         dilations=(1, 1, 1, 1),
12 |         strides=(1, 2, 2, 2),
13 |         norm_cfg=norm_cfg,
14 |         norm_eval=False,
15 |         style='pytorch',
16 |         contract_dilation=True),
17 |     neck=dict(
18 |         type='FPN',
19 |         in_channels=[256, 512, 1024, 2048],
20 |         out_channels=256,
21 |         num_outs=4),
22 |     decode_head=dict(
23 |         type='FPNHead',
24 |         in_channels=[256, 256, 256, 256],
25 |         in_index=[0, 1, 2, 3],
26 |         feature_strides=[4, 8, 16, 32],
27 |         channels=128,
28 |         dropout_ratio=0.1,
29 |         num_classes=19,
30 |         norm_cfg=norm_cfg,
31 |         align_corners=False,
32 |         loss_decode=dict(
33 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
34 |     # model training and testing settings
35 |     train_cfg=dict(),
36 |     test_cfg=dict(mode='whole'))
37 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/models/fpn_uniformer.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
 3 | model = dict(
 4 |     type='EncoderDecoder',
 5 |     backbone=dict(
 6 |         type='UniFormer',
 7 |         embed_dim=[64, 128, 320, 512],
 8 |         layers=[3, 4, 8, 3],
 9 |         head_dim=64,
10 |         mlp_ratio=4.,
11 |         qkv_bias=True,
12 |         drop_rate=0.,
13 |         attn_drop_rate=0.,
14 |         drop_path_rate=0.1),
15 |     neck=dict(
16 |         type='FPN',
17 |         in_channels=[64, 128, 320, 512],
18 |         out_channels=256,
19 |         num_outs=4),
20 |     decode_head=dict(
21 |         type='FPNHead',
22 |         in_channels=[256, 256, 256, 256],
23 |         in_index=[0, 1, 2, 3],
24 |         feature_strides=[4, 8, 16, 32],
25 |         channels=128,
26 |         dropout_ratio=0.1,
27 |         num_classes=150,
28 |         norm_cfg=norm_cfg,
29 |         align_corners=False,
30 |         loss_decode=dict(
31 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
32 |     # model training and testing settings
33 |     train_cfg=dict(),
34 |     test_cfg=dict(mode='whole')
35 | )
36 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py:
--------------------------------------------------------------------------------
 1 | # model settings
 2 | norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
 3 | model = dict(
 4 |     type='EncoderDecoder',
 5 |     backbone=dict(
 6 |         type='MobileNetV3',
 7 |         arch='large',
 8 |         out_indices=(1, 3, 16),
 9 |         norm_cfg=norm_cfg),
10 |     decode_head=dict(
11 |         type='LRASPPHead',
12 |         in_channels=(16, 24, 960),
13 |         in_index=(0, 1, 2),
14 |         channels=128,
15 |         input_transform='multiple_select',
16 |         dropout_ratio=0.1,
17 |         num_classes=19,
18 |         norm_cfg=norm_cfg,
19 |         act_cfg=dict(type='ReLU'),
20 |         align_corners=False,
21 |         loss_decode=dict(
22 |             type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
23 |     # model training and testing settings
24 |     train_cfg=dict(),
25 |     test_cfg=dict(mode='whole'))
26 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_160k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=160000)
 8 | checkpoint_config = dict(by_epoch=False, interval=16000)
 9 | evaluation = dict(interval=16000, metric='mIoU')
10 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_20k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=20000)
 8 | checkpoint_config = dict(by_epoch=False, interval=2000)
 9 | evaluation = dict(interval=2000, metric='mIoU')
10 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_40k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=40000)
 8 | checkpoint_config = dict(by_epoch=False, interval=4000)
 9 | evaluation = dict(interval=4000, metric='mIoU')
10 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_80k.py:
--------------------------------------------------------------------------------
 1 | # optimizer
 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
 3 | optimizer_config = dict()
 4 | # learning policy
 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
 6 | # runtime settings
 7 | runner = dict(type='IterBasedRunner', max_iters=80000)
 8 | checkpoint_config = dict(by_epoch=False, interval=8000)
 9 | evaluation = dict(interval=8000, metric='mIoU')
10 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/exp/upernet_global_small/run.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | work_path=$(dirname $0)
 4 | PYTHONPATH="$(dirname $0)/../../":$PYTHONPATH \
 5 | python -m torch.distributed.launch --nproc_per_node=8 \
 6 |     tools/train.py ${work_path}/config.py \
 7 |     --launcher pytorch \
 8 |     --options model.backbone.pretrained_path='your_model_path/uniformer_small_in1k.pth' \
 9 |     --work-dir ${work_path}/ckpt \
10 |     2>&1 | tee -a ${work_path}/log.txt
11 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/exp/upernet_global_small/test.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | work_path=$(dirname $0)
 4 | PYTHONPATH="$(dirname $0)/../../":$PYTHONPATH \
 5 | python -m torch.distributed.launch --nproc_per_node=8 \
 6 |     tools/test.py ${work_path}/test_config_h32.py \
 7 |     ${work_path}/ckpt/latest.pth \
 8 |     --launcher pytorch \
 9 |     --eval mIoU \
10 |     2>&1 | tee -a ${work_path}/log.txt
11 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | # flake8: noqa
 3 | from .arraymisc import *
 4 | from .fileio import *
 5 | from .image import *
 6 | from .utils import *
 7 | from .version import *
 8 | from .video import *
 9 | from .visualization import *
10 | 
11 | # The following modules are not imported to this level, so mmcv may be used
12 | # without PyTorch.
13 | # - runner
14 | # - parallel
15 | # - op
16 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/arraymisc/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .quantization import dequantize, quantize
3 | 
4 | __all__ = ['quantize', 'dequantize']
5 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/hsigmoid.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch.nn as nn
 3 | 
 4 | from .registry import ACTIVATION_LAYERS
 5 | 
 6 | 
 7 | @ACTIVATION_LAYERS.register_module()
 8 | class HSigmoid(nn.Module):
 9 |     """Hard Sigmoid Module. Apply the hard sigmoid function:
10 |     Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value)
11 |     Default: Hsigmoid(x) = min(max((x + 1) / 2, 0), 1)
12 | 
13 |     Args:
14 |         bias (float): Bias of the input feature map. Default: 1.0.
15 |         divisor (float): Divisor of the input feature map. Default: 2.0.
16 |         min_value (float): Lower bound value. Default: 0.0.
17 |         max_value (float): Upper bound value. Default: 1.0.
18 | 
19 |     Returns:
20 |         Tensor: The output tensor.
21 |     """
22 | 
23 |     def __init__(self, bias=1.0, divisor=2.0, min_value=0.0, max_value=1.0):
24 |         super(HSigmoid, self).__init__()
25 |         self.bias = bias
26 |         self.divisor = divisor
27 |         assert self.divisor != 0
28 |         self.min_value = min_value
29 |         self.max_value = max_value
30 | 
31 |     def forward(self, x):
32 |         x = (x + self.bias) / self.divisor
33 | 
34 |         return x.clamp_(self.min_value, self.max_value)
35 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/hswish.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch.nn as nn
 3 | 
 4 | from .registry import ACTIVATION_LAYERS
 5 | 
 6 | 
 7 | @ACTIVATION_LAYERS.register_module()
 8 | class HSwish(nn.Module):
 9 |     """Hard Swish Module.
10 | 
11 |     This module applies the hard swish function:
12 | 
13 |     .. math::
14 |         Hswish(x) = x * ReLU6(x + 3) / 6
15 | 
16 |     Args:
17 |         inplace (bool): can optionally do the operation in-place.
18 |             Default: False.
19 | 
20 |     Returns:
21 |         Tensor: The output tensor.
22 |     """
23 | 
24 |     def __init__(self, inplace=False):
25 |         super(HSwish, self).__init__()
26 |         self.act = nn.ReLU6(inplace)
27 | 
28 |     def forward(self, x):
29 |         return x * self.act(x + 3) / 6
30 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/padding.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch.nn as nn
 3 | 
 4 | from .registry import PADDING_LAYERS
 5 | 
 6 | PADDING_LAYERS.register_module('zero', module=nn.ZeroPad2d)
 7 | PADDING_LAYERS.register_module('reflect', module=nn.ReflectionPad2d)
 8 | PADDING_LAYERS.register_module('replicate', module=nn.ReplicationPad2d)
 9 | 
10 | 
11 | def build_padding_layer(cfg, *args, **kwargs):
12 |     """Build padding layer.
13 | 
14 |     Args:
15 |         cfg (None or dict): The padding layer config, which should contain:
16 |             - type (str): Layer type.
17 |             - layer args: Args needed to instantiate a padding layer.
18 | 
19 |     Returns:
20 |         nn.Module: Created padding layer.
21 |     """
22 |     if not isinstance(cfg, dict):
23 |         raise TypeError('cfg must be a dict')
24 |     if 'type' not in cfg:
25 |         raise KeyError('the cfg dict must contain the key "type"')
26 | 
27 |     cfg_ = cfg.copy()
28 |     padding_type = cfg_.pop('type')
29 |     if padding_type not in PADDING_LAYERS:
30 |         raise KeyError(f'Unrecognized padding type {padding_type}.')
31 |     else:
32 |         padding_layer = PADDING_LAYERS.get(padding_type)
33 | 
34 |     layer = padding_layer(*args, **kwargs, **cfg_)
35 | 
36 |     return layer
37 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/registry.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from annotator.uniformer.mmcv.utils import Registry
 3 | 
 4 | CONV_LAYERS = Registry('conv layer')
 5 | NORM_LAYERS = Registry('norm layer')
 6 | ACTIVATION_LAYERS = Registry('activation layer')
 7 | PADDING_LAYERS = Registry('padding layer')
 8 | UPSAMPLE_LAYERS = Registry('upsample layer')
 9 | PLUGIN_LAYERS = Registry('plugin layer')
10 | 
11 | DROPOUT_LAYERS = Registry('drop out layers')
12 | POSITIONAL_ENCODING = Registry('position encoding')
13 | ATTENTION = Registry('attention')
14 | FEEDFORWARD_NETWORK = Registry('feed-forward Network')
15 | TRANSFORMER_LAYER = Registry('transformerLayer')
16 | TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence')
17 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/scale.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | 
 6 | class Scale(nn.Module):
 7 |     """A learnable scale parameter.
 8 | 
 9 |     This layer scales the input by a learnable factor. It multiplies a
10 |     learnable scale parameter of shape (1,) with input of any shape.
11 | 
12 |     Args:
13 |         scale (float): Initial value of scale factor. Default: 1.0
14 |     """
15 | 
16 |     def __init__(self, scale=1.0):
17 |         super(Scale, self).__init__()
18 |         self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
19 | 
20 |     def forward(self, x):
21 |         return x * self.scale
22 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/swish.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | import torch.nn as nn
 4 | 
 5 | from .registry import ACTIVATION_LAYERS
 6 | 
 7 | 
 8 | @ACTIVATION_LAYERS.register_module()
 9 | class Swish(nn.Module):
10 |     """Swish Module.
11 | 
12 |     This module applies the swish function:
13 | 
14 |     .. math::
15 |         Swish(x) = x * Sigmoid(x)
16 | 
17 |     Returns:
18 |         Tensor: The output tensor.
19 |     """
20 | 
21 |     def __init__(self):
22 |         super(Swish, self).__init__()
23 | 
24 |     def forward(self, x):
25 |         return x * torch.sigmoid(x)
26 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/cnn/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from ..runner import Sequential
 3 | from ..utils import Registry, build_from_cfg
 4 | 
 5 | 
 6 | def build_model_from_cfg(cfg, registry, default_args=None):
 7 |     """Build a PyTorch model from config dict(s). Different from
 8 |     ``build_from_cfg``, if cfg is a list, a ``nn.Sequential`` will be built.
 9 | 
10 |     Args:
11 |         cfg (dict, list[dict]): The config of modules, is is either a config
12 |             dict or a list of config dicts. If cfg is a list, a
13 |             the built modules will be wrapped with ``nn.Sequential``.
14 |         registry (:obj:`Registry`): A registry the module belongs to.
15 |         default_args (dict, optional): Default arguments to build the module.
16 |             Defaults to None.
17 | 
18 |     Returns:
19 |         nn.Module: A built nn module.
20 |     """
21 |     if isinstance(cfg, list):
22 |         modules = [
23 |             build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg
24 |         ]
25 |         return Sequential(*modules)
26 |     else:
27 |         return build_from_cfg(cfg, registry, default_args)
28 | 
29 | 
30 | MODELS = Registry('model', build_func=build_model_from_cfg)
31 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/cnn/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .flops_counter import get_model_complexity_info
 3 | from .fuse_conv_bn import fuse_conv_bn
 4 | from .sync_bn import revert_sync_batchnorm
 5 | from .weight_init import (INITIALIZERS, Caffe2XavierInit, ConstantInit,
 6 |                           KaimingInit, NormalInit, PretrainedInit,
 7 |                           TruncNormalInit, UniformInit, XavierInit,
 8 |                           bias_init_with_prob, caffe2_xavier_init,
 9 |                           constant_init, initialize, kaiming_init, normal_init,
10 |                           trunc_normal_init, uniform_init, xavier_init)
11 | 
12 | __all__ = [
13 |     'get_model_complexity_info', 'bias_init_with_prob', 'caffe2_xavier_init',
14 |     'constant_init', 'kaiming_init', 'normal_init', 'trunc_normal_init',
15 |     'uniform_init', 'xavier_init', 'fuse_conv_bn', 'initialize',
16 |     'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit',
17 |     'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit',
18 |     'Caffe2XavierInit', 'revert_sync_batchnorm'
19 | ]
20 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/engine/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .test import (collect_results_cpu, collect_results_gpu, multi_gpu_test,
3 |                    single_gpu_test)
4 | 
5 | __all__ = [
6 |     'collect_results_cpu', 'collect_results_gpu', 'multi_gpu_test',
7 |     'single_gpu_test'
8 | ]
9 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/fileio/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .file_client import BaseStorageBackend, FileClient
 3 | from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler
 4 | from .io import dump, load, register_handler
 5 | from .parse import dict_from_file, list_from_file
 6 | 
 7 | __all__ = [
 8 |     'BaseStorageBackend', 'FileClient', 'load', 'dump', 'register_handler',
 9 |     'BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler',
10 |     'list_from_file', 'dict_from_file'
11 | ]
12 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/fileio/handlers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .base import BaseFileHandler
3 | from .json_handler import JsonHandler
4 | from .pickle_handler import PickleHandler
5 | from .yaml_handler import YamlHandler
6 | 
7 | __all__ = ['BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler']
8 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/fileio/handlers/base.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from abc import ABCMeta, abstractmethod
 3 | 
 4 | 
 5 | class BaseFileHandler(metaclass=ABCMeta):
 6 |     # `str_like` is a flag to indicate whether the type of file object is
 7 |     # str-like object or bytes-like object. Pickle only processes bytes-like
 8 |     # objects but json only processes str-like object. If it is str-like
 9 |     # object, `StringIO` will be used to process the buffer.
10 |     str_like = True
11 | 
12 |     @abstractmethod
13 |     def load_from_fileobj(self, file, **kwargs):
14 |         pass
15 | 
16 |     @abstractmethod
17 |     def dump_to_fileobj(self, obj, file, **kwargs):
18 |         pass
19 | 
20 |     @abstractmethod
21 |     def dump_to_str(self, obj, **kwargs):
22 |         pass
23 | 
24 |     def load_from_path(self, filepath, mode='r', **kwargs):
25 |         with open(filepath, mode) as f:
26 |             return self.load_from_fileobj(f, **kwargs)
27 | 
28 |     def dump_to_path(self, obj, filepath, mode='w', **kwargs):
29 |         with open(filepath, mode) as f:
30 |             self.dump_to_fileobj(obj, f, **kwargs)
31 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/fileio/handlers/json_handler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import json
 3 | 
 4 | import numpy as np
 5 | 
 6 | from .base import BaseFileHandler
 7 | 
 8 | 
 9 | def set_default(obj):
10 |     """Set default json values for non-serializable values.
11 | 
12 |     It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list.
13 |     It also converts ``np.generic`` (including ``np.int32``, ``np.float32``,
14 |     etc.) into plain numbers of plain python built-in types.
15 |     """
16 |     if isinstance(obj, (set, range)):
17 |         return list(obj)
18 |     elif isinstance(obj, np.ndarray):
19 |         return obj.tolist()
20 |     elif isinstance(obj, np.generic):
21 |         return obj.item()
22 |     raise TypeError(f'{type(obj)} is unsupported for json dump')
23 | 
24 | 
25 | class JsonHandler(BaseFileHandler):
26 | 
27 |     def load_from_fileobj(self, file):
28 |         return json.load(file)
29 | 
30 |     def dump_to_fileobj(self, obj, file, **kwargs):
31 |         kwargs.setdefault('default', set_default)
32 |         json.dump(obj, file, **kwargs)
33 | 
34 |     def dump_to_str(self, obj, **kwargs):
35 |         kwargs.setdefault('default', set_default)
36 |         return json.dumps(obj, **kwargs)
37 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/fileio/handlers/pickle_handler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import pickle
 3 | 
 4 | from .base import BaseFileHandler
 5 | 
 6 | 
 7 | class PickleHandler(BaseFileHandler):
 8 | 
 9 |     str_like = False
10 | 
11 |     def load_from_fileobj(self, file, **kwargs):
12 |         return pickle.load(file, **kwargs)
13 | 
14 |     def load_from_path(self, filepath, **kwargs):
15 |         return super(PickleHandler, self).load_from_path(
16 |             filepath, mode='rb', **kwargs)
17 | 
18 |     def dump_to_str(self, obj, **kwargs):
19 |         kwargs.setdefault('protocol', 2)
20 |         return pickle.dumps(obj, **kwargs)
21 | 
22 |     def dump_to_fileobj(self, obj, file, **kwargs):
23 |         kwargs.setdefault('protocol', 2)
24 |         pickle.dump(obj, file, **kwargs)
25 | 
26 |     def dump_to_path(self, obj, filepath, **kwargs):
27 |         super(PickleHandler, self).dump_to_path(
28 |             obj, filepath, mode='wb', **kwargs)
29 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/fileio/handlers/yaml_handler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import yaml
 3 | 
 4 | try:
 5 |     from yaml import CLoader as Loader, CDumper as Dumper
 6 | except ImportError:
 7 |     from yaml import Loader, Dumper
 8 | 
 9 | from .base import BaseFileHandler  # isort:skip
10 | 
11 | 
12 | class YamlHandler(BaseFileHandler):
13 | 
14 |     def load_from_fileobj(self, file, **kwargs):
15 |         kwargs.setdefault('Loader', Loader)
16 |         return yaml.load(file, **kwargs)
17 | 
18 |     def dump_to_fileobj(self, obj, file, **kwargs):
19 |         kwargs.setdefault('Dumper', Dumper)
20 |         yaml.dump(obj, file, **kwargs)
21 | 
22 |     def dump_to_str(self, obj, **kwargs):
23 |         kwargs.setdefault('Dumper', Dumper)
24 |         return yaml.dump(obj, **kwargs)
25 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/model_zoo/deprecated.json:
--------------------------------------------------------------------------------
1 | {
2 |   "resnet50_caffe": "detectron/resnet50_caffe",
3 |   "resnet50_caffe_bgr": "detectron2/resnet50_caffe_bgr",
4 |   "resnet101_caffe": "detectron/resnet101_caffe",
5 |   "resnet101_caffe_bgr": "detectron2/resnet101_caffe_bgr"
6 | }
7 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/ops/info.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import glob
 3 | import os
 4 | 
 5 | import torch
 6 | 
 7 | if torch.__version__ == 'parrots':
 8 |     import parrots
 9 | 
10 |     def get_compiler_version():
11 |         return 'GCC ' + parrots.version.compiler
12 | 
13 |     def get_compiling_cuda_version():
14 |         return parrots.version.cuda
15 | else:
16 |     from ..utils import ext_loader
17 |     ext_module = ext_loader.load_ext(
18 |         '_ext', ['get_compiler_version', 'get_compiling_cuda_version'])
19 | 
20 |     def get_compiler_version():
21 |         return ext_module.get_compiler_version()
22 | 
23 |     def get_compiling_cuda_version():
24 |         return ext_module.get_compiling_cuda_version()
25 | 
26 | 
27 | def get_onnxruntime_op_path():
28 |     wildcard = os.path.join(
29 |         os.path.abspath(os.path.dirname(os.path.dirname(__file__))),
30 |         '_ext_ort.*.so')
31 | 
32 |     paths = glob.glob(wildcard)
33 |     if len(paths) > 0:
34 |         return paths[0]
35 |     else:
36 |         return ''
37 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/parallel/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .collate import collate
 3 | from .data_container import DataContainer
 4 | from .data_parallel import MMDataParallel
 5 | from .distributed import MMDistributedDataParallel
 6 | from .registry import MODULE_WRAPPERS
 7 | from .scatter_gather import scatter, scatter_kwargs
 8 | from .utils import is_module_wrapper
 9 | 
10 | __all__ = [
11 |     'collate', 'DataContainer', 'MMDataParallel', 'MMDistributedDataParallel',
12 |     'scatter', 'scatter_kwargs', 'is_module_wrapper', 'MODULE_WRAPPERS'
13 | ]
14 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/parallel/registry.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from torch.nn.parallel import DataParallel, DistributedDataParallel
3 | 
4 | from annotator.uniformer.mmcv.utils import Registry
5 | 
6 | MODULE_WRAPPERS = Registry('module wrapper')
7 | MODULE_WRAPPERS.register_module(module=DataParallel)
8 | MODULE_WRAPPERS.register_module(module=DistributedDataParallel)
9 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/parallel/utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .registry import MODULE_WRAPPERS
 3 | 
 4 | 
 5 | def is_module_wrapper(module):
 6 |     """Check if a module is a module wrapper.
 7 | 
 8 |     The following 3 modules in MMCV (and their subclasses) are regarded as
 9 |     module wrappers: DataParallel, DistributedDataParallel,
10 |     MMDistributedDataParallel (the deprecated version). You may add you own
11 |     module wrapper by registering it to mmcv.parallel.MODULE_WRAPPERS.
12 | 
13 |     Args:
14 |         module (nn.Module): The module to be checked.
15 | 
16 |     Returns:
17 |         bool: True if the input module is a module wrapper.
18 |     """
19 |     module_wrappers = tuple(MODULE_WRAPPERS.module_dict.values())
20 |     return isinstance(module, module_wrappers)
21 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/runner/builder.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import copy
 3 | 
 4 | from ..utils import Registry
 5 | 
 6 | RUNNERS = Registry('runner')
 7 | RUNNER_BUILDERS = Registry('runner builder')
 8 | 
 9 | 
10 | def build_runner_constructor(cfg):
11 |     return RUNNER_BUILDERS.build(cfg)
12 | 
13 | 
14 | def build_runner(cfg, default_args=None):
15 |     runner_cfg = copy.deepcopy(cfg)
16 |     constructor_type = runner_cfg.pop('constructor',
17 |                                       'DefaultRunnerConstructor')
18 |     runner_constructor = build_runner_constructor(
19 |         dict(
20 |             type=constructor_type,
21 |             runner_cfg=runner_cfg,
22 |             default_args=default_args))
23 |     runner = runner_constructor()
24 |     return runner
25 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/closure.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .hook import HOOKS, Hook
 3 | 
 4 | 
 5 | @HOOKS.register_module()
 6 | class ClosureHook(Hook):
 7 | 
 8 |     def __init__(self, fn_name, fn):
 9 |         assert hasattr(self, fn_name)
10 |         assert callable(fn)
11 |         setattr(self, fn_name, fn)
12 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/iter_timer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import time
 3 | 
 4 | from .hook import HOOKS, Hook
 5 | 
 6 | 
 7 | @HOOKS.register_module()
 8 | class IterTimerHook(Hook):
 9 | 
10 |     def before_epoch(self, runner):
11 |         self.t = time.time()
12 | 
13 |     def before_iter(self, runner):
14 |         runner.log_buffer.update({'data_time': time.time() - self.t})
15 | 
16 |     def after_iter(self, runner):
17 |         runner.log_buffer.update({'time': time.time() - self.t})
18 |         self.t = time.time()
19 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .base import LoggerHook
 3 | from .dvclive import DvcliveLoggerHook
 4 | from .mlflow import MlflowLoggerHook
 5 | from .neptune import NeptuneLoggerHook
 6 | from .pavi import PaviLoggerHook
 7 | from .tensorboard import TensorboardLoggerHook
 8 | from .text import TextLoggerHook
 9 | from .wandb import WandbLoggerHook
10 | 
11 | __all__ = [
12 |     'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook',
13 |     'TensorboardLoggerHook', 'TextLoggerHook', 'WandbLoggerHook',
14 |     'NeptuneLoggerHook', 'DvcliveLoggerHook'
15 | ]
16 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/memory.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import torch
 3 | 
 4 | from .hook import HOOKS, Hook
 5 | 
 6 | 
 7 | @HOOKS.register_module()
 8 | class EmptyCacheHook(Hook):
 9 | 
10 |     def __init__(self, before_epoch=False, after_epoch=True, after_iter=False):
11 |         self._before_epoch = before_epoch
12 |         self._after_epoch = after_epoch
13 |         self._after_iter = after_iter
14 | 
15 |     def after_iter(self, runner):
16 |         if self._after_iter:
17 |             torch.cuda.empty_cache()
18 | 
19 |     def before_epoch(self, runner):
20 |         if self._before_epoch:
21 |             torch.cuda.empty_cache()
22 | 
23 |     def after_epoch(self, runner):
24 |         if self._after_epoch:
25 |             torch.cuda.empty_cache()
26 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/sampler_seed.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .hook import HOOKS, Hook
 3 | 
 4 | 
 5 | @HOOKS.register_module()
 6 | class DistSamplerSeedHook(Hook):
 7 |     """Data-loading sampler for distributed training.
 8 | 
 9 |     When distributed training, it is only useful in conjunction with
10 |     :obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same
11 |     purpose with :obj:`IterLoader`.
12 |     """
13 | 
14 |     def before_epoch(self, runner):
15 |         if hasattr(runner.data_loader.sampler, 'set_epoch'):
16 |             # in case the data loader uses `SequentialSampler` in Pytorch
17 |             runner.data_loader.sampler.set_epoch(runner.epoch)
18 |         elif hasattr(runner.data_loader.batch_sampler.sampler, 'set_epoch'):
19 |             # batch sampler in pytorch warps the sampler as its attributes.
20 |             runner.data_loader.batch_sampler.sampler.set_epoch(runner.epoch)
21 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/sync_buffer.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from ..dist_utils import allreduce_params
 3 | from .hook import HOOKS, Hook
 4 | 
 5 | 
 6 | @HOOKS.register_module()
 7 | class SyncBuffersHook(Hook):
 8 |     """Synchronize model buffers such as running_mean and running_var in BN at
 9 |     the end of each epoch.
10 | 
11 |     Args:
12 |         distributed (bool): Whether distributed training is used. It is
13 |           effective only for distributed training. Defaults to True.
14 |     """
15 | 
16 |     def __init__(self, distributed=True):
17 |         self.distributed = distributed
18 | 
19 |     def after_epoch(self, runner):
20 |         """All-reduce model buffers at the end of each epoch."""
21 |         if self.distributed:
22 |             allreduce_params(runner.model.buffers())
23 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/runner/optimizer/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .builder import (OPTIMIZER_BUILDERS, OPTIMIZERS, build_optimizer,
 3 |                       build_optimizer_constructor)
 4 | from .default_constructor import DefaultOptimizerConstructor
 5 | 
 6 | __all__ = [
 7 |     'OPTIMIZER_BUILDERS', 'OPTIMIZERS', 'DefaultOptimizerConstructor',
 8 |     'build_optimizer', 'build_optimizer_constructor'
 9 | ]
10 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/utils/parrots_jit.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | import os
 3 | 
 4 | from .parrots_wrapper import TORCH_VERSION
 5 | 
 6 | parrots_jit_option = os.getenv('PARROTS_JIT_OPTION')
 7 | 
 8 | if TORCH_VERSION == 'parrots' and parrots_jit_option == 'ON':
 9 |     from parrots.jit import pat as jit
10 | else:
11 | 
12 |     def jit(func=None,
13 |             check_input=None,
14 |             full_shape=True,
15 |             derivate=False,
16 |             coderize=False,
17 |             optimize=False):
18 | 
19 |         def wrapper(func):
20 | 
21 |             def wrapper_inner(*args, **kargs):
22 |                 return func(*args, **kargs)
23 | 
24 |             return wrapper_inner
25 | 
26 |         if func is None:
27 |             return wrapper
28 |         else:
29 |             return func
30 | 
31 | 
32 | if TORCH_VERSION == 'parrots':
33 |     from parrots.utils.tester import skip_no_elena
34 | else:
35 | 
36 |     def skip_no_elena(func):
37 | 
38 |         def wrapper(*args, **kargs):
39 |             return func(*args, **kargs)
40 | 
41 |         return wrapper
42 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/utils/trace.py:
--------------------------------------------------------------------------------
 1 | import warnings
 2 | 
 3 | import torch
 4 | 
 5 | from annotator.uniformer.mmcv.utils import digit_version
 6 | 
 7 | 
 8 | def is_jit_tracing() -> bool:
 9 |     if (torch.__version__ != 'parrots'
10 |             and digit_version(torch.__version__) >= digit_version('1.6.0')):
11 |         on_trace = torch.jit.is_tracing()
12 |         # In PyTorch 1.6, torch.jit.is_tracing has a bug.
13 |         # Refers to https://github.com/pytorch/pytorch/issues/42448
14 |         if isinstance(on_trace, bool):
15 |             return on_trace
16 |         else:
17 |             return torch._C._is_tracing()
18 |     else:
19 |         warnings.warn(
20 |             'torch.jit.is_tracing is only supported after v1.6.0. '
21 |             'Therefore is_tracing returns False automatically. Please '
22 |             'set on_trace manually if you are using trace.', UserWarning)
23 |         return False
24 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/video/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .io import Cache, VideoReader, frames2video
 3 | from .optflow import (dequantize_flow, flow_from_bytes, flow_warp, flowread,
 4 |                       flowwrite, quantize_flow, sparse_flow_from_bytes)
 5 | from .processing import concat_video, convert_video, cut_video, resize_video
 6 | 
 7 | __all__ = [
 8 |     'Cache', 'VideoReader', 'frames2video', 'convert_video', 'resize_video',
 9 |     'cut_video', 'concat_video', 'flowread', 'flowwrite', 'quantize_flow',
10 |     'dequantize_flow', 'flow_warp', 'flow_from_bytes', 'sparse_flow_from_bytes'
11 | ]
12 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/visualization/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) OpenMMLab. All rights reserved.
 2 | from .color import Color, color_val
 3 | from .image import imshow, imshow_bboxes, imshow_det_bboxes
 4 | from .optflow import flow2rgb, flowshow, make_color_wheel
 5 | 
 6 | __all__ = [
 7 |     'Color', 'color_val', 'imshow', 'imshow_bboxes', 'imshow_det_bboxes',
 8 |     'flowshow', 'flow2rgb', 'make_color_wheel'
 9 | ]
10 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv_custom/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | 
3 | from .checkpoint import load_checkpoint
4 | 
5 | __all__ = ['load_checkpoint']


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/apis/__init__.py:
--------------------------------------------------------------------------------
 1 | from .inference import inference_segmentor, init_segmentor, show_result_pyplot
 2 | from .test import multi_gpu_test, single_gpu_test
 3 | from .train import get_root_logger, set_random_seed, train_segmentor
 4 | 
 5 | __all__ = [
 6 |     'get_root_logger', 'set_random_seed', 'train_segmentor', 'init_segmentor',
 7 |     'inference_segmentor', 'multi_gpu_test', 'single_gpu_test',
 8 |     'show_result_pyplot'
 9 | ]
10 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .evaluation import *  # noqa: F401, F403
2 | from .seg import *  # noqa: F401, F403
3 | from .utils import *  # noqa: F401, F403
4 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/core/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .class_names import get_classes, get_palette
2 | from .eval_hooks import DistEvalHook, EvalHook
3 | from .metrics import eval_metrics, mean_dice, mean_fscore, mean_iou
4 | 
5 | __all__ = [
6 |     'EvalHook', 'DistEvalHook', 'mean_dice', 'mean_iou', 'mean_fscore',
7 |     'eval_metrics', 'get_classes', 'get_palette'
8 | ]
9 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/core/seg/__init__.py:
--------------------------------------------------------------------------------
1 | from .builder import build_pixel_sampler
2 | from .sampler import BasePixelSampler, OHEMPixelSampler
3 | 
4 | __all__ = ['build_pixel_sampler', 'BasePixelSampler', 'OHEMPixelSampler']
5 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/core/seg/builder.py:
--------------------------------------------------------------------------------
1 | from annotator.uniformer.mmcv.utils import Registry, build_from_cfg
2 | 
3 | PIXEL_SAMPLERS = Registry('pixel sampler')
4 | 
5 | 
6 | def build_pixel_sampler(cfg, **default_args):
7 |     """Build pixel sampler for segmentation map."""
8 |     return build_from_cfg(cfg, PIXEL_SAMPLERS, default_args)
9 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/core/seg/sampler/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_pixel_sampler import BasePixelSampler
2 | from .ohem_pixel_sampler import OHEMPixelSampler
3 | 
4 | __all__ = ['BasePixelSampler', 'OHEMPixelSampler']
5 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/core/seg/sampler/base_pixel_sampler.py:
--------------------------------------------------------------------------------
 1 | from abc import ABCMeta, abstractmethod
 2 | 
 3 | 
 4 | class BasePixelSampler(metaclass=ABCMeta):
 5 |     """Base class of pixel sampler."""
 6 | 
 7 |     def __init__(self, **kwargs):
 8 |         pass
 9 | 
10 |     @abstractmethod
11 |     def sample(self, seg_logit, seg_label):
12 |         """Placeholder for sample function."""
13 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/core/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .misc import add_prefix
2 | 
3 | __all__ = ['add_prefix']
4 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/core/utils/misc.py:
--------------------------------------------------------------------------------
 1 | def add_prefix(inputs, prefix):
 2 |     """Add prefix for dict.
 3 | 
 4 |     Args:
 5 |         inputs (dict): The input dict with str keys.
 6 |         prefix (str): The prefix to add.
 7 | 
 8 |     Returns:
 9 | 
10 |         dict: The dict with keys updated with ``prefix``.
11 |     """
12 | 
13 |     outputs = dict()
14 |     for name, value in inputs.items():
15 |         outputs[f'{prefix}.{name}'] = value
16 | 
17 |     return outputs
18 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/datasets/__init__.py:
--------------------------------------------------------------------------------
 1 | from .ade import ADE20KDataset
 2 | from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset
 3 | from .chase_db1 import ChaseDB1Dataset
 4 | from .cityscapes import CityscapesDataset
 5 | from .custom import CustomDataset
 6 | from .dataset_wrappers import ConcatDataset, RepeatDataset
 7 | from .drive import DRIVEDataset
 8 | from .hrf import HRFDataset
 9 | from .pascal_context import PascalContextDataset, PascalContextDataset59
10 | from .stare import STAREDataset
11 | from .voc import PascalVOCDataset
12 | 
13 | __all__ = [
14 |     'CustomDataset', 'build_dataloader', 'ConcatDataset', 'RepeatDataset',
15 |     'DATASETS', 'build_dataset', 'PIPELINES', 'CityscapesDataset',
16 |     'PascalVOCDataset', 'ADE20KDataset', 'PascalContextDataset',
17 |     'PascalContextDataset59', 'ChaseDB1Dataset', 'DRIVEDataset', 'HRFDataset',
18 |     'STAREDataset'
19 | ]
20 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/datasets/chase_db1.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | 
 3 | from .builder import DATASETS
 4 | from .custom import CustomDataset
 5 | 
 6 | 
 7 | @DATASETS.register_module()
 8 | class ChaseDB1Dataset(CustomDataset):
 9 |     """Chase_db1 dataset.
10 | 
11 |     In segmentation map annotation for Chase_db1, 0 stands for background,
12 |     which is included in 2 categories. ``reduce_zero_label`` is fixed to False.
13 |     The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
14 |     '_1stHO.png'.
15 |     """
16 | 
17 |     CLASSES = ('background', 'vessel')
18 | 
19 |     PALETTE = [[120, 120, 120], [6, 230, 230]]
20 | 
21 |     def __init__(self, **kwargs):
22 |         super(ChaseDB1Dataset, self).__init__(
23 |             img_suffix='.png',
24 |             seg_map_suffix='_1stHO.png',
25 |             reduce_zero_label=False,
26 |             **kwargs)
27 |         assert osp.exists(self.img_dir)
28 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/datasets/drive.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | 
 3 | from .builder import DATASETS
 4 | from .custom import CustomDataset
 5 | 
 6 | 
 7 | @DATASETS.register_module()
 8 | class DRIVEDataset(CustomDataset):
 9 |     """DRIVE dataset.
10 | 
11 |     In segmentation map annotation for DRIVE, 0 stands for background, which is
12 |     included in 2 categories. ``reduce_zero_label`` is fixed to False. The
13 |     ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
14 |     '_manual1.png'.
15 |     """
16 | 
17 |     CLASSES = ('background', 'vessel')
18 | 
19 |     PALETTE = [[120, 120, 120], [6, 230, 230]]
20 | 
21 |     def __init__(self, **kwargs):
22 |         super(DRIVEDataset, self).__init__(
23 |             img_suffix='.png',
24 |             seg_map_suffix='_manual1.png',
25 |             reduce_zero_label=False,
26 |             **kwargs)
27 |         assert osp.exists(self.img_dir)
28 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/datasets/hrf.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | 
 3 | from .builder import DATASETS
 4 | from .custom import CustomDataset
 5 | 
 6 | 
 7 | @DATASETS.register_module()
 8 | class HRFDataset(CustomDataset):
 9 |     """HRF dataset.
10 | 
11 |     In segmentation map annotation for HRF, 0 stands for background, which is
12 |     included in 2 categories. ``reduce_zero_label`` is fixed to False. The
13 |     ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
14 |     '.png'.
15 |     """
16 | 
17 |     CLASSES = ('background', 'vessel')
18 | 
19 |     PALETTE = [[120, 120, 120], [6, 230, 230]]
20 | 
21 |     def __init__(self, **kwargs):
22 |         super(HRFDataset, self).__init__(
23 |             img_suffix='.png',
24 |             seg_map_suffix='.png',
25 |             reduce_zero_label=False,
26 |             **kwargs)
27 |         assert osp.exists(self.img_dir)
28 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/__init__.py:
--------------------------------------------------------------------------------
 1 | from .compose import Compose
 2 | from .formating import (Collect, ImageToTensor, ToDataContainer, ToTensor,
 3 |                         Transpose, to_tensor)
 4 | from .loading import LoadAnnotations, LoadImageFromFile
 5 | from .test_time_aug import MultiScaleFlipAug
 6 | from .transforms import (CLAHE, AdjustGamma, Normalize, Pad,
 7 |                          PhotoMetricDistortion, RandomCrop, RandomFlip,
 8 |                          RandomRotate, Rerange, Resize, RGB2Gray, SegRescale)
 9 | 
10 | __all__ = [
11 |     'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer',
12 |     'Transpose', 'Collect', 'LoadAnnotations', 'LoadImageFromFile',
13 |     'MultiScaleFlipAug', 'Resize', 'RandomFlip', 'Pad', 'RandomCrop',
14 |     'Normalize', 'SegRescale', 'PhotoMetricDistortion', 'RandomRotate',
15 |     'AdjustGamma', 'CLAHE', 'Rerange', 'RGB2Gray'
16 | ]
17 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/datasets/stare.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | 
 3 | from .builder import DATASETS
 4 | from .custom import CustomDataset
 5 | 
 6 | 
 7 | @DATASETS.register_module()
 8 | class STAREDataset(CustomDataset):
 9 |     """STARE dataset.
10 | 
11 |     In segmentation map annotation for STARE, 0 stands for background, which is
12 |     included in 2 categories. ``reduce_zero_label`` is fixed to False. The
13 |     ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
14 |     '.ah.png'.
15 |     """
16 | 
17 |     CLASSES = ('background', 'vessel')
18 | 
19 |     PALETTE = [[120, 120, 120], [6, 230, 230]]
20 | 
21 |     def __init__(self, **kwargs):
22 |         super(STAREDataset, self).__init__(
23 |             img_suffix='.png',
24 |             seg_map_suffix='.ah.png',
25 |             reduce_zero_label=False,
26 |             **kwargs)
27 |         assert osp.exists(self.img_dir)
28 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/datasets/voc.py:
--------------------------------------------------------------------------------
 1 | import os.path as osp
 2 | 
 3 | from .builder import DATASETS
 4 | from .custom import CustomDataset
 5 | 
 6 | 
 7 | @DATASETS.register_module()
 8 | class PascalVOCDataset(CustomDataset):
 9 |     """Pascal VOC dataset.
10 | 
11 |     Args:
12 |         split (str): Split txt file for Pascal VOC.
13 |     """
14 | 
15 |     CLASSES = ('background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
16 |                'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog',
17 |                'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa',
18 |                'train', 'tvmonitor')
19 | 
20 |     PALETTE = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128],
21 |                [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0],
22 |                [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128],
23 |                [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0],
24 |                [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]]
25 | 
26 |     def __init__(self, split, **kwargs):
27 |         super(PascalVOCDataset, self).__init__(
28 |             img_suffix='.jpg', seg_map_suffix='.png', split=split, **kwargs)
29 |         assert osp.exists(self.img_dir) and self.split is not None
30 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/models/__init__.py:
--------------------------------------------------------------------------------
 1 | from .backbones import *  # noqa: F401,F403
 2 | from .builder import (BACKBONES, HEADS, LOSSES, SEGMENTORS, build_backbone,
 3 |                       build_head, build_loss, build_segmentor)
 4 | from .decode_heads import *  # noqa: F401,F403
 5 | from .losses import *  # noqa: F401,F403
 6 | from .necks import *  # noqa: F401,F403
 7 | from .segmentors import *  # noqa: F401,F403
 8 | 
 9 | __all__ = [
10 |     'BACKBONES', 'HEADS', 'LOSSES', 'SEGMENTORS', 'build_backbone',
11 |     'build_head', 'build_loss', 'build_segmentor'
12 | ]
13 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/models/backbones/__init__.py:
--------------------------------------------------------------------------------
 1 | from .cgnet import CGNet
 2 | # from .fast_scnn import FastSCNN
 3 | from .hrnet import HRNet
 4 | from .mobilenet_v2 import MobileNetV2
 5 | from .mobilenet_v3 import MobileNetV3
 6 | from .resnest import ResNeSt
 7 | from .resnet import ResNet, ResNetV1c, ResNetV1d
 8 | from .resnext import ResNeXt
 9 | from .unet import UNet
10 | from .vit import VisionTransformer
11 | from .uniformer import UniFormer
12 | 
13 | __all__ = [
14 |     'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet',
15 |     'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3',
16 |     'VisionTransformer', 'UniFormer'
17 | ]
18 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/models/decode_heads/__init__.py:
--------------------------------------------------------------------------------
 1 | from .ann_head import ANNHead
 2 | from .apc_head import APCHead
 3 | from .aspp_head import ASPPHead
 4 | from .cc_head import CCHead
 5 | from .da_head import DAHead
 6 | from .dm_head import DMHead
 7 | from .dnl_head import DNLHead
 8 | from .ema_head import EMAHead
 9 | from .enc_head import EncHead
10 | from .fcn_head import FCNHead
11 | from .fpn_head import FPNHead
12 | from .gc_head import GCHead
13 | from .lraspp_head import LRASPPHead
14 | from .nl_head import NLHead
15 | from .ocr_head import OCRHead
16 | # from .point_head import PointHead
17 | from .psa_head import PSAHead
18 | from .psp_head import PSPHead
19 | from .sep_aspp_head import DepthwiseSeparableASPPHead
20 | from .sep_fcn_head import DepthwiseSeparableFCNHead
21 | from .uper_head import UPerHead
22 | 
23 | __all__ = [
24 |     'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead',
25 |     'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead',
26 |     'EncHead', 'DepthwiseSeparableFCNHead', 'FPNHead', 'EMAHead', 'DNLHead',
27 |     'APCHead', 'DMHead', 'LRASPPHead'
28 | ]
29 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/models/losses/__init__.py:
--------------------------------------------------------------------------------
 1 | from .accuracy import Accuracy, accuracy
 2 | from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy,
 3 |                                  cross_entropy, mask_cross_entropy)
 4 | from .dice_loss import DiceLoss
 5 | from .lovasz_loss import LovaszLoss
 6 | from .utils import reduce_loss, weight_reduce_loss, weighted_loss
 7 | 
 8 | __all__ = [
 9 |     'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy',
10 |     'mask_cross_entropy', 'CrossEntropyLoss', 'reduce_loss',
11 |     'weight_reduce_loss', 'weighted_loss', 'LovaszLoss', 'DiceLoss'
12 | ]
13 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/models/necks/__init__.py:
--------------------------------------------------------------------------------
1 | from .fpn import FPN
2 | from .multilevel_neck import MultiLevelNeck
3 | 
4 | __all__ = ['FPN', 'MultiLevelNeck']
5 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/models/segmentors/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import BaseSegmentor
2 | from .cascade_encoder_decoder import CascadeEncoderDecoder
3 | from .encoder_decoder import EncoderDecoder
4 | 
5 | __all__ = ['BaseSegmentor', 'EncoderDecoder', 'CascadeEncoderDecoder']
6 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/models/utils/__init__.py:
--------------------------------------------------------------------------------
 1 | from .drop import DropPath
 2 | from .inverted_residual import InvertedResidual, InvertedResidualV3
 3 | from .make_divisible import make_divisible
 4 | from .res_layer import ResLayer
 5 | from .se_layer import SELayer
 6 | from .self_attention_block import SelfAttentionBlock
 7 | from .up_conv_block import UpConvBlock
 8 | from .weight_init import trunc_normal_
 9 | 
10 | __all__ = [
11 |     'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual',
12 |     'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'DropPath', 'trunc_normal_'
13 | ]
14 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/models/utils/drop.py:
--------------------------------------------------------------------------------
 1 | """Modified from https://github.com/rwightman/pytorch-image-
 2 | models/blob/master/timm/models/layers/drop.py."""
 3 | 
 4 | import torch
 5 | from torch import nn
 6 | 
 7 | 
 8 | class DropPath(nn.Module):
 9 |     """Drop paths (Stochastic Depth) per sample  (when applied in main path of
10 |     residual blocks).
11 | 
12 |     Args:
13 |         drop_prob (float): Drop rate for paths of model. Dropout rate has
14 |             to be between 0 and 1. Default: 0.
15 |     """
16 | 
17 |     def __init__(self, drop_prob=0.):
18 |         super(DropPath, self).__init__()
19 |         self.drop_prob = drop_prob
20 |         self.keep_prob = 1 - drop_prob
21 | 
22 |     def forward(self, x):
23 |         if self.drop_prob == 0. or not self.training:
24 |             return x
25 |         shape = (x.shape[0], ) + (1, ) * (
26 |             x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
27 |         random_tensor = self.keep_prob + torch.rand(
28 |             shape, dtype=x.dtype, device=x.device)
29 |         random_tensor.floor_()  # binarize
30 |         output = x.div(self.keep_prob) * random_tensor
31 |         return output
32 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/ops/__init__.py:
--------------------------------------------------------------------------------
1 | from .encoding import Encoding
2 | from .wrappers import Upsample, resize
3 | 
4 | __all__ = ['Upsample', 'resize', 'Encoding']
5 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .collect_env import collect_env
2 | from .logger import get_root_logger
3 | 
4 | __all__ = ['get_root_logger', 'collect_env']
5 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/utils/collect_env.py:
--------------------------------------------------------------------------------
 1 | from annotator.uniformer.mmcv.utils import collect_env as collect_base_env
 2 | from annotator.uniformer.mmcv.utils import get_git_hash
 3 | 
 4 | import annotator.uniformer.mmseg as mmseg
 5 | 
 6 | 
 7 | def collect_env():
 8 |     """Collect the information of the running environments."""
 9 |     env_info = collect_base_env()
10 |     env_info['MMSegmentation'] = f'{mmseg.__version__}+{get_git_hash()[:7]}'
11 | 
12 |     return env_info
13 | 
14 | 
15 | if __name__ == '__main__':
16 |     for name, val in collect_env().items():
17 |         print('{}: {}'.format(name, val))
18 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | from annotator.uniformer.mmcv.utils import get_logger
 4 | 
 5 | 
 6 | def get_root_logger(log_file=None, log_level=logging.INFO):
 7 |     """Get the root logger.
 8 | 
 9 |     The logger will be initialized if it has not been initialized. By default a
10 |     StreamHandler will be added. If `log_file` is specified, a FileHandler will
11 |     also be added. The name of the root logger is the top-level package name,
12 |     e.g., "mmseg".
13 | 
14 |     Args:
15 |         log_file (str | None): The log filename. If specified, a FileHandler
16 |             will be added to the root logger.
17 |         log_level (int): The root logger level. Note that only the process of
18 |             rank 0 is affected, while other processes will set the level to
19 |             "Error" and be silent most of the time.
20 | 
21 |     Returns:
22 |         logging.Logger: The root logger.
23 |     """
24 | 
25 |     logger = get_logger(name='mmseg', log_file=log_file, log_level=log_level)
26 | 
27 |     return logger
28 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/util.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import cv2
 3 | import os
 4 | 
 5 | 
 6 | annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts')
 7 | 
 8 | 
 9 | def HWC3(x):
10 |     assert x.dtype == np.uint8
11 |     if x.ndim == 2:
12 |         x = x[:, :, None]
13 |     assert x.ndim == 3
14 |     H, W, C = x.shape
15 |     assert C == 1 or C == 3 or C == 4
16 |     if C == 3:
17 |         return x
18 |     if C == 1:
19 |         return np.concatenate([x, x, x], axis=2)
20 |     if C == 4:
21 |         color = x[:, :, 0:3].astype(np.float32)
22 |         alpha = x[:, :, 3:4].astype(np.float32) / 255.0
23 |         y = color * alpha + 255.0 * (1.0 - alpha)
24 |         y = y.clip(0, 255).astype(np.uint8)
25 |         return y
26 | 
27 | 
28 | def resize_image(input_image, resolution):
29 |     H, W, C = input_image.shape
30 |     H = float(H)
31 |     W = float(W)
32 |     k = float(resolution) / min(H, W)
33 |     H *= k
34 |     W *= k
35 |     H = int(np.round(H / 64.0)) * 64
36 |     W = int(np.round(W / 64.0)) * 64
37 |     img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
38 |     return img
39 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/gradcam.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from matplotlib import pyplot as plt
 3 | from scipy.ndimage import filters
 4 | from skimage import transform as skimage_transform
 5 | 
 6 | 
 7 | def getAttMap(img, attMap, blur=True, overlap=True):
 8 |     attMap -= attMap.min()
 9 |     if attMap.max() > 0:
10 |         attMap /= attMap.max()
11 |     attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
12 |     if blur:
13 |         attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
14 |         attMap -= attMap.min()
15 |         attMap /= attMap.max()
16 |     cmap = plt.get_cmap("jet")
17 |     attMapV = cmap(attMap)
18 |     attMapV = np.delete(attMapV, 3, 2)
19 |     if overlap:
20 |         attMap = (
21 |             1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
22 |             + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
23 |         )
24 |     return attMap
25 | 


--------------------------------------------------------------------------------
/experiments/lavis/common/vqa_tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |  Copyright (c) 2022, salesforce.com, inc.
3 |  All rights reserved.
4 |  SPDX-License-Identifier: BSD-3-Clause
5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 | 
8 | __author__ = "aagrawal"
9 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/avsd/defaults_dial.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   avsd_dialogue: # name of the dataset builder
 8 |     dataset_card: dataset_card/avsd_dialogue.md 
 9 |     data_type: features #extracted features of videos (I3D, VGGish) # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_train.json
16 |           storage: avsd/annotations/train.json 
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_val.json
19 |           storage: avsd/annotations/val.json 
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_test.json
22 |           storage: avsd/annotations/test.json 
23 |       features:
24 |         storage: avsd/features/ 
25 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/blip_diffusion_datasets/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   blip_diffusion_finetune: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       images:
14 |         storage: ""
15 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/coco/defaults_ret.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   coco_retrieval:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
16 |           md5: aa31ac474cf6250ebb81d18348a07ed8
17 |           storage: coco/annotations/coco_karpathy_train.json
18 |         val:
19 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
20 |           md5: b273847456ef5580e33713b1f7de52a0
21 |           storage:  coco/annotations/coco_karpathy_val.json
22 |         test:
23 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
24 |           md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
25 |           storage: coco/annotations/coco_karpathy_test.json
26 |       images:
27 |           storage: coco/images/
28 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/conceptual_caption/defaults_12m.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   conceptual_caption_12m:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - /export/home/workspace/datasets/cc12m.json
17 |           storage:
18 |               - conceptual_caption/annotations/cc12m.json
19 |       images:
20 |           storage: conceptual_caption/images_12m
21 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/conceptual_caption/defaults_3m.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   conceptual_caption_3m:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - /export/home/workspace/datasets/cc3m.json
17 |           storage:
18 |               - conceptual_caption/annotations/cc3m.json
19 |       images:
20 |           storage: conceptual_caption/images
21 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/didemo/defaults_ret.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   didemo_retrieval: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_train.json
16 |           storage: didemo/annotations/retrieval_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_val.json
19 |           storage: didemo/annotations/retrieval_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_test.json
22 |           storage: didemo/annotations/retrieval_test.json
23 |       videos:
24 |         storage: didemo/videos
25 |         # storage: /export/share/dongxuli/data/didemo_retrieval/videos
26 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/flickr30k/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   flickr30k:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images
10 | 
11 |     build_info:
12 |       annotations:
13 |         train:
14 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json
15 |           storage: flickr30k/annotations/train.json
16 |         val:
17 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json
18 |           storage: flickr30k/annotations/val.json
19 |         test:
20 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json
21 |           storage: flickr30k/annotations/test.json
22 |       images:
23 |           storage: flickr30k/images
24 |           # storage: /export/share/datasets/vision/flickr30k
25 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/imagenet/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   imagenet:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       splits: ["val"]
14 |       images:
15 |           storage: /export/share/datasets/vision/imagenet
16 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/laion/defaults_2B_multi.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   laion2B_multi:
 8 | 
 9 |     data_type: images
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar
14 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/msrvtt/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msrvtt_cap: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json
16 |           storage: msrvtt/annotations/cap_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json
19 |           storage: msrvtt/annotations/cap_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json
22 |           storage: msrvtt/annotations/cap_test.json
23 |       videos:
24 |         storage: msrvtt/videos
25 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/msrvtt/defaults_ret.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msrvtt_retrieval: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_train.json
16 |           storage: msrvtt/annotations/retrieval_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_val.json
19 |           storage: msrvtt/annotations/retrieval_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_test.json
22 |           storage: msrvtt/annotations/retrieval_test.json
23 |       videos:
24 |         storage: msrvtt/videos
25 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/msvd/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msvd_cap: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json
16 |           storage: msvd/annotations/cap_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json
19 |           storage: msvd/annotations/cap_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json
22 |           storage: msvd/annotations/cap_test.json
23 |       videos:
24 |         storage: msvd/videos
25 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/nlvr/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   nlvr:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json
16 |           storage: nlvr/annotations/train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
19 |           storage: nlvr/annotations/dev.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
22 |           storage: nlvr/annotations/test.json
23 |       images:
24 |           storage: /export/share/datasets/vision/NLVR2/
25 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/nocaps/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   nocaps: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         val:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json
16 |           storage:  nocaps/annotations/nocaps_val.json
17 |         test:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json
19 |           storage: nocaps/annotations/nocaps_test.json
20 |       images:
21 |         storage: nocaps/images
22 |         # storage: /export/share/datasets/vision/nocaps/
23 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/sbu_caption/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   sbu_caption:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url:
16 |               - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json
17 |               # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json
18 |           storage:
19 |               - sbu_captions/annotations/sbu.json
20 |       images:
21 |           storage: sbu_captions/images
22 |           # storage: /export/share/datasets/vision_language/sbu_resize
23 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/snli_ve/defaults.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   snli_ve:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json
16 |           storage: snli/annotations/ve_train.json
17 |         val:
18 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json
19 |           storage: snli/annotations/ve_dev.json
20 |         test:
21 |           url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json
22 |           storage: snli/annotations/ve_test.json
23 |       images:
24 |           storage: flickr30k/images/flickr30k-images
25 |           # storage: /export/share/datasets/vision/flickr30k/flickr30k-images
26 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/vatex/defaults_cap.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   msvd_cap: # name of the dataset builder
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: videos # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
16 |           storage: vatex/annotations/cap_train.json
17 |         val:
18 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
19 |           storage: vatex/annotations/cap_val.json
20 |         test:
21 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
22 |           storage: vatex/annotations/cap_test.json
23 |       videos:
24 |         storage: /export/share/dongxuli/data/vatex
25 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/vg/defaults_caption.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   vg_caption:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json
16 |           storage: vg/annotations/vg_caption.json
17 |       images:
18 |         storage: vg/images/
19 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/vg/defaults_vqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   vg_vqa:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       annotations:
14 |         train:
15 |           url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json
16 |           storage: vg/annotations/vg_qa.json
17 |       images:
18 |         storage: vg/images/
19 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/default.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | env:
 7 |   # For default users
 8 |   # cache_root: "cache"
 9 |   # For internal use with persistent storage
10 |   cache_root: "/export/home/.cache/lavis"
11 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/albef_classification_ve.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_classification
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt"
11 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
12 | 
13 |   num_classes: 3
14 | 
15 |   use_distill: True
16 |   momentum: 0.995
17 |   alpha: 0.4
18 | 
19 |   # vit encoder
20 |   vit_type: "base"
21 |   vit_grad_ckpt: False
22 |   vit_ckpt_layer: 0
23 |   vit_layer_norm_epsilon: 1e-6
24 | 
25 |   image_size: 384
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/med_config_albef.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |       eval:
35 |         name: "blip_image_eval"
36 |   text_processor:
37 |       train:
38 |         name: "blip_caption"
39 |       eval:
40 |         name: "blip_caption"
41 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/albef_feature_extractor.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_pretrain
 8 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
 9 | 
10 |   # vit encoder
11 |   vit_type: "base"
12 |   image_size: 224
13 |   vit_ckpt_layer: 0
14 |   vit_drop_path_rate: 0
15 |   vit_layer_norm_epsilon: 1e-6
16 |   vit_grad_ckpt: False
17 | 
18 |   # bert config
19 |   med_config_path: "configs/models/med_config_albef.json"
20 | 
21 |   embed_dim: 256
22 | 
23 | preprocess:
24 |   vis_processor:
25 |       eval:
26 |         name: "blip_image_eval"
27 |         image_size: 224
28 |   text_processor:
29 |       eval:
30 |         name: "blip_caption"
31 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/albef_nlvr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_nlvr
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt"
12 | 
13 |   num_classes: 2
14 | 
15 |   use_distill: True
16 |   momentum: 0.995
17 |   alpha: 0.4
18 | 
19 |   # vit encoder
20 |   vit_type: "base"
21 |   vit_grad_ckpt: False
22 |   vit_ckpt_layer: 0
23 |   vit_layer_norm_epsilon: 1e-6
24 | 
25 |   image_size: 384
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/med_config_albef.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |         image_size: 384
35 |       eval:
36 |         name: "blip_image_eval"
37 |         image_size: 384
38 |   text_processor:
39 |       train:
40 |         name: "blip_caption"
41 |       eval:
42 |         name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/albef_pretrain_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_pretrain
 8 | 
 9 |   load_pretrained: True
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   image_size: 224
15 |   vit_ckpt_layer: 0
16 |   vit_drop_path_rate: 0
17 |   vit_layer_norm_epsilon: 1e-6
18 |   vit_grad_ckpt: False
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/med_config_albef.json"
22 |   mlm_mask_prob: 0.15
23 | 
24 |   embed_dim: 256
25 |   momentum: 0.995
26 |   alpha: 0.4
27 |   temp: 0.07
28 | 
29 |   max_txt_len: 30
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 256
36 |     text_processor:
37 |         train:
38 |           name: "blip_caption"
39 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/albef_vqav2.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_vqa
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt"
12 | 
13 |   use_distill: True
14 |   momentum: 0.995
15 |   alpha: 0.4
16 | 
17 |   # vit encoder
18 |   vit_type: "base"
19 |   vit_grad_ckpt: False
20 |   vit_ckpt_layer: 0
21 |   vit_layer_norm_epsilon: 1e-6
22 | 
23 |   image_size: 384
24 | 
25 |   # bert config
26 |   med_config_path: "configs/models/med_config_albef.json"
27 | 
28 | preprocess:
29 |   vis_processor:
30 |       train:
31 |         name: "blip_image_train"
32 |         image_size: 384
33 |       eval:
34 |         name: "blip_image_eval"
35 |         image_size: 384
36 |   text_processor:
37 |       train:
38 |         name: "blip_question"
39 |       eval:
40 |         name: "blip_question"
41 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/alpro_qa_msrvtt.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   num_classes: 1500
 9 | 
10 |   load_finetuned: True
11 | 
12 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14 | 
15 |   timesformer:
16 |     n_frms: 16
17 |     image_size: 224
18 | 
19 |     patch_size: 16
20 |     attn_drop_rate: 0.
21 |     drop_rate: 0.
22 |     drop_path_rate: 0.1
23 | 
24 |     use_grad_ckpt: True
25 |     ckpt_layer: 12
26 | 
27 |   # bert config
28 |   med_config_path: "configs/models/bert_config_alpro.json"
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "alpro_video_train"
34 |         n_frms: 16
35 |         image_size: 224
36 |       eval:
37 |         name: "alpro_video_eval"
38 |         n_frms: 16
39 |         image_size: 224
40 |   text_processor:
41 |       train:
42 |         name: "blip_caption"
43 |       eval:
44 |         name: "blip_caption"
45 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/alpro_qa_msvd.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   num_classes: 2423
 9 | 
10 |   load_finetuned: True
11 | 
12 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14 | 
15 |   timesformer:
16 |     n_frms: 16
17 |     image_size: 224
18 | 
19 |     patch_size: 16
20 |     attn_drop_rate: 0.
21 |     drop_rate: 0.
22 |     drop_path_rate: 0.1
23 |     use_grad_ckpt: True
24 |     ckpt_layer: 12
25 | 
26 |   # bert config
27 |   med_config_path: "configs/models/bert_config_alpro.json"
28 | 
29 | preprocess:
30 |   vis_processor:
31 |       train:
32 |         name: "alpro_video_train"
33 |         n_frms: 16
34 |         image_size: 224
35 |       eval:
36 |         name: "alpro_video_eval"
37 |         n_frms: 16
38 |         image_size: 224
39 |   text_processor:
40 |       train:
41 |         name: "blip_caption"
42 |       eval:
43 |         name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/alpro_retrieval_didemo.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 | 
 9 |   load_finetuned: True
10 | 
11 |   finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13 | 
14 |   timesformer:
15 |     n_frms: 8
16 |     image_size: 224
17 | 
18 |     patch_size: 16
19 |     attn_drop_rate: 0.
20 |     drop_rate: 0.
21 |     drop_path_rate: 0.1
22 |     use_grad_ckpt: False
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/bert_config_alpro.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       eval:
30 |         name: "alpro_video_eval"
31 |         n_frms: 8
32 |         image_size: 224
33 |   text_processor:
34 |       eval:
35 |         name: "blip_caption"
36 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/alpro_retrieval_msrvtt.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 | 
 9 |   load_finetuned: True
10 | 
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt"
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13 | 
14 |   timesformer:
15 |     n_frms: 8
16 |     image_size: 224
17 | 
18 |     patch_size: 16
19 |     attn_drop_rate: 0.
20 |     drop_rate: 0.
21 |     drop_path_rate: 0.1
22 |     use_grad_ckpt: False
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/bert_config_alpro.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "alpro_video_train"
31 |         n_frms: 8
32 |         image_size: 224
33 |       eval:
34 |         name: "alpro_video_eval"
35 |         n_frms: 8
36 |         image_size: 224
37 |   text_processor:
38 |       train:
39 |         name: "blip_caption"
40 |       eval:
41 |         name: "blip_caption"
42 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/bert_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30522,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/bert_config_alpro.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": true,
18 |   "type_vocab_size": 2,
19 |   "vocab_size": 30522,
20 |   "encoder_width": 768,
21 |   "add_cross_attention": false,
22 |   "fusion_layer": 6
23 | }


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip-diffusion/blip_diffusion_base.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   vit_model: "clip_L"
 3 | 
 4 |   qformer_num_query_token: 16
 5 |   qformer_cross_attention_freq: 1
 6 | 
 7 |   sd_train_text_encoder: False
 8 |   sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 
 9 | 
10 |   load_finetuned: False
11 |   load_pretrained: True
12 |   # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz"
14 | 
15 | preprocess:
16 |   vis_processor:
17 |     train:
18 |       name: "blip_diffusion_inp_image_eval"
19 |     eval:
20 |       name: "blip_diffusion_inp_image_eval"
21 |   text_processor:
22 |     train:
23 |       name: "blip_caption"
24 |     eval:
25 |       name: "blip_caption"
26 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   vit_model: "clip_L"
 3 | 
 4 |   qformer_num_query_token: 16
 5 |   qformer_cross_attention_freq: 1
 6 | 
 7 |   sd_train_text_encoder: False
 8 |   sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 
 9 | 
10 |   load_finetuned: False
11 |   load_pretrained: True
12 |   # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz"
14 | 
15 |   controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-canny"
16 | 
17 | preprocess:
18 |   vis_processor:
19 |     train:
20 |       name: "blip_diffusion_inp_image_eval"
21 |     eval:
22 |       name: "blip_diffusion_inp_image_eval"
23 |   text_processor:
24 |     train:
25 |       name: "blip_caption"
26 |     eval:
27 |       name: "blip_caption"
28 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   vit_model: "clip_L"
 3 | 
 4 |   qformer_num_query_token: 16
 5 |   qformer_cross_attention_freq: 1
 6 | 
 7 |   sd_train_text_encoder: False
 8 |   sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 
 9 | 
10 |   load_finetuned: False
11 |   load_pretrained: True
12 |   # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz"
14 | 
15 |   controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-depth"
16 | 
17 | preprocess:
18 |   vis_processor:
19 |     train:
20 |       name: "blip_diffusion_inp_image_eval"
21 |     eval:
22 |       name: "blip_diffusion_inp_image_eval"
23 |   text_processor:
24 |     train:
25 |       name: "blip_caption"
26 |     eval:
27 |       name: "blip_caption"
28 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_hed.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   vit_model: "clip_L"
 3 | 
 4 |   qformer_num_query_token: 16
 5 |   qformer_cross_attention_freq: 1
 6 | 
 7 |   sd_train_text_encoder: False
 8 |   sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 
 9 | 
10 |   load_finetuned: False
11 |   load_pretrained: True
12 |   # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
13 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz"
14 | 
15 |   controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-hed"
16 | 
17 | preprocess:
18 |   vis_processor:
19 |     train:
20 |       name: "blip_diffusion_inp_image_eval"
21 |     eval:
22 |       name: "blip_diffusion_inp_image_eval"
23 |   text_processor:
24 |     train:
25 |       name: "blip_caption"
26 |     eval:
27 |       name: "blip_caption"
28 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_flant5xl
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_flant5xl.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xl"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_opt2.7b
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt2.7b.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-2.7b"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: caption_coco_opt6.7b
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt6.7b.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-6.7b"
25 | 
26 |   # generation configs
27 |   prompt: "a photo of"
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 364
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 364
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: coco
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth"
12 | 
13 |   # vit encoder
14 |   image_size: 364
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: True
17 |   vit_precision: "fp32"
18 |   freeze_vit: False
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 364
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 364
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |         eval:
36 |           name: "blip_caption"
37 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: flant5xl
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxl_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # T5
25 |   t5_model: "google/flan-t5-xl"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: flant5xxl
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxxl_trimmed.pth"
12 |   finetuned: ""
13 | 
14 |   # vit encoder
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # T5
25 |   t5_model: "google/flan-t5-xxl"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: instruct_vicuna13b
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   # cd_comments: set right path for pretrained blip ckpt
12 |   pretrained: "./checkpoints/instruct_blip_vicuna13b_trimmed.pth"
13 |   finetuned: ""
14 | 
15 |   # vit encoder
16 |   image_size: 224
17 |   drop_path_rate: 0
18 |   use_grad_checkpoint: False
19 |   vit_precision: "fp16"
20 |   freeze_vit: True
21 | 
22 |   # Q-Former
23 |   num_query_token: 32
24 | 
25 |   # cd_comments: set right path for vicuna
26 |   llm_model: "./checkpoints/vicuna-13b-v1.1"
27 | 
28 |   # generation configs
29 |   prompt: ""
30 | 
31 | 
32 | preprocess:
33 |     vis_processor:
34 |         train:
35 |           name: "blip2_image_train"
36 |           image_size: 224
37 |         eval:
38 |           name: "blip_image_eval"
39 |           image_size: 224
40 |     text_processor:
41 |         train:
42 |           name: "blip_caption"
43 |         eval:
44 |           name: "blip_caption"
45 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: instruct_vicuna7b
 8 |   load_finetuned: False
 9 |   load_pretrained: True
10 | 
11 |   # cd_comments: set right path for pretrained blip ckpt
12 |   pretrained: "path/to/the/instruct_blip_vicuna7b_trimmed.pth"
13 | 
14 |   finetuned: ""
15 | 
16 |   # vit encoder
17 |   image_size: 224
18 |   drop_path_rate: 0
19 |   use_grad_checkpoint: False
20 |   vit_precision: "fp16"
21 |   freeze_vit: True
22 | 
23 |   # Q-Former
24 |   num_query_token: 32
25 | 
26 |   # cd_comments: set right path for vicuna
27 |   llm_model: "path/checkpoints/vicuna-7b-v1.1"
28 | 
29 |   # generation configs
30 |   prompt: ""
31 | 
32 | 
33 | preprocess:
34 |     vis_processor:
35 |         train:
36 |           name: "blip2_image_train"
37 |           image_size: 224
38 |         eval:
39 |           name: "blip_image_eval"
40 |           image_size: 224
41 |     text_processor:
42 |         train:
43 |           name: "blip_caption"
44 |         eval:
45 |           name: "blip_caption"
46 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_pretrain.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 224
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 224
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |         eval:
36 |           name: "blip_caption"
37 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xl"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl_vitL.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   vit_model: "clip_L"
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 |   # T5
25 |   t5_model: "google/flan-t5-xl"
26 | 
27 |   # generation configs
28 |   prompt: ""
29 | 
30 | 
31 | preprocess:
32 |     vis_processor:
33 |         train:
34 |           name: "blip_image_train"
35 |           image_size: 224
36 |         eval:
37 |           name: "blip_image_eval"
38 |           image_size: 224
39 |     text_processor:
40 |         train:
41 |           name: "blip_caption"
42 |         eval:
43 |           name: "blip_caption"
44 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_flant5xxl
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # T5
24 |   t5_model: "google/flan-t5-xxl"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_pretrain_llama7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip2_llama
 8 |   load_finetuned: False
 9 |   
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # LLM
24 |   llm_model: "/export/home/project/stanford_alpaca/llama_7B"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip2_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_opt2.7b
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-2.7b"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain_opt6.7b
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   image_size: 224
15 |   drop_path_rate: 0
16 |   use_grad_checkpoint: False
17 |   vit_precision: "fp16"
18 |   freeze_vit: True
19 | 
20 |   # Q-Former
21 |   num_query_token: 32
22 | 
23 |   # OPT
24 |   opt_model: "facebook/opt-6.7b"
25 | 
26 |   # generation configs
27 |   prompt: ""
28 | 
29 | 
30 | preprocess:
31 |     vis_processor:
32 |         train:
33 |           name: "blip_image_train"
34 |           image_size: 224
35 |         eval:
36 |           name: "blip_image_eval"
37 |           image_size: 224
38 |     text_processor:
39 |         train:
40 |           name: "blip_caption"
41 |         eval:
42 |           name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: pretrain
 8 |   load_finetuned: False
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vitL.pth"
11 |   finetuned: ""
12 | 
13 |   # vit encoder
14 |   vit_model: "clip_L"
15 |   image_size: 224
16 |   drop_path_rate: 0
17 |   use_grad_checkpoint: False
18 |   vit_precision: "fp16"
19 |   freeze_vit: True
20 | 
21 |   # Q-Former
22 |   num_query_token: 32
23 | 
24 | 
25 | preprocess:
26 |     vis_processor:
27 |         train:
28 |           name: "blip_image_train"
29 |           image_size: 224
30 |         eval:
31 |           name: "blip_image_eval"
32 |           image_size: 224
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 |         eval:
37 |           name: "blip_caption"
38 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_caption_base_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 | 
18 |   image_size: 384
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/med_config.json"
22 | 
23 |   # generation configs
24 |   prompt: "a picture of "
25 | 
26 | 
27 | preprocess:
28 |     vis_processor:
29 |         train:
30 |           name: "blip_image_train"
31 |         eval:
32 |           name: "blip_image_eval"
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 |           prompt: "a picture of "
37 |         eval:
38 |           name: "blip_caption"
39 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_caption_large_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   load_finetuned: True
 9 | 
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth"
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth"
12 | 
13 |   vit_type: "large"
14 |   vit_grad_ckpt: True
15 |   vit_ckpt_layer: 5
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_large_config.json"
21 | 
22 |   # generation configs
23 |   prompt: "a picture of "
24 | 
25 | 
26 | preprocess:
27 |     vis_processor:
28 |         train:
29 |           name: "blip_image_train"
30 |         eval:
31 |           name: "blip_image_eval"
32 |     text_processor:
33 |         train:
34 |           name: "blip_caption"
35 |           prompt: "a picture of "
36 |         eval:
37 |           name: "blip_caption"
38 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_classification_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_classification
 8 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
 9 | 
10 |   use_distill: True
11 |   momentum: 0.995
12 |   alpha: 0.4
13 | 
14 |   # vit encoder
15 |   vit_type: "base"
16 |   vit_grad_ckpt: False
17 |   vit_ckpt_layer: 0
18 | 
19 |   image_size: 384
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_feature_extractor_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
 9 | 
10 |   # vit encoder
11 |   vit_type: "base"
12 |   vit_grad_ckpt: False
13 |   vit_ckpt_layer: 0
14 | 
15 |   image_size: 224
16 | 
17 |   # bert config
18 |   med_config_path: "configs/models/med_config.json"
19 | 
20 |   embed_dim: 256
21 | 
22 | preprocess:
23 |   vis_processor:
24 |       eval:
25 |         name: "blip_image_eval"
26 |         image_size: 224
27 |   text_processor:
28 |       eval:
29 |         name: "blip_caption"
30 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_itm_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_image_text_matching
 8 | 
 9 |   load_finetuned: True
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_config.json"
21 | 
22 |   embed_dim: 256
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         eval:
27 |           name: "blip_image_eval"
28 |           image_size: 384
29 |     text_processor:
30 |         eval:
31 |           name: "blip_caption"
32 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_itm_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_image_text_matching
 8 | 
 9 |   load_finetuned: True
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "large"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 384
18 | 
19 |   # bert config
20 |   med_config_path: "configs/models/med_large_config.json"
21 | 
22 |   embed_dim: 256
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         eval:
27 |           name: "blip_image_eval"
28 |           image_size: 384
29 |     text_processor:
30 |         eval:
31 |           name: "blip_caption"
32 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_nlvr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_nlvr
 8 |   model_type: nlvr
 9 |   load_finetuned: True
10 | 
11 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth"
12 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
13 | 
14 |   num_classes: 2
15 | 
16 |   # vit encoder
17 |   vit_type: "base"
18 |   vit_grad_ckpt: False
19 |   vit_ckpt_layer: 0
20 |   vit_layer_norm_epsilon: 1e-6
21 | 
22 |   image_size: 384
23 | 
24 |   # bert config
25 |   med_config_path: "configs/models/med_config.json"
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "blip_image_train"
31 |         image_size: 384
32 |       eval:
33 |         name: "blip_image_eval"
34 |         image_size: 384
35 |   text_processor:
36 |       train:
37 |         name: "blip_caption"
38 |       eval:
39 |         name: "blip_caption"
40 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_pretrain_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 | 
 9 |   load_pretrained: True
10 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
11 | 
12 |   # vit encoder
13 |   vit_type: "base"
14 |   vit_grad_ckpt: False
15 |   vit_ckpt_layer: 0
16 | 
17 |   image_size: 224
18 |   alpha: 0.4
19 | 
20 |   # bert config
21 |   med_config_path: "configs/models/bert_config.json"
22 | 
23 |   embed_dim: 256
24 | 
25 |   # generation configs
26 |   prompt: "a picture of "
27 | 
28 | preprocess:
29 |     vis_processor:
30 |         train:
31 |           name: "blip_image_train"
32 |           image_size: 224
33 |     text_processor:
34 |         train:
35 |           name: "blip_caption"
36 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_pretrain_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_pretrain
 8 | 
 9 |   # vit encoder
10 |   vit_type: "large"
11 |   vit_grad_ckpt: True
12 |   vit_ckpt_layer: 5
13 | 
14 |   image_size: 224
15 | 
16 |   # bert config
17 |   med_config_path: "configs/models/med_large_config.json"
18 | 
19 |   embed_dim: 256
20 | 
21 |   # generation configs
22 |   prompt: "a picture of "
23 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_retrieval_coco.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   queue_size: 57600
14 | 
15 |   # vit encoder
16 |   vit_type: "base"
17 |   vit_grad_ckpt: True
18 |   vit_ckpt_layer: 4
19 | 
20 |   image_size: 384
21 | 
22 |   # bert config
23 |   med_config_path: "configs/models/med_config.json"
24 | 
25 |   embed_dim: 256
26 | 
27 | preprocess:
28 |   vis_processor:
29 |       train:
30 |         name: "blip_image_train"
31 |         image_size: 384
32 |       eval:
33 |         name: "blip_image_eval"
34 |         image_size: 384
35 |   text_processor:
36 |       train:
37 |         name: "blip_caption"
38 |       eval:
39 |         name: "blip_caption"
40 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_retrieval_flickr.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_flickr_retrieval.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   queue_size: 57600
14 |   alpha: 0.4
15 | 
16 |   negative_all_rank: False
17 | 
18 |   # vit encoder
19 |   vit_type: "base"
20 |   vit_grad_ckpt: True
21 |   vit_ckpt_layer: 4
22 | 
23 |   image_size: 384
24 | 
25 |   # bert config
26 |   med_config_path: "configs/models/med_config.json"
27 | 
28 |   embed_dim: 256
29 | 
30 | preprocess:
31 |   vis_processor:
32 |       train:
33 |         name: "blip_image_train"
34 |         image_size: 384
35 |       eval:
36 |         name: "blip_image_eval"
37 |         image_size: 384
38 |   text_processor:
39 |       train:
40 |         name: "blip_caption"
41 |       eval:
42 |         name: "blip_caption"
43 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_vqa_aokvqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_vqa_okvqa.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_vqav2.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   load_finetuned: True
 9 | 
10 |   finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
11 |   pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 | 
13 |   # vit encoder
14 |   vit_type: "base"
15 |   vit_grad_ckpt: False
16 |   vit_ckpt_layer: 0
17 |   vit_drop_path_rate: 0.1
18 | 
19 |   image_size: 480
20 | 
21 |   # bert config
22 |   med_config_path: "configs/models/med_config.json"
23 | 
24 | preprocess:
25 |     vis_processor:
26 |         train:
27 |           name: "blip_image_train"
28 |           image_size: 480
29 |         eval:
30 |           name: "blip_image_eval"
31 |           image_size: 480
32 |     text_processor:
33 |         train:
34 |           name: "blip_question"
35 |         eval:
36 |           name: "blip_question"
37 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/RN101-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             23,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/RN101.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             23,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/RN50-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": [
 7 |             3,
 8 |             4,
 9 |             6,
10 |             3
11 |         ],
12 |         "width": 64,
13 |         "patch_size": null
14 |     },
15 |     "text_cfg": {
16 |         "context_length": 77,
17 |         "vocab_size": 49408,
18 |         "width": 512,
19 |         "heads": 8,
20 |         "layers": 12
21 |     }
22 | }
23 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/RN50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": [
 6 |             3,
 7 |             4,
 8 |             6,
 9 |             3
10 |         ],
11 |         "width": 64,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 512,
18 |         "heads": 8,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/RN50x16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 384,
 5 |         "layers": [
 6 |             6,
 7 |             8,
 8 |             18,
 9 |             8
10 |         ],
11 |         "width": 96,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 768,
18 |         "heads": 12,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/RN50x4.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 288,
 5 |         "layers": [
 6 |             4,
 7 |             6,
 8 |             10,
 9 |             6
10 |         ],
11 |         "width": 80,
12 |         "patch_size": null
13 |     },
14 |     "text_cfg": {
15 |         "context_length": 77,
16 |         "vocab_size": 49408,
17 |         "width": 640,
18 |         "heads": 10,
19 |         "layers": 12
20 |     }
21 | }
22 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-B-16-plus-240.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 240,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-B-16-plus.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-B-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-B-32-plus-256.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 640,
 3 |     "vision_cfg": {
 4 |         "image_size": 256,
 5 |         "layers": 12,
 6 |         "width": 896,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 640,
13 |         "heads": 10,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "quick_gelu": true,
 4 |     "vision_cfg": {
 5 |         "image_size": 224,
 6 |         "layers": 12,
 7 |         "width": 768,
 8 |         "patch_size": 32
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-B-32.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 12,
 6 |         "width": 768,
 7 |         "patch_size": 32
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 512,
13 |         "heads": 8,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-H-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 14
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-H-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 32,
 6 |         "width": 1280,
 7 |         "head_width": 80,
 8 |         "patch_size": 16
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 1024,
14 |         "heads": 16,
15 |         "layers": 24
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-L-14-280.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 280,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-L-14-336.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 336,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-L-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 14
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-L-16-320.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 320,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-L-16.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 24,
 6 |         "width": 1024,
 7 |         "patch_size": 16
 8 |     },
 9 |     "text_cfg": {
10 |         "context_length": 77,
11 |         "vocab_size": 49408,
12 |         "width": 768,
13 |         "heads": 12,
14 |         "layers": 12
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-g-14.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "image_size": 224,
 5 |         "layers": 40,
 6 |         "width": 1408,
 7 |         "head_width": 88,
 8 |         "mlp_ratio": 4.3637,
 9 |         "patch_size": 14
10 |     },
11 |     "text_cfg": {
12 |         "context_length": 77,
13 |         "vocab_size": 49408,
14 |         "width": 1024,
15 |         "heads": 16,
16 |         "layers": 24
17 |     }
18 | }
19 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 768,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "efficientnetv2_rw_s",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 288
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 768,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/timm-resnet50d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnet50d",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/timm-resnetaa50d.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnetaa50d",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/timm-resnetblur50.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 1024,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "resnetblur50",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "abs_attn",
 7 |         "timm_proj": "",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "swin_base_patch4_window7_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/timm-vit_base_patch16_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_base_patch16_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/timm-vit_base_patch32_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_base_patch32_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/timm-vit_small_patch16_224.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "embed_dim": 512,
 3 |     "vision_cfg": {
 4 |         "timm_model_name": "vit_small_patch16_224",
 5 |         "timm_model_pretrained": false,
 6 |         "timm_pool": "",
 7 |         "timm_proj": "linear",
 8 |         "image_size": 224
 9 |     },
10 |     "text_cfg": {
11 |         "context_length": 77,
12 |         "vocab_size": 49408,
13 |         "width": 512,
14 |         "heads": 8,
15 |         "layers": 12
16 |     }
17 | }
18 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip_resnet50.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: RN50
10 | 
11 |   pretrained: openai
12 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip_vit_base16.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-B-16
10 | 
11 |   pretrained: openai
12 | 
13 | preprocess:
14 |   vis_processor:
15 |       eval:
16 |         name: "clip_image_eval"
17 |         image_size: 224
18 | 


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/gpt_dialogue_base.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: gpt_dialogue
 8 |   # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
 9 |   # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
10 | 
11 |   len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens 
12 |   
13 |   len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128
14 | 
15 | preprocess:
16 |     vis_processor:
17 |         train:
18 |           name: "gpt_video_ft"
19 |         eval:
20 |           name: "gpt_video_ft"
21 |     text_processor:
22 |         train:
23 |           name: "gpt_dialogue"
24 |         eval:
25 |           name: "gpt_dialogue"


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/med_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30524,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/med_config_albef.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30522,
19 |   "encoder_width": 768,
20 |   "add_cross_attention": true,
21 |   "fusion_layer": 6
22 | }


--------------------------------------------------------------------------------
/experiments/lavis/configs/models/med_large_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertModel"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "add_type_embeddings": false,
18 |   "vocab_size": 30524,
19 |   "encoder_width": 1024,
20 |   "add_cross_attention": true
21 | }


--------------------------------------------------------------------------------
/experiments/lavis/datasets/builders/classification_builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
10 | from lavis.datasets.datasets.nlvr_datasets import NLVRDataset, NLVREvalDataset
11 | from lavis.datasets.datasets.snli_ve_datasets import SNLIVisualEntialmentDataset
12 | 
13 | 
14 | @registry.register_builder("nlvr")
15 | class NLVRBuilder(BaseDatasetBuilder):
16 |     train_dataset_cls = NLVRDataset
17 |     eval_dataset_cls = NLVREvalDataset
18 | 
19 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/nlvr/defaults.yaml"}
20 | 
21 | 
22 | @registry.register_builder("snli_ve")
23 | class SNLIVisualEntailmentBuilder(BaseDatasetBuilder):
24 |     train_dataset_cls = SNLIVisualEntialmentDataset
25 |     eval_dataset_cls = SNLIVisualEntialmentDataset
26 | 
27 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/snli_ve/defaults.yaml"}
28 | 


--------------------------------------------------------------------------------
/experiments/lavis/datasets/builders/dialogue_builder.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
10 | from lavis.datasets.datasets.avsd_dialogue_datasets import (
11 |     AVSDDialDataset,
12 |     AVSDDialEvalDataset,
13 | )
14 | 
15 | 
16 | @registry.register_builder("avsd_dialogue")
17 | class AVSDDialBuilder(BaseDatasetBuilder):
18 |     train_dataset_cls = AVSDDialDataset
19 |     eval_dataset_cls = AVSDDialEvalDataset
20 | 
21 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/avsd/defaults_dial.yaml"}
22 | 


--------------------------------------------------------------------------------
/experiments/lavis/datasets/datasets/multimodal_classification_datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from abc import abstractmethod
 9 | from lavis.datasets.datasets.base_dataset import BaseDataset
10 | 
11 | 
12 | class MultimodalClassificationDataset(BaseDataset):
13 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
14 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
15 | 
16 |         self.class_labels = None
17 | 
18 |     @abstractmethod
19 |     def _build_class_labels(self):
20 |         pass
21 | 


--------------------------------------------------------------------------------
/experiments/lavis/datasets/datasets/vg_vqa_datasets.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | 
10 | from PIL import Image
11 | 
12 | from lavis.datasets.datasets.vqa_datasets import VQADataset
13 | 
14 | 
15 | class VGVQADataset(VQADataset):
16 |     def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
17 |         super().__init__(vis_processor, text_processor, vis_root, ann_paths)
18 | 
19 |     def __getitem__(self, index):
20 |         ann = self.annotation[index]
21 | 
22 |         image_path = os.path.join(self.vis_root, ann["image"])
23 |         image = Image.open(image_path).convert("RGB")
24 | 
25 |         image = self.vis_processor(image)
26 |         question = self.text_processor(ann["question"])
27 | 
28 |         answers = [ann["answer"]]
29 |         # TODO this should be configured better
30 |         weights = [0.2]
31 | 
32 |         return {
33 |             "image": image,
34 |             "text_input": question,
35 |             "answers": answers,
36 |             "weights": weights,
37 |         }
38 | 


--------------------------------------------------------------------------------
/experiments/lavis/models/blip2_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/models/blip2_models/__init__.py


--------------------------------------------------------------------------------
/experiments/lavis/models/blip_diffusion_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/models/blip_diffusion_models/__init__.py


--------------------------------------------------------------------------------
/experiments/lavis/models/clip_models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | 
 7 |  Based on https://github.com/mlfoundations/open_clip
 8 | """
 9 | 
10 | """ OpenAI pretrained model functions
11 | Adapted from https://github.com/mlfoundations/open_clip and https://github.com/openai/CLIP.
12 | 
13 | Originally MIT License, Copyright (c) 2021 OpenAI.
14 | """
15 | 


--------------------------------------------------------------------------------
/experiments/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/experiments/lavis/models/clip_models/pics/CLIP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/models/clip_models/pics/CLIP.png


--------------------------------------------------------------------------------
/experiments/lavis/models/img2prompt_models/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import torch
 9 | 
10 | 
11 | 
12 | 


--------------------------------------------------------------------------------
/experiments/lavis/models/timesformer/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |  Copyright (c) 2022, salesforce.com, inc.
3 |  All rights reserved.
4 |  SPDX-License-Identifier: BSD-3-Clause
5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | 
7 |  Based on https://github.com/facebookresearch/TimeSformer
8 | """
9 | 


--------------------------------------------------------------------------------
/experiments/lavis/models/timesformer/linear.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | """ Linear layer (alternate definition)
 9 | """
10 | import torch
11 | import torch.nn.functional as F
12 | from torch import nn as nn
13 | 
14 | 
15 | class Linear(nn.Linear):
16 |     def forward(self, input: torch.Tensor) -> torch.Tensor:
17 |         if torch.jit.is_scripting():
18 |             bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None
19 |             return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias)
20 |         else:
21 |             return F.linear(input, self.weight, self.bias)
22 | 


--------------------------------------------------------------------------------
/experiments/lavis/processors/base_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from omegaconf import OmegaConf
 9 | 
10 | 
11 | class BaseProcessor:
12 |     def __init__(self):
13 |         self.transform = lambda x: x
14 |         return
15 | 
16 |     def __call__(self, item):
17 |         return self.transform(item)
18 | 
19 |     @classmethod
20 |     def from_config(cls, cfg=None):
21 |         return cls()
22 | 
23 |     def build(self, **kwargs):
24 |         cfg = OmegaConf.create(kwargs)
25 | 
26 |         return self.from_config(cfg)
27 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/albef/eval/nlvr_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_nlvr
 8 |   model_type: nlvr
 9 | 
10 | datasets:
11 |   nlvr: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: multimodal_classification
22 | 
23 |   batch_size_train: 16
24 |   batch_size_eval: 64
25 |   num_workers: 4
26 | 
27 |   seed: 42
28 |   output_dir: "output/ALBEF/NLVR"
29 | 
30 |   evaluate: True
31 |   test_splits: ["val", "test"]
32 | 
33 |   # distribution-specific
34 |   device: "cuda"
35 |   world_size: 1
36 |   dist_url: "env://"
37 |   distributed: True
38 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/albef/eval/ret_coco_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   model_type: coco
 9 | 
10 | datasets:
11 |   coco_retrieval: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: retrieval
22 | 
23 |   # dataloading
24 |   num_workers: 4
25 |   batch_size_train: 32
26 |   batch_size_eval: 64
27 | 
28 |   test_splits: ["test"]
29 | 
30 |   # distribution
31 |   device: "cuda"
32 |   world_size: 1
33 |   dist_url: "env://"
34 |   distributed: True
35 |   use_dist_eval_sampler: False
36 | 
37 |   # model specific
38 |   k_test: 128
39 | 
40 |   # misc
41 |   seed: 42
42 |   output_dir: "output/ALBEF/Retrieval_COCO"
43 | 
44 |   evaluate: True
45 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/albef/eval/ret_flickr30k_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_retrieval
 8 |   model_type: flickr
 9 | 
10 | datasets:
11 |   flickr30k: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: retrieval
22 | 
23 |   # dataloading
24 |   num_workers: 4
25 |   batch_size_train: 32
26 |   batch_size_eval: 64
27 | 
28 |   test_splits: ["test"]
29 | 
30 |   # distribution
31 |   device: "cuda"
32 |   world_size: 1
33 |   dist_url: "env://"
34 |   distributed: True
35 |   use_dist_eval_sampler: False
36 | 
37 |   # model specific
38 |   k_test: 128
39 | 
40 |   # misc
41 |   seed: 42
42 |   output_dir: "output/ALBEF/Retrieval_Flickr30k"
43 | 
44 |   evaluate: True
45 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/albef/eval/snli_ve_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_classification
 8 |   model_type: ve
 9 | 
10 | datasets:
11 |   snli_ve: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |     text_processor:
16 |         eval:
17 |           name: "blip_caption"
18 | 
19 | run:
20 |   task: multimodal_classification
21 |   # optimization-specific
22 |   batch_size_train: 32
23 |   batch_size_eval: 64
24 |   num_workers: 4
25 | 
26 |   seed: 42
27 |   output_dir: "output/ALBEF/SNLI_VE"
28 | 
29 |   evaluate: True
30 |   test_splits: ["val", "test"]
31 | 
32 |   # distribution-specific
33 |   device: "cuda"
34 |   world_size: 1
35 |   dist_url: "env://"
36 |   distributed: True
37 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/albef/eval/vqa_test.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_vqa
 8 |   model_type: vqav2
 9 | 
10 |   image_size: 384
11 | 
12 | 
13 | datasets:
14 |   coco_vqa: # name of the dataset builder
15 |     vis_processor:
16 |         eval:
17 |           name: "blip_image_eval"
18 |           image_size: 384
19 |     text_processor:
20 |         eval:
21 |           name: "blip_question"
22 | 
23 | run:
24 |   task: vqa
25 | 
26 |   # optimization-specific
27 |   batch_size_train: 16
28 |   batch_size_eval: 64
29 |   num_workers: 4
30 | 
31 |   # inference-specific
32 |   max_len: 10
33 |   min_len: 1
34 |   num_beams: 3
35 |   num_ans_candidates: 128
36 |   inference_method: "rank"
37 | 
38 |   seed: 42
39 |   output_dir: "output/ALBEF/VQA"
40 | 
41 |   evaluate: True
42 |   train_splits: ["train"]
43 |   test_splits: ["test"]
44 | 
45 |   # distribution-specific
46 |   device: "cuda"
47 |   world_size: 1
48 |   dist_url: "env://"
49 |   distributed: True
50 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/albef/eval/vqa_val.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: albef_vqa
 8 |   model_type: vqav2
 9 | 
10 |   image_size: 384
11 | 
12 | datasets:
13 |   coco_vqa: # name of the dataset builder
14 |     type: eval
15 |     vis_processor:
16 |         eval:
17 |           name: "blip_image_eval"
18 |           image_size: 384
19 |     text_processor:
20 |         eval:
21 |           name: "blip_question"
22 | 
23 | run:
24 |   task: vqa
25 | 
26 |   # optimization-specific
27 |   batch_size_train: 16
28 |   batch_size_eval: 64
29 |   num_workers: 4
30 | 
31 |   # inference-specific
32 |   max_len: 10
33 |   min_len: 1
34 |   num_beams: 3
35 |   num_ans_candidates: 128
36 |   inference_method: "rank"
37 | 
38 |   seed: 42
39 |   output_dir: "output/ALBEF/VQA"
40 | 
41 |   evaluate: True
42 |   test_splits: ["val"]
43 | 
44 |   # distribution-specific
45 |   device: "cuda"
46 |   world_size: 1
47 |   dist_url: "env://"
48 |   distributed: True
49 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/alpro/eval/didemo_ret_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 |   model_type: didemo
 9 | 
10 |   max_txt_len: 50
11 | 
12 |   timesformer:
13 |     n_frms: 8
14 |     image_size: 224
15 | 
16 | 
17 | datasets:
18 |   didemo_retrieval: # name of the dataset builder
19 |     vis_processor:
20 |         eval:
21 |           name: "alpro_video_eval"
22 |           n_frms: 8
23 |           image_size: 224
24 |     text_processor:
25 |         eval:
26 |           name: "blip_caption"
27 | 
28 | run:
29 |   task: retrieval
30 |   # optimization-specific
31 |   batch_size_train: 8
32 |   batch_size_eval: 64
33 |   num_workers: 4
34 | 
35 |   # k_test: 256
36 |   k_test: 1000
37 | 
38 |   seed: 42
39 |   output_dir: "output/ALPRO/didemo_retrieval"
40 | 
41 |   evaluate: True
42 |   train_splits: ["train"]
43 |   valid_splits: ["val", "test"]
44 |   test_splits: ["test"]
45 | 
46 |   # distribution-specific
47 |   device: "cuda"
48 |   world_size: 1
49 |   dist_url: "env://"
50 |   distributed: True
51 |   use_dist_eval_sampler: False
52 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/alpro/eval/msrvtt_qa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   model_type: msrvtt
 9 | 
10 | datasets:
11 |   msrvtt_qa: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "alpro_video_eval"
15 |           n_frms: 16
16 |           image_size: 224
17 |     text_processor:
18 |         eval:
19 |           name: "blip_caption"
20 | 
21 | run:
22 |   task: multimodal_classification
23 |   # optimization-specific
24 |   batch_size_train: 32
25 |   batch_size_eval: 64
26 |   num_workers: 4
27 | 
28 |   seed: 42
29 |   output_dir: "output/ALPRO/msrvtt_qa"
30 | 
31 |   evaluate: True
32 |   valid_splits: ["val"]
33 |   test_splits: ["test"]
34 | 
35 |   # distribution-specific
36 |   device: "cuda"
37 |   world_size: 1
38 |   dist_url: "env://"
39 |   distributed: True
40 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/alpro/eval/msrvtt_ret_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_retrieval
 8 |   model_type: msrvtt
 9 | 
10 | datasets:
11 |   msrvtt_retrieval: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "alpro_video_eval"
15 |           n_frms: 8
16 |           image_size: 224
17 |     text_processor:
18 |         eval:
19 |           name: "blip_caption"
20 | 
21 | run:
22 |   task: retrieval
23 |   # optimization-specific
24 |   batch_size_train: 24
25 |   batch_size_eval: 64
26 |   num_workers: 4
27 | 
28 |   # k_test: 256
29 |   k_test: 1000
30 | 
31 |   seed: 42
32 |   output_dir: "output/ALPRO/msrvtt_retrieval"
33 | 
34 |   evaluate: True
35 |   test_splits: ["test"]
36 | 
37 |   # distribution-specific
38 |   device: "cuda"
39 |   world_size: 1
40 |   dist_url: "env://"
41 |   distributed: True
42 |   use_dist_eval_sampler: False
43 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/alpro/eval/msvd_qa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: alpro_qa
 8 |   model_type: msvd
 9 | 
10 | datasets:
11 |   msvd_qa: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "alpro_video_eval"
15 |           n_frms: 16
16 |           image_size: 224
17 |     text_processor:
18 |         train:
19 |           name: "blip_caption"
20 |         eval:
21 |           name: "blip_caption"
22 | 
23 | run:
24 |   task: multimodal_classification
25 |   # optimization-specific
26 |   batch_size_train: 24
27 |   batch_size_eval: 64
28 |   num_workers: 4
29 | 
30 |   seed: 42
31 |   output_dir: "output/ALPRO/msvd_qa"
32 | 
33 |   evaluate: True
34 |   test_splits: ["test"]
35 | 
36 |   # distribution-specific
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/aokvqa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   model_type: aokvqa
 9 |   image_size: 480
10 | 
11 | datasets:
12 |   aok_vqa: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 480
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: aok_vqa
23 |   # optimization-specific
24 |   batch_size_train: 64
25 |   batch_size_eval: 64
26 |   num_workers: 4
27 | 
28 |   # inference-specific
29 |   max_len: 10
30 |   min_len: 1
31 |   num_beams: 3
32 |   num_ans_candidates: 128
33 |   inference_method: "rank"
34 | 
35 |   seed: 42
36 |   output_dir: "output/BLIP/AOKVQA"
37 | 
38 |   evaluate: True
39 |   test_splits: ["val", "test"]
40 | 
41 |   # distribution-specific
42 |   device: "cuda"
43 |   world_size: 1
44 |   dist_url: "env://"
45 |   distributed: True
46 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/caption_coco_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   model_type: base_coco
 9 | 
10 | datasets:
11 |   coco_caption: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |     text_processor:
16 |         eval:
17 |           name: "blip_caption"
18 | 
19 | run:
20 |   # task: retrieval
21 |   task: captioning
22 |   # optimizer
23 |   batch_size_train: 32
24 |   batch_size_eval: 64
25 |   num_workers: 4
26 | 
27 |   max_len: 20
28 |   min_len: 5
29 |   num_beams: 3
30 | 
31 |   seed: 42
32 |   output_dir: "output/BLIP/Caption_coco"
33 | 
34 |   evaluate: True
35 |   test_splits: ["test"]
36 | 
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/caption_coco_eval_large.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   model_type: large_coco
 9 | 
10 | datasets:
11 |   coco_caption: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |     text_processor:
16 |         eval:
17 |           name: "blip_caption"
18 | 
19 | run:
20 |   # task: retrieval
21 |   task: captioning
22 |   # optimizer
23 |   batch_size_train: 32
24 |   batch_size_eval: 64
25 |   num_workers: 4
26 | 
27 |   max_len: 20
28 |   min_len: 5
29 |   num_beams: 3
30 | 
31 |   seed: 42
32 |   output_dir: "output/BLIP/Caption_coco"
33 | 
34 |   evaluate: True
35 |   test_splits: ["test"]
36 | 
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/nlvr_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_nlvr
 8 |   model_type: nlvr
 9 | 
10 | datasets:
11 |   nlvr: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: multimodal_classification
22 | 
23 |   batch_size_train: 16
24 |   batch_size_eval: 64
25 |   num_workers: 4
26 | 
27 |   seed: 42
28 |   output_dir: "output/BLIP/NLVR"
29 | 
30 |   evaluate: True
31 |   test_splits: ["val", "test"]
32 | 
33 |   # distribution-specific
34 |   device: "cuda"
35 |   world_size: 1
36 |   dist_url: "env://"
37 |   distributed: True
38 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/nocaps_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_caption
 8 |   model_type: base_coco
 9 |   # pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth'
10 | 
11 | datasets:
12 |   nocaps: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 384
17 |     text_processor:
18 |         eval:
19 |           name: "blip_caption"
20 |           prompt: "a picture of "
21 | 
22 | run:
23 |   # task: retrieval
24 |   task: captioning
25 |   # optimizer
26 |   batch_size_train: 32
27 |   batch_size_eval: 64
28 |   num_workers: 4
29 | 
30 |   max_len: 20
31 |   min_len: 5
32 |   num_beams: 3
33 | 
34 |   seed: 42
35 |   output_dir: "output/BLIP/NoCaps"
36 | 
37 |   evaluate: True
38 |   test_splits: ["val", "test"]
39 | 
40 |   device: "cuda"
41 |   world_size: 1
42 |   dist_url: "env://"
43 |   distributed: True
44 | 
45 |   report_metric: False
46 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/okvqa_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   model_type: okvqa
 9 |   image_size: 480
10 | 
11 | datasets:
12 |   ok_vqa: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 480
17 |     text_processor:
18 |         eval:
19 |           name: "blip_question"
20 | 
21 | run:
22 |   task: vqa
23 |   # optimization-specific
24 |   batch_size_train: 16
25 |   batch_size_eval: 16
26 |   num_workers: 4
27 | 
28 |   # inference-specific
29 |   max_len: 10
30 |   min_len: 1
31 |   num_beams: 3
32 |   num_ans_candidates: 128
33 |   inference_method: "rank"
34 | 
35 |   seed: 42
36 |   output_dir: "output/BLIP/OKVQA"
37 | 
38 |   evaluate: True
39 |   test_splits: ["test"]
40 | 
41 |   # distribution-specific
42 |   device: "cuda"
43 |   world_size: 1
44 |   dist_url: "env://"
45 |   distributed: True
46 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/ret_coco_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   model_type: coco
 9 | 
10 | datasets:
11 |   coco_retrieval: # name of the dataset builder
12 |     vis_processor:
13 |         train:
14 |           name: "blip_image_train"
15 |           image_size: 384
16 |         eval:
17 |           name: "blip_image_eval"
18 |           image_size: 384
19 |     text_processor:
20 |         train:
21 |           name: "blip_caption"
22 |         eval:
23 |           name: "blip_caption"
24 | 
25 | run:
26 |   task: retrieval
27 | 
28 |   # dataloading
29 |   num_workers: 4
30 |   batch_size_train: 32
31 |   batch_size_eval: 128
32 | 
33 |   train_splits: ["train"]
34 |   valid_splits: ["val"]
35 |   test_splits: ["test"]
36 | 
37 |   # distribution
38 |   device: "cuda"
39 |   world_size: 1
40 |   dist_url: "env://"
41 |   distributed: True
42 |   use_dist_eval_sampler: False
43 | 
44 |   # model specific
45 |   k_test: 256
46 | 
47 |   # misc
48 |   seed: 42
49 |   output_dir: "output/BLIP/Retrieval_COCO"
50 | 
51 |   evaluate: True
52 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/ret_flickr_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_retrieval
 8 |   model_type: flickr
 9 | 
10 | datasets:
11 |   flickr30k: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "blip_image_eval"
15 |           image_size: 384
16 |     text_processor:
17 |         eval:
18 |           name: "blip_caption"
19 | 
20 | run:
21 |   task: retrieval
22 | 
23 |   # dataloading
24 |   num_workers: 4
25 |   batch_size_train: 32
26 |   batch_size_eval: 64
27 | 
28 |   test_splits: ["test"]
29 | 
30 |   # distribution
31 |   device: "cuda"
32 |   world_size: 1
33 |   dist_url: "env://"
34 |   distributed: True
35 |   use_dist_eval_sampler: False
36 | 
37 |   # model specific
38 |   k_test: 128
39 | 
40 |   # misc
41 |   seed: 42
42 |   output_dir: "output/Retrieval_Flickr30k"
43 | 
44 |   evaluate: True
45 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/vqav2_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip_vqa
 8 |   model_type: vqav2
 9 |   image_size: 480
10 | 
11 | datasets:
12 |   coco_vqa: # name of the dataset builder
13 |     type: eval
14 |     vis_processor:
15 |         eval:
16 |           name: "blip_image_eval"
17 |           image_size: 480
18 |     text_processor:
19 |         eval:
20 |           name: "blip_question"
21 | 
22 | run:
23 |   task: vqa
24 |   # optimization-specific
25 |   batch_size_train: 16
26 |   batch_size_eval: 64
27 |   num_workers: 4
28 | 
29 |   # inference-specific
30 |   max_len: 10
31 |   min_len: 1
32 |   num_beams: 3
33 |   num_ans_candidates: 128
34 |   inference_method: "rank"
35 | 
36 |   seed: 42
37 |   output_dir: "output/BLIP/VQA"
38 | 
39 |   evaluate: True
40 |   test_splits: ["val"]
41 | 
42 |   # distribution-specific
43 |   device: "cuda"
44 |   world_size: 1
45 |   dist_url: "env://"
46 |   distributed: True
47 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml


--------------------------------------------------------------------------------
/experiments/lavis/projects/blip2/eval/ret_flickr_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: blip2
 8 |   model_type: coco
 9 |   use_grad_checkpoint: False
10 | 
11 | datasets:
12 |   flickr30k: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "blip_image_eval"
16 |           image_size: 364
17 |     text_processor:
18 |         eval:
19 |           name: "blip_caption"
20 | 
21 | run:
22 |   task: retrieval
23 | 
24 |   # dataloading
25 |   num_workers: 4
26 |   batch_size_train: 16
27 |   batch_size_eval: 32
28 | 
29 |   test_splits: ["test"]
30 | 
31 |   # distribution
32 |   device: "cuda"
33 |   world_size: 1
34 |   dist_url: "env://"
35 |   distributed: True
36 |   use_dist_eval_sampler: False
37 | 
38 |   # model specific
39 |   k_test: 128
40 | 
41 |   # misc
42 |   seed: 42
43 |   output_dir: "output/BLIP2/Retrieval_Flickr30k"
44 | 
45 |   evaluate: True


--------------------------------------------------------------------------------
/experiments/lavis/projects/clip/exp_coco_ret_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14-336
10 | 
11 | datasets:
12 |   coco_retrieval: # name of the dataset builder
13 |     vis_processor:
14 |         train:
15 |           name: "clip_image_train"
16 |           image_size: 336
17 |         eval:
18 |           name: "clip_image_eval"
19 |           image_size: 336
20 |     text_processor:
21 |         train:
22 |           name: "blip_caption"
23 |         eval:
24 |           name: "blip_caption"
25 | 
26 | run:
27 |   task: retrieval
28 | 
29 |   # dataloading
30 |   num_workers: 4
31 |   batch_size_train: 32
32 |   batch_size_eval: 128
33 | 
34 |   test_splits: ["test"]
35 | 
36 |   # distribution
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 |   use_dist_eval_sampler: True
42 | 
43 |   # misc
44 |   seed: 42
45 |   output_dir: "output/clip/Retrieval_COCO"
46 | 
47 |   evaluate: True
48 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/clip/exp_flickr_ret_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14-336
10 | 
11 | datasets:
12 |   flickr30k: # name of the dataset builder
13 |     vis_processor:
14 |         train:
15 |           name: "clip_image_train"
16 |           image_size: 336
17 |         eval:
18 |           name: "clip_image_eval"
19 |           image_size: 336
20 |     text_processor:
21 |         train:
22 |           name: "blip_caption"
23 |         eval:
24 |           name: "blip_caption"
25 | 
26 | run:
27 |   task: retrieval
28 | 
29 |   # dataloading
30 |   num_workers: 4
31 |   batch_size_train: 32
32 |   batch_size_eval: 128
33 | 
34 |   test_splits: ["test"]
35 | 
36 |   # distribution
37 |   device: "cuda"
38 |   world_size: 1
39 |   dist_url: "env://"
40 |   distributed: True
41 |   use_dist_eval_sampler: True
42 | 
43 |   # misc
44 |   seed: 42
45 |   output_dir: "output/clip/Retrieval_Flickr"
46 | 
47 |   evaluate: True
48 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/clip/exp_imnet_zs_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: clip
 8 | 
 9 |   model_type: ViT-L-14-336
10 | 
11 | datasets:
12 |   imagenet: # name of the dataset builder
13 |     vis_processor:
14 |         eval:
15 |           name: "clip_image_eval"
16 |           # image_size: 224
17 |           image_size: 336
18 | 
19 | run:
20 |   task: multimodal_classification
21 | 
22 |   # dataloading
23 |   num_workers: 4
24 |   batch_size_train: 32
25 |   batch_size_eval: 128
26 | 
27 |   test_splits: ["val"]
28 | 
29 |   # distribution
30 |   device: "cuda"
31 |   world_size: 1
32 |   dist_url: "env://"
33 |   distributed: True
34 | 
35 |   # misc
36 |   seed: 42
37 |   output_dir: "output/clip/zs_imnet"
38 | 
39 |   evaluate: True
40 | 


--------------------------------------------------------------------------------
/experiments/lavis/projects/gpt/eval/dialogue_avsd_eval.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | model:
 7 |   arch: gpt_dialogue
 8 |   model_type: base
 9 | 
10 | datasets:
11 |   avsd_dialogue: # name of the dataset builder
12 |     vis_processor:
13 |         eval:
14 |           name: "gpt_video_ft"
15 |           visual_ft: ["i3d_flow", "i3d_rgb"]
16 |           audio_ft: ["vggish"]
17 |     text_processor:
18 |         eval:
19 |           name: "gpt_dialogue"
20 |           max_turns:  3
21 |           use_caption: True
22 | 
23 | run:
24 |   task: dialogue
25 |   # optimizer
26 |   batch_size_train: 16
27 |   batch_size_eval: 16
28 |   num_workers: 0
29 | 
30 |   max_len: 20
31 |   min_len: 5
32 |   num_beams: 5
33 | 
34 |   seed: 42
35 |   output_dir: "output/gpt2/dialogue_avsd"
36 | 
37 |   evaluate: True
38 |   valid_splits: ["test"]
39 | 
40 |   device: "cuda"
41 |   world_size: 1
42 |   dist_url: "env://"
43 |   distributed: True
44 | 


--------------------------------------------------------------------------------
/experiments/lavis/runners/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.runners.runner_base import RunnerBase
 9 | from lavis.runners.runner_iter import RunnerIter
10 | 
11 | __all__ = ["RunnerBase", "RunnerIter"]
12 | 


--------------------------------------------------------------------------------
/experiments/lavis/tasks/image_text_pretrain.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.common.registry import registry
 9 | from lavis.tasks.base_task import BaseTask
10 | 
11 | 
12 | @registry.register_task("image_text_pretrain")
13 | class ImageTextPretrainTask(BaseTask):
14 |     def __init__(self):
15 |         super().__init__()
16 | 
17 |     def evaluation(self, model, data_loader, cuda_enabled=True):
18 |         pass
19 | 


--------------------------------------------------------------------------------
/experiments/lavis/tasks/text_to_image_generation.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from lavis.tasks import BaseTask
 9 | from lavis.common.registry import registry
10 | 
11 | 
12 | @registry.register_task("text-to-image-generation")
13 | class TextToImageGenerationTask(BaseTask):
14 |     def __init__(self, cfg):
15 |         super().__init__()
16 | 
17 |         self.cfg = cfg
18 | 
19 |     @classmethod
20 |     def setup_task(cls, cfg):
21 |         run_cfg = cfg.run_cfg
22 | 
23 |         return cls(cfg=run_cfg)
24 | 


--------------------------------------------------------------------------------
/experiments/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/experiments/llava/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | 


--------------------------------------------------------------------------------
/experiments/llava/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
2 | from .language_model.llava_mpt import LlavaMPTForCausalLM, LlavaMPTConfig
3 | 


--------------------------------------------------------------------------------
/experiments/llava/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from llava.model import *
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/experiments/llava/model/language_model/mpt/custom_embedding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch import Tensor
 5 | 
 6 | class SharedEmbedding(nn.Embedding):
 7 | 
 8 |     def forward(self, input: Tensor, unembed: bool=False) -> Tensor:
 9 |         if unembed:
10 |             return F.linear(input, self.weight)
11 |         return super().forward(input)


--------------------------------------------------------------------------------
/experiments/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"):
 9 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
10 | 
11 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
12 | 


--------------------------------------------------------------------------------
/experiments/llava/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torchvision==0.15.2
 2 | transformers==4.31.0
 3 | torch==2.0.1
 4 | tokenizers>=0.12.1,<0.14
 5 | shortuuid
 6 | accelerate==0.21.0
 7 | peft==0.4.0
 8 | bitsandbytes==0.41.0
 9 | scikit-learn==1.2.2
10 | gradio==3.35.2
11 | gradio_client==0.2.9
12 | httpx==0.24.0
13 | numpy
14 | requests
15 | uvicorn
16 | fastapi
17 | einops
18 | einops-exts
19 | timm
20 | contexttimer
21 | decord
22 | diffusers
23 | fairscale
24 | ftfy
25 | iopath
26 | ipython
27 | omegaconf
28 | opencv-python
29 | opendatasets
30 | packaging
31 | pandas
32 | plotly
33 | pre-commit
34 | pycocoevalcap
35 | pycocotools
36 | python-magic
37 | scikit-image
38 | sentencepiece
39 | spacy
40 | streamlit
41 | tqdm
42 | webdataset
43 | wheel
44 | torchaudio
45 | soundfile
46 | moviepy
47 | nltk


--------------------------------------------------------------------------------
/utils/logger.py:
--------------------------------------------------------------------------------
 1 | import torch.distributed as dist
 2 | import logging
 3 | 
 4 | 
 5 | def create_logger(logging_dir):
 6 |     """
 7 |     Create a logger that writes to a log file and stdout.
 8 |     """
 9 |     if dist.get_rank() == 0:  # real logger
10 |         logging.basicConfig(
11 |             level=logging.INFO,
12 |             format="[\033[34m%(asctime)s\033[0m] %(message)s",
13 |             datefmt="%Y-%m-%d %H:%M:%S",
14 |             handlers=[logging.StreamHandler(), logging.FileHandler(f"{logging_dir}/log.txt")],
15 |         )
16 |         logger = logging.getLogger(__name__)
17 |     else:  # dummy logger (does nothing)
18 |         logger = logging.getLogger(__name__)
19 |         logger.addHandler(logging.NullHandler())
20 |     return logger


--------------------------------------------------------------------------------