├── .gitignore
├── LICENSE
├── README.md
├── assets
├── amber.png
├── amber_discriminative.png
├── eyes_forest.png
├── llava_bench.png
├── mme-fullset.png
├── mme-hallucination.png
├── motivation.png
├── observation.png
├── overview.png
├── pope.png
├── qualitative_amber_instructblip.png
├── qualitative_amber_instructblip2.png
├── qualitative_amber_llava.png
├── qualitative_amber_llava2.png
├── qualitative_mme2.png
├── qualitative_mme_instructblip.png
├── qualitative_mme_llava.png
├── qualitative_pope.png
└── qualitative_pope2.png
├── avisc_utils
├── avisc_sample.py
└── vcd_add_noise.py
├── eval_bench
├── SimSun.ttf
├── amber_eval_instructblip.py
├── amber_eval_llava.py
├── amber_loader.py
├── llava_bench_llava.py
├── pope_eval_instructblipb.py
├── pope_eval_llavab.py
├── pope_loader.py
└── scripts
│ ├── amber_eval.sh
│ ├── llava_bench_eval.sh
│ └── pope_eval_batch.sh
├── experiments
├── AMBER
│ ├── LICENSE
│ ├── README.md
│ ├── README_File
│ │ ├── Paper-Arxiv-orange.svg
│ │ ├── comparison.jpg
│ │ ├── intro.jpg
│ │ ├── result.jpg
│ │ └── statistics.jpg
│ ├── data
│ │ ├── annotations.json
│ │ ├── metrics.txt
│ │ ├── query
│ │ │ ├── query_all.json
│ │ │ ├── query_discriminative-attribute.json
│ │ │ ├── query_discriminative-existence.json
│ │ │ ├── query_discriminative-relation.json
│ │ │ ├── query_discriminative.json
│ │ │ └── query_generative.json
│ │ ├── relation.json
│ │ └── safe_words.txt
│ └── inference.py
├── cd_scripts
│ └── mme_eval.sh
├── eval
│ ├── calculation.py
│ ├── convert_answer_to_mme.py
│ ├── eval_mme.py
│ ├── eval_mme
│ │ ├── .DS_Store
│ │ ├── LaVIN
│ │ │ ├── OCR.txt
│ │ │ ├── artwork.txt
│ │ │ ├── celebrity.txt
│ │ │ ├── code_reasoning.txt
│ │ │ ├── color.txt
│ │ │ ├── commonsense_reasoning.txt
│ │ │ ├── count.txt
│ │ │ ├── existence.txt
│ │ │ ├── landmark.txt
│ │ │ ├── numerical_calculation.txt
│ │ │ ├── position.txt
│ │ │ ├── posters.txt
│ │ │ ├── scene.txt
│ │ │ └── text_translation.txt
│ │ ├── Your_Results
│ │ │ ├── OCR.txt
│ │ │ ├── artwork.txt
│ │ │ ├── celebrity.txt
│ │ │ ├── code_reasoning.txt
│ │ │ ├── color.txt
│ │ │ ├── commonsense_reasoning.txt
│ │ │ ├── count.txt
│ │ │ ├── existence.txt
│ │ │ ├── landmark.txt
│ │ │ ├── numerical_calculation.txt
│ │ │ ├── position.txt
│ │ │ ├── posters.txt
│ │ │ ├── scene.txt
│ │ │ └── text_translation.txt
│ │ └── readme.txt
│ ├── eval_pope.py
│ ├── mme_instructblip.py
│ ├── mme_llava.py
│ ├── object_hallucination_vqa_instructblip.py
│ └── object_hallucination_vqa_llava.py
├── lavis
│ ├── __init__.py
│ ├── common
│ │ ├── annotator
│ │ │ ├── canny
│ │ │ │ └── __init__.py
│ │ │ ├── ckpts
│ │ │ │ └── download.sh
│ │ │ ├── hed
│ │ │ │ └── __init__.py
│ │ │ ├── midas
│ │ │ │ ├── __init__.py
│ │ │ │ ├── api.py
│ │ │ │ ├── midas
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── base_model.py
│ │ │ │ │ ├── blocks.py
│ │ │ │ │ ├── dpt_depth.py
│ │ │ │ │ ├── midas_net.py
│ │ │ │ │ ├── midas_net_custom.py
│ │ │ │ │ ├── transforms.py
│ │ │ │ │ └── vit.py
│ │ │ │ └── utils.py
│ │ │ ├── mlsd
│ │ │ │ ├── __init__.py
│ │ │ │ ├── models
│ │ │ │ │ ├── mbv2_mlsd_large.py
│ │ │ │ │ └── mbv2_mlsd_tiny.py
│ │ │ │ └── utils.py
│ │ │ ├── openpose
│ │ │ │ ├── __init__.py
│ │ │ │ ├── body.py
│ │ │ │ ├── hand.py
│ │ │ │ ├── model.py
│ │ │ │ └── util.py
│ │ │ ├── uniformer
│ │ │ │ ├── __init__.py
│ │ │ │ ├── configs
│ │ │ │ │ └── _base_
│ │ │ │ │ │ ├── datasets
│ │ │ │ │ │ ├── ade20k.py
│ │ │ │ │ │ ├── chase_db1.py
│ │ │ │ │ │ ├── cityscapes.py
│ │ │ │ │ │ ├── cityscapes_769x769.py
│ │ │ │ │ │ ├── drive.py
│ │ │ │ │ │ ├── hrf.py
│ │ │ │ │ │ ├── pascal_context.py
│ │ │ │ │ │ ├── pascal_context_59.py
│ │ │ │ │ │ ├── pascal_voc12.py
│ │ │ │ │ │ ├── pascal_voc12_aug.py
│ │ │ │ │ │ └── stare.py
│ │ │ │ │ │ ├── default_runtime.py
│ │ │ │ │ │ ├── models
│ │ │ │ │ │ ├── ann_r50-d8.py
│ │ │ │ │ │ ├── apcnet_r50-d8.py
│ │ │ │ │ │ ├── ccnet_r50-d8.py
│ │ │ │ │ │ ├── cgnet.py
│ │ │ │ │ │ ├── danet_r50-d8.py
│ │ │ │ │ │ ├── deeplabv3_r50-d8.py
│ │ │ │ │ │ ├── deeplabv3_unet_s5-d16.py
│ │ │ │ │ │ ├── deeplabv3plus_r50-d8.py
│ │ │ │ │ │ ├── dmnet_r50-d8.py
│ │ │ │ │ │ ├── dnl_r50-d8.py
│ │ │ │ │ │ ├── emanet_r50-d8.py
│ │ │ │ │ │ ├── encnet_r50-d8.py
│ │ │ │ │ │ ├── fast_scnn.py
│ │ │ │ │ │ ├── fcn_hr18.py
│ │ │ │ │ │ ├── fcn_r50-d8.py
│ │ │ │ │ │ ├── fcn_unet_s5-d16.py
│ │ │ │ │ │ ├── fpn_r50.py
│ │ │ │ │ │ ├── fpn_uniformer.py
│ │ │ │ │ │ ├── gcnet_r50-d8.py
│ │ │ │ │ │ ├── lraspp_m-v3-d8.py
│ │ │ │ │ │ ├── nonlocal_r50-d8.py
│ │ │ │ │ │ ├── ocrnet_hr18.py
│ │ │ │ │ │ ├── ocrnet_r50-d8.py
│ │ │ │ │ │ ├── pointrend_r50.py
│ │ │ │ │ │ ├── psanet_r50-d8.py
│ │ │ │ │ │ ├── pspnet_r50-d8.py
│ │ │ │ │ │ ├── pspnet_unet_s5-d16.py
│ │ │ │ │ │ ├── upernet_r50.py
│ │ │ │ │ │ └── upernet_uniformer.py
│ │ │ │ │ │ └── schedules
│ │ │ │ │ │ ├── schedule_160k.py
│ │ │ │ │ │ ├── schedule_20k.py
│ │ │ │ │ │ ├── schedule_40k.py
│ │ │ │ │ │ └── schedule_80k.py
│ │ │ │ ├── exp
│ │ │ │ │ └── upernet_global_small
│ │ │ │ │ │ ├── config.py
│ │ │ │ │ │ ├── run.sh
│ │ │ │ │ │ ├── test.sh
│ │ │ │ │ │ ├── test_config_g.py
│ │ │ │ │ │ ├── test_config_h32.py
│ │ │ │ │ │ └── test_config_w32.py
│ │ │ │ ├── mmcv
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── arraymisc
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── quantization.py
│ │ │ │ │ ├── cnn
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── alexnet.py
│ │ │ │ │ │ ├── bricks
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── activation.py
│ │ │ │ │ │ │ ├── context_block.py
│ │ │ │ │ │ │ ├── conv.py
│ │ │ │ │ │ │ ├── conv2d_adaptive_padding.py
│ │ │ │ │ │ │ ├── conv_module.py
│ │ │ │ │ │ │ ├── conv_ws.py
│ │ │ │ │ │ │ ├── depthwise_separable_conv_module.py
│ │ │ │ │ │ │ ├── drop.py
│ │ │ │ │ │ │ ├── generalized_attention.py
│ │ │ │ │ │ │ ├── hsigmoid.py
│ │ │ │ │ │ │ ├── hswish.py
│ │ │ │ │ │ │ ├── non_local.py
│ │ │ │ │ │ │ ├── norm.py
│ │ │ │ │ │ │ ├── padding.py
│ │ │ │ │ │ │ ├── plugin.py
│ │ │ │ │ │ │ ├── registry.py
│ │ │ │ │ │ │ ├── scale.py
│ │ │ │ │ │ │ ├── swish.py
│ │ │ │ │ │ │ ├── transformer.py
│ │ │ │ │ │ │ ├── upsample.py
│ │ │ │ │ │ │ └── wrappers.py
│ │ │ │ │ │ ├── builder.py
│ │ │ │ │ │ ├── resnet.py
│ │ │ │ │ │ ├── utils
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── flops_counter.py
│ │ │ │ │ │ │ ├── fuse_conv_bn.py
│ │ │ │ │ │ │ ├── sync_bn.py
│ │ │ │ │ │ │ └── weight_init.py
│ │ │ │ │ │ └── vgg.py
│ │ │ │ │ ├── engine
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── test.py
│ │ │ │ │ ├── fileio
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── file_client.py
│ │ │ │ │ │ ├── handlers
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── base.py
│ │ │ │ │ │ │ ├── json_handler.py
│ │ │ │ │ │ │ ├── pickle_handler.py
│ │ │ │ │ │ │ └── yaml_handler.py
│ │ │ │ │ │ ├── io.py
│ │ │ │ │ │ └── parse.py
│ │ │ │ │ ├── image
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── colorspace.py
│ │ │ │ │ │ ├── geometric.py
│ │ │ │ │ │ ├── io.py
│ │ │ │ │ │ ├── misc.py
│ │ │ │ │ │ └── photometric.py
│ │ │ │ │ ├── model_zoo
│ │ │ │ │ │ ├── deprecated.json
│ │ │ │ │ │ ├── mmcls.json
│ │ │ │ │ │ └── open_mmlab.json
│ │ │ │ │ ├── ops
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── assign_score_withk.py
│ │ │ │ │ │ ├── ball_query.py
│ │ │ │ │ │ ├── bbox.py
│ │ │ │ │ │ ├── border_align.py
│ │ │ │ │ │ ├── box_iou_rotated.py
│ │ │ │ │ │ ├── carafe.py
│ │ │ │ │ │ ├── cc_attention.py
│ │ │ │ │ │ ├── contour_expand.py
│ │ │ │ │ │ ├── corner_pool.py
│ │ │ │ │ │ ├── correlation.py
│ │ │ │ │ │ ├── deform_conv.py
│ │ │ │ │ │ ├── deform_roi_pool.py
│ │ │ │ │ │ ├── deprecated_wrappers.py
│ │ │ │ │ │ ├── focal_loss.py
│ │ │ │ │ │ ├── furthest_point_sample.py
│ │ │ │ │ │ ├── fused_bias_leakyrelu.py
│ │ │ │ │ │ ├── gather_points.py
│ │ │ │ │ │ ├── group_points.py
│ │ │ │ │ │ ├── info.py
│ │ │ │ │ │ ├── iou3d.py
│ │ │ │ │ │ ├── knn.py
│ │ │ │ │ │ ├── masked_conv.py
│ │ │ │ │ │ ├── merge_cells.py
│ │ │ │ │ │ ├── modulated_deform_conv.py
│ │ │ │ │ │ ├── multi_scale_deform_attn.py
│ │ │ │ │ │ ├── nms.py
│ │ │ │ │ │ ├── pixel_group.py
│ │ │ │ │ │ ├── point_sample.py
│ │ │ │ │ │ ├── points_in_boxes.py
│ │ │ │ │ │ ├── points_sampler.py
│ │ │ │ │ │ ├── psa_mask.py
│ │ │ │ │ │ ├── roi_align.py
│ │ │ │ │ │ ├── roi_align_rotated.py
│ │ │ │ │ │ ├── roi_pool.py
│ │ │ │ │ │ ├── roiaware_pool3d.py
│ │ │ │ │ │ ├── roipoint_pool3d.py
│ │ │ │ │ │ ├── saconv.py
│ │ │ │ │ │ ├── scatter_points.py
│ │ │ │ │ │ ├── sync_bn.py
│ │ │ │ │ │ ├── three_interpolate.py
│ │ │ │ │ │ ├── three_nn.py
│ │ │ │ │ │ ├── tin_shift.py
│ │ │ │ │ │ ├── upfirdn2d.py
│ │ │ │ │ │ └── voxelize.py
│ │ │ │ │ ├── parallel
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── _functions.py
│ │ │ │ │ │ ├── collate.py
│ │ │ │ │ │ ├── data_container.py
│ │ │ │ │ │ ├── data_parallel.py
│ │ │ │ │ │ ├── distributed.py
│ │ │ │ │ │ ├── distributed_deprecated.py
│ │ │ │ │ │ ├── registry.py
│ │ │ │ │ │ ├── scatter_gather.py
│ │ │ │ │ │ └── utils.py
│ │ │ │ │ ├── runner
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── base_module.py
│ │ │ │ │ │ ├── base_runner.py
│ │ │ │ │ │ ├── builder.py
│ │ │ │ │ │ ├── checkpoint.py
│ │ │ │ │ │ ├── default_constructor.py
│ │ │ │ │ │ ├── dist_utils.py
│ │ │ │ │ │ ├── epoch_based_runner.py
│ │ │ │ │ │ ├── fp16_utils.py
│ │ │ │ │ │ ├── hooks
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── checkpoint.py
│ │ │ │ │ │ │ ├── closure.py
│ │ │ │ │ │ │ ├── ema.py
│ │ │ │ │ │ │ ├── evaluation.py
│ │ │ │ │ │ │ ├── hook.py
│ │ │ │ │ │ │ ├── iter_timer.py
│ │ │ │ │ │ │ ├── logger
│ │ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ │ ├── base.py
│ │ │ │ │ │ │ │ ├── dvclive.py
│ │ │ │ │ │ │ │ ├── mlflow.py
│ │ │ │ │ │ │ │ ├── neptune.py
│ │ │ │ │ │ │ │ ├── pavi.py
│ │ │ │ │ │ │ │ ├── tensorboard.py
│ │ │ │ │ │ │ │ ├── text.py
│ │ │ │ │ │ │ │ └── wandb.py
│ │ │ │ │ │ │ ├── lr_updater.py
│ │ │ │ │ │ │ ├── memory.py
│ │ │ │ │ │ │ ├── momentum_updater.py
│ │ │ │ │ │ │ ├── optimizer.py
│ │ │ │ │ │ │ ├── profiler.py
│ │ │ │ │ │ │ ├── sampler_seed.py
│ │ │ │ │ │ │ └── sync_buffer.py
│ │ │ │ │ │ ├── iter_based_runner.py
│ │ │ │ │ │ ├── log_buffer.py
│ │ │ │ │ │ ├── optimizer
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── builder.py
│ │ │ │ │ │ │ └── default_constructor.py
│ │ │ │ │ │ ├── priority.py
│ │ │ │ │ │ └── utils.py
│ │ │ │ │ ├── utils
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── config.py
│ │ │ │ │ │ ├── env.py
│ │ │ │ │ │ ├── ext_loader.py
│ │ │ │ │ │ ├── logging.py
│ │ │ │ │ │ ├── misc.py
│ │ │ │ │ │ ├── parrots_jit.py
│ │ │ │ │ │ ├── parrots_wrapper.py
│ │ │ │ │ │ ├── path.py
│ │ │ │ │ │ ├── progressbar.py
│ │ │ │ │ │ ├── registry.py
│ │ │ │ │ │ ├── testing.py
│ │ │ │ │ │ ├── timer.py
│ │ │ │ │ │ ├── trace.py
│ │ │ │ │ │ └── version_utils.py
│ │ │ │ │ ├── version.py
│ │ │ │ │ ├── video
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── io.py
│ │ │ │ │ │ ├── optflow.py
│ │ │ │ │ │ └── processing.py
│ │ │ │ │ └── visualization
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── color.py
│ │ │ │ │ │ ├── image.py
│ │ │ │ │ │ └── optflow.py
│ │ │ │ ├── mmcv_custom
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ └── checkpoint.py
│ │ │ │ └── mmseg
│ │ │ │ │ ├── apis
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── inference.py
│ │ │ │ │ ├── test.py
│ │ │ │ │ └── train.py
│ │ │ │ │ ├── core
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── evaluation
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── class_names.py
│ │ │ │ │ │ ├── eval_hooks.py
│ │ │ │ │ │ └── metrics.py
│ │ │ │ │ ├── seg
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── builder.py
│ │ │ │ │ │ └── sampler
│ │ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ │ ├── base_pixel_sampler.py
│ │ │ │ │ │ │ └── ohem_pixel_sampler.py
│ │ │ │ │ └── utils
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ └── misc.py
│ │ │ │ │ ├── datasets
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── ade.py
│ │ │ │ │ ├── builder.py
│ │ │ │ │ ├── chase_db1.py
│ │ │ │ │ ├── cityscapes.py
│ │ │ │ │ ├── custom.py
│ │ │ │ │ ├── dataset_wrappers.py
│ │ │ │ │ ├── drive.py
│ │ │ │ │ ├── hrf.py
│ │ │ │ │ ├── pascal_context.py
│ │ │ │ │ ├── pipelines
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── compose.py
│ │ │ │ │ │ ├── formating.py
│ │ │ │ │ │ ├── loading.py
│ │ │ │ │ │ ├── test_time_aug.py
│ │ │ │ │ │ └── transforms.py
│ │ │ │ │ ├── stare.py
│ │ │ │ │ └── voc.py
│ │ │ │ │ ├── models
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── backbones
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── cgnet.py
│ │ │ │ │ │ ├── fast_scnn.py
│ │ │ │ │ │ ├── hrnet.py
│ │ │ │ │ │ ├── mobilenet_v2.py
│ │ │ │ │ │ ├── mobilenet_v3.py
│ │ │ │ │ │ ├── resnest.py
│ │ │ │ │ │ ├── resnet.py
│ │ │ │ │ │ ├── resnext.py
│ │ │ │ │ │ ├── unet.py
│ │ │ │ │ │ ├── uniformer.py
│ │ │ │ │ │ └── vit.py
│ │ │ │ │ ├── builder.py
│ │ │ │ │ ├── decode_heads
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── ann_head.py
│ │ │ │ │ │ ├── apc_head.py
│ │ │ │ │ │ ├── aspp_head.py
│ │ │ │ │ │ ├── cascade_decode_head.py
│ │ │ │ │ │ ├── cc_head.py
│ │ │ │ │ │ ├── da_head.py
│ │ │ │ │ │ ├── decode_head.py
│ │ │ │ │ │ ├── dm_head.py
│ │ │ │ │ │ ├── dnl_head.py
│ │ │ │ │ │ ├── ema_head.py
│ │ │ │ │ │ ├── enc_head.py
│ │ │ │ │ │ ├── fcn_head.py
│ │ │ │ │ │ ├── fpn_head.py
│ │ │ │ │ │ ├── gc_head.py
│ │ │ │ │ │ ├── lraspp_head.py
│ │ │ │ │ │ ├── nl_head.py
│ │ │ │ │ │ ├── ocr_head.py
│ │ │ │ │ │ ├── point_head.py
│ │ │ │ │ │ ├── psa_head.py
│ │ │ │ │ │ ├── psp_head.py
│ │ │ │ │ │ ├── sep_aspp_head.py
│ │ │ │ │ │ ├── sep_fcn_head.py
│ │ │ │ │ │ └── uper_head.py
│ │ │ │ │ ├── losses
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── accuracy.py
│ │ │ │ │ │ ├── cross_entropy_loss.py
│ │ │ │ │ │ ├── dice_loss.py
│ │ │ │ │ │ ├── lovasz_loss.py
│ │ │ │ │ │ └── utils.py
│ │ │ │ │ ├── necks
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── fpn.py
│ │ │ │ │ │ └── multilevel_neck.py
│ │ │ │ │ ├── segmentors
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── base.py
│ │ │ │ │ │ ├── cascade_encoder_decoder.py
│ │ │ │ │ │ └── encoder_decoder.py
│ │ │ │ │ └── utils
│ │ │ │ │ │ ├── __init__.py
│ │ │ │ │ │ ├── drop.py
│ │ │ │ │ │ ├── inverted_residual.py
│ │ │ │ │ │ ├── make_divisible.py
│ │ │ │ │ │ ├── res_layer.py
│ │ │ │ │ │ ├── se_layer.py
│ │ │ │ │ │ ├── self_attention_block.py
│ │ │ │ │ │ ├── up_conv_block.py
│ │ │ │ │ │ └── weight_init.py
│ │ │ │ │ ├── ops
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── encoding.py
│ │ │ │ │ └── wrappers.py
│ │ │ │ │ └── utils
│ │ │ │ │ ├── __init__.py
│ │ │ │ │ ├── collect_env.py
│ │ │ │ │ └── logger.py
│ │ │ └── util.py
│ │ ├── config.py
│ │ ├── dist_utils.py
│ │ ├── gradcam.py
│ │ ├── logger.py
│ │ ├── optims.py
│ │ ├── registry.py
│ │ ├── utils.py
│ │ └── vqa_tools
│ │ │ ├── __init__.py
│ │ │ ├── vqa.py
│ │ │ └── vqa_eval.py
│ ├── configs
│ │ ├── datasets
│ │ │ ├── aokvqa
│ │ │ │ └── defaults.yaml
│ │ │ ├── avsd
│ │ │ │ └── defaults_dial.yaml
│ │ │ ├── blip_diffusion_datasets
│ │ │ │ └── defaults.yaml
│ │ │ ├── coco
│ │ │ │ ├── defaults_cap.yaml
│ │ │ │ ├── defaults_ret.yaml
│ │ │ │ ├── defaults_vqa.yaml
│ │ │ │ └── eval_vqa.yaml
│ │ │ ├── conceptual_caption
│ │ │ │ ├── defaults_12m.yaml
│ │ │ │ └── defaults_3m.yaml
│ │ │ ├── didemo
│ │ │ │ └── defaults_ret.yaml
│ │ │ ├── flickr30k
│ │ │ │ └── defaults.yaml
│ │ │ ├── gqa
│ │ │ │ ├── balanced_testdev.yaml
│ │ │ │ ├── balanced_val.yaml
│ │ │ │ └── defaults.yaml
│ │ │ ├── imagenet
│ │ │ │ └── defaults.yaml
│ │ │ ├── laion
│ │ │ │ └── defaults_2B_multi.yaml
│ │ │ ├── msrvtt
│ │ │ │ ├── defaults_cap.yaml
│ │ │ │ ├── defaults_qa.yaml
│ │ │ │ └── defaults_ret.yaml
│ │ │ ├── msvd
│ │ │ │ ├── defaults_cap.yaml
│ │ │ │ └── defaults_qa.yaml
│ │ │ ├── nlvr
│ │ │ │ └── defaults.yaml
│ │ │ ├── nocaps
│ │ │ │ └── defaults.yaml
│ │ │ ├── okvqa
│ │ │ │ └── defaults.yaml
│ │ │ ├── sbu_caption
│ │ │ │ └── defaults.yaml
│ │ │ ├── snli_ve
│ │ │ │ └── defaults.yaml
│ │ │ ├── vatex
│ │ │ │ └── defaults_cap.yaml
│ │ │ └── vg
│ │ │ │ ├── defaults_caption.yaml
│ │ │ │ └── defaults_vqa.yaml
│ │ ├── default.yaml
│ │ └── models
│ │ │ ├── albef_classification_ve.yaml
│ │ │ ├── albef_feature_extractor.yaml
│ │ │ ├── albef_nlvr.yaml
│ │ │ ├── albef_pretrain_base.yaml
│ │ │ ├── albef_retrieval_coco.yaml
│ │ │ ├── albef_retrieval_flickr.yaml
│ │ │ ├── albef_vqav2.yaml
│ │ │ ├── alpro_qa_msrvtt.yaml
│ │ │ ├── alpro_qa_msvd.yaml
│ │ │ ├── alpro_retrieval_didemo.yaml
│ │ │ ├── alpro_retrieval_msrvtt.yaml
│ │ │ ├── bert_config.json
│ │ │ ├── bert_config_alpro.json
│ │ │ ├── blip-diffusion
│ │ │ ├── blip_diffusion_base.yaml
│ │ │ ├── blip_diffusion_controlnet_canny.yaml
│ │ │ ├── blip_diffusion_controlnet_depth.yaml
│ │ │ └── blip_diffusion_controlnet_hed.yaml
│ │ │ ├── blip2
│ │ │ ├── blip2_caption_flant5xl.yaml
│ │ │ ├── blip2_caption_opt2.7b.yaml
│ │ │ ├── blip2_caption_opt6.7b.yaml
│ │ │ ├── blip2_coco.yaml
│ │ │ ├── blip2_instruct_flant5xl.yaml
│ │ │ ├── blip2_instruct_flant5xxl.yaml
│ │ │ ├── blip2_instruct_vicuna13b.yaml
│ │ │ ├── blip2_instruct_vicuna7b.yaml
│ │ │ ├── blip2_pretrain.yaml
│ │ │ ├── blip2_pretrain_flant5xl.yaml
│ │ │ ├── blip2_pretrain_flant5xl_vitL.yaml
│ │ │ ├── blip2_pretrain_flant5xxl.yaml
│ │ │ ├── blip2_pretrain_llama7b.yaml
│ │ │ ├── blip2_pretrain_opt2.7b.yaml
│ │ │ ├── blip2_pretrain_opt6.7b.yaml
│ │ │ └── blip2_pretrain_vitL.yaml
│ │ │ ├── blip_caption_base_coco.yaml
│ │ │ ├── blip_caption_large_coco.yaml
│ │ │ ├── blip_classification_base.yaml
│ │ │ ├── blip_feature_extractor_base.yaml
│ │ │ ├── blip_itm_base.yaml
│ │ │ ├── blip_itm_large.yaml
│ │ │ ├── blip_nlvr.yaml
│ │ │ ├── blip_pretrain_base.yaml
│ │ │ ├── blip_pretrain_large.yaml
│ │ │ ├── blip_retrieval_coco.yaml
│ │ │ ├── blip_retrieval_flickr.yaml
│ │ │ ├── blip_vqa_aokvqa.yaml
│ │ │ ├── blip_vqa_okvqa.yaml
│ │ │ ├── blip_vqav2.yaml
│ │ │ ├── clip
│ │ │ ├── RN101-quickgelu.json
│ │ │ ├── RN101.json
│ │ │ ├── RN50-quickgelu.json
│ │ │ ├── RN50.json
│ │ │ ├── RN50x16.json
│ │ │ ├── RN50x4.json
│ │ │ ├── ViT-B-16-plus-240.json
│ │ │ ├── ViT-B-16-plus.json
│ │ │ ├── ViT-B-16.json
│ │ │ ├── ViT-B-32-plus-256.json
│ │ │ ├── ViT-B-32-quickgelu.json
│ │ │ ├── ViT-B-32.json
│ │ │ ├── ViT-H-14.json
│ │ │ ├── ViT-H-16.json
│ │ │ ├── ViT-L-14-280.json
│ │ │ ├── ViT-L-14-336.json
│ │ │ ├── ViT-L-14.json
│ │ │ ├── ViT-L-16-320.json
│ │ │ ├── ViT-L-16.json
│ │ │ ├── ViT-g-14.json
│ │ │ ├── timm-efficientnetv2_rw_s.json
│ │ │ ├── timm-resnet50d.json
│ │ │ ├── timm-resnetaa50d.json
│ │ │ ├── timm-resnetblur50.json
│ │ │ ├── timm-swin_base_patch4_window7_224.json
│ │ │ ├── timm-vit_base_patch16_224.json
│ │ │ ├── timm-vit_base_patch32_224.json
│ │ │ └── timm-vit_small_patch16_224.json
│ │ │ ├── clip_resnet50.yaml
│ │ │ ├── clip_vit_base16.yaml
│ │ │ ├── clip_vit_base32.yaml
│ │ │ ├── clip_vit_large14.yaml
│ │ │ ├── clip_vit_large14_336.yaml
│ │ │ ├── gpt_dialogue_base.yaml
│ │ │ ├── img2prompt-vqa
│ │ │ └── img2prompt_vqa_base.yaml
│ │ │ ├── med_config.json
│ │ │ ├── med_config_albef.json
│ │ │ ├── med_large_config.json
│ │ │ └── pnp-vqa
│ │ │ ├── pnp_vqa_3b.yaml
│ │ │ ├── pnp_vqa_base.yaml
│ │ │ ├── pnp_vqa_large.yaml
│ │ │ ├── unifiedqav2_3b_config.json
│ │ │ ├── unifiedqav2_base_config.json
│ │ │ └── unifiedqav2_large_config.json
│ ├── datasets
│ │ ├── builders
│ │ │ ├── __init__.py
│ │ │ ├── base_dataset_builder.py
│ │ │ ├── caption_builder.py
│ │ │ ├── classification_builder.py
│ │ │ ├── dialogue_builder.py
│ │ │ ├── image_text_pair_builder.py
│ │ │ ├── imagefolder_builder.py
│ │ │ ├── retrieval_builder.py
│ │ │ ├── text_to_image_generation_builder.py
│ │ │ ├── video_qa_builder.py
│ │ │ └── vqa_builder.py
│ │ ├── data_utils.py
│ │ ├── datasets
│ │ │ ├── aok_vqa_datasets.py
│ │ │ ├── avsd_dialogue_datasets.py
│ │ │ ├── base_dataset.py
│ │ │ ├── caption_datasets.py
│ │ │ ├── coco_caption_datasets.py
│ │ │ ├── coco_vqa_datasets.py
│ │ │ ├── dataloader_utils.py
│ │ │ ├── dialogue_datasets.py
│ │ │ ├── gqa_datasets.py
│ │ │ ├── image_text_pair_datasets.py
│ │ │ ├── imagefolder_dataset.py
│ │ │ ├── laion_dataset.py
│ │ │ ├── multimodal_classification_datasets.py
│ │ │ ├── nlvr_datasets.py
│ │ │ ├── retrieval_datasets.py
│ │ │ ├── snli_ve_datasets.py
│ │ │ ├── subject_driven_t2i_dataset.py
│ │ │ ├── vg_vqa_datasets.py
│ │ │ ├── video_caption_datasets.py
│ │ │ ├── video_vqa_datasets.py
│ │ │ └── vqa_datasets.py
│ │ └── download_scripts
│ │ │ ├── DownloadConceptualCaptions
│ │ │ ├── LICENSE
│ │ │ ├── README.md
│ │ │ ├── create_annotation_12m.ipynb
│ │ │ ├── create_annotation_3m.ipynb
│ │ │ ├── download_data_cc12m.py
│ │ │ └── download_data_cc3m.py
│ │ │ ├── download_coco.py
│ │ │ ├── download_didemo.py
│ │ │ ├── download_flickr.py
│ │ │ ├── download_gqa.py
│ │ │ ├── download_msrvtt.py
│ │ │ ├── download_msvd.py
│ │ │ ├── download_nocaps.py
│ │ │ ├── download_sbu.py
│ │ │ └── download_vg.py
│ ├── models
│ │ ├── __init__.py
│ │ ├── albef_models
│ │ │ ├── __init__.py
│ │ │ ├── albef_classification.py
│ │ │ ├── albef_feature_extractor.py
│ │ │ ├── albef_nlvr.py
│ │ │ ├── albef_outputs.py
│ │ │ ├── albef_pretrain.py
│ │ │ ├── albef_retrieval.py
│ │ │ └── albef_vqa.py
│ │ ├── alpro_models
│ │ │ ├── __init__.py
│ │ │ ├── alpro_outputs.py
│ │ │ ├── alpro_qa.py
│ │ │ └── alpro_retrieval.py
│ │ ├── base_model.py
│ │ ├── blip2_models
│ │ │ ├── Qformer.py
│ │ │ ├── __init__.py
│ │ │ ├── blip2.py
│ │ │ ├── blip2_image_text_matching.py
│ │ │ ├── blip2_opt.py
│ │ │ ├── blip2_qformer.py
│ │ │ ├── blip2_t5.py
│ │ │ ├── blip2_t5_instruct.py
│ │ │ ├── blip2_vicuna_instruct.py
│ │ │ ├── modeling_llama.py
│ │ │ ├── modeling_opt.py
│ │ │ └── modeling_t5.py
│ │ ├── blip_diffusion_models
│ │ │ ├── __init__.py
│ │ │ ├── blip_diffusion.py
│ │ │ ├── modeling_ctx_clip.py
│ │ │ ├── ptp_utils.py
│ │ │ └── utils.py
│ │ ├── blip_models
│ │ │ ├── __init__.py
│ │ │ ├── blip.py
│ │ │ ├── blip_caption.py
│ │ │ ├── blip_classification.py
│ │ │ ├── blip_feature_extractor.py
│ │ │ ├── blip_image_text_matching.py
│ │ │ ├── blip_nlvr.py
│ │ │ ├── blip_outputs.py
│ │ │ ├── blip_pretrain.py
│ │ │ ├── blip_retrieval.py
│ │ │ ├── blip_vqa.py
│ │ │ └── nlvr_encoder.py
│ │ ├── clip_models
│ │ │ ├── __init__.py
│ │ │ ├── bpe_simple_vocab_16e6.txt.gz
│ │ │ ├── clip_outputs.py
│ │ │ ├── loss.py
│ │ │ ├── model.py
│ │ │ ├── pics
│ │ │ │ └── CLIP.png
│ │ │ ├── pretrained.py
│ │ │ ├── timm_model.py
│ │ │ ├── tokenizer.py
│ │ │ ├── transform.py
│ │ │ └── utils.py
│ │ ├── clip_vit.py
│ │ ├── eva_vit.py
│ │ ├── gpt_models
│ │ │ └── gpt_dialogue.py
│ │ ├── img2prompt_models
│ │ │ ├── __init__.py
│ │ │ └── img2prompt_vqa.py
│ │ ├── med.py
│ │ ├── pnp_vqa_models
│ │ │ ├── __init__.py
│ │ │ ├── pnp_unifiedqav2_fid.py
│ │ │ └── pnp_vqa.py
│ │ ├── timesformer
│ │ │ ├── __init__.py
│ │ │ ├── conv2d_same.py
│ │ │ ├── features.py
│ │ │ ├── helpers.py
│ │ │ ├── linear.py
│ │ │ ├── vit.py
│ │ │ └── vit_utils.py
│ │ └── vit.py
│ ├── processors
│ │ ├── __init__.py
│ │ ├── alpro_processors.py
│ │ ├── base_processor.py
│ │ ├── blip_diffusion_processors.py
│ │ ├── blip_processors.py
│ │ ├── clip_processors.py
│ │ ├── functional_video.py
│ │ ├── gpt_processors.py
│ │ ├── randaugment.py
│ │ └── transforms_video.py
│ ├── projects
│ │ ├── albef
│ │ │ ├── eval
│ │ │ │ ├── nlvr_eval.yaml
│ │ │ │ ├── ret_coco_eval.yaml
│ │ │ │ ├── ret_flickr30k_eval.yaml
│ │ │ │ ├── snli_ve_eval.yaml
│ │ │ │ ├── vqa_test.yaml
│ │ │ │ └── vqa_val.yaml
│ │ │ └── train
│ │ │ │ ├── aokvqa_ft.yaml
│ │ │ │ ├── nlvr_ft.yaml
│ │ │ │ ├── okvqa_ft.yaml
│ │ │ │ ├── pretrain.yaml
│ │ │ │ ├── ret_coco_ft.yaml
│ │ │ │ ├── ret_flickr30k_ft.yaml
│ │ │ │ ├── snli_ve_ft.yaml
│ │ │ │ └── vqa_ft.yaml
│ │ ├── alpro
│ │ │ ├── eval
│ │ │ │ ├── didemo_ret_eval.yaml
│ │ │ │ ├── msrvtt_qa_eval.yaml
│ │ │ │ ├── msrvtt_ret_eval.yaml
│ │ │ │ └── msvd_qa_eval.yaml
│ │ │ └── train
│ │ │ │ ├── didemo_ret_ft.yaml
│ │ │ │ ├── msrvtt_qa_ft.yaml
│ │ │ │ ├── msrvtt_retrieval_ft.yaml
│ │ │ │ └── msvd_qa_ft.yaml
│ │ ├── blip
│ │ │ ├── coco_cap_ft_iter.yaml
│ │ │ ├── eval
│ │ │ │ ├── aokvqa_eval.yaml
│ │ │ │ ├── caption_coco_eval.yaml
│ │ │ │ ├── caption_coco_eval_large.yaml
│ │ │ │ ├── nlvr_eval.yaml
│ │ │ │ ├── nocaps_eval.yaml
│ │ │ │ ├── okvqa_eval.yaml
│ │ │ │ ├── ret_coco_eval.yaml
│ │ │ │ ├── ret_flickr_eval.yaml
│ │ │ │ └── vqav2_eval.yaml
│ │ │ └── train
│ │ │ │ ├── aokvqa_ft.yaml
│ │ │ │ ├── caption_coco_ft.yaml
│ │ │ │ ├── caption_coco_large_ft.yaml
│ │ │ │ ├── nlvr_ft.yaml
│ │ │ │ ├── okvqa_ft.yaml
│ │ │ │ ├── pretrain_14m.yaml
│ │ │ │ ├── retrieval_coco_ft.yaml
│ │ │ │ ├── retrieval_flickr_ft.yaml
│ │ │ │ └── vqav2_ft.yaml
│ │ ├── blip2
│ │ │ ├── eval
│ │ │ │ ├── caption_coco_flant5xl_eval.yaml
│ │ │ │ ├── caption_coco_opt2.7b_eval.yaml
│ │ │ │ ├── caption_coco_opt6.7b_eval.yaml
│ │ │ │ ├── gqa_zeroshot_flant5xl_eval.yaml
│ │ │ │ ├── okvqa_zeroshot_flant5xl_eval.yaml
│ │ │ │ ├── ret_coco_eval.yaml
│ │ │ │ ├── ret_flickr_eval.yaml
│ │ │ │ ├── vqav2_zeroshot_flant5xl_eval.yaml
│ │ │ │ └── vqav2_zeroshot_opt_eval.yaml
│ │ │ └── train
│ │ │ │ ├── caption_coco_ft.yaml
│ │ │ │ ├── pretrain_stage1.yaml
│ │ │ │ ├── pretrain_stage2.yaml
│ │ │ │ └── retrieval_coco_ft.yaml
│ │ ├── blip_diffusion
│ │ │ ├── finetune-db-dog.yaml
│ │ │ ├── finetune-db-pink-dress.yaml
│ │ │ ├── finetune-db-shein-jacket.yaml
│ │ │ └── finetune-db-template.yaml
│ │ ├── clip
│ │ │ ├── exp_coco_ret_eval.yaml
│ │ │ ├── exp_flickr_ret_eval.yaml
│ │ │ └── exp_imnet_zs_eval.yaml
│ │ ├── gpt
│ │ │ ├── eval
│ │ │ │ └── dialogue_avsd_eval.yaml
│ │ │ └── train
│ │ │ │ └── dialogue_avsd_ft.yaml
│ │ └── pnp-vqa
│ │ │ └── eval
│ │ │ ├── gqa_eval.yaml
│ │ │ ├── gqa_eval_3b.yaml
│ │ │ ├── gqa_eval_large.yaml
│ │ │ ├── okvqa_eval.yaml
│ │ │ ├── okvqa_eval_3b.yaml
│ │ │ ├── okvqa_eval_large.yaml
│ │ │ ├── vqav2_eval.yaml
│ │ │ ├── vqav2_eval_3b.yaml
│ │ │ ├── vqav2_eval_large.yaml
│ │ │ ├── vqav2_test_eval.yaml
│ │ │ ├── vqav2_test_eval_3b.yaml
│ │ │ └── vqav2_test_eval_large.yaml
│ ├── runners
│ │ ├── __init__.py
│ │ ├── runner_base.py
│ │ └── runner_iter.py
│ └── tasks
│ │ ├── __init__.py
│ │ ├── base_task.py
│ │ ├── captioning.py
│ │ ├── dialogue.py
│ │ ├── image_text_pretrain.py
│ │ ├── multimodal_classification.py
│ │ ├── retrieval.py
│ │ ├── text_to_image_generation.py
│ │ ├── vqa.py
│ │ └── vqa_reading_comprehension.py
└── llava
│ ├── __init__.py
│ ├── constants.py
│ ├── conversation.py
│ ├── mm_utils.py
│ ├── model
│ ├── __init__.py
│ ├── builder.py
│ ├── consolidate.py
│ ├── language_model
│ │ ├── llava_llama.py
│ │ ├── llava_mpt.py
│ │ └── mpt
│ │ │ ├── adapt_tokenizer.py
│ │ │ ├── attention.py
│ │ │ ├── blocks.py
│ │ │ ├── configuration_mpt.py
│ │ │ ├── custom_embedding.py
│ │ │ ├── flash_attn_triton.py
│ │ │ ├── hf_prefixlm_converter.py
│ │ │ ├── meta_init_context.py
│ │ │ ├── modeling_mpt.py
│ │ │ ├── norm.py
│ │ │ └── param_init_fns.py
│ ├── llava_arch.py
│ ├── make_delta.py
│ ├── multimodal_encoder
│ │ ├── builder.py
│ │ └── clip_encoder.py
│ ├── multimodal_projector
│ │ └── builder.py
│ └── utils.py
│ └── utils.py
├── requirements.txt
└── utils
├── dist_util.py
└── logger.py
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2024 Sangmin Woo
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/assets/amber.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/amber.png
--------------------------------------------------------------------------------
/assets/amber_discriminative.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/amber_discriminative.png
--------------------------------------------------------------------------------
/assets/eyes_forest.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/eyes_forest.png
--------------------------------------------------------------------------------
/assets/llava_bench.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/llava_bench.png
--------------------------------------------------------------------------------
/assets/mme-fullset.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/mme-fullset.png
--------------------------------------------------------------------------------
/assets/mme-hallucination.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/mme-hallucination.png
--------------------------------------------------------------------------------
/assets/motivation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/motivation.png
--------------------------------------------------------------------------------
/assets/observation.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/observation.png
--------------------------------------------------------------------------------
/assets/overview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/overview.png
--------------------------------------------------------------------------------
/assets/pope.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/pope.png
--------------------------------------------------------------------------------
/assets/qualitative_amber_instructblip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_amber_instructblip.png
--------------------------------------------------------------------------------
/assets/qualitative_amber_instructblip2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_amber_instructblip2.png
--------------------------------------------------------------------------------
/assets/qualitative_amber_llava.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_amber_llava.png
--------------------------------------------------------------------------------
/assets/qualitative_amber_llava2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_amber_llava2.png
--------------------------------------------------------------------------------
/assets/qualitative_mme2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_mme2.png
--------------------------------------------------------------------------------
/assets/qualitative_mme_instructblip.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_mme_instructblip.png
--------------------------------------------------------------------------------
/assets/qualitative_mme_llava.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_mme_llava.png
--------------------------------------------------------------------------------
/assets/qualitative_pope.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_pope.png
--------------------------------------------------------------------------------
/assets/qualitative_pope2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_pope2.png
--------------------------------------------------------------------------------
/avisc_utils/vcd_add_noise.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | def add_diffusion_noise(image_tensor, noise_step):
4 | num_steps = 1000 # Number of diffusion steps
5 |
6 | # decide beta in each step
7 | betas = torch.linspace(-6,6,num_steps)
8 | betas = torch.sigmoid(betas) * (0.5e-2 - 1e-5) + 1e-5
9 |
10 | # decide alphas in each step
11 | alphas = 1 - betas
12 | alphas_prod = torch.cumprod(alphas, dim=0)
13 | alphas_prod_p = torch.cat([torch.tensor([1]).float(), alphas_prod[:-1]],0) # p for previous
14 | alphas_bar_sqrt = torch.sqrt(alphas_prod)
15 | one_minus_alphas_bar_log = torch.log(1 - alphas_prod)
16 | one_minus_alphas_bar_sqrt = torch.sqrt(1 - alphas_prod)
17 |
18 | def q_x(x_0,t):
19 | noise = torch.randn_like(x_0)
20 | alphas_t = alphas_bar_sqrt[t]
21 | alphas_1_m_t = one_minus_alphas_bar_sqrt[t]
22 | return (alphas_t*x_0 + alphas_1_m_t*noise)
23 |
24 | noise_delta = int(noise_step) # from 0-999
25 | noisy_image = image_tensor.clone()
26 | image_tensor_cd = q_x(noisy_image,noise_step)
27 |
28 | return image_tensor_cd
29 |
30 |
--------------------------------------------------------------------------------
/eval_bench/SimSun.ttf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/eval_bench/SimSun.ttf
--------------------------------------------------------------------------------
/eval_bench/scripts/llava_bench_eval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 |
4 | ## set below
5 | ####################################################
6 | seed=42
7 | model="llava" # llava | qwen-vl | instructblip
8 | use_avisc=false
9 | use_cd=False
10 | gpus=0
11 | max_token=64
12 | cd_alpha=2.5
13 | cd_beta=0.1
14 | model_path="/path/to/the/checkpoints/llava-v1.5-7b"
15 | pope_path="path/to/dataset/llava-bench-in-the-wild/questions.jsonl"
16 | data_path="path/to/dataset/llava-bench-in-the-wild/images"
17 | log_path="path/to//llava_bench/.json"
18 | conv="llava_v1"
19 | batch_size=1
20 | ####################################################
21 |
22 | export CUDA_VISIBLE_DEVICES=${gpus}
23 | python ./eval_bench/llava_bench_llava.py \
24 | --seed ${seed} \
25 | --model-path ${model_path} \
26 | --question-file ${pope_path} \
27 | --image-folder ${data_path} \
28 | --answers-file ${log_path} \
29 | --conv ${conv} \
30 | --use_avisc ${use_avisc} \
31 | --use_cd ${use_cd} \
32 | --max_token ${max_token} \
33 | --cd_alpha ${cd_alpha} \
34 | --cd_beta ${cd_beta} \
35 |
36 |
--------------------------------------------------------------------------------
/experiments/AMBER/README_File/Paper-Arxiv-orange.svg:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/experiments/AMBER/README_File/comparison.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/AMBER/README_File/comparison.jpg
--------------------------------------------------------------------------------
/experiments/AMBER/README_File/intro.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/AMBER/README_File/intro.jpg
--------------------------------------------------------------------------------
/experiments/AMBER/README_File/result.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/AMBER/README_File/result.jpg
--------------------------------------------------------------------------------
/experiments/AMBER/README_File/statistics.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/AMBER/README_File/statistics.jpg
--------------------------------------------------------------------------------
/experiments/AMBER/data/metrics.txt:
--------------------------------------------------------------------------------
1 | chair_num=0.001
2 | chair_score=0
3 | safe_cover_num=0.001
4 | safe_cover_score=0
5 | hallu_cover_num=0.001
6 | hallu_cover_score=0
7 | non_hallu_score=0
8 | non_hallu_num=0.001
9 | qa_correct_score=0
10 | qa_correct_num=0.001
11 | qa_no_score=0
12 | qa_no_num=0.001
13 | qa_ans_no_score=0
14 | qa_ans_no_num=0.001
15 | as_qa_correct_score=0
16 | as_qa_correct_num=0.001
17 | as_qa_no_score=0
18 | as_qa_no_num=0.001
19 | as_qa_ans_no_score=0
20 | as_qa_ans_no_num=0.001
21 | an_qa_correct_score=0
22 | an_qa_correct_num=0.001
23 | an_qa_no_score=0
24 | an_qa_no_num=0.001
25 | an_qa_ans_no_score=0
26 | an_qa_ans_no_num=0.001
27 | aa_qa_correct_score=0
28 | aa_qa_correct_num=0.001
29 | aa_qa_no_score=0
30 | aa_qa_no_num=0.001
31 | aa_qa_ans_no_score=0
32 | aa_qa_ans_no_num=0.001
33 | asso_qa_correct_score=0
34 | asso_qa_correct_num=0.001
35 | asso_qa_no_score=0
36 | asso_qa_no_num=0.001
37 | asso_qa_ans_no_score=0
38 | asso_qa_ans_no_num=0.001
39 | ha_qa_correct_score=0
40 | ha_qa_correct_num=0.001
41 | ha_qa_no_score=0
42 | ha_qa_no_num=0.001
43 | ha_qa_ans_no_score=0
44 | ha_qa_ans_no_num=0.001
--------------------------------------------------------------------------------
/experiments/AMBER/data/safe_words.txt:
--------------------------------------------------------------------------------
1 | orange
2 | snack
3 | line
4 | camera
5 | light
6 | shoe
7 | sign
8 | range
9 | individual
--------------------------------------------------------------------------------
/experiments/eval/eval_mme/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/eval/eval_mme/.DS_Store
--------------------------------------------------------------------------------
/experiments/eval/eval_mme/readme.txt:
--------------------------------------------------------------------------------
1 | # This is an automated calculation script for the acc, acc+, and score.
2 |
3 | # You can directly run "python3 calculation.py" to get the evaluation results of LaVIN.
4 |
5 | # In order to get the statistical results of your model:
6 |
7 | (1) Fill all the files in "Your_Results", adding your model's responses:
8 | Each file in "Your_Results" consists of:
9 | Image_Name + "\t" + Question + "\t" + Ground_Truth_Answer + "\n"
10 |
11 | You need to add the responses of your model as:
12 | Image_Name + "\t" + Question + "\t" + Ground_Truth_Answer + "\t" + Your_Response + "\n"
13 |
14 | Note: if your responses contain "\n", please delet it. For each question, your response can only be in one line, not across lines!
15 |
16 | (2) run "python3 calculation.py --results_dir ./Your_Results"
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/experiments/lavis/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import os
9 | import sys
10 |
11 | from omegaconf import OmegaConf
12 |
13 | from lavis.common.registry import registry
14 |
15 | from lavis.datasets.builders import *
16 | from lavis.models import *
17 | from lavis.processors import *
18 | from lavis.tasks import *
19 |
20 |
21 | root_dir = os.path.dirname(os.path.abspath(__file__))
22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
23 |
24 | registry.register_path("library_root", root_dir)
25 | repo_root = os.path.join(root_dir, "..")
26 | registry.register_path("repo_root", repo_root)
27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
28 | registry.register_path("cache_root", cache_root)
29 |
30 | registry.register("MAX_INT", sys.maxsize)
31 | registry.register("SPLIT_NAMES", ["train", "val", "test"])
32 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/canny/__init__.py:
--------------------------------------------------------------------------------
1 | import cv2
2 |
3 |
4 | class CannyDetector:
5 | def __call__(self, img, low_threshold, high_threshold):
6 | return cv2.Canny(img, low_threshold, high_threshold)
7 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/ckpts/download.sh:
--------------------------------------------------------------------------------
1 | #! /bin/bash
2 |
3 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt
4 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/network-bsds500.pth
5 |
6 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/midas/midas/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/common/annotator/midas/midas/__init__.py
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/midas/midas/base_model.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 |
4 | class BaseModel(torch.nn.Module):
5 | def load(self, path):
6 | """Load model from file.
7 |
8 | Args:
9 | path (str): file path
10 | """
11 | parameters = torch.load(path, map_location=torch.device('cpu'))
12 |
13 | if "optimizer" in parameters:
14 | parameters = parameters["model"]
15 |
16 | self.load_state_dict(parameters)
17 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | from annotator.uniformer.mmseg.apis import init_segmentor, inference_segmentor, show_result_pyplot
4 | from annotator.uniformer.mmseg.core.evaluation import get_palette
5 | from annotator.util import annotator_ckpts_path
6 |
7 |
8 | checkpoint_file = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/upernet_global_small.pth"
9 |
10 |
11 | class UniformerDetector:
12 | def __init__(self):
13 | modelpath = os.path.join(annotator_ckpts_path, "upernet_global_small.pth")
14 | if not os.path.exists(modelpath):
15 | from basicsr.utils.download_util import load_file_from_url
16 | load_file_from_url(checkpoint_file, model_dir=annotator_ckpts_path)
17 | config_file = os.path.join(os.path.dirname(annotator_ckpts_path), "uniformer", "exp", "upernet_global_small", "config.py")
18 | self.model = init_segmentor(config_file, modelpath).cuda()
19 |
20 | def __call__(self, img):
21 | result = inference_segmentor(self.model, img)
22 | res_img = show_result_pyplot(self.model, img, result, get_palette('ade'), opacity=1)
23 | return res_img
24 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/datasets/pascal_voc12_aug.py:
--------------------------------------------------------------------------------
1 | _base_ = './pascal_voc12.py'
2 | # dataset settings
3 | data = dict(
4 | train=dict(
5 | ann_dir=['SegmentationClass', 'SegmentationClassAug'],
6 | split=[
7 | 'ImageSets/Segmentation/train.txt',
8 | 'ImageSets/Segmentation/aug.txt'
9 | ]))
10 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/default_runtime.py:
--------------------------------------------------------------------------------
1 | # yapf:disable
2 | log_config = dict(
3 | interval=50,
4 | hooks=[
5 | dict(type='TextLoggerHook', by_epoch=False),
6 | # dict(type='TensorboardLoggerHook')
7 | ])
8 | # yapf:enable
9 | dist_params = dict(backend='nccl')
10 | log_level = 'INFO'
11 | load_from = None
12 | resume_from = None
13 | workflow = [('train', 1)]
14 | cudnn_benchmark = True
15 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/models/cgnet.py:
--------------------------------------------------------------------------------
1 | # model settings
2 | norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True)
3 | model = dict(
4 | type='EncoderDecoder',
5 | backbone=dict(
6 | type='CGNet',
7 | norm_cfg=norm_cfg,
8 | in_channels=3,
9 | num_channels=(32, 64, 128),
10 | num_blocks=(3, 21),
11 | dilations=(2, 4),
12 | reductions=(8, 16)),
13 | decode_head=dict(
14 | type='FCNHead',
15 | in_channels=256,
16 | in_index=2,
17 | channels=256,
18 | num_convs=0,
19 | concat_input=False,
20 | dropout_ratio=0,
21 | num_classes=19,
22 | norm_cfg=norm_cfg,
23 | loss_decode=dict(
24 | type='CrossEntropyLoss',
25 | use_sigmoid=False,
26 | loss_weight=1.0,
27 | class_weight=[
28 | 2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352,
29 | 10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905,
30 | 10.347791, 6.3927646, 10.226669, 10.241062, 10.280587,
31 | 10.396974, 10.055647
32 | ])),
33 | # model training and testing settings
34 | train_cfg=dict(sampler=None),
35 | test_cfg=dict(mode='whole'))
36 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/models/fpn_r50.py:
--------------------------------------------------------------------------------
1 | # model settings
2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
3 | model = dict(
4 | type='EncoderDecoder',
5 | pretrained='open-mmlab://resnet50_v1c',
6 | backbone=dict(
7 | type='ResNetV1c',
8 | depth=50,
9 | num_stages=4,
10 | out_indices=(0, 1, 2, 3),
11 | dilations=(1, 1, 1, 1),
12 | strides=(1, 2, 2, 2),
13 | norm_cfg=norm_cfg,
14 | norm_eval=False,
15 | style='pytorch',
16 | contract_dilation=True),
17 | neck=dict(
18 | type='FPN',
19 | in_channels=[256, 512, 1024, 2048],
20 | out_channels=256,
21 | num_outs=4),
22 | decode_head=dict(
23 | type='FPNHead',
24 | in_channels=[256, 256, 256, 256],
25 | in_index=[0, 1, 2, 3],
26 | feature_strides=[4, 8, 16, 32],
27 | channels=128,
28 | dropout_ratio=0.1,
29 | num_classes=19,
30 | norm_cfg=norm_cfg,
31 | align_corners=False,
32 | loss_decode=dict(
33 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
34 | # model training and testing settings
35 | train_cfg=dict(),
36 | test_cfg=dict(mode='whole'))
37 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/models/fpn_uniformer.py:
--------------------------------------------------------------------------------
1 | # model settings
2 | norm_cfg = dict(type='SyncBN', requires_grad=True)
3 | model = dict(
4 | type='EncoderDecoder',
5 | backbone=dict(
6 | type='UniFormer',
7 | embed_dim=[64, 128, 320, 512],
8 | layers=[3, 4, 8, 3],
9 | head_dim=64,
10 | mlp_ratio=4.,
11 | qkv_bias=True,
12 | drop_rate=0.,
13 | attn_drop_rate=0.,
14 | drop_path_rate=0.1),
15 | neck=dict(
16 | type='FPN',
17 | in_channels=[64, 128, 320, 512],
18 | out_channels=256,
19 | num_outs=4),
20 | decode_head=dict(
21 | type='FPNHead',
22 | in_channels=[256, 256, 256, 256],
23 | in_index=[0, 1, 2, 3],
24 | feature_strides=[4, 8, 16, 32],
25 | channels=128,
26 | dropout_ratio=0.1,
27 | num_classes=150,
28 | norm_cfg=norm_cfg,
29 | align_corners=False,
30 | loss_decode=dict(
31 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
32 | # model training and testing settings
33 | train_cfg=dict(),
34 | test_cfg=dict(mode='whole')
35 | )
36 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py:
--------------------------------------------------------------------------------
1 | # model settings
2 | norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
3 | model = dict(
4 | type='EncoderDecoder',
5 | backbone=dict(
6 | type='MobileNetV3',
7 | arch='large',
8 | out_indices=(1, 3, 16),
9 | norm_cfg=norm_cfg),
10 | decode_head=dict(
11 | type='LRASPPHead',
12 | in_channels=(16, 24, 960),
13 | in_index=(0, 1, 2),
14 | channels=128,
15 | input_transform='multiple_select',
16 | dropout_ratio=0.1,
17 | num_classes=19,
18 | norm_cfg=norm_cfg,
19 | act_cfg=dict(type='ReLU'),
20 | align_corners=False,
21 | loss_decode=dict(
22 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
23 | # model training and testing settings
24 | train_cfg=dict(),
25 | test_cfg=dict(mode='whole'))
26 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_160k.py:
--------------------------------------------------------------------------------
1 | # optimizer
2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
3 | optimizer_config = dict()
4 | # learning policy
5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
6 | # runtime settings
7 | runner = dict(type='IterBasedRunner', max_iters=160000)
8 | checkpoint_config = dict(by_epoch=False, interval=16000)
9 | evaluation = dict(interval=16000, metric='mIoU')
10 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_20k.py:
--------------------------------------------------------------------------------
1 | # optimizer
2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
3 | optimizer_config = dict()
4 | # learning policy
5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
6 | # runtime settings
7 | runner = dict(type='IterBasedRunner', max_iters=20000)
8 | checkpoint_config = dict(by_epoch=False, interval=2000)
9 | evaluation = dict(interval=2000, metric='mIoU')
10 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_40k.py:
--------------------------------------------------------------------------------
1 | # optimizer
2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
3 | optimizer_config = dict()
4 | # learning policy
5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
6 | # runtime settings
7 | runner = dict(type='IterBasedRunner', max_iters=40000)
8 | checkpoint_config = dict(by_epoch=False, interval=4000)
9 | evaluation = dict(interval=4000, metric='mIoU')
10 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_80k.py:
--------------------------------------------------------------------------------
1 | # optimizer
2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
3 | optimizer_config = dict()
4 | # learning policy
5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False)
6 | # runtime settings
7 | runner = dict(type='IterBasedRunner', max_iters=80000)
8 | checkpoint_config = dict(by_epoch=False, interval=8000)
9 | evaluation = dict(interval=8000, metric='mIoU')
10 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/exp/upernet_global_small/run.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | work_path=$(dirname $0)
4 | PYTHONPATH="$(dirname $0)/../../":$PYTHONPATH \
5 | python -m torch.distributed.launch --nproc_per_node=8 \
6 | tools/train.py ${work_path}/config.py \
7 | --launcher pytorch \
8 | --options model.backbone.pretrained_path='your_model_path/uniformer_small_in1k.pth' \
9 | --work-dir ${work_path}/ckpt \
10 | 2>&1 | tee -a ${work_path}/log.txt
11 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/exp/upernet_global_small/test.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | work_path=$(dirname $0)
4 | PYTHONPATH="$(dirname $0)/../../":$PYTHONPATH \
5 | python -m torch.distributed.launch --nproc_per_node=8 \
6 | tools/test.py ${work_path}/test_config_h32.py \
7 | ${work_path}/ckpt/latest.pth \
8 | --launcher pytorch \
9 | --eval mIoU \
10 | 2>&1 | tee -a ${work_path}/log.txt
11 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | # flake8: noqa
3 | from .arraymisc import *
4 | from .fileio import *
5 | from .image import *
6 | from .utils import *
7 | from .version import *
8 | from .video import *
9 | from .visualization import *
10 |
11 | # The following modules are not imported to this level, so mmcv may be used
12 | # without PyTorch.
13 | # - runner
14 | # - parallel
15 | # - op
16 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/arraymisc/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .quantization import dequantize, quantize
3 |
4 | __all__ = ['quantize', 'dequantize']
5 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/hsigmoid.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch.nn as nn
3 |
4 | from .registry import ACTIVATION_LAYERS
5 |
6 |
7 | @ACTIVATION_LAYERS.register_module()
8 | class HSigmoid(nn.Module):
9 | """Hard Sigmoid Module. Apply the hard sigmoid function:
10 | Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value)
11 | Default: Hsigmoid(x) = min(max((x + 1) / 2, 0), 1)
12 |
13 | Args:
14 | bias (float): Bias of the input feature map. Default: 1.0.
15 | divisor (float): Divisor of the input feature map. Default: 2.0.
16 | min_value (float): Lower bound value. Default: 0.0.
17 | max_value (float): Upper bound value. Default: 1.0.
18 |
19 | Returns:
20 | Tensor: The output tensor.
21 | """
22 |
23 | def __init__(self, bias=1.0, divisor=2.0, min_value=0.0, max_value=1.0):
24 | super(HSigmoid, self).__init__()
25 | self.bias = bias
26 | self.divisor = divisor
27 | assert self.divisor != 0
28 | self.min_value = min_value
29 | self.max_value = max_value
30 |
31 | def forward(self, x):
32 | x = (x + self.bias) / self.divisor
33 |
34 | return x.clamp_(self.min_value, self.max_value)
35 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/hswish.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch.nn as nn
3 |
4 | from .registry import ACTIVATION_LAYERS
5 |
6 |
7 | @ACTIVATION_LAYERS.register_module()
8 | class HSwish(nn.Module):
9 | """Hard Swish Module.
10 |
11 | This module applies the hard swish function:
12 |
13 | .. math::
14 | Hswish(x) = x * ReLU6(x + 3) / 6
15 |
16 | Args:
17 | inplace (bool): can optionally do the operation in-place.
18 | Default: False.
19 |
20 | Returns:
21 | Tensor: The output tensor.
22 | """
23 |
24 | def __init__(self, inplace=False):
25 | super(HSwish, self).__init__()
26 | self.act = nn.ReLU6(inplace)
27 |
28 | def forward(self, x):
29 | return x * self.act(x + 3) / 6
30 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/padding.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch.nn as nn
3 |
4 | from .registry import PADDING_LAYERS
5 |
6 | PADDING_LAYERS.register_module('zero', module=nn.ZeroPad2d)
7 | PADDING_LAYERS.register_module('reflect', module=nn.ReflectionPad2d)
8 | PADDING_LAYERS.register_module('replicate', module=nn.ReplicationPad2d)
9 |
10 |
11 | def build_padding_layer(cfg, *args, **kwargs):
12 | """Build padding layer.
13 |
14 | Args:
15 | cfg (None or dict): The padding layer config, which should contain:
16 | - type (str): Layer type.
17 | - layer args: Args needed to instantiate a padding layer.
18 |
19 | Returns:
20 | nn.Module: Created padding layer.
21 | """
22 | if not isinstance(cfg, dict):
23 | raise TypeError('cfg must be a dict')
24 | if 'type' not in cfg:
25 | raise KeyError('the cfg dict must contain the key "type"')
26 |
27 | cfg_ = cfg.copy()
28 | padding_type = cfg_.pop('type')
29 | if padding_type not in PADDING_LAYERS:
30 | raise KeyError(f'Unrecognized padding type {padding_type}.')
31 | else:
32 | padding_layer = PADDING_LAYERS.get(padding_type)
33 |
34 | layer = padding_layer(*args, **kwargs, **cfg_)
35 |
36 | return layer
37 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/registry.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from annotator.uniformer.mmcv.utils import Registry
3 |
4 | CONV_LAYERS = Registry('conv layer')
5 | NORM_LAYERS = Registry('norm layer')
6 | ACTIVATION_LAYERS = Registry('activation layer')
7 | PADDING_LAYERS = Registry('padding layer')
8 | UPSAMPLE_LAYERS = Registry('upsample layer')
9 | PLUGIN_LAYERS = Registry('plugin layer')
10 |
11 | DROPOUT_LAYERS = Registry('drop out layers')
12 | POSITIONAL_ENCODING = Registry('position encoding')
13 | ATTENTION = Registry('attention')
14 | FEEDFORWARD_NETWORK = Registry('feed-forward Network')
15 | TRANSFORMER_LAYER = Registry('transformerLayer')
16 | TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence')
17 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/scale.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 | import torch.nn as nn
4 |
5 |
6 | class Scale(nn.Module):
7 | """A learnable scale parameter.
8 |
9 | This layer scales the input by a learnable factor. It multiplies a
10 | learnable scale parameter of shape (1,) with input of any shape.
11 |
12 | Args:
13 | scale (float): Initial value of scale factor. Default: 1.0
14 | """
15 |
16 | def __init__(self, scale=1.0):
17 | super(Scale, self).__init__()
18 | self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float))
19 |
20 | def forward(self, x):
21 | return x * self.scale
22 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/swish.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 | import torch.nn as nn
4 |
5 | from .registry import ACTIVATION_LAYERS
6 |
7 |
8 | @ACTIVATION_LAYERS.register_module()
9 | class Swish(nn.Module):
10 | """Swish Module.
11 |
12 | This module applies the swish function:
13 |
14 | .. math::
15 | Swish(x) = x * Sigmoid(x)
16 |
17 | Returns:
18 | Tensor: The output tensor.
19 | """
20 |
21 | def __init__(self):
22 | super(Swish, self).__init__()
23 |
24 | def forward(self, x):
25 | return x * torch.sigmoid(x)
26 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/cnn/builder.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from ..runner import Sequential
3 | from ..utils import Registry, build_from_cfg
4 |
5 |
6 | def build_model_from_cfg(cfg, registry, default_args=None):
7 | """Build a PyTorch model from config dict(s). Different from
8 | ``build_from_cfg``, if cfg is a list, a ``nn.Sequential`` will be built.
9 |
10 | Args:
11 | cfg (dict, list[dict]): The config of modules, is is either a config
12 | dict or a list of config dicts. If cfg is a list, a
13 | the built modules will be wrapped with ``nn.Sequential``.
14 | registry (:obj:`Registry`): A registry the module belongs to.
15 | default_args (dict, optional): Default arguments to build the module.
16 | Defaults to None.
17 |
18 | Returns:
19 | nn.Module: A built nn module.
20 | """
21 | if isinstance(cfg, list):
22 | modules = [
23 | build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg
24 | ]
25 | return Sequential(*modules)
26 | else:
27 | return build_from_cfg(cfg, registry, default_args)
28 |
29 |
30 | MODELS = Registry('model', build_func=build_model_from_cfg)
31 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/cnn/utils/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .flops_counter import get_model_complexity_info
3 | from .fuse_conv_bn import fuse_conv_bn
4 | from .sync_bn import revert_sync_batchnorm
5 | from .weight_init import (INITIALIZERS, Caffe2XavierInit, ConstantInit,
6 | KaimingInit, NormalInit, PretrainedInit,
7 | TruncNormalInit, UniformInit, XavierInit,
8 | bias_init_with_prob, caffe2_xavier_init,
9 | constant_init, initialize, kaiming_init, normal_init,
10 | trunc_normal_init, uniform_init, xavier_init)
11 |
12 | __all__ = [
13 | 'get_model_complexity_info', 'bias_init_with_prob', 'caffe2_xavier_init',
14 | 'constant_init', 'kaiming_init', 'normal_init', 'trunc_normal_init',
15 | 'uniform_init', 'xavier_init', 'fuse_conv_bn', 'initialize',
16 | 'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit',
17 | 'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit',
18 | 'Caffe2XavierInit', 'revert_sync_batchnorm'
19 | ]
20 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/engine/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .test import (collect_results_cpu, collect_results_gpu, multi_gpu_test,
3 | single_gpu_test)
4 |
5 | __all__ = [
6 | 'collect_results_cpu', 'collect_results_gpu', 'multi_gpu_test',
7 | 'single_gpu_test'
8 | ]
9 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/fileio/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .file_client import BaseStorageBackend, FileClient
3 | from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler
4 | from .io import dump, load, register_handler
5 | from .parse import dict_from_file, list_from_file
6 |
7 | __all__ = [
8 | 'BaseStorageBackend', 'FileClient', 'load', 'dump', 'register_handler',
9 | 'BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler',
10 | 'list_from_file', 'dict_from_file'
11 | ]
12 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/fileio/handlers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .base import BaseFileHandler
3 | from .json_handler import JsonHandler
4 | from .pickle_handler import PickleHandler
5 | from .yaml_handler import YamlHandler
6 |
7 | __all__ = ['BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler']
8 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/fileio/handlers/base.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from abc import ABCMeta, abstractmethod
3 |
4 |
5 | class BaseFileHandler(metaclass=ABCMeta):
6 | # `str_like` is a flag to indicate whether the type of file object is
7 | # str-like object or bytes-like object. Pickle only processes bytes-like
8 | # objects but json only processes str-like object. If it is str-like
9 | # object, `StringIO` will be used to process the buffer.
10 | str_like = True
11 |
12 | @abstractmethod
13 | def load_from_fileobj(self, file, **kwargs):
14 | pass
15 |
16 | @abstractmethod
17 | def dump_to_fileobj(self, obj, file, **kwargs):
18 | pass
19 |
20 | @abstractmethod
21 | def dump_to_str(self, obj, **kwargs):
22 | pass
23 |
24 | def load_from_path(self, filepath, mode='r', **kwargs):
25 | with open(filepath, mode) as f:
26 | return self.load_from_fileobj(f, **kwargs)
27 |
28 | def dump_to_path(self, obj, filepath, mode='w', **kwargs):
29 | with open(filepath, mode) as f:
30 | self.dump_to_fileobj(obj, f, **kwargs)
31 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/fileio/handlers/json_handler.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import json
3 |
4 | import numpy as np
5 |
6 | from .base import BaseFileHandler
7 |
8 |
9 | def set_default(obj):
10 | """Set default json values for non-serializable values.
11 |
12 | It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list.
13 | It also converts ``np.generic`` (including ``np.int32``, ``np.float32``,
14 | etc.) into plain numbers of plain python built-in types.
15 | """
16 | if isinstance(obj, (set, range)):
17 | return list(obj)
18 | elif isinstance(obj, np.ndarray):
19 | return obj.tolist()
20 | elif isinstance(obj, np.generic):
21 | return obj.item()
22 | raise TypeError(f'{type(obj)} is unsupported for json dump')
23 |
24 |
25 | class JsonHandler(BaseFileHandler):
26 |
27 | def load_from_fileobj(self, file):
28 | return json.load(file)
29 |
30 | def dump_to_fileobj(self, obj, file, **kwargs):
31 | kwargs.setdefault('default', set_default)
32 | json.dump(obj, file, **kwargs)
33 |
34 | def dump_to_str(self, obj, **kwargs):
35 | kwargs.setdefault('default', set_default)
36 | return json.dumps(obj, **kwargs)
37 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/fileio/handlers/pickle_handler.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import pickle
3 |
4 | from .base import BaseFileHandler
5 |
6 |
7 | class PickleHandler(BaseFileHandler):
8 |
9 | str_like = False
10 |
11 | def load_from_fileobj(self, file, **kwargs):
12 | return pickle.load(file, **kwargs)
13 |
14 | def load_from_path(self, filepath, **kwargs):
15 | return super(PickleHandler, self).load_from_path(
16 | filepath, mode='rb', **kwargs)
17 |
18 | def dump_to_str(self, obj, **kwargs):
19 | kwargs.setdefault('protocol', 2)
20 | return pickle.dumps(obj, **kwargs)
21 |
22 | def dump_to_fileobj(self, obj, file, **kwargs):
23 | kwargs.setdefault('protocol', 2)
24 | pickle.dump(obj, file, **kwargs)
25 |
26 | def dump_to_path(self, obj, filepath, **kwargs):
27 | super(PickleHandler, self).dump_to_path(
28 | obj, filepath, mode='wb', **kwargs)
29 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/fileio/handlers/yaml_handler.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import yaml
3 |
4 | try:
5 | from yaml import CLoader as Loader, CDumper as Dumper
6 | except ImportError:
7 | from yaml import Loader, Dumper
8 |
9 | from .base import BaseFileHandler # isort:skip
10 |
11 |
12 | class YamlHandler(BaseFileHandler):
13 |
14 | def load_from_fileobj(self, file, **kwargs):
15 | kwargs.setdefault('Loader', Loader)
16 | return yaml.load(file, **kwargs)
17 |
18 | def dump_to_fileobj(self, obj, file, **kwargs):
19 | kwargs.setdefault('Dumper', Dumper)
20 | yaml.dump(obj, file, **kwargs)
21 |
22 | def dump_to_str(self, obj, **kwargs):
23 | kwargs.setdefault('Dumper', Dumper)
24 | return yaml.dump(obj, **kwargs)
25 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/model_zoo/deprecated.json:
--------------------------------------------------------------------------------
1 | {
2 | "resnet50_caffe": "detectron/resnet50_caffe",
3 | "resnet50_caffe_bgr": "detectron2/resnet50_caffe_bgr",
4 | "resnet101_caffe": "detectron/resnet101_caffe",
5 | "resnet101_caffe_bgr": "detectron2/resnet101_caffe_bgr"
6 | }
7 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/ops/info.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import glob
3 | import os
4 |
5 | import torch
6 |
7 | if torch.__version__ == 'parrots':
8 | import parrots
9 |
10 | def get_compiler_version():
11 | return 'GCC ' + parrots.version.compiler
12 |
13 | def get_compiling_cuda_version():
14 | return parrots.version.cuda
15 | else:
16 | from ..utils import ext_loader
17 | ext_module = ext_loader.load_ext(
18 | '_ext', ['get_compiler_version', 'get_compiling_cuda_version'])
19 |
20 | def get_compiler_version():
21 | return ext_module.get_compiler_version()
22 |
23 | def get_compiling_cuda_version():
24 | return ext_module.get_compiling_cuda_version()
25 |
26 |
27 | def get_onnxruntime_op_path():
28 | wildcard = os.path.join(
29 | os.path.abspath(os.path.dirname(os.path.dirname(__file__))),
30 | '_ext_ort.*.so')
31 |
32 | paths = glob.glob(wildcard)
33 | if len(paths) > 0:
34 | return paths[0]
35 | else:
36 | return ''
37 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/parallel/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .collate import collate
3 | from .data_container import DataContainer
4 | from .data_parallel import MMDataParallel
5 | from .distributed import MMDistributedDataParallel
6 | from .registry import MODULE_WRAPPERS
7 | from .scatter_gather import scatter, scatter_kwargs
8 | from .utils import is_module_wrapper
9 |
10 | __all__ = [
11 | 'collate', 'DataContainer', 'MMDataParallel', 'MMDistributedDataParallel',
12 | 'scatter', 'scatter_kwargs', 'is_module_wrapper', 'MODULE_WRAPPERS'
13 | ]
14 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/parallel/registry.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from torch.nn.parallel import DataParallel, DistributedDataParallel
3 |
4 | from annotator.uniformer.mmcv.utils import Registry
5 |
6 | MODULE_WRAPPERS = Registry('module wrapper')
7 | MODULE_WRAPPERS.register_module(module=DataParallel)
8 | MODULE_WRAPPERS.register_module(module=DistributedDataParallel)
9 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/parallel/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .registry import MODULE_WRAPPERS
3 |
4 |
5 | def is_module_wrapper(module):
6 | """Check if a module is a module wrapper.
7 |
8 | The following 3 modules in MMCV (and their subclasses) are regarded as
9 | module wrappers: DataParallel, DistributedDataParallel,
10 | MMDistributedDataParallel (the deprecated version). You may add you own
11 | module wrapper by registering it to mmcv.parallel.MODULE_WRAPPERS.
12 |
13 | Args:
14 | module (nn.Module): The module to be checked.
15 |
16 | Returns:
17 | bool: True if the input module is a module wrapper.
18 | """
19 | module_wrappers = tuple(MODULE_WRAPPERS.module_dict.values())
20 | return isinstance(module, module_wrappers)
21 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/runner/builder.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import copy
3 |
4 | from ..utils import Registry
5 |
6 | RUNNERS = Registry('runner')
7 | RUNNER_BUILDERS = Registry('runner builder')
8 |
9 |
10 | def build_runner_constructor(cfg):
11 | return RUNNER_BUILDERS.build(cfg)
12 |
13 |
14 | def build_runner(cfg, default_args=None):
15 | runner_cfg = copy.deepcopy(cfg)
16 | constructor_type = runner_cfg.pop('constructor',
17 | 'DefaultRunnerConstructor')
18 | runner_constructor = build_runner_constructor(
19 | dict(
20 | type=constructor_type,
21 | runner_cfg=runner_cfg,
22 | default_args=default_args))
23 | runner = runner_constructor()
24 | return runner
25 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/closure.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .hook import HOOKS, Hook
3 |
4 |
5 | @HOOKS.register_module()
6 | class ClosureHook(Hook):
7 |
8 | def __init__(self, fn_name, fn):
9 | assert hasattr(self, fn_name)
10 | assert callable(fn)
11 | setattr(self, fn_name, fn)
12 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/iter_timer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import time
3 |
4 | from .hook import HOOKS, Hook
5 |
6 |
7 | @HOOKS.register_module()
8 | class IterTimerHook(Hook):
9 |
10 | def before_epoch(self, runner):
11 | self.t = time.time()
12 |
13 | def before_iter(self, runner):
14 | runner.log_buffer.update({'data_time': time.time() - self.t})
15 |
16 | def after_iter(self, runner):
17 | runner.log_buffer.update({'time': time.time() - self.t})
18 | self.t = time.time()
19 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .base import LoggerHook
3 | from .dvclive import DvcliveLoggerHook
4 | from .mlflow import MlflowLoggerHook
5 | from .neptune import NeptuneLoggerHook
6 | from .pavi import PaviLoggerHook
7 | from .tensorboard import TensorboardLoggerHook
8 | from .text import TextLoggerHook
9 | from .wandb import WandbLoggerHook
10 |
11 | __all__ = [
12 | 'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook',
13 | 'TensorboardLoggerHook', 'TextLoggerHook', 'WandbLoggerHook',
14 | 'NeptuneLoggerHook', 'DvcliveLoggerHook'
15 | ]
16 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/memory.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import torch
3 |
4 | from .hook import HOOKS, Hook
5 |
6 |
7 | @HOOKS.register_module()
8 | class EmptyCacheHook(Hook):
9 |
10 | def __init__(self, before_epoch=False, after_epoch=True, after_iter=False):
11 | self._before_epoch = before_epoch
12 | self._after_epoch = after_epoch
13 | self._after_iter = after_iter
14 |
15 | def after_iter(self, runner):
16 | if self._after_iter:
17 | torch.cuda.empty_cache()
18 |
19 | def before_epoch(self, runner):
20 | if self._before_epoch:
21 | torch.cuda.empty_cache()
22 |
23 | def after_epoch(self, runner):
24 | if self._after_epoch:
25 | torch.cuda.empty_cache()
26 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/sampler_seed.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .hook import HOOKS, Hook
3 |
4 |
5 | @HOOKS.register_module()
6 | class DistSamplerSeedHook(Hook):
7 | """Data-loading sampler for distributed training.
8 |
9 | When distributed training, it is only useful in conjunction with
10 | :obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same
11 | purpose with :obj:`IterLoader`.
12 | """
13 |
14 | def before_epoch(self, runner):
15 | if hasattr(runner.data_loader.sampler, 'set_epoch'):
16 | # in case the data loader uses `SequentialSampler` in Pytorch
17 | runner.data_loader.sampler.set_epoch(runner.epoch)
18 | elif hasattr(runner.data_loader.batch_sampler.sampler, 'set_epoch'):
19 | # batch sampler in pytorch warps the sampler as its attributes.
20 | runner.data_loader.batch_sampler.sampler.set_epoch(runner.epoch)
21 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/sync_buffer.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from ..dist_utils import allreduce_params
3 | from .hook import HOOKS, Hook
4 |
5 |
6 | @HOOKS.register_module()
7 | class SyncBuffersHook(Hook):
8 | """Synchronize model buffers such as running_mean and running_var in BN at
9 | the end of each epoch.
10 |
11 | Args:
12 | distributed (bool): Whether distributed training is used. It is
13 | effective only for distributed training. Defaults to True.
14 | """
15 |
16 | def __init__(self, distributed=True):
17 | self.distributed = distributed
18 |
19 | def after_epoch(self, runner):
20 | """All-reduce model buffers at the end of each epoch."""
21 | if self.distributed:
22 | allreduce_params(runner.model.buffers())
23 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/runner/optimizer/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .builder import (OPTIMIZER_BUILDERS, OPTIMIZERS, build_optimizer,
3 | build_optimizer_constructor)
4 | from .default_constructor import DefaultOptimizerConstructor
5 |
6 | __all__ = [
7 | 'OPTIMIZER_BUILDERS', 'OPTIMIZERS', 'DefaultOptimizerConstructor',
8 | 'build_optimizer', 'build_optimizer_constructor'
9 | ]
10 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/utils/parrots_jit.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | import os
3 |
4 | from .parrots_wrapper import TORCH_VERSION
5 |
6 | parrots_jit_option = os.getenv('PARROTS_JIT_OPTION')
7 |
8 | if TORCH_VERSION == 'parrots' and parrots_jit_option == 'ON':
9 | from parrots.jit import pat as jit
10 | else:
11 |
12 | def jit(func=None,
13 | check_input=None,
14 | full_shape=True,
15 | derivate=False,
16 | coderize=False,
17 | optimize=False):
18 |
19 | def wrapper(func):
20 |
21 | def wrapper_inner(*args, **kargs):
22 | return func(*args, **kargs)
23 |
24 | return wrapper_inner
25 |
26 | if func is None:
27 | return wrapper
28 | else:
29 | return func
30 |
31 |
32 | if TORCH_VERSION == 'parrots':
33 | from parrots.utils.tester import skip_no_elena
34 | else:
35 |
36 | def skip_no_elena(func):
37 |
38 | def wrapper(*args, **kargs):
39 | return func(*args, **kargs)
40 |
41 | return wrapper
42 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/utils/trace.py:
--------------------------------------------------------------------------------
1 | import warnings
2 |
3 | import torch
4 |
5 | from annotator.uniformer.mmcv.utils import digit_version
6 |
7 |
8 | def is_jit_tracing() -> bool:
9 | if (torch.__version__ != 'parrots'
10 | and digit_version(torch.__version__) >= digit_version('1.6.0')):
11 | on_trace = torch.jit.is_tracing()
12 | # In PyTorch 1.6, torch.jit.is_tracing has a bug.
13 | # Refers to https://github.com/pytorch/pytorch/issues/42448
14 | if isinstance(on_trace, bool):
15 | return on_trace
16 | else:
17 | return torch._C._is_tracing()
18 | else:
19 | warnings.warn(
20 | 'torch.jit.is_tracing is only supported after v1.6.0. '
21 | 'Therefore is_tracing returns False automatically. Please '
22 | 'set on_trace manually if you are using trace.', UserWarning)
23 | return False
24 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/video/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .io import Cache, VideoReader, frames2video
3 | from .optflow import (dequantize_flow, flow_from_bytes, flow_warp, flowread,
4 | flowwrite, quantize_flow, sparse_flow_from_bytes)
5 | from .processing import concat_video, convert_video, cut_video, resize_video
6 |
7 | __all__ = [
8 | 'Cache', 'VideoReader', 'frames2video', 'convert_video', 'resize_video',
9 | 'cut_video', 'concat_video', 'flowread', 'flowwrite', 'quantize_flow',
10 | 'dequantize_flow', 'flow_warp', 'flow_from_bytes', 'sparse_flow_from_bytes'
11 | ]
12 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv/visualization/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) OpenMMLab. All rights reserved.
2 | from .color import Color, color_val
3 | from .image import imshow, imshow_bboxes, imshow_det_bboxes
4 | from .optflow import flow2rgb, flowshow, make_color_wheel
5 |
6 | __all__ = [
7 | 'Color', 'color_val', 'imshow', 'imshow_bboxes', 'imshow_det_bboxes',
8 | 'flowshow', 'flow2rgb', 'make_color_wheel'
9 | ]
10 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmcv_custom/__init__.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from .checkpoint import load_checkpoint
4 |
5 | __all__ = ['load_checkpoint']
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/apis/__init__.py:
--------------------------------------------------------------------------------
1 | from .inference import inference_segmentor, init_segmentor, show_result_pyplot
2 | from .test import multi_gpu_test, single_gpu_test
3 | from .train import get_root_logger, set_random_seed, train_segmentor
4 |
5 | __all__ = [
6 | 'get_root_logger', 'set_random_seed', 'train_segmentor', 'init_segmentor',
7 | 'inference_segmentor', 'multi_gpu_test', 'single_gpu_test',
8 | 'show_result_pyplot'
9 | ]
10 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/core/__init__.py:
--------------------------------------------------------------------------------
1 | from .evaluation import * # noqa: F401, F403
2 | from .seg import * # noqa: F401, F403
3 | from .utils import * # noqa: F401, F403
4 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/core/evaluation/__init__.py:
--------------------------------------------------------------------------------
1 | from .class_names import get_classes, get_palette
2 | from .eval_hooks import DistEvalHook, EvalHook
3 | from .metrics import eval_metrics, mean_dice, mean_fscore, mean_iou
4 |
5 | __all__ = [
6 | 'EvalHook', 'DistEvalHook', 'mean_dice', 'mean_iou', 'mean_fscore',
7 | 'eval_metrics', 'get_classes', 'get_palette'
8 | ]
9 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/core/seg/__init__.py:
--------------------------------------------------------------------------------
1 | from .builder import build_pixel_sampler
2 | from .sampler import BasePixelSampler, OHEMPixelSampler
3 |
4 | __all__ = ['build_pixel_sampler', 'BasePixelSampler', 'OHEMPixelSampler']
5 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/core/seg/builder.py:
--------------------------------------------------------------------------------
1 | from annotator.uniformer.mmcv.utils import Registry, build_from_cfg
2 |
3 | PIXEL_SAMPLERS = Registry('pixel sampler')
4 |
5 |
6 | def build_pixel_sampler(cfg, **default_args):
7 | """Build pixel sampler for segmentation map."""
8 | return build_from_cfg(cfg, PIXEL_SAMPLERS, default_args)
9 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/core/seg/sampler/__init__.py:
--------------------------------------------------------------------------------
1 | from .base_pixel_sampler import BasePixelSampler
2 | from .ohem_pixel_sampler import OHEMPixelSampler
3 |
4 | __all__ = ['BasePixelSampler', 'OHEMPixelSampler']
5 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/core/seg/sampler/base_pixel_sampler.py:
--------------------------------------------------------------------------------
1 | from abc import ABCMeta, abstractmethod
2 |
3 |
4 | class BasePixelSampler(metaclass=ABCMeta):
5 | """Base class of pixel sampler."""
6 |
7 | def __init__(self, **kwargs):
8 | pass
9 |
10 | @abstractmethod
11 | def sample(self, seg_logit, seg_label):
12 | """Placeholder for sample function."""
13 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/core/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .misc import add_prefix
2 |
3 | __all__ = ['add_prefix']
4 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/core/utils/misc.py:
--------------------------------------------------------------------------------
1 | def add_prefix(inputs, prefix):
2 | """Add prefix for dict.
3 |
4 | Args:
5 | inputs (dict): The input dict with str keys.
6 | prefix (str): The prefix to add.
7 |
8 | Returns:
9 |
10 | dict: The dict with keys updated with ``prefix``.
11 | """
12 |
13 | outputs = dict()
14 | for name, value in inputs.items():
15 | outputs[f'{prefix}.{name}'] = value
16 |
17 | return outputs
18 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/datasets/__init__.py:
--------------------------------------------------------------------------------
1 | from .ade import ADE20KDataset
2 | from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset
3 | from .chase_db1 import ChaseDB1Dataset
4 | from .cityscapes import CityscapesDataset
5 | from .custom import CustomDataset
6 | from .dataset_wrappers import ConcatDataset, RepeatDataset
7 | from .drive import DRIVEDataset
8 | from .hrf import HRFDataset
9 | from .pascal_context import PascalContextDataset, PascalContextDataset59
10 | from .stare import STAREDataset
11 | from .voc import PascalVOCDataset
12 |
13 | __all__ = [
14 | 'CustomDataset', 'build_dataloader', 'ConcatDataset', 'RepeatDataset',
15 | 'DATASETS', 'build_dataset', 'PIPELINES', 'CityscapesDataset',
16 | 'PascalVOCDataset', 'ADE20KDataset', 'PascalContextDataset',
17 | 'PascalContextDataset59', 'ChaseDB1Dataset', 'DRIVEDataset', 'HRFDataset',
18 | 'STAREDataset'
19 | ]
20 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/datasets/chase_db1.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 |
3 | from .builder import DATASETS
4 | from .custom import CustomDataset
5 |
6 |
7 | @DATASETS.register_module()
8 | class ChaseDB1Dataset(CustomDataset):
9 | """Chase_db1 dataset.
10 |
11 | In segmentation map annotation for Chase_db1, 0 stands for background,
12 | which is included in 2 categories. ``reduce_zero_label`` is fixed to False.
13 | The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
14 | '_1stHO.png'.
15 | """
16 |
17 | CLASSES = ('background', 'vessel')
18 |
19 | PALETTE = [[120, 120, 120], [6, 230, 230]]
20 |
21 | def __init__(self, **kwargs):
22 | super(ChaseDB1Dataset, self).__init__(
23 | img_suffix='.png',
24 | seg_map_suffix='_1stHO.png',
25 | reduce_zero_label=False,
26 | **kwargs)
27 | assert osp.exists(self.img_dir)
28 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/datasets/drive.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 |
3 | from .builder import DATASETS
4 | from .custom import CustomDataset
5 |
6 |
7 | @DATASETS.register_module()
8 | class DRIVEDataset(CustomDataset):
9 | """DRIVE dataset.
10 |
11 | In segmentation map annotation for DRIVE, 0 stands for background, which is
12 | included in 2 categories. ``reduce_zero_label`` is fixed to False. The
13 | ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
14 | '_manual1.png'.
15 | """
16 |
17 | CLASSES = ('background', 'vessel')
18 |
19 | PALETTE = [[120, 120, 120], [6, 230, 230]]
20 |
21 | def __init__(self, **kwargs):
22 | super(DRIVEDataset, self).__init__(
23 | img_suffix='.png',
24 | seg_map_suffix='_manual1.png',
25 | reduce_zero_label=False,
26 | **kwargs)
27 | assert osp.exists(self.img_dir)
28 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/datasets/hrf.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 |
3 | from .builder import DATASETS
4 | from .custom import CustomDataset
5 |
6 |
7 | @DATASETS.register_module()
8 | class HRFDataset(CustomDataset):
9 | """HRF dataset.
10 |
11 | In segmentation map annotation for HRF, 0 stands for background, which is
12 | included in 2 categories. ``reduce_zero_label`` is fixed to False. The
13 | ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
14 | '.png'.
15 | """
16 |
17 | CLASSES = ('background', 'vessel')
18 |
19 | PALETTE = [[120, 120, 120], [6, 230, 230]]
20 |
21 | def __init__(self, **kwargs):
22 | super(HRFDataset, self).__init__(
23 | img_suffix='.png',
24 | seg_map_suffix='.png',
25 | reduce_zero_label=False,
26 | **kwargs)
27 | assert osp.exists(self.img_dir)
28 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/__init__.py:
--------------------------------------------------------------------------------
1 | from .compose import Compose
2 | from .formating import (Collect, ImageToTensor, ToDataContainer, ToTensor,
3 | Transpose, to_tensor)
4 | from .loading import LoadAnnotations, LoadImageFromFile
5 | from .test_time_aug import MultiScaleFlipAug
6 | from .transforms import (CLAHE, AdjustGamma, Normalize, Pad,
7 | PhotoMetricDistortion, RandomCrop, RandomFlip,
8 | RandomRotate, Rerange, Resize, RGB2Gray, SegRescale)
9 |
10 | __all__ = [
11 | 'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer',
12 | 'Transpose', 'Collect', 'LoadAnnotations', 'LoadImageFromFile',
13 | 'MultiScaleFlipAug', 'Resize', 'RandomFlip', 'Pad', 'RandomCrop',
14 | 'Normalize', 'SegRescale', 'PhotoMetricDistortion', 'RandomRotate',
15 | 'AdjustGamma', 'CLAHE', 'Rerange', 'RGB2Gray'
16 | ]
17 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/datasets/stare.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 |
3 | from .builder import DATASETS
4 | from .custom import CustomDataset
5 |
6 |
7 | @DATASETS.register_module()
8 | class STAREDataset(CustomDataset):
9 | """STARE dataset.
10 |
11 | In segmentation map annotation for STARE, 0 stands for background, which is
12 | included in 2 categories. ``reduce_zero_label`` is fixed to False. The
13 | ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
14 | '.ah.png'.
15 | """
16 |
17 | CLASSES = ('background', 'vessel')
18 |
19 | PALETTE = [[120, 120, 120], [6, 230, 230]]
20 |
21 | def __init__(self, **kwargs):
22 | super(STAREDataset, self).__init__(
23 | img_suffix='.png',
24 | seg_map_suffix='.ah.png',
25 | reduce_zero_label=False,
26 | **kwargs)
27 | assert osp.exists(self.img_dir)
28 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/datasets/voc.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 |
3 | from .builder import DATASETS
4 | from .custom import CustomDataset
5 |
6 |
7 | @DATASETS.register_module()
8 | class PascalVOCDataset(CustomDataset):
9 | """Pascal VOC dataset.
10 |
11 | Args:
12 | split (str): Split txt file for Pascal VOC.
13 | """
14 |
15 | CLASSES = ('background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle',
16 | 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog',
17 | 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa',
18 | 'train', 'tvmonitor')
19 |
20 | PALETTE = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128],
21 | [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0],
22 | [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128],
23 | [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0],
24 | [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]]
25 |
26 | def __init__(self, split, **kwargs):
27 | super(PascalVOCDataset, self).__init__(
28 | img_suffix='.jpg', seg_map_suffix='.png', split=split, **kwargs)
29 | assert osp.exists(self.img_dir) and self.split is not None
30 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/models/__init__.py:
--------------------------------------------------------------------------------
1 | from .backbones import * # noqa: F401,F403
2 | from .builder import (BACKBONES, HEADS, LOSSES, SEGMENTORS, build_backbone,
3 | build_head, build_loss, build_segmentor)
4 | from .decode_heads import * # noqa: F401,F403
5 | from .losses import * # noqa: F401,F403
6 | from .necks import * # noqa: F401,F403
7 | from .segmentors import * # noqa: F401,F403
8 |
9 | __all__ = [
10 | 'BACKBONES', 'HEADS', 'LOSSES', 'SEGMENTORS', 'build_backbone',
11 | 'build_head', 'build_loss', 'build_segmentor'
12 | ]
13 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/models/backbones/__init__.py:
--------------------------------------------------------------------------------
1 | from .cgnet import CGNet
2 | # from .fast_scnn import FastSCNN
3 | from .hrnet import HRNet
4 | from .mobilenet_v2 import MobileNetV2
5 | from .mobilenet_v3 import MobileNetV3
6 | from .resnest import ResNeSt
7 | from .resnet import ResNet, ResNetV1c, ResNetV1d
8 | from .resnext import ResNeXt
9 | from .unet import UNet
10 | from .vit import VisionTransformer
11 | from .uniformer import UniFormer
12 |
13 | __all__ = [
14 | 'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet',
15 | 'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3',
16 | 'VisionTransformer', 'UniFormer'
17 | ]
18 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/models/decode_heads/__init__.py:
--------------------------------------------------------------------------------
1 | from .ann_head import ANNHead
2 | from .apc_head import APCHead
3 | from .aspp_head import ASPPHead
4 | from .cc_head import CCHead
5 | from .da_head import DAHead
6 | from .dm_head import DMHead
7 | from .dnl_head import DNLHead
8 | from .ema_head import EMAHead
9 | from .enc_head import EncHead
10 | from .fcn_head import FCNHead
11 | from .fpn_head import FPNHead
12 | from .gc_head import GCHead
13 | from .lraspp_head import LRASPPHead
14 | from .nl_head import NLHead
15 | from .ocr_head import OCRHead
16 | # from .point_head import PointHead
17 | from .psa_head import PSAHead
18 | from .psp_head import PSPHead
19 | from .sep_aspp_head import DepthwiseSeparableASPPHead
20 | from .sep_fcn_head import DepthwiseSeparableFCNHead
21 | from .uper_head import UPerHead
22 |
23 | __all__ = [
24 | 'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead',
25 | 'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead',
26 | 'EncHead', 'DepthwiseSeparableFCNHead', 'FPNHead', 'EMAHead', 'DNLHead',
27 | 'APCHead', 'DMHead', 'LRASPPHead'
28 | ]
29 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/models/losses/__init__.py:
--------------------------------------------------------------------------------
1 | from .accuracy import Accuracy, accuracy
2 | from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy,
3 | cross_entropy, mask_cross_entropy)
4 | from .dice_loss import DiceLoss
5 | from .lovasz_loss import LovaszLoss
6 | from .utils import reduce_loss, weight_reduce_loss, weighted_loss
7 |
8 | __all__ = [
9 | 'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy',
10 | 'mask_cross_entropy', 'CrossEntropyLoss', 'reduce_loss',
11 | 'weight_reduce_loss', 'weighted_loss', 'LovaszLoss', 'DiceLoss'
12 | ]
13 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/models/necks/__init__.py:
--------------------------------------------------------------------------------
1 | from .fpn import FPN
2 | from .multilevel_neck import MultiLevelNeck
3 |
4 | __all__ = ['FPN', 'MultiLevelNeck']
5 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/models/segmentors/__init__.py:
--------------------------------------------------------------------------------
1 | from .base import BaseSegmentor
2 | from .cascade_encoder_decoder import CascadeEncoderDecoder
3 | from .encoder_decoder import EncoderDecoder
4 |
5 | __all__ = ['BaseSegmentor', 'EncoderDecoder', 'CascadeEncoderDecoder']
6 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/models/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .drop import DropPath
2 | from .inverted_residual import InvertedResidual, InvertedResidualV3
3 | from .make_divisible import make_divisible
4 | from .res_layer import ResLayer
5 | from .se_layer import SELayer
6 | from .self_attention_block import SelfAttentionBlock
7 | from .up_conv_block import UpConvBlock
8 | from .weight_init import trunc_normal_
9 |
10 | __all__ = [
11 | 'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual',
12 | 'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'DropPath', 'trunc_normal_'
13 | ]
14 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/models/utils/drop.py:
--------------------------------------------------------------------------------
1 | """Modified from https://github.com/rwightman/pytorch-image-
2 | models/blob/master/timm/models/layers/drop.py."""
3 |
4 | import torch
5 | from torch import nn
6 |
7 |
8 | class DropPath(nn.Module):
9 | """Drop paths (Stochastic Depth) per sample (when applied in main path of
10 | residual blocks).
11 |
12 | Args:
13 | drop_prob (float): Drop rate for paths of model. Dropout rate has
14 | to be between 0 and 1. Default: 0.
15 | """
16 |
17 | def __init__(self, drop_prob=0.):
18 | super(DropPath, self).__init__()
19 | self.drop_prob = drop_prob
20 | self.keep_prob = 1 - drop_prob
21 |
22 | def forward(self, x):
23 | if self.drop_prob == 0. or not self.training:
24 | return x
25 | shape = (x.shape[0], ) + (1, ) * (
26 | x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
27 | random_tensor = self.keep_prob + torch.rand(
28 | shape, dtype=x.dtype, device=x.device)
29 | random_tensor.floor_() # binarize
30 | output = x.div(self.keep_prob) * random_tensor
31 | return output
32 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/ops/__init__.py:
--------------------------------------------------------------------------------
1 | from .encoding import Encoding
2 | from .wrappers import Upsample, resize
3 |
4 | __all__ = ['Upsample', 'resize', 'Encoding']
5 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .collect_env import collect_env
2 | from .logger import get_root_logger
3 |
4 | __all__ = ['get_root_logger', 'collect_env']
5 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/utils/collect_env.py:
--------------------------------------------------------------------------------
1 | from annotator.uniformer.mmcv.utils import collect_env as collect_base_env
2 | from annotator.uniformer.mmcv.utils import get_git_hash
3 |
4 | import annotator.uniformer.mmseg as mmseg
5 |
6 |
7 | def collect_env():
8 | """Collect the information of the running environments."""
9 | env_info = collect_base_env()
10 | env_info['MMSegmentation'] = f'{mmseg.__version__}+{get_git_hash()[:7]}'
11 |
12 | return env_info
13 |
14 |
15 | if __name__ == '__main__':
16 | for name, val in collect_env().items():
17 | print('{}: {}'.format(name, val))
18 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/uniformer/mmseg/utils/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | from annotator.uniformer.mmcv.utils import get_logger
4 |
5 |
6 | def get_root_logger(log_file=None, log_level=logging.INFO):
7 | """Get the root logger.
8 |
9 | The logger will be initialized if it has not been initialized. By default a
10 | StreamHandler will be added. If `log_file` is specified, a FileHandler will
11 | also be added. The name of the root logger is the top-level package name,
12 | e.g., "mmseg".
13 |
14 | Args:
15 | log_file (str | None): The log filename. If specified, a FileHandler
16 | will be added to the root logger.
17 | log_level (int): The root logger level. Note that only the process of
18 | rank 0 is affected, while other processes will set the level to
19 | "Error" and be silent most of the time.
20 |
21 | Returns:
22 | logging.Logger: The root logger.
23 | """
24 |
25 | logger = get_logger(name='mmseg', log_file=log_file, log_level=log_level)
26 |
27 | return logger
28 |
--------------------------------------------------------------------------------
/experiments/lavis/common/annotator/util.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import cv2
3 | import os
4 |
5 |
6 | annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts')
7 |
8 |
9 | def HWC3(x):
10 | assert x.dtype == np.uint8
11 | if x.ndim == 2:
12 | x = x[:, :, None]
13 | assert x.ndim == 3
14 | H, W, C = x.shape
15 | assert C == 1 or C == 3 or C == 4
16 | if C == 3:
17 | return x
18 | if C == 1:
19 | return np.concatenate([x, x, x], axis=2)
20 | if C == 4:
21 | color = x[:, :, 0:3].astype(np.float32)
22 | alpha = x[:, :, 3:4].astype(np.float32) / 255.0
23 | y = color * alpha + 255.0 * (1.0 - alpha)
24 | y = y.clip(0, 255).astype(np.uint8)
25 | return y
26 |
27 |
28 | def resize_image(input_image, resolution):
29 | H, W, C = input_image.shape
30 | H = float(H)
31 | W = float(W)
32 | k = float(resolution) / min(H, W)
33 | H *= k
34 | W *= k
35 | H = int(np.round(H / 64.0)) * 64
36 | W = int(np.round(W / 64.0)) * 64
37 | img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA)
38 | return img
39 |
--------------------------------------------------------------------------------
/experiments/lavis/common/gradcam.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from matplotlib import pyplot as plt
3 | from scipy.ndimage import filters
4 | from skimage import transform as skimage_transform
5 |
6 |
7 | def getAttMap(img, attMap, blur=True, overlap=True):
8 | attMap -= attMap.min()
9 | if attMap.max() > 0:
10 | attMap /= attMap.max()
11 | attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
12 | if blur:
13 | attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
14 | attMap -= attMap.min()
15 | attMap /= attMap.max()
16 | cmap = plt.get_cmap("jet")
17 | attMapV = cmap(attMap)
18 | attMapV = np.delete(attMapV, 3, 2)
19 | if overlap:
20 | attMap = (
21 | 1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
22 | + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
23 | )
24 | return attMap
25 |
--------------------------------------------------------------------------------
/experiments/lavis/common/vqa_tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | __author__ = "aagrawal"
9 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/avsd/defaults_dial.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | avsd_dialogue: # name of the dataset builder
8 | dataset_card: dataset_card/avsd_dialogue.md
9 | data_type: features #extracted features of videos (I3D, VGGish) # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_train.json
16 | storage: avsd/annotations/train.json
17 | val:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_val.json
19 | storage: avsd/annotations/val.json
20 | test:
21 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_test.json
22 | storage: avsd/annotations/test.json
23 | features:
24 | storage: avsd/features/
25 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/blip_diffusion_datasets/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | blip_diffusion_finetune: # name of the dataset builder
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | images:
14 | storage: ""
15 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/coco/defaults_ret.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | coco_retrieval:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json
16 | md5: aa31ac474cf6250ebb81d18348a07ed8
17 | storage: coco/annotations/coco_karpathy_train.json
18 | val:
19 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json
20 | md5: b273847456ef5580e33713b1f7de52a0
21 | storage: coco/annotations/coco_karpathy_val.json
22 | test:
23 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json
24 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1
25 | storage: coco/annotations/coco_karpathy_test.json
26 | images:
27 | storage: coco/images/
28 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/conceptual_caption/defaults_12m.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | conceptual_caption_12m:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url:
16 | - /export/home/workspace/datasets/cc12m.json
17 | storage:
18 | - conceptual_caption/annotations/cc12m.json
19 | images:
20 | storage: conceptual_caption/images_12m
21 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/conceptual_caption/defaults_3m.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | conceptual_caption_3m:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url:
16 | - /export/home/workspace/datasets/cc3m.json
17 | storage:
18 | - conceptual_caption/annotations/cc3m.json
19 | images:
20 | storage: conceptual_caption/images
21 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/didemo/defaults_ret.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | didemo_retrieval: # name of the dataset builder
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: videos # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_train.json
16 | storage: didemo/annotations/retrieval_train.json
17 | val:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_val.json
19 | storage: didemo/annotations/retrieval_val.json
20 | test:
21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_test.json
22 | storage: didemo/annotations/retrieval_test.json
23 | videos:
24 | storage: didemo/videos
25 | # storage: /export/share/dongxuli/data/didemo_retrieval/videos
26 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/flickr30k/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | flickr30k:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images
10 |
11 | build_info:
12 | annotations:
13 | train:
14 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json
15 | storage: flickr30k/annotations/train.json
16 | val:
17 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json
18 | storage: flickr30k/annotations/val.json
19 | test:
20 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json
21 | storage: flickr30k/annotations/test.json
22 | images:
23 | storage: flickr30k/images
24 | # storage: /export/share/datasets/vision/flickr30k
25 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/imagenet/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | imagenet:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | splits: ["val"]
14 | images:
15 | storage: /export/share/datasets/vision/imagenet
16 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/laion/defaults_2B_multi.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | laion2B_multi:
8 |
9 | data_type: images
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar
14 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/msrvtt/defaults_cap.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | msrvtt_cap: # name of the dataset builder
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: videos # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json
16 | storage: msrvtt/annotations/cap_train.json
17 | val:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json
19 | storage: msrvtt/annotations/cap_val.json
20 | test:
21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json
22 | storage: msrvtt/annotations/cap_test.json
23 | videos:
24 | storage: msrvtt/videos
25 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/msrvtt/defaults_ret.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | msrvtt_retrieval: # name of the dataset builder
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: videos # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_train.json
16 | storage: msrvtt/annotations/retrieval_train.json
17 | val:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_val.json
19 | storage: msrvtt/annotations/retrieval_val.json
20 | test:
21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_test.json
22 | storage: msrvtt/annotations/retrieval_test.json
23 | videos:
24 | storage: msrvtt/videos
25 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/msvd/defaults_cap.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | msvd_cap: # name of the dataset builder
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: videos # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json
16 | storage: msvd/annotations/cap_train.json
17 | val:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json
19 | storage: msvd/annotations/cap_val.json
20 | test:
21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json
22 | storage: msvd/annotations/cap_test.json
23 | videos:
24 | storage: msvd/videos
25 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/nlvr/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | nlvr:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json
16 | storage: nlvr/annotations/train.json
17 | val:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
19 | storage: nlvr/annotations/dev.json
20 | test:
21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json
22 | storage: nlvr/annotations/test.json
23 | images:
24 | storage: /export/share/datasets/vision/NLVR2/
25 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/nocaps/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | nocaps: # name of the dataset builder
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | val:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json
16 | storage: nocaps/annotations/nocaps_val.json
17 | test:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json
19 | storage: nocaps/annotations/nocaps_test.json
20 | images:
21 | storage: nocaps/images
22 | # storage: /export/share/datasets/vision/nocaps/
23 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/sbu_caption/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | sbu_caption:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url:
16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json
17 | # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json
18 | storage:
19 | - sbu_captions/annotations/sbu.json
20 | images:
21 | storage: sbu_captions/images
22 | # storage: /export/share/datasets/vision_language/sbu_resize
23 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/snli_ve/defaults.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | snli_ve:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json
16 | storage: snli/annotations/ve_train.json
17 | val:
18 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json
19 | storage: snli/annotations/ve_dev.json
20 | test:
21 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json
22 | storage: snli/annotations/ve_test.json
23 | images:
24 | storage: flickr30k/images/flickr30k-images
25 | # storage: /export/share/datasets/vision/flickr30k/flickr30k-images
26 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/vatex/defaults_cap.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | msvd_cap: # name of the dataset builder
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: videos # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json
16 | storage: vatex/annotations/cap_train.json
17 | val:
18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json
19 | storage: vatex/annotations/cap_val.json
20 | test:
21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json
22 | storage: vatex/annotations/cap_test.json
23 | videos:
24 | storage: /export/share/dongxuli/data/vatex
25 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/vg/defaults_caption.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | vg_caption:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json
16 | storage: vg/annotations/vg_caption.json
17 | images:
18 | storage: vg/images/
19 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/datasets/vg/defaults_vqa.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | datasets:
7 | vg_vqa:
8 | # data_dir: ${env.data_dir}/datasets
9 | data_type: images # [images|videos|features]
10 |
11 | build_info:
12 | # Be careful not to append minus sign (-) before split to avoid itemizing
13 | annotations:
14 | train:
15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json
16 | storage: vg/annotations/vg_qa.json
17 | images:
18 | storage: vg/images/
19 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/default.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | env:
7 | # For default users
8 | # cache_root: "cache"
9 | # For internal use with persistent storage
10 | cache_root: "/export/home/.cache/lavis"
11 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/albef_classification_ve.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_classification
8 | load_finetuned: True
9 |
10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt"
11 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
12 |
13 | num_classes: 3
14 |
15 | use_distill: True
16 | momentum: 0.995
17 | alpha: 0.4
18 |
19 | # vit encoder
20 | vit_type: "base"
21 | vit_grad_ckpt: False
22 | vit_ckpt_layer: 0
23 | vit_layer_norm_epsilon: 1e-6
24 |
25 | image_size: 384
26 |
27 | # bert config
28 | med_config_path: "configs/models/med_config_albef.json"
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | eval:
35 | name: "blip_image_eval"
36 | text_processor:
37 | train:
38 | name: "blip_caption"
39 | eval:
40 | name: "blip_caption"
41 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/albef_feature_extractor.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_pretrain
8 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
9 |
10 | # vit encoder
11 | vit_type: "base"
12 | image_size: 224
13 | vit_ckpt_layer: 0
14 | vit_drop_path_rate: 0
15 | vit_layer_norm_epsilon: 1e-6
16 | vit_grad_ckpt: False
17 |
18 | # bert config
19 | med_config_path: "configs/models/med_config_albef.json"
20 |
21 | embed_dim: 256
22 |
23 | preprocess:
24 | vis_processor:
25 | eval:
26 | name: "blip_image_eval"
27 | image_size: 224
28 | text_processor:
29 | eval:
30 | name: "blip_caption"
31 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/albef_nlvr.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_nlvr
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth"
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt"
12 |
13 | num_classes: 2
14 |
15 | use_distill: True
16 | momentum: 0.995
17 | alpha: 0.4
18 |
19 | # vit encoder
20 | vit_type: "base"
21 | vit_grad_ckpt: False
22 | vit_ckpt_layer: 0
23 | vit_layer_norm_epsilon: 1e-6
24 |
25 | image_size: 384
26 |
27 | # bert config
28 | med_config_path: "configs/models/med_config_albef.json"
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 384
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 384
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/albef_pretrain_base.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_pretrain
8 |
9 | load_pretrained: True
10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 |
12 | # vit encoder
13 | vit_type: "base"
14 | image_size: 224
15 | vit_ckpt_layer: 0
16 | vit_drop_path_rate: 0
17 | vit_layer_norm_epsilon: 1e-6
18 | vit_grad_ckpt: False
19 |
20 | # bert config
21 | med_config_path: "configs/models/med_config_albef.json"
22 | mlm_mask_prob: 0.15
23 |
24 | embed_dim: 256
25 | momentum: 0.995
26 | alpha: 0.4
27 | temp: 0.07
28 |
29 | max_txt_len: 30
30 |
31 | preprocess:
32 | vis_processor:
33 | train:
34 | name: "blip_image_train"
35 | image_size: 256
36 | text_processor:
37 | train:
38 | name: "blip_caption"
39 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/albef_vqav2.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_vqa
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth"
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt"
12 |
13 | use_distill: True
14 | momentum: 0.995
15 | alpha: 0.4
16 |
17 | # vit encoder
18 | vit_type: "base"
19 | vit_grad_ckpt: False
20 | vit_ckpt_layer: 0
21 | vit_layer_norm_epsilon: 1e-6
22 |
23 | image_size: 384
24 |
25 | # bert config
26 | med_config_path: "configs/models/med_config_albef.json"
27 |
28 | preprocess:
29 | vis_processor:
30 | train:
31 | name: "blip_image_train"
32 | image_size: 384
33 | eval:
34 | name: "blip_image_eval"
35 | image_size: 384
36 | text_processor:
37 | train:
38 | name: "blip_question"
39 | eval:
40 | name: "blip_question"
41 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/alpro_qa_msrvtt.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: alpro_qa
8 | num_classes: 1500
9 |
10 | load_finetuned: True
11 |
12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth"
13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14 |
15 | timesformer:
16 | n_frms: 16
17 | image_size: 224
18 |
19 | patch_size: 16
20 | attn_drop_rate: 0.
21 | drop_rate: 0.
22 | drop_path_rate: 0.1
23 |
24 | use_grad_ckpt: True
25 | ckpt_layer: 12
26 |
27 | # bert config
28 | med_config_path: "configs/models/bert_config_alpro.json"
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "alpro_video_train"
34 | n_frms: 16
35 | image_size: 224
36 | eval:
37 | name: "alpro_video_eval"
38 | n_frms: 16
39 | image_size: 224
40 | text_processor:
41 | train:
42 | name: "blip_caption"
43 | eval:
44 | name: "blip_caption"
45 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/alpro_qa_msvd.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: alpro_qa
8 | num_classes: 2423
9 |
10 | load_finetuned: True
11 |
12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth"
13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
14 |
15 | timesformer:
16 | n_frms: 16
17 | image_size: 224
18 |
19 | patch_size: 16
20 | attn_drop_rate: 0.
21 | drop_rate: 0.
22 | drop_path_rate: 0.1
23 | use_grad_ckpt: True
24 | ckpt_layer: 12
25 |
26 | # bert config
27 | med_config_path: "configs/models/bert_config_alpro.json"
28 |
29 | preprocess:
30 | vis_processor:
31 | train:
32 | name: "alpro_video_train"
33 | n_frms: 16
34 | image_size: 224
35 | eval:
36 | name: "alpro_video_eval"
37 | n_frms: 16
38 | image_size: 224
39 | text_processor:
40 | train:
41 | name: "blip_caption"
42 | eval:
43 | name: "blip_caption"
44 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/alpro_retrieval_didemo.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: alpro_retrieval
8 |
9 | load_finetuned: True
10 |
11 | finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt
12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13 |
14 | timesformer:
15 | n_frms: 8
16 | image_size: 224
17 |
18 | patch_size: 16
19 | attn_drop_rate: 0.
20 | drop_rate: 0.
21 | drop_path_rate: 0.1
22 | use_grad_ckpt: False
23 |
24 | # bert config
25 | med_config_path: "configs/models/bert_config_alpro.json"
26 |
27 | preprocess:
28 | vis_processor:
29 | eval:
30 | name: "alpro_video_eval"
31 | n_frms: 8
32 | image_size: 224
33 | text_processor:
34 | eval:
35 | name: "blip_caption"
36 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/alpro_retrieval_msrvtt.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: alpro_retrieval
8 |
9 | load_finetuned: True
10 |
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt"
12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt"
13 |
14 | timesformer:
15 | n_frms: 8
16 | image_size: 224
17 |
18 | patch_size: 16
19 | attn_drop_rate: 0.
20 | drop_rate: 0.
21 | drop_path_rate: 0.1
22 | use_grad_ckpt: False
23 |
24 | # bert config
25 | med_config_path: "configs/models/bert_config_alpro.json"
26 |
27 | preprocess:
28 | vis_processor:
29 | train:
30 | name: "alpro_video_train"
31 | n_frms: 8
32 | image_size: 224
33 | eval:
34 | name: "alpro_video_eval"
35 | n_frms: 8
36 | image_size: 224
37 | text_processor:
38 | train:
39 | name: "blip_caption"
40 | eval:
41 | name: "blip_caption"
42 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/bert_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertModel"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "hidden_act": "gelu",
7 | "hidden_dropout_prob": 0.1,
8 | "hidden_size": 768,
9 | "initializer_range": 0.02,
10 | "intermediate_size": 3072,
11 | "layer_norm_eps": 1e-12,
12 | "max_position_embeddings": 512,
13 | "model_type": "bert",
14 | "num_attention_heads": 12,
15 | "num_hidden_layers": 12,
16 | "pad_token_id": 0,
17 | "add_type_embeddings": false,
18 | "vocab_size": 30522,
19 | "encoder_width": 768,
20 | "add_cross_attention": true
21 | }
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/bert_config_alpro.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertModel"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "hidden_act": "gelu",
7 | "hidden_dropout_prob": 0.1,
8 | "hidden_size": 768,
9 | "initializer_range": 0.02,
10 | "intermediate_size": 3072,
11 | "layer_norm_eps": 1e-12,
12 | "max_position_embeddings": 512,
13 | "model_type": "bert",
14 | "num_attention_heads": 12,
15 | "num_hidden_layers": 12,
16 | "pad_token_id": 0,
17 | "add_type_embeddings": true,
18 | "type_vocab_size": 2,
19 | "vocab_size": 30522,
20 | "encoder_width": 768,
21 | "add_cross_attention": false,
22 | "fusion_layer": 6
23 | }
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip-diffusion/blip_diffusion_base.yaml:
--------------------------------------------------------------------------------
1 | model:
2 | vit_model: "clip_L"
3 |
4 | qformer_num_query_token: 16
5 | qformer_cross_attention_freq: 1
6 |
7 | sd_train_text_encoder: False
8 | sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
9 |
10 | load_finetuned: False
11 | load_pretrained: True
12 | # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz"
14 |
15 | preprocess:
16 | vis_processor:
17 | train:
18 | name: "blip_diffusion_inp_image_eval"
19 | eval:
20 | name: "blip_diffusion_inp_image_eval"
21 | text_processor:
22 | train:
23 | name: "blip_caption"
24 | eval:
25 | name: "blip_caption"
26 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml:
--------------------------------------------------------------------------------
1 | model:
2 | vit_model: "clip_L"
3 |
4 | qformer_num_query_token: 16
5 | qformer_cross_attention_freq: 1
6 |
7 | sd_train_text_encoder: False
8 | sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
9 |
10 | load_finetuned: False
11 | load_pretrained: True
12 | # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz"
14 |
15 | controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-canny"
16 |
17 | preprocess:
18 | vis_processor:
19 | train:
20 | name: "blip_diffusion_inp_image_eval"
21 | eval:
22 | name: "blip_diffusion_inp_image_eval"
23 | text_processor:
24 | train:
25 | name: "blip_caption"
26 | eval:
27 | name: "blip_caption"
28 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml:
--------------------------------------------------------------------------------
1 | model:
2 | vit_model: "clip_L"
3 |
4 | qformer_num_query_token: 16
5 | qformer_cross_attention_freq: 1
6 |
7 | sd_train_text_encoder: False
8 | sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
9 |
10 | load_finetuned: False
11 | load_pretrained: True
12 | # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz"
14 |
15 | controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-depth"
16 |
17 | preprocess:
18 | vis_processor:
19 | train:
20 | name: "blip_diffusion_inp_image_eval"
21 | eval:
22 | name: "blip_diffusion_inp_image_eval"
23 | text_processor:
24 | train:
25 | name: "blip_caption"
26 | eval:
27 | name: "blip_caption"
28 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_hed.yaml:
--------------------------------------------------------------------------------
1 | model:
2 | vit_model: "clip_L"
3 |
4 | qformer_num_query_token: 16
5 | qformer_cross_attention_freq: 1
6 |
7 | sd_train_text_encoder: False
8 | sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5"
9 |
10 | load_finetuned: False
11 | load_pretrained: True
12 | # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/"
13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz"
14 |
15 | controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-hed"
16 |
17 | preprocess:
18 | vis_processor:
19 | train:
20 | name: "blip_diffusion_inp_image_eval"
21 | eval:
22 | name: "blip_diffusion_inp_image_eval"
23 | text_processor:
24 | train:
25 | name: "blip_caption"
26 | eval:
27 | name: "blip_caption"
28 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: caption_coco_flant5xl
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_flant5xl.pth"
12 |
13 | # vit encoder
14 | image_size: 364
15 | drop_path_rate: 0
16 | use_grad_checkpoint: False
17 | vit_precision: "fp32"
18 | freeze_vit: False
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 | # T5
24 | t5_model: "google/flan-t5-xl"
25 |
26 | # generation configs
27 | prompt: "a photo of"
28 |
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 364
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 364
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: caption_coco_opt2.7b
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt2.7b.pth"
12 |
13 | # vit encoder
14 | image_size: 364
15 | drop_path_rate: 0
16 | use_grad_checkpoint: False
17 | vit_precision: "fp32"
18 | freeze_vit: False
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 | # OPT
24 | opt_model: "facebook/opt-2.7b"
25 |
26 | # generation configs
27 | prompt: "a photo of"
28 |
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 364
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 364
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: caption_coco_opt6.7b
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt6.7b.pth"
12 |
13 | # vit encoder
14 | image_size: 364
15 | drop_path_rate: 0
16 | use_grad_checkpoint: False
17 | vit_precision: "fp32"
18 | freeze_vit: False
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 | # OPT
24 | opt_model: "facebook/opt-6.7b"
25 |
26 | # generation configs
27 | prompt: "a photo of"
28 |
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 364
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 364
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_coco.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: coco
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth"
12 |
13 | # vit encoder
14 | image_size: 364
15 | drop_path_rate: 0
16 | use_grad_checkpoint: True
17 | vit_precision: "fp32"
18 | freeze_vit: False
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 |
24 | preprocess:
25 | vis_processor:
26 | train:
27 | name: "blip_image_train"
28 | image_size: 364
29 | eval:
30 | name: "blip_image_eval"
31 | image_size: 364
32 | text_processor:
33 | train:
34 | name: "blip_caption"
35 | eval:
36 | name: "blip_caption"
37 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: flant5xl
8 | load_finetuned: False
9 | load_pretrained: True
10 |
11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxl_trimmed.pth"
12 | finetuned: ""
13 |
14 | # vit encoder
15 | image_size: 224
16 | drop_path_rate: 0
17 | use_grad_checkpoint: False
18 | vit_precision: "fp16"
19 | freeze_vit: True
20 |
21 | # Q-Former
22 | num_query_token: 32
23 |
24 | # T5
25 | t5_model: "google/flan-t5-xl"
26 |
27 | # generation configs
28 | prompt: ""
29 |
30 |
31 | preprocess:
32 | vis_processor:
33 | train:
34 | name: "blip_image_train"
35 | image_size: 224
36 | eval:
37 | name: "blip_image_eval"
38 | image_size: 224
39 | text_processor:
40 | train:
41 | name: "blip_caption"
42 | eval:
43 | name: "blip_caption"
44 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: flant5xxl
8 | load_finetuned: False
9 | load_pretrained: True
10 |
11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxxl_trimmed.pth"
12 | finetuned: ""
13 |
14 | # vit encoder
15 | image_size: 224
16 | drop_path_rate: 0
17 | use_grad_checkpoint: False
18 | vit_precision: "fp16"
19 | freeze_vit: True
20 |
21 | # Q-Former
22 | num_query_token: 32
23 |
24 | # T5
25 | t5_model: "google/flan-t5-xxl"
26 |
27 | # generation configs
28 | prompt: ""
29 |
30 |
31 | preprocess:
32 | vis_processor:
33 | train:
34 | name: "blip_image_train"
35 | image_size: 224
36 | eval:
37 | name: "blip_image_eval"
38 | image_size: 224
39 | text_processor:
40 | train:
41 | name: "blip_caption"
42 | eval:
43 | name: "blip_caption"
44 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: instruct_vicuna13b
8 | load_finetuned: False
9 | load_pretrained: True
10 |
11 | # cd_comments: set right path for pretrained blip ckpt
12 | pretrained: "./checkpoints/instruct_blip_vicuna13b_trimmed.pth"
13 | finetuned: ""
14 |
15 | # vit encoder
16 | image_size: 224
17 | drop_path_rate: 0
18 | use_grad_checkpoint: False
19 | vit_precision: "fp16"
20 | freeze_vit: True
21 |
22 | # Q-Former
23 | num_query_token: 32
24 |
25 | # cd_comments: set right path for vicuna
26 | llm_model: "./checkpoints/vicuna-13b-v1.1"
27 |
28 | # generation configs
29 | prompt: ""
30 |
31 |
32 | preprocess:
33 | vis_processor:
34 | train:
35 | name: "blip2_image_train"
36 | image_size: 224
37 | eval:
38 | name: "blip_image_eval"
39 | image_size: 224
40 | text_processor:
41 | train:
42 | name: "blip_caption"
43 | eval:
44 | name: "blip_caption"
45 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: instruct_vicuna7b
8 | load_finetuned: False
9 | load_pretrained: True
10 |
11 | # cd_comments: set right path for pretrained blip ckpt
12 | pretrained: "path/to/the/instruct_blip_vicuna7b_trimmed.pth"
13 |
14 | finetuned: ""
15 |
16 | # vit encoder
17 | image_size: 224
18 | drop_path_rate: 0
19 | use_grad_checkpoint: False
20 | vit_precision: "fp16"
21 | freeze_vit: True
22 |
23 | # Q-Former
24 | num_query_token: 32
25 |
26 | # cd_comments: set right path for vicuna
27 | llm_model: "path/checkpoints/vicuna-7b-v1.1"
28 |
29 | # generation configs
30 | prompt: ""
31 |
32 |
33 | preprocess:
34 | vis_processor:
35 | train:
36 | name: "blip2_image_train"
37 | image_size: 224
38 | eval:
39 | name: "blip_image_eval"
40 | image_size: 224
41 | text_processor:
42 | train:
43 | name: "blip_caption"
44 | eval:
45 | name: "blip_caption"
46 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_pretrain.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: pretrain
8 | load_finetuned: False
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 | finetuned: ""
12 |
13 | # vit encoder
14 | image_size: 224
15 | drop_path_rate: 0
16 | use_grad_checkpoint: False
17 | vit_precision: "fp16"
18 | freeze_vit: True
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 |
24 | preprocess:
25 | vis_processor:
26 | train:
27 | name: "blip_image_train"
28 | image_size: 224
29 | eval:
30 | name: "blip_image_eval"
31 | image_size: 224
32 | text_processor:
33 | train:
34 | name: "blip_caption"
35 | eval:
36 | name: "blip_caption"
37 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: pretrain_flant5xl
8 | load_finetuned: False
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth"
11 | finetuned: ""
12 |
13 | # vit encoder
14 | image_size: 224
15 | drop_path_rate: 0
16 | use_grad_checkpoint: False
17 | vit_precision: "fp16"
18 | freeze_vit: True
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 | # T5
24 | t5_model: "google/flan-t5-xl"
25 |
26 | # generation configs
27 | prompt: ""
28 |
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 224
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 224
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: pretrain_flant5xl
8 | load_finetuned: False
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl_vitL.pth"
11 | finetuned: ""
12 |
13 | # vit encoder
14 | vit_model: "clip_L"
15 | image_size: 224
16 | drop_path_rate: 0
17 | use_grad_checkpoint: False
18 | vit_precision: "fp16"
19 | freeze_vit: True
20 |
21 | # Q-Former
22 | num_query_token: 32
23 |
24 | # T5
25 | t5_model: "google/flan-t5-xl"
26 |
27 | # generation configs
28 | prompt: ""
29 |
30 |
31 | preprocess:
32 | vis_processor:
33 | train:
34 | name: "blip_image_train"
35 | image_size: 224
36 | eval:
37 | name: "blip_image_eval"
38 | image_size: 224
39 | text_processor:
40 | train:
41 | name: "blip_caption"
42 | eval:
43 | name: "blip_caption"
44 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: pretrain_flant5xxl
8 | load_finetuned: False
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth"
11 | finetuned: ""
12 |
13 | # vit encoder
14 | image_size: 224
15 | drop_path_rate: 0
16 | use_grad_checkpoint: False
17 | vit_precision: "fp16"
18 | freeze_vit: True
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 | # T5
24 | t5_model: "google/flan-t5-xxl"
25 |
26 | # generation configs
27 | prompt: ""
28 |
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 224
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 224
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_pretrain_llama7b.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip2_llama
8 | load_finetuned: False
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
11 | finetuned: ""
12 |
13 | # vit encoder
14 | image_size: 224
15 | drop_path_rate: 0
16 | use_grad_checkpoint: False
17 | vit_precision: "fp16"
18 | freeze_vit: True
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 | # LLM
24 | llm_model: "/export/home/project/stanford_alpaca/llama_7B"
25 |
26 | # generation configs
27 | prompt: ""
28 |
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip2_image_train"
34 | image_size: 224
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 224
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: pretrain_opt2.7b
8 | load_finetuned: False
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth"
11 | finetuned: ""
12 |
13 | # vit encoder
14 | image_size: 224
15 | drop_path_rate: 0
16 | use_grad_checkpoint: False
17 | vit_precision: "fp16"
18 | freeze_vit: True
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 | # OPT
24 | opt_model: "facebook/opt-2.7b"
25 |
26 | # generation configs
27 | prompt: ""
28 |
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 224
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 224
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: pretrain_opt6.7b
8 | load_finetuned: False
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth"
11 | finetuned: ""
12 |
13 | # vit encoder
14 | image_size: 224
15 | drop_path_rate: 0
16 | use_grad_checkpoint: False
17 | vit_precision: "fp16"
18 | freeze_vit: True
19 |
20 | # Q-Former
21 | num_query_token: 32
22 |
23 | # OPT
24 | opt_model: "facebook/opt-6.7b"
25 |
26 | # generation configs
27 | prompt: ""
28 |
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 224
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 224
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: pretrain
8 | load_finetuned: False
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vitL.pth"
11 | finetuned: ""
12 |
13 | # vit encoder
14 | vit_model: "clip_L"
15 | image_size: 224
16 | drop_path_rate: 0
17 | use_grad_checkpoint: False
18 | vit_precision: "fp16"
19 | freeze_vit: True
20 |
21 | # Q-Former
22 | num_query_token: 32
23 |
24 |
25 | preprocess:
26 | vis_processor:
27 | train:
28 | name: "blip_image_train"
29 | image_size: 224
30 | eval:
31 | name: "blip_image_eval"
32 | image_size: 224
33 | text_processor:
34 | train:
35 | name: "blip_caption"
36 | eval:
37 | name: "blip_caption"
38 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_caption_base_coco.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_caption
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth"
12 |
13 | # vit encoder
14 | vit_type: "base"
15 | vit_grad_ckpt: False
16 | vit_ckpt_layer: 0
17 |
18 | image_size: 384
19 |
20 | # bert config
21 | med_config_path: "configs/models/med_config.json"
22 |
23 | # generation configs
24 | prompt: "a picture of "
25 |
26 |
27 | preprocess:
28 | vis_processor:
29 | train:
30 | name: "blip_image_train"
31 | eval:
32 | name: "blip_image_eval"
33 | text_processor:
34 | train:
35 | name: "blip_caption"
36 | prompt: "a picture of "
37 | eval:
38 | name: "blip_caption"
39 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_caption_large_coco.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_caption
8 | load_finetuned: True
9 |
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth"
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth"
12 |
13 | vit_type: "large"
14 | vit_grad_ckpt: True
15 | vit_ckpt_layer: 5
16 |
17 | image_size: 384
18 |
19 | # bert config
20 | med_config_path: "configs/models/med_large_config.json"
21 |
22 | # generation configs
23 | prompt: "a picture of "
24 |
25 |
26 | preprocess:
27 | vis_processor:
28 | train:
29 | name: "blip_image_train"
30 | eval:
31 | name: "blip_image_eval"
32 | text_processor:
33 | train:
34 | name: "blip_caption"
35 | prompt: "a picture of "
36 | eval:
37 | name: "blip_caption"
38 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_classification_base.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_classification
8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
9 |
10 | use_distill: True
11 | momentum: 0.995
12 | alpha: 0.4
13 |
14 | # vit encoder
15 | vit_type: "base"
16 | vit_grad_ckpt: False
17 | vit_ckpt_layer: 0
18 |
19 | image_size: 384
20 |
21 | # bert config
22 | med_config_path: "configs/models/med_config.json"
23 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_feature_extractor_base.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_pretrain
8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
9 |
10 | # vit encoder
11 | vit_type: "base"
12 | vit_grad_ckpt: False
13 | vit_ckpt_layer: 0
14 |
15 | image_size: 224
16 |
17 | # bert config
18 | med_config_path: "configs/models/med_config.json"
19 |
20 | embed_dim: 256
21 |
22 | preprocess:
23 | vis_processor:
24 | eval:
25 | name: "blip_image_eval"
26 | image_size: 224
27 | text_processor:
28 | eval:
29 | name: "blip_caption"
30 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_itm_base.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_image_text_matching
8 |
9 | load_finetuned: True
10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth"
11 |
12 | # vit encoder
13 | vit_type: "base"
14 | vit_grad_ckpt: False
15 | vit_ckpt_layer: 0
16 |
17 | image_size: 384
18 |
19 | # bert config
20 | med_config_path: "configs/models/med_config.json"
21 |
22 | embed_dim: 256
23 |
24 | preprocess:
25 | vis_processor:
26 | eval:
27 | name: "blip_image_eval"
28 | image_size: 384
29 | text_processor:
30 | eval:
31 | name: "blip_caption"
32 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_itm_large.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_image_text_matching
8 |
9 | load_finetuned: True
10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth"
11 |
12 | # vit encoder
13 | vit_type: "large"
14 | vit_grad_ckpt: False
15 | vit_ckpt_layer: 0
16 |
17 | image_size: 384
18 |
19 | # bert config
20 | med_config_path: "configs/models/med_large_config.json"
21 |
22 | embed_dim: 256
23 |
24 | preprocess:
25 | vis_processor:
26 | eval:
27 | name: "blip_image_eval"
28 | image_size: 384
29 | text_processor:
30 | eval:
31 | name: "blip_caption"
32 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_nlvr.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_nlvr
8 | model_type: nlvr
9 | load_finetuned: True
10 |
11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth"
12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
13 |
14 | num_classes: 2
15 |
16 | # vit encoder
17 | vit_type: "base"
18 | vit_grad_ckpt: False
19 | vit_ckpt_layer: 0
20 | vit_layer_norm_epsilon: 1e-6
21 |
22 | image_size: 384
23 |
24 | # bert config
25 | med_config_path: "configs/models/med_config.json"
26 |
27 | preprocess:
28 | vis_processor:
29 | train:
30 | name: "blip_image_train"
31 | image_size: 384
32 | eval:
33 | name: "blip_image_eval"
34 | image_size: 384
35 | text_processor:
36 | train:
37 | name: "blip_caption"
38 | eval:
39 | name: "blip_caption"
40 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_pretrain_base.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_pretrain
8 |
9 | load_pretrained: True
10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
11 |
12 | # vit encoder
13 | vit_type: "base"
14 | vit_grad_ckpt: False
15 | vit_ckpt_layer: 0
16 |
17 | image_size: 224
18 | alpha: 0.4
19 |
20 | # bert config
21 | med_config_path: "configs/models/bert_config.json"
22 |
23 | embed_dim: 256
24 |
25 | # generation configs
26 | prompt: "a picture of "
27 |
28 | preprocess:
29 | vis_processor:
30 | train:
31 | name: "blip_image_train"
32 | image_size: 224
33 | text_processor:
34 | train:
35 | name: "blip_caption"
36 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_pretrain_large.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_pretrain
8 |
9 | # vit encoder
10 | vit_type: "large"
11 | vit_grad_ckpt: True
12 | vit_ckpt_layer: 5
13 |
14 | image_size: 224
15 |
16 | # bert config
17 | med_config_path: "configs/models/med_large_config.json"
18 |
19 | embed_dim: 256
20 |
21 | # generation configs
22 | prompt: "a picture of "
23 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_retrieval_coco.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_retrieval
8 | load_finetuned: True
9 |
10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth"
11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 |
13 | queue_size: 57600
14 |
15 | # vit encoder
16 | vit_type: "base"
17 | vit_grad_ckpt: True
18 | vit_ckpt_layer: 4
19 |
20 | image_size: 384
21 |
22 | # bert config
23 | med_config_path: "configs/models/med_config.json"
24 |
25 | embed_dim: 256
26 |
27 | preprocess:
28 | vis_processor:
29 | train:
30 | name: "blip_image_train"
31 | image_size: 384
32 | eval:
33 | name: "blip_image_eval"
34 | image_size: 384
35 | text_processor:
36 | train:
37 | name: "blip_caption"
38 | eval:
39 | name: "blip_caption"
40 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_retrieval_flickr.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_retrieval
8 | load_finetuned: True
9 |
10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_flickr_retrieval.pth"
11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 |
13 | queue_size: 57600
14 | alpha: 0.4
15 |
16 | negative_all_rank: False
17 |
18 | # vit encoder
19 | vit_type: "base"
20 | vit_grad_ckpt: True
21 | vit_ckpt_layer: 4
22 |
23 | image_size: 384
24 |
25 | # bert config
26 | med_config_path: "configs/models/med_config.json"
27 |
28 | embed_dim: 256
29 |
30 | preprocess:
31 | vis_processor:
32 | train:
33 | name: "blip_image_train"
34 | image_size: 384
35 | eval:
36 | name: "blip_image_eval"
37 | image_size: 384
38 | text_processor:
39 | train:
40 | name: "blip_caption"
41 | eval:
42 | name: "blip_caption"
43 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_vqa_aokvqa.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_vqa
8 | load_finetuned: True
9 |
10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth"
11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 |
13 | # vit encoder
14 | vit_type: "base"
15 | vit_grad_ckpt: False
16 | vit_ckpt_layer: 0
17 | vit_drop_path_rate: 0.1
18 |
19 | image_size: 480
20 |
21 | # bert config
22 | med_config_path: "configs/models/med_config.json"
23 |
24 | preprocess:
25 | vis_processor:
26 | train:
27 | name: "blip_image_train"
28 | image_size: 480
29 | eval:
30 | name: "blip_image_eval"
31 | image_size: 480
32 | text_processor:
33 | train:
34 | name: "blip_question"
35 | eval:
36 | name: "blip_question"
37 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_vqa_okvqa.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_vqa
8 | load_finetuned: True
9 |
10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth"
11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
12 |
13 | # vit encoder
14 | vit_type: "base"
15 | vit_grad_ckpt: False
16 | vit_ckpt_layer: 0
17 | vit_drop_path_rate: 0.1
18 |
19 | image_size: 480
20 |
21 | # bert config
22 | med_config_path: "configs/models/med_config.json"
23 |
24 | preprocess:
25 | vis_processor:
26 | train:
27 | name: "blip_image_train"
28 | image_size: 480
29 | eval:
30 | name: "blip_image_eval"
31 | image_size: 480
32 | text_processor:
33 | train:
34 | name: "blip_question"
35 | eval:
36 | name: "blip_question"
37 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/blip_vqav2.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_vqa
8 | load_finetuned: True
9 |
10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth"
11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
12 |
13 | # vit encoder
14 | vit_type: "base"
15 | vit_grad_ckpt: False
16 | vit_ckpt_layer: 0
17 | vit_drop_path_rate: 0.1
18 |
19 | image_size: 480
20 |
21 | # bert config
22 | med_config_path: "configs/models/med_config.json"
23 |
24 | preprocess:
25 | vis_processor:
26 | train:
27 | name: "blip_image_train"
28 | image_size: 480
29 | eval:
30 | name: "blip_image_eval"
31 | image_size: 480
32 | text_processor:
33 | train:
34 | name: "blip_question"
35 | eval:
36 | name: "blip_question"
37 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/RN101-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": [
7 | 3,
8 | 4,
9 | 23,
10 | 3
11 | ],
12 | "width": 64,
13 | "patch_size": null
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 512,
19 | "heads": 8,
20 | "layers": 12
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/RN101.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": [
6 | 3,
7 | 4,
8 | 23,
9 | 3
10 | ],
11 | "width": 64,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 512,
18 | "heads": 8,
19 | "layers": 12
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/RN50-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": [
7 | 3,
8 | 4,
9 | 6,
10 | 3
11 | ],
12 | "width": 64,
13 | "patch_size": null
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 512,
19 | "heads": 8,
20 | "layers": 12
21 | }
22 | }
23 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/RN50.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": [
6 | 3,
7 | 4,
8 | 6,
9 | 3
10 | ],
11 | "width": 64,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 512,
18 | "heads": 8,
19 | "layers": 12
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/RN50x16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 384,
5 | "layers": [
6 | 6,
7 | 8,
8 | 18,
9 | 8
10 | ],
11 | "width": 96,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 768,
18 | "heads": 12,
19 | "layers": 12
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/RN50x4.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 288,
5 | "layers": [
6 | 4,
7 | 6,
8 | 10,
9 | 6
10 | ],
11 | "width": 80,
12 | "patch_size": null
13 | },
14 | "text_cfg": {
15 | "context_length": 77,
16 | "vocab_size": 49408,
17 | "width": 640,
18 | "heads": 10,
19 | "layers": 12
20 | }
21 | }
22 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-B-16-plus-240.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 240,
5 | "layers": 12,
6 | "width": 896,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 640,
13 | "heads": 10,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-B-16-plus.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 896,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 640,
13 | "heads": 10,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-B-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-B-32-plus-256.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 640,
3 | "vision_cfg": {
4 | "image_size": 256,
5 | "layers": 12,
6 | "width": 896,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 640,
13 | "heads": 10,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-B-32-quickgelu.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "quick_gelu": true,
4 | "vision_cfg": {
5 | "image_size": 224,
6 | "layers": 12,
7 | "width": 768,
8 | "patch_size": 32
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-B-32.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 32
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 512,
13 | "heads": 8,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-H-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 14
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1024,
14 | "heads": 16,
15 | "layers": 24
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-H-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 1280,
7 | "head_width": 80,
8 | "patch_size": 16
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 1024,
14 | "heads": 16,
15 | "layers": 24
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-L-14-280.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 280,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-L-14-336.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 336,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-L-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 14
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-L-16-320.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 320,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-L-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "patch_size": 16
8 | },
9 | "text_cfg": {
10 | "context_length": 77,
11 | "vocab_size": 49408,
12 | "width": 768,
13 | "heads": 12,
14 | "layers": 12
15 | }
16 | }
17 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/ViT-g-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 40,
6 | "width": 1408,
7 | "head_width": 88,
8 | "mlp_ratio": 4.3637,
9 | "patch_size": 14
10 | },
11 | "text_cfg": {
12 | "context_length": 77,
13 | "vocab_size": 49408,
14 | "width": 1024,
15 | "heads": 16,
16 | "layers": 24
17 | }
18 | }
19 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "timm_model_name": "efficientnetv2_rw_s",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "abs_attn",
7 | "timm_proj": "",
8 | "image_size": 288
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 768,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/timm-resnet50d.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "resnet50d",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "abs_attn",
7 | "timm_proj": "",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/timm-resnetaa50d.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "resnetaa50d",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "abs_attn",
7 | "timm_proj": "",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/timm-resnetblur50.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "timm_model_name": "resnetblur50",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "abs_attn",
7 | "timm_proj": "",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "swin_base_patch4_window7_224",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/timm-vit_base_patch16_224.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "vit_base_patch16_224",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/timm-vit_base_patch32_224.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "vit_base_patch32_224",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip/timm-vit_small_patch16_224.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "timm_model_name": "vit_small_patch16_224",
5 | "timm_model_pretrained": false,
6 | "timm_pool": "",
7 | "timm_proj": "linear",
8 | "image_size": 224
9 | },
10 | "text_cfg": {
11 | "context_length": 77,
12 | "vocab_size": 49408,
13 | "width": 512,
14 | "heads": 8,
15 | "layers": 12
16 | }
17 | }
18 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip_resnet50.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: clip
8 |
9 | model_type: RN50
10 |
11 | pretrained: openai
12 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/clip_vit_base16.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: clip
8 |
9 | model_type: ViT-B-16
10 |
11 | pretrained: openai
12 |
13 | preprocess:
14 | vis_processor:
15 | eval:
16 | name: "clip_image_eval"
17 | image_size: 224
18 |
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/gpt_dialogue_base.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: gpt_dialogue
8 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth"
9 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth"
10 |
11 | len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens
12 |
13 | len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128
14 |
15 | preprocess:
16 | vis_processor:
17 | train:
18 | name: "gpt_video_ft"
19 | eval:
20 | name: "gpt_video_ft"
21 | text_processor:
22 | train:
23 | name: "gpt_dialogue"
24 | eval:
25 | name: "gpt_dialogue"
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/med_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertModel"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "hidden_act": "gelu",
7 | "hidden_dropout_prob": 0.1,
8 | "hidden_size": 768,
9 | "initializer_range": 0.02,
10 | "intermediate_size": 3072,
11 | "layer_norm_eps": 1e-12,
12 | "max_position_embeddings": 512,
13 | "model_type": "bert",
14 | "num_attention_heads": 12,
15 | "num_hidden_layers": 12,
16 | "pad_token_id": 0,
17 | "add_type_embeddings": false,
18 | "vocab_size": 30524,
19 | "encoder_width": 768,
20 | "add_cross_attention": true
21 | }
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/med_config_albef.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertModel"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "hidden_act": "gelu",
7 | "hidden_dropout_prob": 0.1,
8 | "hidden_size": 768,
9 | "initializer_range": 0.02,
10 | "intermediate_size": 3072,
11 | "layer_norm_eps": 1e-12,
12 | "max_position_embeddings": 512,
13 | "model_type": "bert",
14 | "num_attention_heads": 12,
15 | "num_hidden_layers": 12,
16 | "pad_token_id": 0,
17 | "add_type_embeddings": false,
18 | "vocab_size": 30522,
19 | "encoder_width": 768,
20 | "add_cross_attention": true,
21 | "fusion_layer": 6
22 | }
--------------------------------------------------------------------------------
/experiments/lavis/configs/models/med_large_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "architectures": [
3 | "BertModel"
4 | ],
5 | "attention_probs_dropout_prob": 0.1,
6 | "hidden_act": "gelu",
7 | "hidden_dropout_prob": 0.1,
8 | "hidden_size": 768,
9 | "initializer_range": 0.02,
10 | "intermediate_size": 3072,
11 | "layer_norm_eps": 1e-12,
12 | "max_position_embeddings": 512,
13 | "model_type": "bert",
14 | "num_attention_heads": 12,
15 | "num_hidden_layers": 12,
16 | "pad_token_id": 0,
17 | "add_type_embeddings": false,
18 | "vocab_size": 30524,
19 | "encoder_width": 1024,
20 | "add_cross_attention": true
21 | }
--------------------------------------------------------------------------------
/experiments/lavis/datasets/builders/classification_builder.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from lavis.common.registry import registry
9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
10 | from lavis.datasets.datasets.nlvr_datasets import NLVRDataset, NLVREvalDataset
11 | from lavis.datasets.datasets.snli_ve_datasets import SNLIVisualEntialmentDataset
12 |
13 |
14 | @registry.register_builder("nlvr")
15 | class NLVRBuilder(BaseDatasetBuilder):
16 | train_dataset_cls = NLVRDataset
17 | eval_dataset_cls = NLVREvalDataset
18 |
19 | DATASET_CONFIG_DICT = {"default": "configs/datasets/nlvr/defaults.yaml"}
20 |
21 |
22 | @registry.register_builder("snli_ve")
23 | class SNLIVisualEntailmentBuilder(BaseDatasetBuilder):
24 | train_dataset_cls = SNLIVisualEntialmentDataset
25 | eval_dataset_cls = SNLIVisualEntialmentDataset
26 |
27 | DATASET_CONFIG_DICT = {"default": "configs/datasets/snli_ve/defaults.yaml"}
28 |
--------------------------------------------------------------------------------
/experiments/lavis/datasets/builders/dialogue_builder.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from lavis.common.registry import registry
9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder
10 | from lavis.datasets.datasets.avsd_dialogue_datasets import (
11 | AVSDDialDataset,
12 | AVSDDialEvalDataset,
13 | )
14 |
15 |
16 | @registry.register_builder("avsd_dialogue")
17 | class AVSDDialBuilder(BaseDatasetBuilder):
18 | train_dataset_cls = AVSDDialDataset
19 | eval_dataset_cls = AVSDDialEvalDataset
20 |
21 | DATASET_CONFIG_DICT = {"default": "configs/datasets/avsd/defaults_dial.yaml"}
22 |
--------------------------------------------------------------------------------
/experiments/lavis/datasets/datasets/multimodal_classification_datasets.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from abc import abstractmethod
9 | from lavis.datasets.datasets.base_dataset import BaseDataset
10 |
11 |
12 | class MultimodalClassificationDataset(BaseDataset):
13 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
14 | super().__init__(vis_processor, text_processor, vis_root, ann_paths)
15 |
16 | self.class_labels = None
17 |
18 | @abstractmethod
19 | def _build_class_labels(self):
20 | pass
21 |
--------------------------------------------------------------------------------
/experiments/lavis/datasets/datasets/vg_vqa_datasets.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import os
9 |
10 | from PIL import Image
11 |
12 | from lavis.datasets.datasets.vqa_datasets import VQADataset
13 |
14 |
15 | class VGVQADataset(VQADataset):
16 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths):
17 | super().__init__(vis_processor, text_processor, vis_root, ann_paths)
18 |
19 | def __getitem__(self, index):
20 | ann = self.annotation[index]
21 |
22 | image_path = os.path.join(self.vis_root, ann["image"])
23 | image = Image.open(image_path).convert("RGB")
24 |
25 | image = self.vis_processor(image)
26 | question = self.text_processor(ann["question"])
27 |
28 | answers = [ann["answer"]]
29 | # TODO this should be configured better
30 | weights = [0.2]
31 |
32 | return {
33 | "image": image,
34 | "text_input": question,
35 | "answers": answers,
36 | "weights": weights,
37 | }
38 |
--------------------------------------------------------------------------------
/experiments/lavis/models/blip2_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/models/blip2_models/__init__.py
--------------------------------------------------------------------------------
/experiments/lavis/models/blip_diffusion_models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/models/blip_diffusion_models/__init__.py
--------------------------------------------------------------------------------
/experiments/lavis/models/clip_models/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 |
7 | Based on https://github.com/mlfoundations/open_clip
8 | """
9 |
10 | """ OpenAI pretrained model functions
11 | Adapted from https://github.com/mlfoundations/open_clip and https://github.com/openai/CLIP.
12 |
13 | Originally MIT License, Copyright (c) 2021 OpenAI.
14 | """
15 |
--------------------------------------------------------------------------------
/experiments/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz
--------------------------------------------------------------------------------
/experiments/lavis/models/clip_models/pics/CLIP.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/models/clip_models/pics/CLIP.png
--------------------------------------------------------------------------------
/experiments/lavis/models/img2prompt_models/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | import torch
9 |
10 |
11 |
12 |
--------------------------------------------------------------------------------
/experiments/lavis/models/timesformer/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 |
7 | Based on https://github.com/facebookresearch/TimeSformer
8 | """
9 |
--------------------------------------------------------------------------------
/experiments/lavis/models/timesformer/linear.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | """ Linear layer (alternate definition)
9 | """
10 | import torch
11 | import torch.nn.functional as F
12 | from torch import nn as nn
13 |
14 |
15 | class Linear(nn.Linear):
16 | def forward(self, input: torch.Tensor) -> torch.Tensor:
17 | if torch.jit.is_scripting():
18 | bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None
19 | return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias)
20 | else:
21 | return F.linear(input, self.weight, self.bias)
22 |
--------------------------------------------------------------------------------
/experiments/lavis/processors/base_processor.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from omegaconf import OmegaConf
9 |
10 |
11 | class BaseProcessor:
12 | def __init__(self):
13 | self.transform = lambda x: x
14 | return
15 |
16 | def __call__(self, item):
17 | return self.transform(item)
18 |
19 | @classmethod
20 | def from_config(cls, cfg=None):
21 | return cls()
22 |
23 | def build(self, **kwargs):
24 | cfg = OmegaConf.create(kwargs)
25 |
26 | return self.from_config(cfg)
27 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/albef/eval/nlvr_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_nlvr
8 | model_type: nlvr
9 |
10 | datasets:
11 | nlvr: # name of the dataset builder
12 | vis_processor:
13 | eval:
14 | name: "blip_image_eval"
15 | image_size: 384
16 | text_processor:
17 | eval:
18 | name: "blip_caption"
19 |
20 | run:
21 | task: multimodal_classification
22 |
23 | batch_size_train: 16
24 | batch_size_eval: 64
25 | num_workers: 4
26 |
27 | seed: 42
28 | output_dir: "output/ALBEF/NLVR"
29 |
30 | evaluate: True
31 | test_splits: ["val", "test"]
32 |
33 | # distribution-specific
34 | device: "cuda"
35 | world_size: 1
36 | dist_url: "env://"
37 | distributed: True
38 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/albef/eval/ret_coco_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_retrieval
8 | model_type: coco
9 |
10 | datasets:
11 | coco_retrieval: # name of the dataset builder
12 | vis_processor:
13 | eval:
14 | name: "blip_image_eval"
15 | image_size: 384
16 | text_processor:
17 | eval:
18 | name: "blip_caption"
19 |
20 | run:
21 | task: retrieval
22 |
23 | # dataloading
24 | num_workers: 4
25 | batch_size_train: 32
26 | batch_size_eval: 64
27 |
28 | test_splits: ["test"]
29 |
30 | # distribution
31 | device: "cuda"
32 | world_size: 1
33 | dist_url: "env://"
34 | distributed: True
35 | use_dist_eval_sampler: False
36 |
37 | # model specific
38 | k_test: 128
39 |
40 | # misc
41 | seed: 42
42 | output_dir: "output/ALBEF/Retrieval_COCO"
43 |
44 | evaluate: True
45 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/albef/eval/ret_flickr30k_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_retrieval
8 | model_type: flickr
9 |
10 | datasets:
11 | flickr30k: # name of the dataset builder
12 | vis_processor:
13 | eval:
14 | name: "blip_image_eval"
15 | image_size: 384
16 | text_processor:
17 | eval:
18 | name: "blip_caption"
19 |
20 | run:
21 | task: retrieval
22 |
23 | # dataloading
24 | num_workers: 4
25 | batch_size_train: 32
26 | batch_size_eval: 64
27 |
28 | test_splits: ["test"]
29 |
30 | # distribution
31 | device: "cuda"
32 | world_size: 1
33 | dist_url: "env://"
34 | distributed: True
35 | use_dist_eval_sampler: False
36 |
37 | # model specific
38 | k_test: 128
39 |
40 | # misc
41 | seed: 42
42 | output_dir: "output/ALBEF/Retrieval_Flickr30k"
43 |
44 | evaluate: True
45 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/albef/eval/snli_ve_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_classification
8 | model_type: ve
9 |
10 | datasets:
11 | snli_ve: # name of the dataset builder
12 | vis_processor:
13 | eval:
14 | name: "blip_image_eval"
15 | text_processor:
16 | eval:
17 | name: "blip_caption"
18 |
19 | run:
20 | task: multimodal_classification
21 | # optimization-specific
22 | batch_size_train: 32
23 | batch_size_eval: 64
24 | num_workers: 4
25 |
26 | seed: 42
27 | output_dir: "output/ALBEF/SNLI_VE"
28 |
29 | evaluate: True
30 | test_splits: ["val", "test"]
31 |
32 | # distribution-specific
33 | device: "cuda"
34 | world_size: 1
35 | dist_url: "env://"
36 | distributed: True
37 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/albef/eval/vqa_test.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_vqa
8 | model_type: vqav2
9 |
10 | image_size: 384
11 |
12 |
13 | datasets:
14 | coco_vqa: # name of the dataset builder
15 | vis_processor:
16 | eval:
17 | name: "blip_image_eval"
18 | image_size: 384
19 | text_processor:
20 | eval:
21 | name: "blip_question"
22 |
23 | run:
24 | task: vqa
25 |
26 | # optimization-specific
27 | batch_size_train: 16
28 | batch_size_eval: 64
29 | num_workers: 4
30 |
31 | # inference-specific
32 | max_len: 10
33 | min_len: 1
34 | num_beams: 3
35 | num_ans_candidates: 128
36 | inference_method: "rank"
37 |
38 | seed: 42
39 | output_dir: "output/ALBEF/VQA"
40 |
41 | evaluate: True
42 | train_splits: ["train"]
43 | test_splits: ["test"]
44 |
45 | # distribution-specific
46 | device: "cuda"
47 | world_size: 1
48 | dist_url: "env://"
49 | distributed: True
50 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/albef/eval/vqa_val.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: albef_vqa
8 | model_type: vqav2
9 |
10 | image_size: 384
11 |
12 | datasets:
13 | coco_vqa: # name of the dataset builder
14 | type: eval
15 | vis_processor:
16 | eval:
17 | name: "blip_image_eval"
18 | image_size: 384
19 | text_processor:
20 | eval:
21 | name: "blip_question"
22 |
23 | run:
24 | task: vqa
25 |
26 | # optimization-specific
27 | batch_size_train: 16
28 | batch_size_eval: 64
29 | num_workers: 4
30 |
31 | # inference-specific
32 | max_len: 10
33 | min_len: 1
34 | num_beams: 3
35 | num_ans_candidates: 128
36 | inference_method: "rank"
37 |
38 | seed: 42
39 | output_dir: "output/ALBEF/VQA"
40 |
41 | evaluate: True
42 | test_splits: ["val"]
43 |
44 | # distribution-specific
45 | device: "cuda"
46 | world_size: 1
47 | dist_url: "env://"
48 | distributed: True
49 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/alpro/eval/didemo_ret_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: alpro_retrieval
8 | model_type: didemo
9 |
10 | max_txt_len: 50
11 |
12 | timesformer:
13 | n_frms: 8
14 | image_size: 224
15 |
16 |
17 | datasets:
18 | didemo_retrieval: # name of the dataset builder
19 | vis_processor:
20 | eval:
21 | name: "alpro_video_eval"
22 | n_frms: 8
23 | image_size: 224
24 | text_processor:
25 | eval:
26 | name: "blip_caption"
27 |
28 | run:
29 | task: retrieval
30 | # optimization-specific
31 | batch_size_train: 8
32 | batch_size_eval: 64
33 | num_workers: 4
34 |
35 | # k_test: 256
36 | k_test: 1000
37 |
38 | seed: 42
39 | output_dir: "output/ALPRO/didemo_retrieval"
40 |
41 | evaluate: True
42 | train_splits: ["train"]
43 | valid_splits: ["val", "test"]
44 | test_splits: ["test"]
45 |
46 | # distribution-specific
47 | device: "cuda"
48 | world_size: 1
49 | dist_url: "env://"
50 | distributed: True
51 | use_dist_eval_sampler: False
52 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/alpro/eval/msrvtt_qa_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: alpro_qa
8 | model_type: msrvtt
9 |
10 | datasets:
11 | msrvtt_qa: # name of the dataset builder
12 | vis_processor:
13 | eval:
14 | name: "alpro_video_eval"
15 | n_frms: 16
16 | image_size: 224
17 | text_processor:
18 | eval:
19 | name: "blip_caption"
20 |
21 | run:
22 | task: multimodal_classification
23 | # optimization-specific
24 | batch_size_train: 32
25 | batch_size_eval: 64
26 | num_workers: 4
27 |
28 | seed: 42
29 | output_dir: "output/ALPRO/msrvtt_qa"
30 |
31 | evaluate: True
32 | valid_splits: ["val"]
33 | test_splits: ["test"]
34 |
35 | # distribution-specific
36 | device: "cuda"
37 | world_size: 1
38 | dist_url: "env://"
39 | distributed: True
40 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/alpro/eval/msrvtt_ret_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: alpro_retrieval
8 | model_type: msrvtt
9 |
10 | datasets:
11 | msrvtt_retrieval: # name of the dataset builder
12 | vis_processor:
13 | eval:
14 | name: "alpro_video_eval"
15 | n_frms: 8
16 | image_size: 224
17 | text_processor:
18 | eval:
19 | name: "blip_caption"
20 |
21 | run:
22 | task: retrieval
23 | # optimization-specific
24 | batch_size_train: 24
25 | batch_size_eval: 64
26 | num_workers: 4
27 |
28 | # k_test: 256
29 | k_test: 1000
30 |
31 | seed: 42
32 | output_dir: "output/ALPRO/msrvtt_retrieval"
33 |
34 | evaluate: True
35 | test_splits: ["test"]
36 |
37 | # distribution-specific
38 | device: "cuda"
39 | world_size: 1
40 | dist_url: "env://"
41 | distributed: True
42 | use_dist_eval_sampler: False
43 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/alpro/eval/msvd_qa_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: alpro_qa
8 | model_type: msvd
9 |
10 | datasets:
11 | msvd_qa: # name of the dataset builder
12 | vis_processor:
13 | eval:
14 | name: "alpro_video_eval"
15 | n_frms: 16
16 | image_size: 224
17 | text_processor:
18 | train:
19 | name: "blip_caption"
20 | eval:
21 | name: "blip_caption"
22 |
23 | run:
24 | task: multimodal_classification
25 | # optimization-specific
26 | batch_size_train: 24
27 | batch_size_eval: 64
28 | num_workers: 4
29 |
30 | seed: 42
31 | output_dir: "output/ALPRO/msvd_qa"
32 |
33 | evaluate: True
34 | test_splits: ["test"]
35 |
36 | # distribution-specific
37 | device: "cuda"
38 | world_size: 1
39 | dist_url: "env://"
40 | distributed: True
41 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/aokvqa_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_vqa
8 | model_type: aokvqa
9 | image_size: 480
10 |
11 | datasets:
12 | aok_vqa: # name of the dataset builder
13 | vis_processor:
14 | eval:
15 | name: "blip_image_eval"
16 | image_size: 480
17 | text_processor:
18 | eval:
19 | name: "blip_question"
20 |
21 | run:
22 | task: aok_vqa
23 | # optimization-specific
24 | batch_size_train: 64
25 | batch_size_eval: 64
26 | num_workers: 4
27 |
28 | # inference-specific
29 | max_len: 10
30 | min_len: 1
31 | num_beams: 3
32 | num_ans_candidates: 128
33 | inference_method: "rank"
34 |
35 | seed: 42
36 | output_dir: "output/BLIP/AOKVQA"
37 |
38 | evaluate: True
39 | test_splits: ["val", "test"]
40 |
41 | # distribution-specific
42 | device: "cuda"
43 | world_size: 1
44 | dist_url: "env://"
45 | distributed: True
46 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/caption_coco_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_caption
8 | model_type: base_coco
9 |
10 | datasets:
11 | coco_caption: # name of the dataset builder
12 | vis_processor:
13 | eval:
14 | name: "blip_image_eval"
15 | text_processor:
16 | eval:
17 | name: "blip_caption"
18 |
19 | run:
20 | # task: retrieval
21 | task: captioning
22 | # optimizer
23 | batch_size_train: 32
24 | batch_size_eval: 64
25 | num_workers: 4
26 |
27 | max_len: 20
28 | min_len: 5
29 | num_beams: 3
30 |
31 | seed: 42
32 | output_dir: "output/BLIP/Caption_coco"
33 |
34 | evaluate: True
35 | test_splits: ["test"]
36 |
37 | device: "cuda"
38 | world_size: 1
39 | dist_url: "env://"
40 | distributed: True
41 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/caption_coco_eval_large.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_caption
8 | model_type: large_coco
9 |
10 | datasets:
11 | coco_caption: # name of the dataset builder
12 | vis_processor:
13 | eval:
14 | name: "blip_image_eval"
15 | text_processor:
16 | eval:
17 | name: "blip_caption"
18 |
19 | run:
20 | # task: retrieval
21 | task: captioning
22 | # optimizer
23 | batch_size_train: 32
24 | batch_size_eval: 64
25 | num_workers: 4
26 |
27 | max_len: 20
28 | min_len: 5
29 | num_beams: 3
30 |
31 | seed: 42
32 | output_dir: "output/BLIP/Caption_coco"
33 |
34 | evaluate: True
35 | test_splits: ["test"]
36 |
37 | device: "cuda"
38 | world_size: 1
39 | dist_url: "env://"
40 | distributed: True
41 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/nlvr_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_nlvr
8 | model_type: nlvr
9 |
10 | datasets:
11 | nlvr: # name of the dataset builder
12 | vis_processor:
13 | eval:
14 | name: "blip_image_eval"
15 | image_size: 384
16 | text_processor:
17 | eval:
18 | name: "blip_caption"
19 |
20 | run:
21 | task: multimodal_classification
22 |
23 | batch_size_train: 16
24 | batch_size_eval: 64
25 | num_workers: 4
26 |
27 | seed: 42
28 | output_dir: "output/BLIP/NLVR"
29 |
30 | evaluate: True
31 | test_splits: ["val", "test"]
32 |
33 | # distribution-specific
34 | device: "cuda"
35 | world_size: 1
36 | dist_url: "env://"
37 | distributed: True
38 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/nocaps_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_caption
8 | model_type: base_coco
9 | # pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth'
10 |
11 | datasets:
12 | nocaps: # name of the dataset builder
13 | vis_processor:
14 | eval:
15 | name: "blip_image_eval"
16 | image_size: 384
17 | text_processor:
18 | eval:
19 | name: "blip_caption"
20 | prompt: "a picture of "
21 |
22 | run:
23 | # task: retrieval
24 | task: captioning
25 | # optimizer
26 | batch_size_train: 32
27 | batch_size_eval: 64
28 | num_workers: 4
29 |
30 | max_len: 20
31 | min_len: 5
32 | num_beams: 3
33 |
34 | seed: 42
35 | output_dir: "output/BLIP/NoCaps"
36 |
37 | evaluate: True
38 | test_splits: ["val", "test"]
39 |
40 | device: "cuda"
41 | world_size: 1
42 | dist_url: "env://"
43 | distributed: True
44 |
45 | report_metric: False
46 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/okvqa_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_vqa
8 | model_type: okvqa
9 | image_size: 480
10 |
11 | datasets:
12 | ok_vqa: # name of the dataset builder
13 | vis_processor:
14 | eval:
15 | name: "blip_image_eval"
16 | image_size: 480
17 | text_processor:
18 | eval:
19 | name: "blip_question"
20 |
21 | run:
22 | task: vqa
23 | # optimization-specific
24 | batch_size_train: 16
25 | batch_size_eval: 16
26 | num_workers: 4
27 |
28 | # inference-specific
29 | max_len: 10
30 | min_len: 1
31 | num_beams: 3
32 | num_ans_candidates: 128
33 | inference_method: "rank"
34 |
35 | seed: 42
36 | output_dir: "output/BLIP/OKVQA"
37 |
38 | evaluate: True
39 | test_splits: ["test"]
40 |
41 | # distribution-specific
42 | device: "cuda"
43 | world_size: 1
44 | dist_url: "env://"
45 | distributed: True
46 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/ret_coco_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_retrieval
8 | model_type: coco
9 |
10 | datasets:
11 | coco_retrieval: # name of the dataset builder
12 | vis_processor:
13 | train:
14 | name: "blip_image_train"
15 | image_size: 384
16 | eval:
17 | name: "blip_image_eval"
18 | image_size: 384
19 | text_processor:
20 | train:
21 | name: "blip_caption"
22 | eval:
23 | name: "blip_caption"
24 |
25 | run:
26 | task: retrieval
27 |
28 | # dataloading
29 | num_workers: 4
30 | batch_size_train: 32
31 | batch_size_eval: 128
32 |
33 | train_splits: ["train"]
34 | valid_splits: ["val"]
35 | test_splits: ["test"]
36 |
37 | # distribution
38 | device: "cuda"
39 | world_size: 1
40 | dist_url: "env://"
41 | distributed: True
42 | use_dist_eval_sampler: False
43 |
44 | # model specific
45 | k_test: 256
46 |
47 | # misc
48 | seed: 42
49 | output_dir: "output/BLIP/Retrieval_COCO"
50 |
51 | evaluate: True
52 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/ret_flickr_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_retrieval
8 | model_type: flickr
9 |
10 | datasets:
11 | flickr30k: # name of the dataset builder
12 | vis_processor:
13 | eval:
14 | name: "blip_image_eval"
15 | image_size: 384
16 | text_processor:
17 | eval:
18 | name: "blip_caption"
19 |
20 | run:
21 | task: retrieval
22 |
23 | # dataloading
24 | num_workers: 4
25 | batch_size_train: 32
26 | batch_size_eval: 64
27 |
28 | test_splits: ["test"]
29 |
30 | # distribution
31 | device: "cuda"
32 | world_size: 1
33 | dist_url: "env://"
34 | distributed: True
35 | use_dist_eval_sampler: False
36 |
37 | # model specific
38 | k_test: 128
39 |
40 | # misc
41 | seed: 42
42 | output_dir: "output/Retrieval_Flickr30k"
43 |
44 | evaluate: True
45 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/blip/eval/vqav2_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip_vqa
8 | model_type: vqav2
9 | image_size: 480
10 |
11 | datasets:
12 | coco_vqa: # name of the dataset builder
13 | type: eval
14 | vis_processor:
15 | eval:
16 | name: "blip_image_eval"
17 | image_size: 480
18 | text_processor:
19 | eval:
20 | name: "blip_question"
21 |
22 | run:
23 | task: vqa
24 | # optimization-specific
25 | batch_size_train: 16
26 | batch_size_eval: 64
27 | num_workers: 4
28 |
29 | # inference-specific
30 | max_len: 10
31 | min_len: 1
32 | num_beams: 3
33 | num_ans_candidates: 128
34 | inference_method: "rank"
35 |
36 | seed: 42
37 | output_dir: "output/BLIP/VQA"
38 |
39 | evaluate: True
40 | test_splits: ["val"]
41 |
42 | # distribution-specific
43 | device: "cuda"
44 | world_size: 1
45 | dist_url: "env://"
46 | distributed: True
47 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml
--------------------------------------------------------------------------------
/experiments/lavis/projects/blip2/eval/ret_flickr_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: blip2
8 | model_type: coco
9 | use_grad_checkpoint: False
10 |
11 | datasets:
12 | flickr30k: # name of the dataset builder
13 | vis_processor:
14 | eval:
15 | name: "blip_image_eval"
16 | image_size: 364
17 | text_processor:
18 | eval:
19 | name: "blip_caption"
20 |
21 | run:
22 | task: retrieval
23 |
24 | # dataloading
25 | num_workers: 4
26 | batch_size_train: 16
27 | batch_size_eval: 32
28 |
29 | test_splits: ["test"]
30 |
31 | # distribution
32 | device: "cuda"
33 | world_size: 1
34 | dist_url: "env://"
35 | distributed: True
36 | use_dist_eval_sampler: False
37 |
38 | # model specific
39 | k_test: 128
40 |
41 | # misc
42 | seed: 42
43 | output_dir: "output/BLIP2/Retrieval_Flickr30k"
44 |
45 | evaluate: True
--------------------------------------------------------------------------------
/experiments/lavis/projects/clip/exp_coco_ret_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: clip
8 |
9 | model_type: ViT-L-14-336
10 |
11 | datasets:
12 | coco_retrieval: # name of the dataset builder
13 | vis_processor:
14 | train:
15 | name: "clip_image_train"
16 | image_size: 336
17 | eval:
18 | name: "clip_image_eval"
19 | image_size: 336
20 | text_processor:
21 | train:
22 | name: "blip_caption"
23 | eval:
24 | name: "blip_caption"
25 |
26 | run:
27 | task: retrieval
28 |
29 | # dataloading
30 | num_workers: 4
31 | batch_size_train: 32
32 | batch_size_eval: 128
33 |
34 | test_splits: ["test"]
35 |
36 | # distribution
37 | device: "cuda"
38 | world_size: 1
39 | dist_url: "env://"
40 | distributed: True
41 | use_dist_eval_sampler: True
42 |
43 | # misc
44 | seed: 42
45 | output_dir: "output/clip/Retrieval_COCO"
46 |
47 | evaluate: True
48 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/clip/exp_flickr_ret_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: clip
8 |
9 | model_type: ViT-L-14-336
10 |
11 | datasets:
12 | flickr30k: # name of the dataset builder
13 | vis_processor:
14 | train:
15 | name: "clip_image_train"
16 | image_size: 336
17 | eval:
18 | name: "clip_image_eval"
19 | image_size: 336
20 | text_processor:
21 | train:
22 | name: "blip_caption"
23 | eval:
24 | name: "blip_caption"
25 |
26 | run:
27 | task: retrieval
28 |
29 | # dataloading
30 | num_workers: 4
31 | batch_size_train: 32
32 | batch_size_eval: 128
33 |
34 | test_splits: ["test"]
35 |
36 | # distribution
37 | device: "cuda"
38 | world_size: 1
39 | dist_url: "env://"
40 | distributed: True
41 | use_dist_eval_sampler: True
42 |
43 | # misc
44 | seed: 42
45 | output_dir: "output/clip/Retrieval_Flickr"
46 |
47 | evaluate: True
48 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/clip/exp_imnet_zs_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: clip
8 |
9 | model_type: ViT-L-14-336
10 |
11 | datasets:
12 | imagenet: # name of the dataset builder
13 | vis_processor:
14 | eval:
15 | name: "clip_image_eval"
16 | # image_size: 224
17 | image_size: 336
18 |
19 | run:
20 | task: multimodal_classification
21 |
22 | # dataloading
23 | num_workers: 4
24 | batch_size_train: 32
25 | batch_size_eval: 128
26 |
27 | test_splits: ["val"]
28 |
29 | # distribution
30 | device: "cuda"
31 | world_size: 1
32 | dist_url: "env://"
33 | distributed: True
34 |
35 | # misc
36 | seed: 42
37 | output_dir: "output/clip/zs_imnet"
38 |
39 | evaluate: True
40 |
--------------------------------------------------------------------------------
/experiments/lavis/projects/gpt/eval/dialogue_avsd_eval.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: gpt_dialogue
8 | model_type: base
9 |
10 | datasets:
11 | avsd_dialogue: # name of the dataset builder
12 | vis_processor:
13 | eval:
14 | name: "gpt_video_ft"
15 | visual_ft: ["i3d_flow", "i3d_rgb"]
16 | audio_ft: ["vggish"]
17 | text_processor:
18 | eval:
19 | name: "gpt_dialogue"
20 | max_turns: 3
21 | use_caption: True
22 |
23 | run:
24 | task: dialogue
25 | # optimizer
26 | batch_size_train: 16
27 | batch_size_eval: 16
28 | num_workers: 0
29 |
30 | max_len: 20
31 | min_len: 5
32 | num_beams: 5
33 |
34 | seed: 42
35 | output_dir: "output/gpt2/dialogue_avsd"
36 |
37 | evaluate: True
38 | valid_splits: ["test"]
39 |
40 | device: "cuda"
41 | world_size: 1
42 | dist_url: "env://"
43 | distributed: True
44 |
--------------------------------------------------------------------------------
/experiments/lavis/runners/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from lavis.runners.runner_base import RunnerBase
9 | from lavis.runners.runner_iter import RunnerIter
10 |
11 | __all__ = ["RunnerBase", "RunnerIter"]
12 |
--------------------------------------------------------------------------------
/experiments/lavis/tasks/image_text_pretrain.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from lavis.common.registry import registry
9 | from lavis.tasks.base_task import BaseTask
10 |
11 |
12 | @registry.register_task("image_text_pretrain")
13 | class ImageTextPretrainTask(BaseTask):
14 | def __init__(self):
15 | super().__init__()
16 |
17 | def evaluation(self, model, data_loader, cuda_enabled=True):
18 | pass
19 |
--------------------------------------------------------------------------------
/experiments/lavis/tasks/text_to_image_generation.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright (c) 2022, salesforce.com, inc.
3 | All rights reserved.
4 | SPDX-License-Identifier: BSD-3-Clause
5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 |
8 | from lavis.tasks import BaseTask
9 | from lavis.common.registry import registry
10 |
11 |
12 | @registry.register_task("text-to-image-generation")
13 | class TextToImageGenerationTask(BaseTask):
14 | def __init__(self, cfg):
15 | super().__init__()
16 |
17 | self.cfg = cfg
18 |
19 | @classmethod
20 | def setup_task(cls, cfg):
21 | run_cfg = cfg.run_cfg
22 |
23 | return cls(cfg=run_cfg)
24 |
--------------------------------------------------------------------------------
/experiments/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 |
--------------------------------------------------------------------------------
/experiments/llava/constants.py:
--------------------------------------------------------------------------------
1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
2 | WORKER_HEART_BEAT_INTERVAL = 15
3 |
4 | LOGDIR = "."
5 |
6 | # Model Constants
7 | IGNORE_INDEX = -100
8 | IMAGE_TOKEN_INDEX = -200
9 | DEFAULT_IMAGE_TOKEN = ""
10 | DEFAULT_IMAGE_PATCH_TOKEN = ""
11 | DEFAULT_IM_START_TOKEN = ""
12 | DEFAULT_IM_END_TOKEN = ""
13 |
--------------------------------------------------------------------------------
/experiments/llava/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
2 | from .language_model.llava_mpt import LlavaMPTForCausalLM, LlavaMPTConfig
3 |
--------------------------------------------------------------------------------
/experiments/llava/model/consolidate.py:
--------------------------------------------------------------------------------
1 | """
2 | Usage:
3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
4 | """
5 | import argparse
6 |
7 | import torch
8 | from transformers import AutoTokenizer, AutoModelForCausalLM
9 | from llava.model import *
10 | from llava.model.utils import auto_upgrade
11 |
12 |
13 | def consolidate_ckpt(src_path, dst_path):
14 | print("Loading model")
15 | auto_upgrade(src_path)
16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 | src_model.save_pretrained(dst_path)
19 | src_tokenizer.save_pretrained(dst_path)
20 |
21 |
22 | if __name__ == "__main__":
23 | parser = argparse.ArgumentParser()
24 | parser.add_argument("--src", type=str, required=True)
25 | parser.add_argument("--dst", type=str, required=True)
26 |
27 | args = parser.parse_args()
28 |
29 | consolidate_ckpt(args.src, args.dst)
30 |
--------------------------------------------------------------------------------
/experiments/llava/model/language_model/mpt/custom_embedding.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import torch.nn.functional as F
4 | from torch import Tensor
5 |
6 | class SharedEmbedding(nn.Embedding):
7 |
8 | def forward(self, input: Tensor, unembed: bool=False) -> Tensor:
9 | if unembed:
10 | return F.linear(input, self.weight)
11 | return super().forward(input)
--------------------------------------------------------------------------------
/experiments/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
1 | import os
2 | from .clip_encoder import CLIPVisionTower
3 |
4 |
5 | def build_vision_tower(vision_tower_cfg, **kwargs):
6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
7 | is_absolute_path_exists = os.path.exists(vision_tower)
8 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"):
9 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
10 |
11 | raise ValueError(f'Unknown vision tower: {vision_tower}')
12 |
--------------------------------------------------------------------------------
/experiments/llava/model/utils.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoConfig
2 |
3 |
4 | def auto_upgrade(config):
5 | cfg = AutoConfig.from_pretrained(config)
6 | if 'llava' in config and 'llava' not in cfg.model_type:
7 | assert cfg.model_type == 'llama'
8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 | if confirm.lower() in ["y", "yes"]:
12 | print("Upgrading checkpoint...")
13 | assert len(cfg.architectures) == 1
14 | setattr(cfg.__class__, "model_type", "llava")
15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 | cfg.save_pretrained(config)
17 | print("Checkpoint upgraded.")
18 | else:
19 | print("Checkpoint upgrade aborted.")
20 | exit(1)
21 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | torchvision==0.15.2
2 | transformers==4.31.0
3 | torch==2.0.1
4 | tokenizers>=0.12.1,<0.14
5 | shortuuid
6 | accelerate==0.21.0
7 | peft==0.4.0
8 | bitsandbytes==0.41.0
9 | scikit-learn==1.2.2
10 | gradio==3.35.2
11 | gradio_client==0.2.9
12 | httpx==0.24.0
13 | numpy
14 | requests
15 | uvicorn
16 | fastapi
17 | einops
18 | einops-exts
19 | timm
20 | contexttimer
21 | decord
22 | diffusers
23 | fairscale
24 | ftfy
25 | iopath
26 | ipython
27 | omegaconf
28 | opencv-python
29 | opendatasets
30 | packaging
31 | pandas
32 | plotly
33 | pre-commit
34 | pycocoevalcap
35 | pycocotools
36 | python-magic
37 | scikit-image
38 | sentencepiece
39 | spacy
40 | streamlit
41 | tqdm
42 | webdataset
43 | wheel
44 | torchaudio
45 | soundfile
46 | moviepy
47 | nltk
--------------------------------------------------------------------------------
/utils/logger.py:
--------------------------------------------------------------------------------
1 | import torch.distributed as dist
2 | import logging
3 |
4 |
5 | def create_logger(logging_dir):
6 | """
7 | Create a logger that writes to a log file and stdout.
8 | """
9 | if dist.get_rank() == 0: # real logger
10 | logging.basicConfig(
11 | level=logging.INFO,
12 | format="[\033[34m%(asctime)s\033[0m] %(message)s",
13 | datefmt="%Y-%m-%d %H:%M:%S",
14 | handlers=[logging.StreamHandler(), logging.FileHandler(f"{logging_dir}/log.txt")],
15 | )
16 | logger = logging.getLogger(__name__)
17 | else: # dummy logger (does nothing)
18 | logger = logging.getLogger(__name__)
19 | logger.addHandler(logging.NullHandler())
20 | return logger
--------------------------------------------------------------------------------