├── .gitignore ├── LICENSE ├── README.md ├── assets ├── amber.png ├── amber_discriminative.png ├── eyes_forest.png ├── llava_bench.png ├── mme-fullset.png ├── mme-hallucination.png ├── motivation.png ├── observation.png ├── overview.png ├── pope.png ├── qualitative_amber_instructblip.png ├── qualitative_amber_instructblip2.png ├── qualitative_amber_llava.png ├── qualitative_amber_llava2.png ├── qualitative_mme2.png ├── qualitative_mme_instructblip.png ├── qualitative_mme_llava.png ├── qualitative_pope.png └── qualitative_pope2.png ├── avisc_utils ├── avisc_sample.py └── vcd_add_noise.py ├── eval_bench ├── SimSun.ttf ├── amber_eval_instructblip.py ├── amber_eval_llava.py ├── amber_loader.py ├── llava_bench_llava.py ├── pope_eval_instructblipb.py ├── pope_eval_llavab.py ├── pope_loader.py └── scripts │ ├── amber_eval.sh │ ├── llava_bench_eval.sh │ └── pope_eval_batch.sh ├── experiments ├── AMBER │ ├── LICENSE │ ├── README.md │ ├── README_File │ │ ├── Paper-Arxiv-orange.svg │ │ ├── comparison.jpg │ │ ├── intro.jpg │ │ ├── result.jpg │ │ └── statistics.jpg │ ├── data │ │ ├── annotations.json │ │ ├── metrics.txt │ │ ├── query │ │ │ ├── query_all.json │ │ │ ├── query_discriminative-attribute.json │ │ │ ├── query_discriminative-existence.json │ │ │ ├── query_discriminative-relation.json │ │ │ ├── query_discriminative.json │ │ │ └── query_generative.json │ │ ├── relation.json │ │ └── safe_words.txt │ └── inference.py ├── cd_scripts │ └── mme_eval.sh ├── eval │ ├── calculation.py │ ├── convert_answer_to_mme.py │ ├── eval_mme.py │ ├── eval_mme │ │ ├── .DS_Store │ │ ├── LaVIN │ │ │ ├── OCR.txt │ │ │ ├── artwork.txt │ │ │ ├── celebrity.txt │ │ │ ├── code_reasoning.txt │ │ │ ├── color.txt │ │ │ ├── commonsense_reasoning.txt │ │ │ ├── count.txt │ │ │ ├── existence.txt │ │ │ ├── landmark.txt │ │ │ ├── numerical_calculation.txt │ │ │ ├── position.txt │ │ │ ├── posters.txt │ │ │ ├── scene.txt │ │ │ └── text_translation.txt │ │ ├── Your_Results │ │ │ ├── OCR.txt │ │ │ ├── artwork.txt │ │ │ ├── celebrity.txt │ │ │ ├── code_reasoning.txt │ │ │ ├── color.txt │ │ │ ├── commonsense_reasoning.txt │ │ │ ├── count.txt │ │ │ ├── existence.txt │ │ │ ├── landmark.txt │ │ │ ├── numerical_calculation.txt │ │ │ ├── position.txt │ │ │ ├── posters.txt │ │ │ ├── scene.txt │ │ │ └── text_translation.txt │ │ └── readme.txt │ ├── eval_pope.py │ ├── mme_instructblip.py │ ├── mme_llava.py │ ├── object_hallucination_vqa_instructblip.py │ └── object_hallucination_vqa_llava.py ├── lavis │ ├── __init__.py │ ├── common │ │ ├── annotator │ │ │ ├── canny │ │ │ │ └── __init__.py │ │ │ ├── ckpts │ │ │ │ └── download.sh │ │ │ ├── hed │ │ │ │ └── __init__.py │ │ │ ├── midas │ │ │ │ ├── __init__.py │ │ │ │ ├── api.py │ │ │ │ ├── midas │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base_model.py │ │ │ │ │ ├── blocks.py │ │ │ │ │ ├── dpt_depth.py │ │ │ │ │ ├── midas_net.py │ │ │ │ │ ├── midas_net_custom.py │ │ │ │ │ ├── transforms.py │ │ │ │ │ └── vit.py │ │ │ │ └── utils.py │ │ │ ├── mlsd │ │ │ │ ├── __init__.py │ │ │ │ ├── models │ │ │ │ │ ├── mbv2_mlsd_large.py │ │ │ │ │ └── mbv2_mlsd_tiny.py │ │ │ │ └── utils.py │ │ │ ├── openpose │ │ │ │ ├── __init__.py │ │ │ │ ├── body.py │ │ │ │ ├── hand.py │ │ │ │ ├── model.py │ │ │ │ └── util.py │ │ │ ├── uniformer │ │ │ │ ├── __init__.py │ │ │ │ ├── configs │ │ │ │ │ └── _base_ │ │ │ │ │ │ ├── datasets │ │ │ │ │ │ ├── ade20k.py │ │ │ │ │ │ ├── chase_db1.py │ │ │ │ │ │ ├── cityscapes.py │ │ │ │ │ │ ├── cityscapes_769x769.py │ │ │ │ │ │ ├── drive.py │ │ │ │ │ │ ├── hrf.py │ │ │ │ │ │ ├── pascal_context.py │ │ │ │ │ │ ├── pascal_context_59.py │ │ │ │ │ │ ├── pascal_voc12.py │ │ │ │ │ │ ├── pascal_voc12_aug.py │ │ │ │ │ │ └── stare.py │ │ │ │ │ │ ├── default_runtime.py │ │ │ │ │ │ ├── models │ │ │ │ │ │ ├── ann_r50-d8.py │ │ │ │ │ │ ├── apcnet_r50-d8.py │ │ │ │ │ │ ├── ccnet_r50-d8.py │ │ │ │ │ │ ├── cgnet.py │ │ │ │ │ │ ├── danet_r50-d8.py │ │ │ │ │ │ ├── deeplabv3_r50-d8.py │ │ │ │ │ │ ├── deeplabv3_unet_s5-d16.py │ │ │ │ │ │ ├── deeplabv3plus_r50-d8.py │ │ │ │ │ │ ├── dmnet_r50-d8.py │ │ │ │ │ │ ├── dnl_r50-d8.py │ │ │ │ │ │ ├── emanet_r50-d8.py │ │ │ │ │ │ ├── encnet_r50-d8.py │ │ │ │ │ │ ├── fast_scnn.py │ │ │ │ │ │ ├── fcn_hr18.py │ │ │ │ │ │ ├── fcn_r50-d8.py │ │ │ │ │ │ ├── fcn_unet_s5-d16.py │ │ │ │ │ │ ├── fpn_r50.py │ │ │ │ │ │ ├── fpn_uniformer.py │ │ │ │ │ │ ├── gcnet_r50-d8.py │ │ │ │ │ │ ├── lraspp_m-v3-d8.py │ │ │ │ │ │ ├── nonlocal_r50-d8.py │ │ │ │ │ │ ├── ocrnet_hr18.py │ │ │ │ │ │ ├── ocrnet_r50-d8.py │ │ │ │ │ │ ├── pointrend_r50.py │ │ │ │ │ │ ├── psanet_r50-d8.py │ │ │ │ │ │ ├── pspnet_r50-d8.py │ │ │ │ │ │ ├── pspnet_unet_s5-d16.py │ │ │ │ │ │ ├── upernet_r50.py │ │ │ │ │ │ └── upernet_uniformer.py │ │ │ │ │ │ └── schedules │ │ │ │ │ │ ├── schedule_160k.py │ │ │ │ │ │ ├── schedule_20k.py │ │ │ │ │ │ ├── schedule_40k.py │ │ │ │ │ │ └── schedule_80k.py │ │ │ │ ├── exp │ │ │ │ │ └── upernet_global_small │ │ │ │ │ │ ├── config.py │ │ │ │ │ │ ├── run.sh │ │ │ │ │ │ ├── test.sh │ │ │ │ │ │ ├── test_config_g.py │ │ │ │ │ │ ├── test_config_h32.py │ │ │ │ │ │ └── test_config_w32.py │ │ │ │ ├── mmcv │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── arraymisc │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── quantization.py │ │ │ │ │ ├── cnn │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── alexnet.py │ │ │ │ │ │ ├── bricks │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ ├── activation.py │ │ │ │ │ │ │ ├── context_block.py │ │ │ │ │ │ │ ├── conv.py │ │ │ │ │ │ │ ├── conv2d_adaptive_padding.py │ │ │ │ │ │ │ ├── conv_module.py │ │ │ │ │ │ │ ├── conv_ws.py │ │ │ │ │ │ │ ├── depthwise_separable_conv_module.py │ │ │ │ │ │ │ ├── drop.py │ │ │ │ │ │ │ ├── generalized_attention.py │ │ │ │ │ │ │ ├── hsigmoid.py │ │ │ │ │ │ │ ├── hswish.py │ │ │ │ │ │ │ ├── non_local.py │ │ │ │ │ │ │ ├── norm.py │ │ │ │ │ │ │ ├── padding.py │ │ │ │ │ │ │ ├── plugin.py │ │ │ │ │ │ │ ├── registry.py │ │ │ │ │ │ │ ├── scale.py │ │ │ │ │ │ │ ├── swish.py │ │ │ │ │ │ │ ├── transformer.py │ │ │ │ │ │ │ ├── upsample.py │ │ │ │ │ │ │ └── wrappers.py │ │ │ │ │ │ ├── builder.py │ │ │ │ │ │ ├── resnet.py │ │ │ │ │ │ ├── utils │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ ├── flops_counter.py │ │ │ │ │ │ │ ├── fuse_conv_bn.py │ │ │ │ │ │ │ ├── sync_bn.py │ │ │ │ │ │ │ └── weight_init.py │ │ │ │ │ │ └── vgg.py │ │ │ │ │ ├── engine │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── test.py │ │ │ │ │ ├── fileio │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── file_client.py │ │ │ │ │ │ ├── handlers │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ ├── base.py │ │ │ │ │ │ │ ├── json_handler.py │ │ │ │ │ │ │ ├── pickle_handler.py │ │ │ │ │ │ │ └── yaml_handler.py │ │ │ │ │ │ ├── io.py │ │ │ │ │ │ └── parse.py │ │ │ │ │ ├── image │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── colorspace.py │ │ │ │ │ │ ├── geometric.py │ │ │ │ │ │ ├── io.py │ │ │ │ │ │ ├── misc.py │ │ │ │ │ │ └── photometric.py │ │ │ │ │ ├── model_zoo │ │ │ │ │ │ ├── deprecated.json │ │ │ │ │ │ ├── mmcls.json │ │ │ │ │ │ └── open_mmlab.json │ │ │ │ │ ├── ops │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── assign_score_withk.py │ │ │ │ │ │ ├── ball_query.py │ │ │ │ │ │ ├── bbox.py │ │ │ │ │ │ ├── border_align.py │ │ │ │ │ │ ├── box_iou_rotated.py │ │ │ │ │ │ ├── carafe.py │ │ │ │ │ │ ├── cc_attention.py │ │ │ │ │ │ ├── contour_expand.py │ │ │ │ │ │ ├── corner_pool.py │ │ │ │ │ │ ├── correlation.py │ │ │ │ │ │ ├── deform_conv.py │ │ │ │ │ │ ├── deform_roi_pool.py │ │ │ │ │ │ ├── deprecated_wrappers.py │ │ │ │ │ │ ├── focal_loss.py │ │ │ │ │ │ ├── furthest_point_sample.py │ │ │ │ │ │ ├── fused_bias_leakyrelu.py │ │ │ │ │ │ ├── gather_points.py │ │ │ │ │ │ ├── group_points.py │ │ │ │ │ │ ├── info.py │ │ │ │ │ │ ├── iou3d.py │ │ │ │ │ │ ├── knn.py │ │ │ │ │ │ ├── masked_conv.py │ │ │ │ │ │ ├── merge_cells.py │ │ │ │ │ │ ├── modulated_deform_conv.py │ │ │ │ │ │ ├── multi_scale_deform_attn.py │ │ │ │ │ │ ├── nms.py │ │ │ │ │ │ ├── pixel_group.py │ │ │ │ │ │ ├── point_sample.py │ │ │ │ │ │ ├── points_in_boxes.py │ │ │ │ │ │ ├── points_sampler.py │ │ │ │ │ │ ├── psa_mask.py │ │ │ │ │ │ ├── roi_align.py │ │ │ │ │ │ ├── roi_align_rotated.py │ │ │ │ │ │ ├── roi_pool.py │ │ │ │ │ │ ├── roiaware_pool3d.py │ │ │ │ │ │ ├── roipoint_pool3d.py │ │ │ │ │ │ ├── saconv.py │ │ │ │ │ │ ├── scatter_points.py │ │ │ │ │ │ ├── sync_bn.py │ │ │ │ │ │ ├── three_interpolate.py │ │ │ │ │ │ ├── three_nn.py │ │ │ │ │ │ ├── tin_shift.py │ │ │ │ │ │ ├── upfirdn2d.py │ │ │ │ │ │ └── voxelize.py │ │ │ │ │ ├── parallel │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── _functions.py │ │ │ │ │ │ ├── collate.py │ │ │ │ │ │ ├── data_container.py │ │ │ │ │ │ ├── data_parallel.py │ │ │ │ │ │ ├── distributed.py │ │ │ │ │ │ ├── distributed_deprecated.py │ │ │ │ │ │ ├── registry.py │ │ │ │ │ │ ├── scatter_gather.py │ │ │ │ │ │ └── utils.py │ │ │ │ │ ├── runner │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── base_module.py │ │ │ │ │ │ ├── base_runner.py │ │ │ │ │ │ ├── builder.py │ │ │ │ │ │ ├── checkpoint.py │ │ │ │ │ │ ├── default_constructor.py │ │ │ │ │ │ ├── dist_utils.py │ │ │ │ │ │ ├── epoch_based_runner.py │ │ │ │ │ │ ├── fp16_utils.py │ │ │ │ │ │ ├── hooks │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ ├── checkpoint.py │ │ │ │ │ │ │ ├── closure.py │ │ │ │ │ │ │ ├── ema.py │ │ │ │ │ │ │ ├── evaluation.py │ │ │ │ │ │ │ ├── hook.py │ │ │ │ │ │ │ ├── iter_timer.py │ │ │ │ │ │ │ ├── logger │ │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ │ ├── base.py │ │ │ │ │ │ │ │ ├── dvclive.py │ │ │ │ │ │ │ │ ├── mlflow.py │ │ │ │ │ │ │ │ ├── neptune.py │ │ │ │ │ │ │ │ ├── pavi.py │ │ │ │ │ │ │ │ ├── tensorboard.py │ │ │ │ │ │ │ │ ├── text.py │ │ │ │ │ │ │ │ └── wandb.py │ │ │ │ │ │ │ ├── lr_updater.py │ │ │ │ │ │ │ ├── memory.py │ │ │ │ │ │ │ ├── momentum_updater.py │ │ │ │ │ │ │ ├── optimizer.py │ │ │ │ │ │ │ ├── profiler.py │ │ │ │ │ │ │ ├── sampler_seed.py │ │ │ │ │ │ │ └── sync_buffer.py │ │ │ │ │ │ ├── iter_based_runner.py │ │ │ │ │ │ ├── log_buffer.py │ │ │ │ │ │ ├── optimizer │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ ├── builder.py │ │ │ │ │ │ │ └── default_constructor.py │ │ │ │ │ │ ├── priority.py │ │ │ │ │ │ └── utils.py │ │ │ │ │ ├── utils │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── config.py │ │ │ │ │ │ ├── env.py │ │ │ │ │ │ ├── ext_loader.py │ │ │ │ │ │ ├── logging.py │ │ │ │ │ │ ├── misc.py │ │ │ │ │ │ ├── parrots_jit.py │ │ │ │ │ │ ├── parrots_wrapper.py │ │ │ │ │ │ ├── path.py │ │ │ │ │ │ ├── progressbar.py │ │ │ │ │ │ ├── registry.py │ │ │ │ │ │ ├── testing.py │ │ │ │ │ │ ├── timer.py │ │ │ │ │ │ ├── trace.py │ │ │ │ │ │ └── version_utils.py │ │ │ │ │ ├── version.py │ │ │ │ │ ├── video │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── io.py │ │ │ │ │ │ ├── optflow.py │ │ │ │ │ │ └── processing.py │ │ │ │ │ └── visualization │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── color.py │ │ │ │ │ │ ├── image.py │ │ │ │ │ │ └── optflow.py │ │ │ │ ├── mmcv_custom │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── checkpoint.py │ │ │ │ └── mmseg │ │ │ │ │ ├── apis │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── inference.py │ │ │ │ │ ├── test.py │ │ │ │ │ └── train.py │ │ │ │ │ ├── core │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── evaluation │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── class_names.py │ │ │ │ │ │ ├── eval_hooks.py │ │ │ │ │ │ └── metrics.py │ │ │ │ │ ├── seg │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── builder.py │ │ │ │ │ │ └── sampler │ │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ │ ├── base_pixel_sampler.py │ │ │ │ │ │ │ └── ohem_pixel_sampler.py │ │ │ │ │ └── utils │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ └── misc.py │ │ │ │ │ ├── datasets │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── ade.py │ │ │ │ │ ├── builder.py │ │ │ │ │ ├── chase_db1.py │ │ │ │ │ ├── cityscapes.py │ │ │ │ │ ├── custom.py │ │ │ │ │ ├── dataset_wrappers.py │ │ │ │ │ ├── drive.py │ │ │ │ │ ├── hrf.py │ │ │ │ │ ├── pascal_context.py │ │ │ │ │ ├── pipelines │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── compose.py │ │ │ │ │ │ ├── formating.py │ │ │ │ │ │ ├── loading.py │ │ │ │ │ │ ├── test_time_aug.py │ │ │ │ │ │ └── transforms.py │ │ │ │ │ ├── stare.py │ │ │ │ │ └── voc.py │ │ │ │ │ ├── models │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── backbones │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── cgnet.py │ │ │ │ │ │ ├── fast_scnn.py │ │ │ │ │ │ ├── hrnet.py │ │ │ │ │ │ ├── mobilenet_v2.py │ │ │ │ │ │ ├── mobilenet_v3.py │ │ │ │ │ │ ├── resnest.py │ │ │ │ │ │ ├── resnet.py │ │ │ │ │ │ ├── resnext.py │ │ │ │ │ │ ├── unet.py │ │ │ │ │ │ ├── uniformer.py │ │ │ │ │ │ └── vit.py │ │ │ │ │ ├── builder.py │ │ │ │ │ ├── decode_heads │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── ann_head.py │ │ │ │ │ │ ├── apc_head.py │ │ │ │ │ │ ├── aspp_head.py │ │ │ │ │ │ ├── cascade_decode_head.py │ │ │ │ │ │ ├── cc_head.py │ │ │ │ │ │ ├── da_head.py │ │ │ │ │ │ ├── decode_head.py │ │ │ │ │ │ ├── dm_head.py │ │ │ │ │ │ ├── dnl_head.py │ │ │ │ │ │ ├── ema_head.py │ │ │ │ │ │ ├── enc_head.py │ │ │ │ │ │ ├── fcn_head.py │ │ │ │ │ │ ├── fpn_head.py │ │ │ │ │ │ ├── gc_head.py │ │ │ │ │ │ ├── lraspp_head.py │ │ │ │ │ │ ├── nl_head.py │ │ │ │ │ │ ├── ocr_head.py │ │ │ │ │ │ ├── point_head.py │ │ │ │ │ │ ├── psa_head.py │ │ │ │ │ │ ├── psp_head.py │ │ │ │ │ │ ├── sep_aspp_head.py │ │ │ │ │ │ ├── sep_fcn_head.py │ │ │ │ │ │ └── uper_head.py │ │ │ │ │ ├── losses │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── accuracy.py │ │ │ │ │ │ ├── cross_entropy_loss.py │ │ │ │ │ │ ├── dice_loss.py │ │ │ │ │ │ ├── lovasz_loss.py │ │ │ │ │ │ └── utils.py │ │ │ │ │ ├── necks │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── fpn.py │ │ │ │ │ │ └── multilevel_neck.py │ │ │ │ │ ├── segmentors │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── base.py │ │ │ │ │ │ ├── cascade_encoder_decoder.py │ │ │ │ │ │ └── encoder_decoder.py │ │ │ │ │ └── utils │ │ │ │ │ │ ├── __init__.py │ │ │ │ │ │ ├── drop.py │ │ │ │ │ │ ├── inverted_residual.py │ │ │ │ │ │ ├── make_divisible.py │ │ │ │ │ │ ├── res_layer.py │ │ │ │ │ │ ├── se_layer.py │ │ │ │ │ │ ├── self_attention_block.py │ │ │ │ │ │ ├── up_conv_block.py │ │ │ │ │ │ └── weight_init.py │ │ │ │ │ ├── ops │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── encoding.py │ │ │ │ │ └── wrappers.py │ │ │ │ │ └── utils │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── collect_env.py │ │ │ │ │ └── logger.py │ │ │ └── util.py │ │ ├── config.py │ │ ├── dist_utils.py │ │ ├── gradcam.py │ │ ├── logger.py │ │ ├── optims.py │ │ ├── registry.py │ │ ├── utils.py │ │ └── vqa_tools │ │ │ ├── __init__.py │ │ │ ├── vqa.py │ │ │ └── vqa_eval.py │ ├── configs │ │ ├── datasets │ │ │ ├── aokvqa │ │ │ │ └── defaults.yaml │ │ │ ├── avsd │ │ │ │ └── defaults_dial.yaml │ │ │ ├── blip_diffusion_datasets │ │ │ │ └── defaults.yaml │ │ │ ├── coco │ │ │ │ ├── defaults_cap.yaml │ │ │ │ ├── defaults_ret.yaml │ │ │ │ ├── defaults_vqa.yaml │ │ │ │ └── eval_vqa.yaml │ │ │ ├── conceptual_caption │ │ │ │ ├── defaults_12m.yaml │ │ │ │ └── defaults_3m.yaml │ │ │ ├── didemo │ │ │ │ └── defaults_ret.yaml │ │ │ ├── flickr30k │ │ │ │ └── defaults.yaml │ │ │ ├── gqa │ │ │ │ ├── balanced_testdev.yaml │ │ │ │ ├── balanced_val.yaml │ │ │ │ └── defaults.yaml │ │ │ ├── imagenet │ │ │ │ └── defaults.yaml │ │ │ ├── laion │ │ │ │ └── defaults_2B_multi.yaml │ │ │ ├── msrvtt │ │ │ │ ├── defaults_cap.yaml │ │ │ │ ├── defaults_qa.yaml │ │ │ │ └── defaults_ret.yaml │ │ │ ├── msvd │ │ │ │ ├── defaults_cap.yaml │ │ │ │ └── defaults_qa.yaml │ │ │ ├── nlvr │ │ │ │ └── defaults.yaml │ │ │ ├── nocaps │ │ │ │ └── defaults.yaml │ │ │ ├── okvqa │ │ │ │ └── defaults.yaml │ │ │ ├── sbu_caption │ │ │ │ └── defaults.yaml │ │ │ ├── snli_ve │ │ │ │ └── defaults.yaml │ │ │ ├── vatex │ │ │ │ └── defaults_cap.yaml │ │ │ └── vg │ │ │ │ ├── defaults_caption.yaml │ │ │ │ └── defaults_vqa.yaml │ │ ├── default.yaml │ │ └── models │ │ │ ├── albef_classification_ve.yaml │ │ │ ├── albef_feature_extractor.yaml │ │ │ ├── albef_nlvr.yaml │ │ │ ├── albef_pretrain_base.yaml │ │ │ ├── albef_retrieval_coco.yaml │ │ │ ├── albef_retrieval_flickr.yaml │ │ │ ├── albef_vqav2.yaml │ │ │ ├── alpro_qa_msrvtt.yaml │ │ │ ├── alpro_qa_msvd.yaml │ │ │ ├── alpro_retrieval_didemo.yaml │ │ │ ├── alpro_retrieval_msrvtt.yaml │ │ │ ├── bert_config.json │ │ │ ├── bert_config_alpro.json │ │ │ ├── blip-diffusion │ │ │ ├── blip_diffusion_base.yaml │ │ │ ├── blip_diffusion_controlnet_canny.yaml │ │ │ ├── blip_diffusion_controlnet_depth.yaml │ │ │ └── blip_diffusion_controlnet_hed.yaml │ │ │ ├── blip2 │ │ │ ├── blip2_caption_flant5xl.yaml │ │ │ ├── blip2_caption_opt2.7b.yaml │ │ │ ├── blip2_caption_opt6.7b.yaml │ │ │ ├── blip2_coco.yaml │ │ │ ├── blip2_instruct_flant5xl.yaml │ │ │ ├── blip2_instruct_flant5xxl.yaml │ │ │ ├── blip2_instruct_vicuna13b.yaml │ │ │ ├── blip2_instruct_vicuna7b.yaml │ │ │ ├── blip2_pretrain.yaml │ │ │ ├── blip2_pretrain_flant5xl.yaml │ │ │ ├── blip2_pretrain_flant5xl_vitL.yaml │ │ │ ├── blip2_pretrain_flant5xxl.yaml │ │ │ ├── blip2_pretrain_llama7b.yaml │ │ │ ├── blip2_pretrain_opt2.7b.yaml │ │ │ ├── blip2_pretrain_opt6.7b.yaml │ │ │ └── blip2_pretrain_vitL.yaml │ │ │ ├── blip_caption_base_coco.yaml │ │ │ ├── blip_caption_large_coco.yaml │ │ │ ├── blip_classification_base.yaml │ │ │ ├── blip_feature_extractor_base.yaml │ │ │ ├── blip_itm_base.yaml │ │ │ ├── blip_itm_large.yaml │ │ │ ├── blip_nlvr.yaml │ │ │ ├── blip_pretrain_base.yaml │ │ │ ├── blip_pretrain_large.yaml │ │ │ ├── blip_retrieval_coco.yaml │ │ │ ├── blip_retrieval_flickr.yaml │ │ │ ├── blip_vqa_aokvqa.yaml │ │ │ ├── blip_vqa_okvqa.yaml │ │ │ ├── blip_vqav2.yaml │ │ │ ├── clip │ │ │ ├── RN101-quickgelu.json │ │ │ ├── RN101.json │ │ │ ├── RN50-quickgelu.json │ │ │ ├── RN50.json │ │ │ ├── RN50x16.json │ │ │ ├── RN50x4.json │ │ │ ├── ViT-B-16-plus-240.json │ │ │ ├── ViT-B-16-plus.json │ │ │ ├── ViT-B-16.json │ │ │ ├── ViT-B-32-plus-256.json │ │ │ ├── ViT-B-32-quickgelu.json │ │ │ ├── ViT-B-32.json │ │ │ ├── ViT-H-14.json │ │ │ ├── ViT-H-16.json │ │ │ ├── ViT-L-14-280.json │ │ │ ├── ViT-L-14-336.json │ │ │ ├── ViT-L-14.json │ │ │ ├── ViT-L-16-320.json │ │ │ ├── ViT-L-16.json │ │ │ ├── ViT-g-14.json │ │ │ ├── timm-efficientnetv2_rw_s.json │ │ │ ├── timm-resnet50d.json │ │ │ ├── timm-resnetaa50d.json │ │ │ ├── timm-resnetblur50.json │ │ │ ├── timm-swin_base_patch4_window7_224.json │ │ │ ├── timm-vit_base_patch16_224.json │ │ │ ├── timm-vit_base_patch32_224.json │ │ │ └── timm-vit_small_patch16_224.json │ │ │ ├── clip_resnet50.yaml │ │ │ ├── clip_vit_base16.yaml │ │ │ ├── clip_vit_base32.yaml │ │ │ ├── clip_vit_large14.yaml │ │ │ ├── clip_vit_large14_336.yaml │ │ │ ├── gpt_dialogue_base.yaml │ │ │ ├── img2prompt-vqa │ │ │ └── img2prompt_vqa_base.yaml │ │ │ ├── med_config.json │ │ │ ├── med_config_albef.json │ │ │ ├── med_large_config.json │ │ │ └── pnp-vqa │ │ │ ├── pnp_vqa_3b.yaml │ │ │ ├── pnp_vqa_base.yaml │ │ │ ├── pnp_vqa_large.yaml │ │ │ ├── unifiedqav2_3b_config.json │ │ │ ├── unifiedqav2_base_config.json │ │ │ └── unifiedqav2_large_config.json │ ├── datasets │ │ ├── builders │ │ │ ├── __init__.py │ │ │ ├── base_dataset_builder.py │ │ │ ├── caption_builder.py │ │ │ ├── classification_builder.py │ │ │ ├── dialogue_builder.py │ │ │ ├── image_text_pair_builder.py │ │ │ ├── imagefolder_builder.py │ │ │ ├── retrieval_builder.py │ │ │ ├── text_to_image_generation_builder.py │ │ │ ├── video_qa_builder.py │ │ │ └── vqa_builder.py │ │ ├── data_utils.py │ │ ├── datasets │ │ │ ├── aok_vqa_datasets.py │ │ │ ├── avsd_dialogue_datasets.py │ │ │ ├── base_dataset.py │ │ │ ├── caption_datasets.py │ │ │ ├── coco_caption_datasets.py │ │ │ ├── coco_vqa_datasets.py │ │ │ ├── dataloader_utils.py │ │ │ ├── dialogue_datasets.py │ │ │ ├── gqa_datasets.py │ │ │ ├── image_text_pair_datasets.py │ │ │ ├── imagefolder_dataset.py │ │ │ ├── laion_dataset.py │ │ │ ├── multimodal_classification_datasets.py │ │ │ ├── nlvr_datasets.py │ │ │ ├── retrieval_datasets.py │ │ │ ├── snli_ve_datasets.py │ │ │ ├── subject_driven_t2i_dataset.py │ │ │ ├── vg_vqa_datasets.py │ │ │ ├── video_caption_datasets.py │ │ │ ├── video_vqa_datasets.py │ │ │ └── vqa_datasets.py │ │ └── download_scripts │ │ │ ├── DownloadConceptualCaptions │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── create_annotation_12m.ipynb │ │ │ ├── create_annotation_3m.ipynb │ │ │ ├── download_data_cc12m.py │ │ │ └── download_data_cc3m.py │ │ │ ├── download_coco.py │ │ │ ├── download_didemo.py │ │ │ ├── download_flickr.py │ │ │ ├── download_gqa.py │ │ │ ├── download_msrvtt.py │ │ │ ├── download_msvd.py │ │ │ ├── download_nocaps.py │ │ │ ├── download_sbu.py │ │ │ └── download_vg.py │ ├── models │ │ ├── __init__.py │ │ ├── albef_models │ │ │ ├── __init__.py │ │ │ ├── albef_classification.py │ │ │ ├── albef_feature_extractor.py │ │ │ ├── albef_nlvr.py │ │ │ ├── albef_outputs.py │ │ │ ├── albef_pretrain.py │ │ │ ├── albef_retrieval.py │ │ │ └── albef_vqa.py │ │ ├── alpro_models │ │ │ ├── __init__.py │ │ │ ├── alpro_outputs.py │ │ │ ├── alpro_qa.py │ │ │ └── alpro_retrieval.py │ │ ├── base_model.py │ │ ├── blip2_models │ │ │ ├── Qformer.py │ │ │ ├── __init__.py │ │ │ ├── blip2.py │ │ │ ├── blip2_image_text_matching.py │ │ │ ├── blip2_opt.py │ │ │ ├── blip2_qformer.py │ │ │ ├── blip2_t5.py │ │ │ ├── blip2_t5_instruct.py │ │ │ ├── blip2_vicuna_instruct.py │ │ │ ├── modeling_llama.py │ │ │ ├── modeling_opt.py │ │ │ └── modeling_t5.py │ │ ├── blip_diffusion_models │ │ │ ├── __init__.py │ │ │ ├── blip_diffusion.py │ │ │ ├── modeling_ctx_clip.py │ │ │ ├── ptp_utils.py │ │ │ └── utils.py │ │ ├── blip_models │ │ │ ├── __init__.py │ │ │ ├── blip.py │ │ │ ├── blip_caption.py │ │ │ ├── blip_classification.py │ │ │ ├── blip_feature_extractor.py │ │ │ ├── blip_image_text_matching.py │ │ │ ├── blip_nlvr.py │ │ │ ├── blip_outputs.py │ │ │ ├── blip_pretrain.py │ │ │ ├── blip_retrieval.py │ │ │ ├── blip_vqa.py │ │ │ └── nlvr_encoder.py │ │ ├── clip_models │ │ │ ├── __init__.py │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ ├── clip_outputs.py │ │ │ ├── loss.py │ │ │ ├── model.py │ │ │ ├── pics │ │ │ │ └── CLIP.png │ │ │ ├── pretrained.py │ │ │ ├── timm_model.py │ │ │ ├── tokenizer.py │ │ │ ├── transform.py │ │ │ └── utils.py │ │ ├── clip_vit.py │ │ ├── eva_vit.py │ │ ├── gpt_models │ │ │ └── gpt_dialogue.py │ │ ├── img2prompt_models │ │ │ ├── __init__.py │ │ │ └── img2prompt_vqa.py │ │ ├── med.py │ │ ├── pnp_vqa_models │ │ │ ├── __init__.py │ │ │ ├── pnp_unifiedqav2_fid.py │ │ │ └── pnp_vqa.py │ │ ├── timesformer │ │ │ ├── __init__.py │ │ │ ├── conv2d_same.py │ │ │ ├── features.py │ │ │ ├── helpers.py │ │ │ ├── linear.py │ │ │ ├── vit.py │ │ │ └── vit_utils.py │ │ └── vit.py │ ├── processors │ │ ├── __init__.py │ │ ├── alpro_processors.py │ │ ├── base_processor.py │ │ ├── blip_diffusion_processors.py │ │ ├── blip_processors.py │ │ ├── clip_processors.py │ │ ├── functional_video.py │ │ ├── gpt_processors.py │ │ ├── randaugment.py │ │ └── transforms_video.py │ ├── projects │ │ ├── albef │ │ │ ├── eval │ │ │ │ ├── nlvr_eval.yaml │ │ │ │ ├── ret_coco_eval.yaml │ │ │ │ ├── ret_flickr30k_eval.yaml │ │ │ │ ├── snli_ve_eval.yaml │ │ │ │ ├── vqa_test.yaml │ │ │ │ └── vqa_val.yaml │ │ │ └── train │ │ │ │ ├── aokvqa_ft.yaml │ │ │ │ ├── nlvr_ft.yaml │ │ │ │ ├── okvqa_ft.yaml │ │ │ │ ├── pretrain.yaml │ │ │ │ ├── ret_coco_ft.yaml │ │ │ │ ├── ret_flickr30k_ft.yaml │ │ │ │ ├── snli_ve_ft.yaml │ │ │ │ └── vqa_ft.yaml │ │ ├── alpro │ │ │ ├── eval │ │ │ │ ├── didemo_ret_eval.yaml │ │ │ │ ├── msrvtt_qa_eval.yaml │ │ │ │ ├── msrvtt_ret_eval.yaml │ │ │ │ └── msvd_qa_eval.yaml │ │ │ └── train │ │ │ │ ├── didemo_ret_ft.yaml │ │ │ │ ├── msrvtt_qa_ft.yaml │ │ │ │ ├── msrvtt_retrieval_ft.yaml │ │ │ │ └── msvd_qa_ft.yaml │ │ ├── blip │ │ │ ├── coco_cap_ft_iter.yaml │ │ │ ├── eval │ │ │ │ ├── aokvqa_eval.yaml │ │ │ │ ├── caption_coco_eval.yaml │ │ │ │ ├── caption_coco_eval_large.yaml │ │ │ │ ├── nlvr_eval.yaml │ │ │ │ ├── nocaps_eval.yaml │ │ │ │ ├── okvqa_eval.yaml │ │ │ │ ├── ret_coco_eval.yaml │ │ │ │ ├── ret_flickr_eval.yaml │ │ │ │ └── vqav2_eval.yaml │ │ │ └── train │ │ │ │ ├── aokvqa_ft.yaml │ │ │ │ ├── caption_coco_ft.yaml │ │ │ │ ├── caption_coco_large_ft.yaml │ │ │ │ ├── nlvr_ft.yaml │ │ │ │ ├── okvqa_ft.yaml │ │ │ │ ├── pretrain_14m.yaml │ │ │ │ ├── retrieval_coco_ft.yaml │ │ │ │ ├── retrieval_flickr_ft.yaml │ │ │ │ └── vqav2_ft.yaml │ │ ├── blip2 │ │ │ ├── eval │ │ │ │ ├── caption_coco_flant5xl_eval.yaml │ │ │ │ ├── caption_coco_opt2.7b_eval.yaml │ │ │ │ ├── caption_coco_opt6.7b_eval.yaml │ │ │ │ ├── gqa_zeroshot_flant5xl_eval.yaml │ │ │ │ ├── okvqa_zeroshot_flant5xl_eval.yaml │ │ │ │ ├── ret_coco_eval.yaml │ │ │ │ ├── ret_flickr_eval.yaml │ │ │ │ ├── vqav2_zeroshot_flant5xl_eval.yaml │ │ │ │ └── vqav2_zeroshot_opt_eval.yaml │ │ │ └── train │ │ │ │ ├── caption_coco_ft.yaml │ │ │ │ ├── pretrain_stage1.yaml │ │ │ │ ├── pretrain_stage2.yaml │ │ │ │ └── retrieval_coco_ft.yaml │ │ ├── blip_diffusion │ │ │ ├── finetune-db-dog.yaml │ │ │ ├── finetune-db-pink-dress.yaml │ │ │ ├── finetune-db-shein-jacket.yaml │ │ │ └── finetune-db-template.yaml │ │ ├── clip │ │ │ ├── exp_coco_ret_eval.yaml │ │ │ ├── exp_flickr_ret_eval.yaml │ │ │ └── exp_imnet_zs_eval.yaml │ │ ├── gpt │ │ │ ├── eval │ │ │ │ └── dialogue_avsd_eval.yaml │ │ │ └── train │ │ │ │ └── dialogue_avsd_ft.yaml │ │ └── pnp-vqa │ │ │ └── eval │ │ │ ├── gqa_eval.yaml │ │ │ ├── gqa_eval_3b.yaml │ │ │ ├── gqa_eval_large.yaml │ │ │ ├── okvqa_eval.yaml │ │ │ ├── okvqa_eval_3b.yaml │ │ │ ├── okvqa_eval_large.yaml │ │ │ ├── vqav2_eval.yaml │ │ │ ├── vqav2_eval_3b.yaml │ │ │ ├── vqav2_eval_large.yaml │ │ │ ├── vqav2_test_eval.yaml │ │ │ ├── vqav2_test_eval_3b.yaml │ │ │ └── vqav2_test_eval_large.yaml │ ├── runners │ │ ├── __init__.py │ │ ├── runner_base.py │ │ └── runner_iter.py │ └── tasks │ │ ├── __init__.py │ │ ├── base_task.py │ │ ├── captioning.py │ │ ├── dialogue.py │ │ ├── image_text_pretrain.py │ │ ├── multimodal_classification.py │ │ ├── retrieval.py │ │ ├── text_to_image_generation.py │ │ ├── vqa.py │ │ └── vqa_reading_comprehension.py └── llava │ ├── __init__.py │ ├── constants.py │ ├── conversation.py │ ├── mm_utils.py │ ├── model │ ├── __init__.py │ ├── builder.py │ ├── consolidate.py │ ├── language_model │ │ ├── llava_llama.py │ │ ├── llava_mpt.py │ │ └── mpt │ │ │ ├── adapt_tokenizer.py │ │ │ ├── attention.py │ │ │ ├── blocks.py │ │ │ ├── configuration_mpt.py │ │ │ ├── custom_embedding.py │ │ │ ├── flash_attn_triton.py │ │ │ ├── hf_prefixlm_converter.py │ │ │ ├── meta_init_context.py │ │ │ ├── modeling_mpt.py │ │ │ ├── norm.py │ │ │ └── param_init_fns.py │ ├── llava_arch.py │ ├── make_delta.py │ ├── multimodal_encoder │ │ ├── builder.py │ │ └── clip_encoder.py │ ├── multimodal_projector │ │ └── builder.py │ └── utils.py │ └── utils.py ├── requirements.txt └── utils ├── dist_util.py └── logger.py /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Sangmin Woo 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /assets/amber.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/amber.png -------------------------------------------------------------------------------- /assets/amber_discriminative.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/amber_discriminative.png -------------------------------------------------------------------------------- /assets/eyes_forest.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/eyes_forest.png -------------------------------------------------------------------------------- /assets/llava_bench.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/llava_bench.png -------------------------------------------------------------------------------- /assets/mme-fullset.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/mme-fullset.png -------------------------------------------------------------------------------- /assets/mme-hallucination.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/mme-hallucination.png -------------------------------------------------------------------------------- /assets/motivation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/motivation.png -------------------------------------------------------------------------------- /assets/observation.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/observation.png -------------------------------------------------------------------------------- /assets/overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/overview.png -------------------------------------------------------------------------------- /assets/pope.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/pope.png -------------------------------------------------------------------------------- /assets/qualitative_amber_instructblip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_amber_instructblip.png -------------------------------------------------------------------------------- /assets/qualitative_amber_instructblip2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_amber_instructblip2.png -------------------------------------------------------------------------------- /assets/qualitative_amber_llava.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_amber_llava.png -------------------------------------------------------------------------------- /assets/qualitative_amber_llava2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_amber_llava2.png -------------------------------------------------------------------------------- /assets/qualitative_mme2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_mme2.png -------------------------------------------------------------------------------- /assets/qualitative_mme_instructblip.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_mme_instructblip.png -------------------------------------------------------------------------------- /assets/qualitative_mme_llava.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_mme_llava.png -------------------------------------------------------------------------------- /assets/qualitative_pope.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_pope.png -------------------------------------------------------------------------------- /assets/qualitative_pope2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/assets/qualitative_pope2.png -------------------------------------------------------------------------------- /avisc_utils/vcd_add_noise.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | def add_diffusion_noise(image_tensor, noise_step): 4 | num_steps = 1000 # Number of diffusion steps 5 | 6 | # decide beta in each step 7 | betas = torch.linspace(-6,6,num_steps) 8 | betas = torch.sigmoid(betas) * (0.5e-2 - 1e-5) + 1e-5 9 | 10 | # decide alphas in each step 11 | alphas = 1 - betas 12 | alphas_prod = torch.cumprod(alphas, dim=0) 13 | alphas_prod_p = torch.cat([torch.tensor([1]).float(), alphas_prod[:-1]],0) # p for previous 14 | alphas_bar_sqrt = torch.sqrt(alphas_prod) 15 | one_minus_alphas_bar_log = torch.log(1 - alphas_prod) 16 | one_minus_alphas_bar_sqrt = torch.sqrt(1 - alphas_prod) 17 | 18 | def q_x(x_0,t): 19 | noise = torch.randn_like(x_0) 20 | alphas_t = alphas_bar_sqrt[t] 21 | alphas_1_m_t = one_minus_alphas_bar_sqrt[t] 22 | return (alphas_t*x_0 + alphas_1_m_t*noise) 23 | 24 | noise_delta = int(noise_step) # from 0-999 25 | noisy_image = image_tensor.clone() 26 | image_tensor_cd = q_x(noisy_image,noise_step) 27 | 28 | return image_tensor_cd 29 | 30 | -------------------------------------------------------------------------------- /eval_bench/SimSun.ttf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/eval_bench/SimSun.ttf -------------------------------------------------------------------------------- /eval_bench/scripts/llava_bench_eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | ## set below 5 | #################################################### 6 | seed=42 7 | model="llava" # llava | qwen-vl | instructblip 8 | use_avisc=false 9 | use_cd=False 10 | gpus=0 11 | max_token=64 12 | cd_alpha=2.5 13 | cd_beta=0.1 14 | model_path="/path/to/the/checkpoints/llava-v1.5-7b" 15 | pope_path="path/to/dataset/llava-bench-in-the-wild/questions.jsonl" 16 | data_path="path/to/dataset/llava-bench-in-the-wild/images" 17 | log_path="path/to//llava_bench/.json" 18 | conv="llava_v1" 19 | batch_size=1 20 | #################################################### 21 | 22 | export CUDA_VISIBLE_DEVICES=${gpus} 23 | python ./eval_bench/llava_bench_llava.py \ 24 | --seed ${seed} \ 25 | --model-path ${model_path} \ 26 | --question-file ${pope_path} \ 27 | --image-folder ${data_path} \ 28 | --answers-file ${log_path} \ 29 | --conv ${conv} \ 30 | --use_avisc ${use_avisc} \ 31 | --use_cd ${use_cd} \ 32 | --max_token ${max_token} \ 33 | --cd_alpha ${cd_alpha} \ 34 | --cd_beta ${cd_beta} \ 35 | 36 | -------------------------------------------------------------------------------- /experiments/AMBER/README_File/Paper-Arxiv-orange.svg: -------------------------------------------------------------------------------- 1 | Paper: ArxivPaperArxiv -------------------------------------------------------------------------------- /experiments/AMBER/README_File/comparison.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/AMBER/README_File/comparison.jpg -------------------------------------------------------------------------------- /experiments/AMBER/README_File/intro.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/AMBER/README_File/intro.jpg -------------------------------------------------------------------------------- /experiments/AMBER/README_File/result.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/AMBER/README_File/result.jpg -------------------------------------------------------------------------------- /experiments/AMBER/README_File/statistics.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/AMBER/README_File/statistics.jpg -------------------------------------------------------------------------------- /experiments/AMBER/data/metrics.txt: -------------------------------------------------------------------------------- 1 | chair_num=0.001 2 | chair_score=0 3 | safe_cover_num=0.001 4 | safe_cover_score=0 5 | hallu_cover_num=0.001 6 | hallu_cover_score=0 7 | non_hallu_score=0 8 | non_hallu_num=0.001 9 | qa_correct_score=0 10 | qa_correct_num=0.001 11 | qa_no_score=0 12 | qa_no_num=0.001 13 | qa_ans_no_score=0 14 | qa_ans_no_num=0.001 15 | as_qa_correct_score=0 16 | as_qa_correct_num=0.001 17 | as_qa_no_score=0 18 | as_qa_no_num=0.001 19 | as_qa_ans_no_score=0 20 | as_qa_ans_no_num=0.001 21 | an_qa_correct_score=0 22 | an_qa_correct_num=0.001 23 | an_qa_no_score=0 24 | an_qa_no_num=0.001 25 | an_qa_ans_no_score=0 26 | an_qa_ans_no_num=0.001 27 | aa_qa_correct_score=0 28 | aa_qa_correct_num=0.001 29 | aa_qa_no_score=0 30 | aa_qa_no_num=0.001 31 | aa_qa_ans_no_score=0 32 | aa_qa_ans_no_num=0.001 33 | asso_qa_correct_score=0 34 | asso_qa_correct_num=0.001 35 | asso_qa_no_score=0 36 | asso_qa_no_num=0.001 37 | asso_qa_ans_no_score=0 38 | asso_qa_ans_no_num=0.001 39 | ha_qa_correct_score=0 40 | ha_qa_correct_num=0.001 41 | ha_qa_no_score=0 42 | ha_qa_no_num=0.001 43 | ha_qa_ans_no_score=0 44 | ha_qa_ans_no_num=0.001 -------------------------------------------------------------------------------- /experiments/AMBER/data/safe_words.txt: -------------------------------------------------------------------------------- 1 | orange 2 | snack 3 | line 4 | camera 5 | light 6 | shoe 7 | sign 8 | range 9 | individual -------------------------------------------------------------------------------- /experiments/eval/eval_mme/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/eval/eval_mme/.DS_Store -------------------------------------------------------------------------------- /experiments/eval/eval_mme/readme.txt: -------------------------------------------------------------------------------- 1 | # This is an automated calculation script for the acc, acc+, and score. 2 | 3 | # You can directly run "python3 calculation.py" to get the evaluation results of LaVIN. 4 | 5 | # In order to get the statistical results of your model: 6 | 7 | (1) Fill all the files in "Your_Results", adding your model's responses: 8 | Each file in "Your_Results" consists of: 9 | Image_Name + "\t" + Question + "\t" + Ground_Truth_Answer + "\n" 10 | 11 | You need to add the responses of your model as: 12 | Image_Name + "\t" + Question + "\t" + Ground_Truth_Answer + "\t" + Your_Response + "\n" 13 | 14 | Note: if your responses contain "\n", please delet it. For each question, your response can only be in one line, not across lines! 15 | 16 | (2) run "python3 calculation.py --results_dir ./Your_Results" 17 | 18 | 19 | 20 | -------------------------------------------------------------------------------- /experiments/lavis/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from lavis.common.registry import registry 14 | 15 | from lavis.datasets.builders import * 16 | from lavis.models import * 17 | from lavis.processors import * 18 | from lavis.tasks import * 19 | 20 | 21 | root_dir = os.path.dirname(os.path.abspath(__file__)) 22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml")) 23 | 24 | registry.register_path("library_root", root_dir) 25 | repo_root = os.path.join(root_dir, "..") 26 | registry.register_path("repo_root", repo_root) 27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root) 28 | registry.register_path("cache_root", cache_root) 29 | 30 | registry.register("MAX_INT", sys.maxsize) 31 | registry.register("SPLIT_NAMES", ["train", "val", "test"]) 32 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/canny/__init__.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | 3 | 4 | class CannyDetector: 5 | def __call__(self, img, low_threshold, high_threshold): 6 | return cv2.Canny(img, low_threshold, high_threshold) 7 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/ckpts/download.sh: -------------------------------------------------------------------------------- 1 | #! /bin/bash 2 | 3 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/dpt_hybrid-midas-501f0c75.pt 4 | wget https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/network-bsds500.pth 5 | 6 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/midas/midas/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/common/annotator/midas/midas/__init__.py -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/midas/midas/base_model.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | 4 | class BaseModel(torch.nn.Module): 5 | def load(self, path): 6 | """Load model from file. 7 | 8 | Args: 9 | path (str): file path 10 | """ 11 | parameters = torch.load(path, map_location=torch.device('cpu')) 12 | 13 | if "optimizer" in parameters: 14 | parameters = parameters["model"] 15 | 16 | self.load_state_dict(parameters) 17 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from annotator.uniformer.mmseg.apis import init_segmentor, inference_segmentor, show_result_pyplot 4 | from annotator.uniformer.mmseg.core.evaluation import get_palette 5 | from annotator.util import annotator_ckpts_path 6 | 7 | 8 | checkpoint_file = "https://huggingface.co/lllyasviel/ControlNet/resolve/main/annotator/ckpts/upernet_global_small.pth" 9 | 10 | 11 | class UniformerDetector: 12 | def __init__(self): 13 | modelpath = os.path.join(annotator_ckpts_path, "upernet_global_small.pth") 14 | if not os.path.exists(modelpath): 15 | from basicsr.utils.download_util import load_file_from_url 16 | load_file_from_url(checkpoint_file, model_dir=annotator_ckpts_path) 17 | config_file = os.path.join(os.path.dirname(annotator_ckpts_path), "uniformer", "exp", "upernet_global_small", "config.py") 18 | self.model = init_segmentor(config_file, modelpath).cuda() 19 | 20 | def __call__(self, img): 21 | result = inference_segmentor(self.model, img) 22 | res_img = show_result_pyplot(self.model, img, result, get_palette('ade'), opacity=1) 23 | return res_img 24 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/configs/_base_/datasets/pascal_voc12_aug.py: -------------------------------------------------------------------------------- 1 | _base_ = './pascal_voc12.py' 2 | # dataset settings 3 | data = dict( 4 | train=dict( 5 | ann_dir=['SegmentationClass', 'SegmentationClassAug'], 6 | split=[ 7 | 'ImageSets/Segmentation/train.txt', 8 | 'ImageSets/Segmentation/aug.txt' 9 | ])) 10 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/configs/_base_/default_runtime.py: -------------------------------------------------------------------------------- 1 | # yapf:disable 2 | log_config = dict( 3 | interval=50, 4 | hooks=[ 5 | dict(type='TextLoggerHook', by_epoch=False), 6 | # dict(type='TensorboardLoggerHook') 7 | ]) 8 | # yapf:enable 9 | dist_params = dict(backend='nccl') 10 | log_level = 'INFO' 11 | load_from = None 12 | resume_from = None 13 | workflow = [('train', 1)] 14 | cudnn_benchmark = True 15 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/configs/_base_/models/cgnet.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', eps=1e-03, requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | backbone=dict( 6 | type='CGNet', 7 | norm_cfg=norm_cfg, 8 | in_channels=3, 9 | num_channels=(32, 64, 128), 10 | num_blocks=(3, 21), 11 | dilations=(2, 4), 12 | reductions=(8, 16)), 13 | decode_head=dict( 14 | type='FCNHead', 15 | in_channels=256, 16 | in_index=2, 17 | channels=256, 18 | num_convs=0, 19 | concat_input=False, 20 | dropout_ratio=0, 21 | num_classes=19, 22 | norm_cfg=norm_cfg, 23 | loss_decode=dict( 24 | type='CrossEntropyLoss', 25 | use_sigmoid=False, 26 | loss_weight=1.0, 27 | class_weight=[ 28 | 2.5959933, 6.7415504, 3.5354059, 9.8663225, 9.690899, 9.369352, 29 | 10.289121, 9.953208, 4.3097677, 9.490387, 7.674431, 9.396905, 30 | 10.347791, 6.3927646, 10.226669, 10.241062, 10.280587, 31 | 10.396974, 10.055647 32 | ])), 33 | # model training and testing settings 34 | train_cfg=dict(sampler=None), 35 | test_cfg=dict(mode='whole')) 36 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/configs/_base_/models/fpn_r50.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | pretrained='open-mmlab://resnet50_v1c', 6 | backbone=dict( 7 | type='ResNetV1c', 8 | depth=50, 9 | num_stages=4, 10 | out_indices=(0, 1, 2, 3), 11 | dilations=(1, 1, 1, 1), 12 | strides=(1, 2, 2, 2), 13 | norm_cfg=norm_cfg, 14 | norm_eval=False, 15 | style='pytorch', 16 | contract_dilation=True), 17 | neck=dict( 18 | type='FPN', 19 | in_channels=[256, 512, 1024, 2048], 20 | out_channels=256, 21 | num_outs=4), 22 | decode_head=dict( 23 | type='FPNHead', 24 | in_channels=[256, 256, 256, 256], 25 | in_index=[0, 1, 2, 3], 26 | feature_strides=[4, 8, 16, 32], 27 | channels=128, 28 | dropout_ratio=0.1, 29 | num_classes=19, 30 | norm_cfg=norm_cfg, 31 | align_corners=False, 32 | loss_decode=dict( 33 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 34 | # model training and testing settings 35 | train_cfg=dict(), 36 | test_cfg=dict(mode='whole')) 37 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/configs/_base_/models/fpn_uniformer.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | backbone=dict( 6 | type='UniFormer', 7 | embed_dim=[64, 128, 320, 512], 8 | layers=[3, 4, 8, 3], 9 | head_dim=64, 10 | mlp_ratio=4., 11 | qkv_bias=True, 12 | drop_rate=0., 13 | attn_drop_rate=0., 14 | drop_path_rate=0.1), 15 | neck=dict( 16 | type='FPN', 17 | in_channels=[64, 128, 320, 512], 18 | out_channels=256, 19 | num_outs=4), 20 | decode_head=dict( 21 | type='FPNHead', 22 | in_channels=[256, 256, 256, 256], 23 | in_index=[0, 1, 2, 3], 24 | feature_strides=[4, 8, 16, 32], 25 | channels=128, 26 | dropout_ratio=0.1, 27 | num_classes=150, 28 | norm_cfg=norm_cfg, 29 | align_corners=False, 30 | loss_decode=dict( 31 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 32 | # model training and testing settings 33 | train_cfg=dict(), 34 | test_cfg=dict(mode='whole') 35 | ) 36 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/configs/_base_/models/lraspp_m-v3-d8.py: -------------------------------------------------------------------------------- 1 | # model settings 2 | norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True) 3 | model = dict( 4 | type='EncoderDecoder', 5 | backbone=dict( 6 | type='MobileNetV3', 7 | arch='large', 8 | out_indices=(1, 3, 16), 9 | norm_cfg=norm_cfg), 10 | decode_head=dict( 11 | type='LRASPPHead', 12 | in_channels=(16, 24, 960), 13 | in_index=(0, 1, 2), 14 | channels=128, 15 | input_transform='multiple_select', 16 | dropout_ratio=0.1, 17 | num_classes=19, 18 | norm_cfg=norm_cfg, 19 | act_cfg=dict(type='ReLU'), 20 | align_corners=False, 21 | loss_decode=dict( 22 | type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)), 23 | # model training and testing settings 24 | train_cfg=dict(), 25 | test_cfg=dict(mode='whole')) 26 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_160k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=160000) 8 | checkpoint_config = dict(by_epoch=False, interval=16000) 9 | evaluation = dict(interval=16000, metric='mIoU') 10 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_20k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=20000) 8 | checkpoint_config = dict(by_epoch=False, interval=2000) 9 | evaluation = dict(interval=2000, metric='mIoU') 10 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_40k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=40000) 8 | checkpoint_config = dict(by_epoch=False, interval=4000) 9 | evaluation = dict(interval=4000, metric='mIoU') 10 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/configs/_base_/schedules/schedule_80k.py: -------------------------------------------------------------------------------- 1 | # optimizer 2 | optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005) 3 | optimizer_config = dict() 4 | # learning policy 5 | lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=False) 6 | # runtime settings 7 | runner = dict(type='IterBasedRunner', max_iters=80000) 8 | checkpoint_config = dict(by_epoch=False, interval=8000) 9 | evaluation = dict(interval=8000, metric='mIoU') 10 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/exp/upernet_global_small/run.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | work_path=$(dirname $0) 4 | PYTHONPATH="$(dirname $0)/../../":$PYTHONPATH \ 5 | python -m torch.distributed.launch --nproc_per_node=8 \ 6 | tools/train.py ${work_path}/config.py \ 7 | --launcher pytorch \ 8 | --options model.backbone.pretrained_path='your_model_path/uniformer_small_in1k.pth' \ 9 | --work-dir ${work_path}/ckpt \ 10 | 2>&1 | tee -a ${work_path}/log.txt 11 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/exp/upernet_global_small/test.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | work_path=$(dirname $0) 4 | PYTHONPATH="$(dirname $0)/../../":$PYTHONPATH \ 5 | python -m torch.distributed.launch --nproc_per_node=8 \ 6 | tools/test.py ${work_path}/test_config_h32.py \ 7 | ${work_path}/ckpt/latest.pth \ 8 | --launcher pytorch \ 9 | --eval mIoU \ 10 | 2>&1 | tee -a ${work_path}/log.txt 11 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | # flake8: noqa 3 | from .arraymisc import * 4 | from .fileio import * 5 | from .image import * 6 | from .utils import * 7 | from .version import * 8 | from .video import * 9 | from .visualization import * 10 | 11 | # The following modules are not imported to this level, so mmcv may be used 12 | # without PyTorch. 13 | # - runner 14 | # - parallel 15 | # - op 16 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/arraymisc/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .quantization import dequantize, quantize 3 | 4 | __all__ = ['quantize', 'dequantize'] 5 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/hsigmoid.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch.nn as nn 3 | 4 | from .registry import ACTIVATION_LAYERS 5 | 6 | 7 | @ACTIVATION_LAYERS.register_module() 8 | class HSigmoid(nn.Module): 9 | """Hard Sigmoid Module. Apply the hard sigmoid function: 10 | Hsigmoid(x) = min(max((x + bias) / divisor, min_value), max_value) 11 | Default: Hsigmoid(x) = min(max((x + 1) / 2, 0), 1) 12 | 13 | Args: 14 | bias (float): Bias of the input feature map. Default: 1.0. 15 | divisor (float): Divisor of the input feature map. Default: 2.0. 16 | min_value (float): Lower bound value. Default: 0.0. 17 | max_value (float): Upper bound value. Default: 1.0. 18 | 19 | Returns: 20 | Tensor: The output tensor. 21 | """ 22 | 23 | def __init__(self, bias=1.0, divisor=2.0, min_value=0.0, max_value=1.0): 24 | super(HSigmoid, self).__init__() 25 | self.bias = bias 26 | self.divisor = divisor 27 | assert self.divisor != 0 28 | self.min_value = min_value 29 | self.max_value = max_value 30 | 31 | def forward(self, x): 32 | x = (x + self.bias) / self.divisor 33 | 34 | return x.clamp_(self.min_value, self.max_value) 35 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/hswish.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch.nn as nn 3 | 4 | from .registry import ACTIVATION_LAYERS 5 | 6 | 7 | @ACTIVATION_LAYERS.register_module() 8 | class HSwish(nn.Module): 9 | """Hard Swish Module. 10 | 11 | This module applies the hard swish function: 12 | 13 | .. math:: 14 | Hswish(x) = x * ReLU6(x + 3) / 6 15 | 16 | Args: 17 | inplace (bool): can optionally do the operation in-place. 18 | Default: False. 19 | 20 | Returns: 21 | Tensor: The output tensor. 22 | """ 23 | 24 | def __init__(self, inplace=False): 25 | super(HSwish, self).__init__() 26 | self.act = nn.ReLU6(inplace) 27 | 28 | def forward(self, x): 29 | return x * self.act(x + 3) / 6 30 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/padding.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch.nn as nn 3 | 4 | from .registry import PADDING_LAYERS 5 | 6 | PADDING_LAYERS.register_module('zero', module=nn.ZeroPad2d) 7 | PADDING_LAYERS.register_module('reflect', module=nn.ReflectionPad2d) 8 | PADDING_LAYERS.register_module('replicate', module=nn.ReplicationPad2d) 9 | 10 | 11 | def build_padding_layer(cfg, *args, **kwargs): 12 | """Build padding layer. 13 | 14 | Args: 15 | cfg (None or dict): The padding layer config, which should contain: 16 | - type (str): Layer type. 17 | - layer args: Args needed to instantiate a padding layer. 18 | 19 | Returns: 20 | nn.Module: Created padding layer. 21 | """ 22 | if not isinstance(cfg, dict): 23 | raise TypeError('cfg must be a dict') 24 | if 'type' not in cfg: 25 | raise KeyError('the cfg dict must contain the key "type"') 26 | 27 | cfg_ = cfg.copy() 28 | padding_type = cfg_.pop('type') 29 | if padding_type not in PADDING_LAYERS: 30 | raise KeyError(f'Unrecognized padding type {padding_type}.') 31 | else: 32 | padding_layer = PADDING_LAYERS.get(padding_type) 33 | 34 | layer = padding_layer(*args, **kwargs, **cfg_) 35 | 36 | return layer 37 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from annotator.uniformer.mmcv.utils import Registry 3 | 4 | CONV_LAYERS = Registry('conv layer') 5 | NORM_LAYERS = Registry('norm layer') 6 | ACTIVATION_LAYERS = Registry('activation layer') 7 | PADDING_LAYERS = Registry('padding layer') 8 | UPSAMPLE_LAYERS = Registry('upsample layer') 9 | PLUGIN_LAYERS = Registry('plugin layer') 10 | 11 | DROPOUT_LAYERS = Registry('drop out layers') 12 | POSITIONAL_ENCODING = Registry('position encoding') 13 | ATTENTION = Registry('attention') 14 | FEEDFORWARD_NETWORK = Registry('feed-forward Network') 15 | TRANSFORMER_LAYER = Registry('transformerLayer') 16 | TRANSFORMER_LAYER_SEQUENCE = Registry('transformer-layers sequence') 17 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/scale.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | import torch.nn as nn 4 | 5 | 6 | class Scale(nn.Module): 7 | """A learnable scale parameter. 8 | 9 | This layer scales the input by a learnable factor. It multiplies a 10 | learnable scale parameter of shape (1,) with input of any shape. 11 | 12 | Args: 13 | scale (float): Initial value of scale factor. Default: 1.0 14 | """ 15 | 16 | def __init__(self, scale=1.0): 17 | super(Scale, self).__init__() 18 | self.scale = nn.Parameter(torch.tensor(scale, dtype=torch.float)) 19 | 20 | def forward(self, x): 21 | return x * self.scale 22 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/cnn/bricks/swish.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | import torch.nn as nn 4 | 5 | from .registry import ACTIVATION_LAYERS 6 | 7 | 8 | @ACTIVATION_LAYERS.register_module() 9 | class Swish(nn.Module): 10 | """Swish Module. 11 | 12 | This module applies the swish function: 13 | 14 | .. math:: 15 | Swish(x) = x * Sigmoid(x) 16 | 17 | Returns: 18 | Tensor: The output tensor. 19 | """ 20 | 21 | def __init__(self): 22 | super(Swish, self).__init__() 23 | 24 | def forward(self, x): 25 | return x * torch.sigmoid(x) 26 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/cnn/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from ..runner import Sequential 3 | from ..utils import Registry, build_from_cfg 4 | 5 | 6 | def build_model_from_cfg(cfg, registry, default_args=None): 7 | """Build a PyTorch model from config dict(s). Different from 8 | ``build_from_cfg``, if cfg is a list, a ``nn.Sequential`` will be built. 9 | 10 | Args: 11 | cfg (dict, list[dict]): The config of modules, is is either a config 12 | dict or a list of config dicts. If cfg is a list, a 13 | the built modules will be wrapped with ``nn.Sequential``. 14 | registry (:obj:`Registry`): A registry the module belongs to. 15 | default_args (dict, optional): Default arguments to build the module. 16 | Defaults to None. 17 | 18 | Returns: 19 | nn.Module: A built nn module. 20 | """ 21 | if isinstance(cfg, list): 22 | modules = [ 23 | build_from_cfg(cfg_, registry, default_args) for cfg_ in cfg 24 | ] 25 | return Sequential(*modules) 26 | else: 27 | return build_from_cfg(cfg, registry, default_args) 28 | 29 | 30 | MODELS = Registry('model', build_func=build_model_from_cfg) 31 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/cnn/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .flops_counter import get_model_complexity_info 3 | from .fuse_conv_bn import fuse_conv_bn 4 | from .sync_bn import revert_sync_batchnorm 5 | from .weight_init import (INITIALIZERS, Caffe2XavierInit, ConstantInit, 6 | KaimingInit, NormalInit, PretrainedInit, 7 | TruncNormalInit, UniformInit, XavierInit, 8 | bias_init_with_prob, caffe2_xavier_init, 9 | constant_init, initialize, kaiming_init, normal_init, 10 | trunc_normal_init, uniform_init, xavier_init) 11 | 12 | __all__ = [ 13 | 'get_model_complexity_info', 'bias_init_with_prob', 'caffe2_xavier_init', 14 | 'constant_init', 'kaiming_init', 'normal_init', 'trunc_normal_init', 15 | 'uniform_init', 'xavier_init', 'fuse_conv_bn', 'initialize', 16 | 'INITIALIZERS', 'ConstantInit', 'XavierInit', 'NormalInit', 17 | 'TruncNormalInit', 'UniformInit', 'KaimingInit', 'PretrainedInit', 18 | 'Caffe2XavierInit', 'revert_sync_batchnorm' 19 | ] 20 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/engine/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .test import (collect_results_cpu, collect_results_gpu, multi_gpu_test, 3 | single_gpu_test) 4 | 5 | __all__ = [ 6 | 'collect_results_cpu', 'collect_results_gpu', 'multi_gpu_test', 7 | 'single_gpu_test' 8 | ] 9 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/fileio/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .file_client import BaseStorageBackend, FileClient 3 | from .handlers import BaseFileHandler, JsonHandler, PickleHandler, YamlHandler 4 | from .io import dump, load, register_handler 5 | from .parse import dict_from_file, list_from_file 6 | 7 | __all__ = [ 8 | 'BaseStorageBackend', 'FileClient', 'load', 'dump', 'register_handler', 9 | 'BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler', 10 | 'list_from_file', 'dict_from_file' 11 | ] 12 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/fileio/handlers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .base import BaseFileHandler 3 | from .json_handler import JsonHandler 4 | from .pickle_handler import PickleHandler 5 | from .yaml_handler import YamlHandler 6 | 7 | __all__ = ['BaseFileHandler', 'JsonHandler', 'PickleHandler', 'YamlHandler'] 8 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/fileio/handlers/base.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from abc import ABCMeta, abstractmethod 3 | 4 | 5 | class BaseFileHandler(metaclass=ABCMeta): 6 | # `str_like` is a flag to indicate whether the type of file object is 7 | # str-like object or bytes-like object. Pickle only processes bytes-like 8 | # objects but json only processes str-like object. If it is str-like 9 | # object, `StringIO` will be used to process the buffer. 10 | str_like = True 11 | 12 | @abstractmethod 13 | def load_from_fileobj(self, file, **kwargs): 14 | pass 15 | 16 | @abstractmethod 17 | def dump_to_fileobj(self, obj, file, **kwargs): 18 | pass 19 | 20 | @abstractmethod 21 | def dump_to_str(self, obj, **kwargs): 22 | pass 23 | 24 | def load_from_path(self, filepath, mode='r', **kwargs): 25 | with open(filepath, mode) as f: 26 | return self.load_from_fileobj(f, **kwargs) 27 | 28 | def dump_to_path(self, obj, filepath, mode='w', **kwargs): 29 | with open(filepath, mode) as f: 30 | self.dump_to_fileobj(obj, f, **kwargs) 31 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/fileio/handlers/json_handler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import json 3 | 4 | import numpy as np 5 | 6 | from .base import BaseFileHandler 7 | 8 | 9 | def set_default(obj): 10 | """Set default json values for non-serializable values. 11 | 12 | It helps convert ``set``, ``range`` and ``np.ndarray`` data types to list. 13 | It also converts ``np.generic`` (including ``np.int32``, ``np.float32``, 14 | etc.) into plain numbers of plain python built-in types. 15 | """ 16 | if isinstance(obj, (set, range)): 17 | return list(obj) 18 | elif isinstance(obj, np.ndarray): 19 | return obj.tolist() 20 | elif isinstance(obj, np.generic): 21 | return obj.item() 22 | raise TypeError(f'{type(obj)} is unsupported for json dump') 23 | 24 | 25 | class JsonHandler(BaseFileHandler): 26 | 27 | def load_from_fileobj(self, file): 28 | return json.load(file) 29 | 30 | def dump_to_fileobj(self, obj, file, **kwargs): 31 | kwargs.setdefault('default', set_default) 32 | json.dump(obj, file, **kwargs) 33 | 34 | def dump_to_str(self, obj, **kwargs): 35 | kwargs.setdefault('default', set_default) 36 | return json.dumps(obj, **kwargs) 37 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/fileio/handlers/pickle_handler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import pickle 3 | 4 | from .base import BaseFileHandler 5 | 6 | 7 | class PickleHandler(BaseFileHandler): 8 | 9 | str_like = False 10 | 11 | def load_from_fileobj(self, file, **kwargs): 12 | return pickle.load(file, **kwargs) 13 | 14 | def load_from_path(self, filepath, **kwargs): 15 | return super(PickleHandler, self).load_from_path( 16 | filepath, mode='rb', **kwargs) 17 | 18 | def dump_to_str(self, obj, **kwargs): 19 | kwargs.setdefault('protocol', 2) 20 | return pickle.dumps(obj, **kwargs) 21 | 22 | def dump_to_fileobj(self, obj, file, **kwargs): 23 | kwargs.setdefault('protocol', 2) 24 | pickle.dump(obj, file, **kwargs) 25 | 26 | def dump_to_path(self, obj, filepath, **kwargs): 27 | super(PickleHandler, self).dump_to_path( 28 | obj, filepath, mode='wb', **kwargs) 29 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/fileio/handlers/yaml_handler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import yaml 3 | 4 | try: 5 | from yaml import CLoader as Loader, CDumper as Dumper 6 | except ImportError: 7 | from yaml import Loader, Dumper 8 | 9 | from .base import BaseFileHandler # isort:skip 10 | 11 | 12 | class YamlHandler(BaseFileHandler): 13 | 14 | def load_from_fileobj(self, file, **kwargs): 15 | kwargs.setdefault('Loader', Loader) 16 | return yaml.load(file, **kwargs) 17 | 18 | def dump_to_fileobj(self, obj, file, **kwargs): 19 | kwargs.setdefault('Dumper', Dumper) 20 | yaml.dump(obj, file, **kwargs) 21 | 22 | def dump_to_str(self, obj, **kwargs): 23 | kwargs.setdefault('Dumper', Dumper) 24 | return yaml.dump(obj, **kwargs) 25 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/model_zoo/deprecated.json: -------------------------------------------------------------------------------- 1 | { 2 | "resnet50_caffe": "detectron/resnet50_caffe", 3 | "resnet50_caffe_bgr": "detectron2/resnet50_caffe_bgr", 4 | "resnet101_caffe": "detectron/resnet101_caffe", 5 | "resnet101_caffe_bgr": "detectron2/resnet101_caffe_bgr" 6 | } 7 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/ops/info.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import glob 3 | import os 4 | 5 | import torch 6 | 7 | if torch.__version__ == 'parrots': 8 | import parrots 9 | 10 | def get_compiler_version(): 11 | return 'GCC ' + parrots.version.compiler 12 | 13 | def get_compiling_cuda_version(): 14 | return parrots.version.cuda 15 | else: 16 | from ..utils import ext_loader 17 | ext_module = ext_loader.load_ext( 18 | '_ext', ['get_compiler_version', 'get_compiling_cuda_version']) 19 | 20 | def get_compiler_version(): 21 | return ext_module.get_compiler_version() 22 | 23 | def get_compiling_cuda_version(): 24 | return ext_module.get_compiling_cuda_version() 25 | 26 | 27 | def get_onnxruntime_op_path(): 28 | wildcard = os.path.join( 29 | os.path.abspath(os.path.dirname(os.path.dirname(__file__))), 30 | '_ext_ort.*.so') 31 | 32 | paths = glob.glob(wildcard) 33 | if len(paths) > 0: 34 | return paths[0] 35 | else: 36 | return '' 37 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/parallel/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .collate import collate 3 | from .data_container import DataContainer 4 | from .data_parallel import MMDataParallel 5 | from .distributed import MMDistributedDataParallel 6 | from .registry import MODULE_WRAPPERS 7 | from .scatter_gather import scatter, scatter_kwargs 8 | from .utils import is_module_wrapper 9 | 10 | __all__ = [ 11 | 'collate', 'DataContainer', 'MMDataParallel', 'MMDistributedDataParallel', 12 | 'scatter', 'scatter_kwargs', 'is_module_wrapper', 'MODULE_WRAPPERS' 13 | ] 14 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/parallel/registry.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from torch.nn.parallel import DataParallel, DistributedDataParallel 3 | 4 | from annotator.uniformer.mmcv.utils import Registry 5 | 6 | MODULE_WRAPPERS = Registry('module wrapper') 7 | MODULE_WRAPPERS.register_module(module=DataParallel) 8 | MODULE_WRAPPERS.register_module(module=DistributedDataParallel) 9 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/parallel/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .registry import MODULE_WRAPPERS 3 | 4 | 5 | def is_module_wrapper(module): 6 | """Check if a module is a module wrapper. 7 | 8 | The following 3 modules in MMCV (and their subclasses) are regarded as 9 | module wrappers: DataParallel, DistributedDataParallel, 10 | MMDistributedDataParallel (the deprecated version). You may add you own 11 | module wrapper by registering it to mmcv.parallel.MODULE_WRAPPERS. 12 | 13 | Args: 14 | module (nn.Module): The module to be checked. 15 | 16 | Returns: 17 | bool: True if the input module is a module wrapper. 18 | """ 19 | module_wrappers = tuple(MODULE_WRAPPERS.module_dict.values()) 20 | return isinstance(module, module_wrappers) 21 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/runner/builder.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import copy 3 | 4 | from ..utils import Registry 5 | 6 | RUNNERS = Registry('runner') 7 | RUNNER_BUILDERS = Registry('runner builder') 8 | 9 | 10 | def build_runner_constructor(cfg): 11 | return RUNNER_BUILDERS.build(cfg) 12 | 13 | 14 | def build_runner(cfg, default_args=None): 15 | runner_cfg = copy.deepcopy(cfg) 16 | constructor_type = runner_cfg.pop('constructor', 17 | 'DefaultRunnerConstructor') 18 | runner_constructor = build_runner_constructor( 19 | dict( 20 | type=constructor_type, 21 | runner_cfg=runner_cfg, 22 | default_args=default_args)) 23 | runner = runner_constructor() 24 | return runner 25 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/closure.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .hook import HOOKS, Hook 3 | 4 | 5 | @HOOKS.register_module() 6 | class ClosureHook(Hook): 7 | 8 | def __init__(self, fn_name, fn): 9 | assert hasattr(self, fn_name) 10 | assert callable(fn) 11 | setattr(self, fn_name, fn) 12 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/iter_timer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import time 3 | 4 | from .hook import HOOKS, Hook 5 | 6 | 7 | @HOOKS.register_module() 8 | class IterTimerHook(Hook): 9 | 10 | def before_epoch(self, runner): 11 | self.t = time.time() 12 | 13 | def before_iter(self, runner): 14 | runner.log_buffer.update({'data_time': time.time() - self.t}) 15 | 16 | def after_iter(self, runner): 17 | runner.log_buffer.update({'time': time.time() - self.t}) 18 | self.t = time.time() 19 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/logger/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .base import LoggerHook 3 | from .dvclive import DvcliveLoggerHook 4 | from .mlflow import MlflowLoggerHook 5 | from .neptune import NeptuneLoggerHook 6 | from .pavi import PaviLoggerHook 7 | from .tensorboard import TensorboardLoggerHook 8 | from .text import TextLoggerHook 9 | from .wandb import WandbLoggerHook 10 | 11 | __all__ = [ 12 | 'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook', 13 | 'TensorboardLoggerHook', 'TextLoggerHook', 'WandbLoggerHook', 14 | 'NeptuneLoggerHook', 'DvcliveLoggerHook' 15 | ] 16 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/memory.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import torch 3 | 4 | from .hook import HOOKS, Hook 5 | 6 | 7 | @HOOKS.register_module() 8 | class EmptyCacheHook(Hook): 9 | 10 | def __init__(self, before_epoch=False, after_epoch=True, after_iter=False): 11 | self._before_epoch = before_epoch 12 | self._after_epoch = after_epoch 13 | self._after_iter = after_iter 14 | 15 | def after_iter(self, runner): 16 | if self._after_iter: 17 | torch.cuda.empty_cache() 18 | 19 | def before_epoch(self, runner): 20 | if self._before_epoch: 21 | torch.cuda.empty_cache() 22 | 23 | def after_epoch(self, runner): 24 | if self._after_epoch: 25 | torch.cuda.empty_cache() 26 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/sampler_seed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .hook import HOOKS, Hook 3 | 4 | 5 | @HOOKS.register_module() 6 | class DistSamplerSeedHook(Hook): 7 | """Data-loading sampler for distributed training. 8 | 9 | When distributed training, it is only useful in conjunction with 10 | :obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same 11 | purpose with :obj:`IterLoader`. 12 | """ 13 | 14 | def before_epoch(self, runner): 15 | if hasattr(runner.data_loader.sampler, 'set_epoch'): 16 | # in case the data loader uses `SequentialSampler` in Pytorch 17 | runner.data_loader.sampler.set_epoch(runner.epoch) 18 | elif hasattr(runner.data_loader.batch_sampler.sampler, 'set_epoch'): 19 | # batch sampler in pytorch warps the sampler as its attributes. 20 | runner.data_loader.batch_sampler.sampler.set_epoch(runner.epoch) 21 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/runner/hooks/sync_buffer.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from ..dist_utils import allreduce_params 3 | from .hook import HOOKS, Hook 4 | 5 | 6 | @HOOKS.register_module() 7 | class SyncBuffersHook(Hook): 8 | """Synchronize model buffers such as running_mean and running_var in BN at 9 | the end of each epoch. 10 | 11 | Args: 12 | distributed (bool): Whether distributed training is used. It is 13 | effective only for distributed training. Defaults to True. 14 | """ 15 | 16 | def __init__(self, distributed=True): 17 | self.distributed = distributed 18 | 19 | def after_epoch(self, runner): 20 | """All-reduce model buffers at the end of each epoch.""" 21 | if self.distributed: 22 | allreduce_params(runner.model.buffers()) 23 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/runner/optimizer/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .builder import (OPTIMIZER_BUILDERS, OPTIMIZERS, build_optimizer, 3 | build_optimizer_constructor) 4 | from .default_constructor import DefaultOptimizerConstructor 5 | 6 | __all__ = [ 7 | 'OPTIMIZER_BUILDERS', 'OPTIMIZERS', 'DefaultOptimizerConstructor', 8 | 'build_optimizer', 'build_optimizer_constructor' 9 | ] 10 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/utils/parrots_jit.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | import os 3 | 4 | from .parrots_wrapper import TORCH_VERSION 5 | 6 | parrots_jit_option = os.getenv('PARROTS_JIT_OPTION') 7 | 8 | if TORCH_VERSION == 'parrots' and parrots_jit_option == 'ON': 9 | from parrots.jit import pat as jit 10 | else: 11 | 12 | def jit(func=None, 13 | check_input=None, 14 | full_shape=True, 15 | derivate=False, 16 | coderize=False, 17 | optimize=False): 18 | 19 | def wrapper(func): 20 | 21 | def wrapper_inner(*args, **kargs): 22 | return func(*args, **kargs) 23 | 24 | return wrapper_inner 25 | 26 | if func is None: 27 | return wrapper 28 | else: 29 | return func 30 | 31 | 32 | if TORCH_VERSION == 'parrots': 33 | from parrots.utils.tester import skip_no_elena 34 | else: 35 | 36 | def skip_no_elena(func): 37 | 38 | def wrapper(*args, **kargs): 39 | return func(*args, **kargs) 40 | 41 | return wrapper 42 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/utils/trace.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | 3 | import torch 4 | 5 | from annotator.uniformer.mmcv.utils import digit_version 6 | 7 | 8 | def is_jit_tracing() -> bool: 9 | if (torch.__version__ != 'parrots' 10 | and digit_version(torch.__version__) >= digit_version('1.6.0')): 11 | on_trace = torch.jit.is_tracing() 12 | # In PyTorch 1.6, torch.jit.is_tracing has a bug. 13 | # Refers to https://github.com/pytorch/pytorch/issues/42448 14 | if isinstance(on_trace, bool): 15 | return on_trace 16 | else: 17 | return torch._C._is_tracing() 18 | else: 19 | warnings.warn( 20 | 'torch.jit.is_tracing is only supported after v1.6.0. ' 21 | 'Therefore is_tracing returns False automatically. Please ' 22 | 'set on_trace manually if you are using trace.', UserWarning) 23 | return False 24 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/video/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .io import Cache, VideoReader, frames2video 3 | from .optflow import (dequantize_flow, flow_from_bytes, flow_warp, flowread, 4 | flowwrite, quantize_flow, sparse_flow_from_bytes) 5 | from .processing import concat_video, convert_video, cut_video, resize_video 6 | 7 | __all__ = [ 8 | 'Cache', 'VideoReader', 'frames2video', 'convert_video', 'resize_video', 9 | 'cut_video', 'concat_video', 'flowread', 'flowwrite', 'quantize_flow', 10 | 'dequantize_flow', 'flow_warp', 'flow_from_bytes', 'sparse_flow_from_bytes' 11 | ] 12 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv/visualization/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) OpenMMLab. All rights reserved. 2 | from .color import Color, color_val 3 | from .image import imshow, imshow_bboxes, imshow_det_bboxes 4 | from .optflow import flow2rgb, flowshow, make_color_wheel 5 | 6 | __all__ = [ 7 | 'Color', 'color_val', 'imshow', 'imshow_bboxes', 'imshow_det_bboxes', 8 | 'flowshow', 'flow2rgb', 'make_color_wheel' 9 | ] 10 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmcv_custom/__init__.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from .checkpoint import load_checkpoint 4 | 5 | __all__ = ['load_checkpoint'] -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/apis/__init__.py: -------------------------------------------------------------------------------- 1 | from .inference import inference_segmentor, init_segmentor, show_result_pyplot 2 | from .test import multi_gpu_test, single_gpu_test 3 | from .train import get_root_logger, set_random_seed, train_segmentor 4 | 5 | __all__ = [ 6 | 'get_root_logger', 'set_random_seed', 'train_segmentor', 'init_segmentor', 7 | 'inference_segmentor', 'multi_gpu_test', 'single_gpu_test', 8 | 'show_result_pyplot' 9 | ] 10 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/core/__init__.py: -------------------------------------------------------------------------------- 1 | from .evaluation import * # noqa: F401, F403 2 | from .seg import * # noqa: F401, F403 3 | from .utils import * # noqa: F401, F403 4 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/core/evaluation/__init__.py: -------------------------------------------------------------------------------- 1 | from .class_names import get_classes, get_palette 2 | from .eval_hooks import DistEvalHook, EvalHook 3 | from .metrics import eval_metrics, mean_dice, mean_fscore, mean_iou 4 | 5 | __all__ = [ 6 | 'EvalHook', 'DistEvalHook', 'mean_dice', 'mean_iou', 'mean_fscore', 7 | 'eval_metrics', 'get_classes', 'get_palette' 8 | ] 9 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/core/seg/__init__.py: -------------------------------------------------------------------------------- 1 | from .builder import build_pixel_sampler 2 | from .sampler import BasePixelSampler, OHEMPixelSampler 3 | 4 | __all__ = ['build_pixel_sampler', 'BasePixelSampler', 'OHEMPixelSampler'] 5 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/core/seg/builder.py: -------------------------------------------------------------------------------- 1 | from annotator.uniformer.mmcv.utils import Registry, build_from_cfg 2 | 3 | PIXEL_SAMPLERS = Registry('pixel sampler') 4 | 5 | 6 | def build_pixel_sampler(cfg, **default_args): 7 | """Build pixel sampler for segmentation map.""" 8 | return build_from_cfg(cfg, PIXEL_SAMPLERS, default_args) 9 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/core/seg/sampler/__init__.py: -------------------------------------------------------------------------------- 1 | from .base_pixel_sampler import BasePixelSampler 2 | from .ohem_pixel_sampler import OHEMPixelSampler 3 | 4 | __all__ = ['BasePixelSampler', 'OHEMPixelSampler'] 5 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/core/seg/sampler/base_pixel_sampler.py: -------------------------------------------------------------------------------- 1 | from abc import ABCMeta, abstractmethod 2 | 3 | 4 | class BasePixelSampler(metaclass=ABCMeta): 5 | """Base class of pixel sampler.""" 6 | 7 | def __init__(self, **kwargs): 8 | pass 9 | 10 | @abstractmethod 11 | def sample(self, seg_logit, seg_label): 12 | """Placeholder for sample function.""" 13 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/core/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .misc import add_prefix 2 | 3 | __all__ = ['add_prefix'] 4 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/core/utils/misc.py: -------------------------------------------------------------------------------- 1 | def add_prefix(inputs, prefix): 2 | """Add prefix for dict. 3 | 4 | Args: 5 | inputs (dict): The input dict with str keys. 6 | prefix (str): The prefix to add. 7 | 8 | Returns: 9 | 10 | dict: The dict with keys updated with ``prefix``. 11 | """ 12 | 13 | outputs = dict() 14 | for name, value in inputs.items(): 15 | outputs[f'{prefix}.{name}'] = value 16 | 17 | return outputs 18 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | from .ade import ADE20KDataset 2 | from .builder import DATASETS, PIPELINES, build_dataloader, build_dataset 3 | from .chase_db1 import ChaseDB1Dataset 4 | from .cityscapes import CityscapesDataset 5 | from .custom import CustomDataset 6 | from .dataset_wrappers import ConcatDataset, RepeatDataset 7 | from .drive import DRIVEDataset 8 | from .hrf import HRFDataset 9 | from .pascal_context import PascalContextDataset, PascalContextDataset59 10 | from .stare import STAREDataset 11 | from .voc import PascalVOCDataset 12 | 13 | __all__ = [ 14 | 'CustomDataset', 'build_dataloader', 'ConcatDataset', 'RepeatDataset', 15 | 'DATASETS', 'build_dataset', 'PIPELINES', 'CityscapesDataset', 16 | 'PascalVOCDataset', 'ADE20KDataset', 'PascalContextDataset', 17 | 'PascalContextDataset59', 'ChaseDB1Dataset', 'DRIVEDataset', 'HRFDataset', 18 | 'STAREDataset' 19 | ] 20 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/datasets/chase_db1.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from .builder import DATASETS 4 | from .custom import CustomDataset 5 | 6 | 7 | @DATASETS.register_module() 8 | class ChaseDB1Dataset(CustomDataset): 9 | """Chase_db1 dataset. 10 | 11 | In segmentation map annotation for Chase_db1, 0 stands for background, 12 | which is included in 2 categories. ``reduce_zero_label`` is fixed to False. 13 | The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to 14 | '_1stHO.png'. 15 | """ 16 | 17 | CLASSES = ('background', 'vessel') 18 | 19 | PALETTE = [[120, 120, 120], [6, 230, 230]] 20 | 21 | def __init__(self, **kwargs): 22 | super(ChaseDB1Dataset, self).__init__( 23 | img_suffix='.png', 24 | seg_map_suffix='_1stHO.png', 25 | reduce_zero_label=False, 26 | **kwargs) 27 | assert osp.exists(self.img_dir) 28 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/datasets/drive.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from .builder import DATASETS 4 | from .custom import CustomDataset 5 | 6 | 7 | @DATASETS.register_module() 8 | class DRIVEDataset(CustomDataset): 9 | """DRIVE dataset. 10 | 11 | In segmentation map annotation for DRIVE, 0 stands for background, which is 12 | included in 2 categories. ``reduce_zero_label`` is fixed to False. The 13 | ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to 14 | '_manual1.png'. 15 | """ 16 | 17 | CLASSES = ('background', 'vessel') 18 | 19 | PALETTE = [[120, 120, 120], [6, 230, 230]] 20 | 21 | def __init__(self, **kwargs): 22 | super(DRIVEDataset, self).__init__( 23 | img_suffix='.png', 24 | seg_map_suffix='_manual1.png', 25 | reduce_zero_label=False, 26 | **kwargs) 27 | assert osp.exists(self.img_dir) 28 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/datasets/hrf.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from .builder import DATASETS 4 | from .custom import CustomDataset 5 | 6 | 7 | @DATASETS.register_module() 8 | class HRFDataset(CustomDataset): 9 | """HRF dataset. 10 | 11 | In segmentation map annotation for HRF, 0 stands for background, which is 12 | included in 2 categories. ``reduce_zero_label`` is fixed to False. The 13 | ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to 14 | '.png'. 15 | """ 16 | 17 | CLASSES = ('background', 'vessel') 18 | 19 | PALETTE = [[120, 120, 120], [6, 230, 230]] 20 | 21 | def __init__(self, **kwargs): 22 | super(HRFDataset, self).__init__( 23 | img_suffix='.png', 24 | seg_map_suffix='.png', 25 | reduce_zero_label=False, 26 | **kwargs) 27 | assert osp.exists(self.img_dir) 28 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/datasets/pipelines/__init__.py: -------------------------------------------------------------------------------- 1 | from .compose import Compose 2 | from .formating import (Collect, ImageToTensor, ToDataContainer, ToTensor, 3 | Transpose, to_tensor) 4 | from .loading import LoadAnnotations, LoadImageFromFile 5 | from .test_time_aug import MultiScaleFlipAug 6 | from .transforms import (CLAHE, AdjustGamma, Normalize, Pad, 7 | PhotoMetricDistortion, RandomCrop, RandomFlip, 8 | RandomRotate, Rerange, Resize, RGB2Gray, SegRescale) 9 | 10 | __all__ = [ 11 | 'Compose', 'to_tensor', 'ToTensor', 'ImageToTensor', 'ToDataContainer', 12 | 'Transpose', 'Collect', 'LoadAnnotations', 'LoadImageFromFile', 13 | 'MultiScaleFlipAug', 'Resize', 'RandomFlip', 'Pad', 'RandomCrop', 14 | 'Normalize', 'SegRescale', 'PhotoMetricDistortion', 'RandomRotate', 15 | 'AdjustGamma', 'CLAHE', 'Rerange', 'RGB2Gray' 16 | ] 17 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/datasets/stare.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from .builder import DATASETS 4 | from .custom import CustomDataset 5 | 6 | 7 | @DATASETS.register_module() 8 | class STAREDataset(CustomDataset): 9 | """STARE dataset. 10 | 11 | In segmentation map annotation for STARE, 0 stands for background, which is 12 | included in 2 categories. ``reduce_zero_label`` is fixed to False. The 13 | ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to 14 | '.ah.png'. 15 | """ 16 | 17 | CLASSES = ('background', 'vessel') 18 | 19 | PALETTE = [[120, 120, 120], [6, 230, 230]] 20 | 21 | def __init__(self, **kwargs): 22 | super(STAREDataset, self).__init__( 23 | img_suffix='.png', 24 | seg_map_suffix='.ah.png', 25 | reduce_zero_label=False, 26 | **kwargs) 27 | assert osp.exists(self.img_dir) 28 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/datasets/voc.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | 3 | from .builder import DATASETS 4 | from .custom import CustomDataset 5 | 6 | 7 | @DATASETS.register_module() 8 | class PascalVOCDataset(CustomDataset): 9 | """Pascal VOC dataset. 10 | 11 | Args: 12 | split (str): Split txt file for Pascal VOC. 13 | """ 14 | 15 | CLASSES = ('background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 16 | 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 17 | 'horse', 'motorbike', 'person', 'pottedplant', 'sheep', 'sofa', 18 | 'train', 'tvmonitor') 19 | 20 | PALETTE = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128], 21 | [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0], 22 | [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128], 23 | [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0], 24 | [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]] 25 | 26 | def __init__(self, split, **kwargs): 27 | super(PascalVOCDataset, self).__init__( 28 | img_suffix='.jpg', seg_map_suffix='.png', split=split, **kwargs) 29 | assert osp.exists(self.img_dir) and self.split is not None 30 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/models/__init__.py: -------------------------------------------------------------------------------- 1 | from .backbones import * # noqa: F401,F403 2 | from .builder import (BACKBONES, HEADS, LOSSES, SEGMENTORS, build_backbone, 3 | build_head, build_loss, build_segmentor) 4 | from .decode_heads import * # noqa: F401,F403 5 | from .losses import * # noqa: F401,F403 6 | from .necks import * # noqa: F401,F403 7 | from .segmentors import * # noqa: F401,F403 8 | 9 | __all__ = [ 10 | 'BACKBONES', 'HEADS', 'LOSSES', 'SEGMENTORS', 'build_backbone', 11 | 'build_head', 'build_loss', 'build_segmentor' 12 | ] 13 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/models/backbones/__init__.py: -------------------------------------------------------------------------------- 1 | from .cgnet import CGNet 2 | # from .fast_scnn import FastSCNN 3 | from .hrnet import HRNet 4 | from .mobilenet_v2 import MobileNetV2 5 | from .mobilenet_v3 import MobileNetV3 6 | from .resnest import ResNeSt 7 | from .resnet import ResNet, ResNetV1c, ResNetV1d 8 | from .resnext import ResNeXt 9 | from .unet import UNet 10 | from .vit import VisionTransformer 11 | from .uniformer import UniFormer 12 | 13 | __all__ = [ 14 | 'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 15 | 'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3', 16 | 'VisionTransformer', 'UniFormer' 17 | ] 18 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/models/decode_heads/__init__.py: -------------------------------------------------------------------------------- 1 | from .ann_head import ANNHead 2 | from .apc_head import APCHead 3 | from .aspp_head import ASPPHead 4 | from .cc_head import CCHead 5 | from .da_head import DAHead 6 | from .dm_head import DMHead 7 | from .dnl_head import DNLHead 8 | from .ema_head import EMAHead 9 | from .enc_head import EncHead 10 | from .fcn_head import FCNHead 11 | from .fpn_head import FPNHead 12 | from .gc_head import GCHead 13 | from .lraspp_head import LRASPPHead 14 | from .nl_head import NLHead 15 | from .ocr_head import OCRHead 16 | # from .point_head import PointHead 17 | from .psa_head import PSAHead 18 | from .psp_head import PSPHead 19 | from .sep_aspp_head import DepthwiseSeparableASPPHead 20 | from .sep_fcn_head import DepthwiseSeparableFCNHead 21 | from .uper_head import UPerHead 22 | 23 | __all__ = [ 24 | 'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead', 25 | 'UPerHead', 'DepthwiseSeparableASPPHead', 'ANNHead', 'DAHead', 'OCRHead', 26 | 'EncHead', 'DepthwiseSeparableFCNHead', 'FPNHead', 'EMAHead', 'DNLHead', 27 | 'APCHead', 'DMHead', 'LRASPPHead' 28 | ] 29 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/models/losses/__init__.py: -------------------------------------------------------------------------------- 1 | from .accuracy import Accuracy, accuracy 2 | from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy, 3 | cross_entropy, mask_cross_entropy) 4 | from .dice_loss import DiceLoss 5 | from .lovasz_loss import LovaszLoss 6 | from .utils import reduce_loss, weight_reduce_loss, weighted_loss 7 | 8 | __all__ = [ 9 | 'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy', 10 | 'mask_cross_entropy', 'CrossEntropyLoss', 'reduce_loss', 11 | 'weight_reduce_loss', 'weighted_loss', 'LovaszLoss', 'DiceLoss' 12 | ] 13 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/models/necks/__init__.py: -------------------------------------------------------------------------------- 1 | from .fpn import FPN 2 | from .multilevel_neck import MultiLevelNeck 3 | 4 | __all__ = ['FPN', 'MultiLevelNeck'] 5 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/models/segmentors/__init__.py: -------------------------------------------------------------------------------- 1 | from .base import BaseSegmentor 2 | from .cascade_encoder_decoder import CascadeEncoderDecoder 3 | from .encoder_decoder import EncoderDecoder 4 | 5 | __all__ = ['BaseSegmentor', 'EncoderDecoder', 'CascadeEncoderDecoder'] 6 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/models/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .drop import DropPath 2 | from .inverted_residual import InvertedResidual, InvertedResidualV3 3 | from .make_divisible import make_divisible 4 | from .res_layer import ResLayer 5 | from .se_layer import SELayer 6 | from .self_attention_block import SelfAttentionBlock 7 | from .up_conv_block import UpConvBlock 8 | from .weight_init import trunc_normal_ 9 | 10 | __all__ = [ 11 | 'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual', 12 | 'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'DropPath', 'trunc_normal_' 13 | ] 14 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/models/utils/drop.py: -------------------------------------------------------------------------------- 1 | """Modified from https://github.com/rwightman/pytorch-image- 2 | models/blob/master/timm/models/layers/drop.py.""" 3 | 4 | import torch 5 | from torch import nn 6 | 7 | 8 | class DropPath(nn.Module): 9 | """Drop paths (Stochastic Depth) per sample (when applied in main path of 10 | residual blocks). 11 | 12 | Args: 13 | drop_prob (float): Drop rate for paths of model. Dropout rate has 14 | to be between 0 and 1. Default: 0. 15 | """ 16 | 17 | def __init__(self, drop_prob=0.): 18 | super(DropPath, self).__init__() 19 | self.drop_prob = drop_prob 20 | self.keep_prob = 1 - drop_prob 21 | 22 | def forward(self, x): 23 | if self.drop_prob == 0. or not self.training: 24 | return x 25 | shape = (x.shape[0], ) + (1, ) * ( 26 | x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets 27 | random_tensor = self.keep_prob + torch.rand( 28 | shape, dtype=x.dtype, device=x.device) 29 | random_tensor.floor_() # binarize 30 | output = x.div(self.keep_prob) * random_tensor 31 | return output 32 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/ops/__init__.py: -------------------------------------------------------------------------------- 1 | from .encoding import Encoding 2 | from .wrappers import Upsample, resize 3 | 4 | __all__ = ['Upsample', 'resize', 'Encoding'] 5 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .collect_env import collect_env 2 | from .logger import get_root_logger 3 | 4 | __all__ = ['get_root_logger', 'collect_env'] 5 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/utils/collect_env.py: -------------------------------------------------------------------------------- 1 | from annotator.uniformer.mmcv.utils import collect_env as collect_base_env 2 | from annotator.uniformer.mmcv.utils import get_git_hash 3 | 4 | import annotator.uniformer.mmseg as mmseg 5 | 6 | 7 | def collect_env(): 8 | """Collect the information of the running environments.""" 9 | env_info = collect_base_env() 10 | env_info['MMSegmentation'] = f'{mmseg.__version__}+{get_git_hash()[:7]}' 11 | 12 | return env_info 13 | 14 | 15 | if __name__ == '__main__': 16 | for name, val in collect_env().items(): 17 | print('{}: {}'.format(name, val)) 18 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/uniformer/mmseg/utils/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from annotator.uniformer.mmcv.utils import get_logger 4 | 5 | 6 | def get_root_logger(log_file=None, log_level=logging.INFO): 7 | """Get the root logger. 8 | 9 | The logger will be initialized if it has not been initialized. By default a 10 | StreamHandler will be added. If `log_file` is specified, a FileHandler will 11 | also be added. The name of the root logger is the top-level package name, 12 | e.g., "mmseg". 13 | 14 | Args: 15 | log_file (str | None): The log filename. If specified, a FileHandler 16 | will be added to the root logger. 17 | log_level (int): The root logger level. Note that only the process of 18 | rank 0 is affected, while other processes will set the level to 19 | "Error" and be silent most of the time. 20 | 21 | Returns: 22 | logging.Logger: The root logger. 23 | """ 24 | 25 | logger = get_logger(name='mmseg', log_file=log_file, log_level=log_level) 26 | 27 | return logger 28 | -------------------------------------------------------------------------------- /experiments/lavis/common/annotator/util.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import cv2 3 | import os 4 | 5 | 6 | annotator_ckpts_path = os.path.join(os.path.dirname(__file__), 'ckpts') 7 | 8 | 9 | def HWC3(x): 10 | assert x.dtype == np.uint8 11 | if x.ndim == 2: 12 | x = x[:, :, None] 13 | assert x.ndim == 3 14 | H, W, C = x.shape 15 | assert C == 1 or C == 3 or C == 4 16 | if C == 3: 17 | return x 18 | if C == 1: 19 | return np.concatenate([x, x, x], axis=2) 20 | if C == 4: 21 | color = x[:, :, 0:3].astype(np.float32) 22 | alpha = x[:, :, 3:4].astype(np.float32) / 255.0 23 | y = color * alpha + 255.0 * (1.0 - alpha) 24 | y = y.clip(0, 255).astype(np.uint8) 25 | return y 26 | 27 | 28 | def resize_image(input_image, resolution): 29 | H, W, C = input_image.shape 30 | H = float(H) 31 | W = float(W) 32 | k = float(resolution) / min(H, W) 33 | H *= k 34 | W *= k 35 | H = int(np.round(H / 64.0)) * 64 36 | W = int(np.round(W / 64.0)) * 64 37 | img = cv2.resize(input_image, (W, H), interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA) 38 | return img 39 | -------------------------------------------------------------------------------- /experiments/lavis/common/gradcam.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | from scipy.ndimage import filters 4 | from skimage import transform as skimage_transform 5 | 6 | 7 | def getAttMap(img, attMap, blur=True, overlap=True): 8 | attMap -= attMap.min() 9 | if attMap.max() > 0: 10 | attMap /= attMap.max() 11 | attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant") 12 | if blur: 13 | attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2])) 14 | attMap -= attMap.min() 15 | attMap /= attMap.max() 16 | cmap = plt.get_cmap("jet") 17 | attMapV = cmap(attMap) 18 | attMapV = np.delete(attMapV, 3, 2) 19 | if overlap: 20 | attMap = ( 21 | 1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img 22 | + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV 23 | ) 24 | return attMap 25 | -------------------------------------------------------------------------------- /experiments/lavis/common/vqa_tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | __author__ = "aagrawal" 9 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/avsd/defaults_dial.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | avsd_dialogue: # name of the dataset builder 8 | dataset_card: dataset_card/avsd_dialogue.md 9 | data_type: features #extracted features of videos (I3D, VGGish) # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_train.json 16 | storage: avsd/annotations/train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_val.json 19 | storage: avsd/annotations/val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/avsd_dstc7_test.json 22 | storage: avsd/annotations/test.json 23 | features: 24 | storage: avsd/features/ 25 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/blip_diffusion_datasets/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | blip_diffusion_finetune: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | images: 14 | storage: "" 15 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/coco/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | coco_retrieval: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_train.json 16 | md5: aa31ac474cf6250ebb81d18348a07ed8 17 | storage: coco/annotations/coco_karpathy_train.json 18 | val: 19 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_val.json 20 | md5: b273847456ef5580e33713b1f7de52a0 21 | storage: coco/annotations/coco_karpathy_val.json 22 | test: 23 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/coco_karpathy_test.json 24 | md5: 3ff34b0ef2db02d01c37399f6a2a6cd1 25 | storage: coco/annotations/coco_karpathy_test.json 26 | images: 27 | storage: coco/images/ 28 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/conceptual_caption/defaults_12m.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | conceptual_caption_12m: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - /export/home/workspace/datasets/cc12m.json 17 | storage: 18 | - conceptual_caption/annotations/cc12m.json 19 | images: 20 | storage: conceptual_caption/images_12m 21 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/conceptual_caption/defaults_3m.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | conceptual_caption_3m: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - /export/home/workspace/datasets/cc3m.json 17 | storage: 18 | - conceptual_caption/annotations/cc3m.json 19 | images: 20 | storage: conceptual_caption/images 21 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/didemo/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | didemo_retrieval: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_train.json 16 | storage: didemo/annotations/retrieval_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_val.json 19 | storage: didemo/annotations/retrieval_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/didemo/retrieval_test.json 22 | storage: didemo/annotations/retrieval_test.json 23 | videos: 24 | storage: didemo/videos 25 | # storage: /export/share/dongxuli/data/didemo_retrieval/videos 26 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/flickr30k/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | flickr30k: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images 10 | 11 | build_info: 12 | annotations: 13 | train: 14 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_train.json 15 | storage: flickr30k/annotations/train.json 16 | val: 17 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_val.json 18 | storage: flickr30k/annotations/val.json 19 | test: 20 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/flickr30k_test.json 21 | storage: flickr30k/annotations/test.json 22 | images: 23 | storage: flickr30k/images 24 | # storage: /export/share/datasets/vision/flickr30k 25 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/imagenet/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | imagenet: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | splits: ["val"] 14 | images: 15 | storage: /export/share/datasets/vision/imagenet 16 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/laion/defaults_2B_multi.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | laion2B_multi: 8 | 9 | data_type: images 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | storage: /export/laion/laion2B-multi/part-00000/{00000..01743}.tar 14 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/msrvtt/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msrvtt_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_train.json 16 | storage: msrvtt/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_val.json 19 | storage: msrvtt/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/cap_test.json 22 | storage: msrvtt/annotations/cap_test.json 23 | videos: 24 | storage: msrvtt/videos 25 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/msrvtt/defaults_ret.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msrvtt_retrieval: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_train.json 16 | storage: msrvtt/annotations/retrieval_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_val.json 19 | storage: msrvtt/annotations/retrieval_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msrvtt/retrieval_test.json 22 | storage: msrvtt/annotations/retrieval_test.json 23 | videos: 24 | storage: msrvtt/videos 25 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/msvd/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_train.json 16 | storage: msvd/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_val.json 19 | storage: msvd/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/msvd/cap_test.json 22 | storage: msvd/annotations/cap_test.json 23 | videos: 24 | storage: msvd/videos 25 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/nlvr/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | nlvr: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_train.json 16 | storage: nlvr/annotations/train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json 19 | storage: nlvr/annotations/dev.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/nlvr/nlvr_dev.json 22 | storage: nlvr/annotations/test.json 23 | images: 24 | storage: /export/share/datasets/vision/NLVR2/ 25 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/nocaps/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | nocaps: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | val: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_val.json 16 | storage: nocaps/annotations/nocaps_val.json 17 | test: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/datasets/nocaps_test.json 19 | storage: nocaps/annotations/nocaps_test.json 20 | images: 21 | storage: nocaps/images 22 | # storage: /export/share/datasets/vision/nocaps/ 23 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/sbu_caption/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | sbu_caption: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: 16 | - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/sbu/sbu.json 17 | # - /export/share/dongxuli/data/lavis/sbu/annotation/sbu.json 18 | storage: 19 | - sbu_captions/annotations/sbu.json 20 | images: 21 | storage: sbu_captions/images 22 | # storage: /export/share/datasets/vision_language/sbu_resize 23 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/snli_ve/defaults.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | snli_ve: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_train.json 16 | storage: snli/annotations/ve_train.json 17 | val: 18 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_dev.json 19 | storage: snli/annotations/ve_dev.json 20 | test: 21 | url: /export/share/dongxuli/data/lavis/snli/annotation/ve_test.json 22 | storage: snli/annotations/ve_test.json 23 | images: 24 | storage: flickr30k/images/flickr30k-images 25 | # storage: /export/share/datasets/vision/flickr30k/flickr30k-images 26 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/vatex/defaults_cap.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | msvd_cap: # name of the dataset builder 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: videos # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_train.json 16 | storage: vatex/annotations/cap_train.json 17 | val: 18 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_val.json 19 | storage: vatex/annotations/cap_val.json 20 | test: 21 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/vatex/cap_private_test.json 22 | storage: vatex/annotations/cap_test.json 23 | videos: 24 | storage: /export/share/dongxuli/data/vatex 25 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/vg/defaults_caption.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | vg_caption: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_caption.json 16 | storage: vg/annotations/vg_caption.json 17 | images: 18 | storage: vg/images/ 19 | -------------------------------------------------------------------------------- /experiments/lavis/configs/datasets/vg/defaults_vqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | vg_vqa: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | annotations: 14 | train: 15 | url: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/visual_genome/vg_qa.json 16 | storage: vg/annotations/vg_qa.json 17 | images: 18 | storage: vg/images/ 19 | -------------------------------------------------------------------------------- /experiments/lavis/configs/default.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | env: 7 | # For default users 8 | # cache_root: "cache" 9 | # For internal use with persistent storage 10 | cache_root: "/export/home/.cache/lavis" 11 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/albef_classification_ve.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_classification 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_snli_ve_lavis.pt" 11 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 12 | 13 | num_classes: 3 14 | 15 | use_distill: True 16 | momentum: 0.995 17 | alpha: 0.4 18 | 19 | # vit encoder 20 | vit_type: "base" 21 | vit_grad_ckpt: False 22 | vit_ckpt_layer: 0 23 | vit_layer_norm_epsilon: 1e-6 24 | 25 | image_size: 384 26 | 27 | # bert config 28 | med_config_path: "configs/models/med_config_albef.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | eval: 35 | name: "blip_image_eval" 36 | text_processor: 37 | train: 38 | name: "blip_caption" 39 | eval: 40 | name: "blip_caption" 41 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/albef_feature_extractor.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_pretrain 8 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 9 | 10 | # vit encoder 11 | vit_type: "base" 12 | image_size: 224 13 | vit_ckpt_layer: 0 14 | vit_drop_path_rate: 0 15 | vit_layer_norm_epsilon: 1e-6 16 | vit_grad_ckpt: False 17 | 18 | # bert config 19 | med_config_path: "configs/models/med_config_albef.json" 20 | 21 | embed_dim: 256 22 | 23 | preprocess: 24 | vis_processor: 25 | eval: 26 | name: "blip_image_eval" 27 | image_size: 224 28 | text_processor: 29 | eval: 30 | name: "blip_caption" 31 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/albef_nlvr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_nlvr 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/pretrain_model_nlvr.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_nlvr_lavis.pt" 12 | 13 | num_classes: 2 14 | 15 | use_distill: True 16 | momentum: 0.995 17 | alpha: 0.4 18 | 19 | # vit encoder 20 | vit_type: "base" 21 | vit_grad_ckpt: False 22 | vit_ckpt_layer: 0 23 | vit_layer_norm_epsilon: 1e-6 24 | 25 | image_size: 384 26 | 27 | # bert config 28 | med_config_path: "configs/models/med_config_albef.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 384 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 384 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/albef_pretrain_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_pretrain 8 | 9 | load_pretrained: True 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | image_size: 224 15 | vit_ckpt_layer: 0 16 | vit_drop_path_rate: 0 17 | vit_layer_norm_epsilon: 1e-6 18 | vit_grad_ckpt: False 19 | 20 | # bert config 21 | med_config_path: "configs/models/med_config_albef.json" 22 | mlm_mask_prob: 0.15 23 | 24 | embed_dim: 256 25 | momentum: 0.995 26 | alpha: 0.4 27 | temp: 0.07 28 | 29 | max_txt_len: 30 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 256 36 | text_processor: 37 | train: 38 | name: "blip_caption" 39 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/albef_vqav2.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_vqa 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-pcl-data-research/ALBEF/ALBEF.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALBEF/albef_vqav2_lavis.pt" 12 | 13 | use_distill: True 14 | momentum: 0.995 15 | alpha: 0.4 16 | 17 | # vit encoder 18 | vit_type: "base" 19 | vit_grad_ckpt: False 20 | vit_ckpt_layer: 0 21 | vit_layer_norm_epsilon: 1e-6 22 | 23 | image_size: 384 24 | 25 | # bert config 26 | med_config_path: "configs/models/med_config_albef.json" 27 | 28 | preprocess: 29 | vis_processor: 30 | train: 31 | name: "blip_image_train" 32 | image_size: 384 33 | eval: 34 | name: "blip_image_eval" 35 | image_size: 384 36 | text_processor: 37 | train: 38 | name: "blip_question" 39 | eval: 40 | name: "blip_question" 41 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/alpro_qa_msrvtt.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | num_classes: 1500 9 | 10 | load_finetuned: True 11 | 12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_qa.pth" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 14 | 15 | timesformer: 16 | n_frms: 16 17 | image_size: 224 18 | 19 | patch_size: 16 20 | attn_drop_rate: 0. 21 | drop_rate: 0. 22 | drop_path_rate: 0.1 23 | 24 | use_grad_ckpt: True 25 | ckpt_layer: 12 26 | 27 | # bert config 28 | med_config_path: "configs/models/bert_config_alpro.json" 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "alpro_video_train" 34 | n_frms: 16 35 | image_size: 224 36 | eval: 37 | name: "alpro_video_eval" 38 | n_frms: 16 39 | image_size: 224 40 | text_processor: 41 | train: 42 | name: "blip_caption" 43 | eval: 44 | name: "blip_caption" 45 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/alpro_qa_msvd.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | num_classes: 2423 9 | 10 | load_finetuned: True 11 | 12 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msvd_qa.pth" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 14 | 15 | timesformer: 16 | n_frms: 16 17 | image_size: 224 18 | 19 | patch_size: 16 20 | attn_drop_rate: 0. 21 | drop_rate: 0. 22 | drop_path_rate: 0.1 23 | use_grad_ckpt: True 24 | ckpt_layer: 12 25 | 26 | # bert config 27 | med_config_path: "configs/models/bert_config_alpro.json" 28 | 29 | preprocess: 30 | vis_processor: 31 | train: 32 | name: "alpro_video_train" 33 | n_frms: 16 34 | image_size: 224 35 | eval: 36 | name: "alpro_video_eval" 37 | n_frms: 16 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/alpro_retrieval_didemo.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | 9 | load_finetuned: True 10 | 11 | finetuned: https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_didemo_retrieval.pt 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 13 | 14 | timesformer: 15 | n_frms: 8 16 | image_size: 224 17 | 18 | patch_size: 16 19 | attn_drop_rate: 0. 20 | drop_rate: 0. 21 | drop_path_rate: 0.1 22 | use_grad_ckpt: False 23 | 24 | # bert config 25 | med_config_path: "configs/models/bert_config_alpro.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | eval: 30 | name: "alpro_video_eval" 31 | n_frms: 8 32 | image_size: 224 33 | text_processor: 34 | eval: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/alpro_retrieval_msrvtt.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | 9 | load_finetuned: True 10 | 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_msrvtt_retrieval.pt" 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/ALPRO/alpro_pretrain.pt" 13 | 14 | timesformer: 15 | n_frms: 8 16 | image_size: 224 17 | 18 | patch_size: 16 19 | attn_drop_rate: 0. 20 | drop_rate: 0. 21 | drop_path_rate: 0.1 22 | use_grad_ckpt: False 23 | 24 | # bert config 25 | med_config_path: "configs/models/bert_config_alpro.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "alpro_video_train" 31 | n_frms: 8 32 | image_size: 224 33 | eval: 34 | name: "alpro_video_eval" 35 | n_frms: 8 36 | image_size: 224 37 | text_processor: 38 | train: 39 | name: "blip_caption" 40 | eval: 41 | name: "blip_caption" 42 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/bert_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /experiments/lavis/configs/models/bert_config_alpro.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": true, 18 | "type_vocab_size": 2, 19 | "vocab_size": 30522, 20 | "encoder_width": 768, 21 | "add_cross_attention": false, 22 | "fusion_layer": 6 23 | } -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip-diffusion/blip_diffusion_base.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | vit_model: "clip_L" 3 | 4 | qformer_num_query_token: 16 5 | qformer_cross_attention_freq: 1 6 | 7 | sd_train_text_encoder: False 8 | sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 9 | 10 | load_finetuned: False 11 | load_pretrained: True 12 | # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz" 14 | 15 | preprocess: 16 | vis_processor: 17 | train: 18 | name: "blip_diffusion_inp_image_eval" 19 | eval: 20 | name: "blip_diffusion_inp_image_eval" 21 | text_processor: 22 | train: 23 | name: "blip_caption" 24 | eval: 25 | name: "blip_caption" 26 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_canny.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | vit_model: "clip_L" 3 | 4 | qformer_num_query_token: 16 5 | qformer_cross_attention_freq: 1 6 | 7 | sd_train_text_encoder: False 8 | sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 9 | 10 | load_finetuned: False 11 | load_pretrained: True 12 | # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion.tar.gz" 14 | 15 | controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-canny" 16 | 17 | preprocess: 18 | vis_processor: 19 | train: 20 | name: "blip_diffusion_inp_image_eval" 21 | eval: 22 | name: "blip_diffusion_inp_image_eval" 23 | text_processor: 24 | train: 25 | name: "blip_caption" 26 | eval: 27 | name: "blip_caption" 28 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_depth.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | vit_model: "clip_L" 3 | 4 | qformer_num_query_token: 16 5 | qformer_cross_attention_freq: 1 6 | 7 | sd_train_text_encoder: False 8 | sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 9 | 10 | load_finetuned: False 11 | load_pretrained: True 12 | # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz" 14 | 15 | controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-depth" 16 | 17 | preprocess: 18 | vis_processor: 19 | train: 20 | name: "blip_diffusion_inp_image_eval" 21 | eval: 22 | name: "blip_diffusion_inp_image_eval" 23 | text_processor: 24 | train: 25 | name: "blip_caption" 26 | eval: 27 | name: "blip_caption" 28 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip-diffusion/blip_diffusion_controlnet_hed.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | vit_model: "clip_L" 3 | 4 | qformer_num_query_token: 16 5 | qformer_cross_attention_freq: 1 6 | 7 | sd_train_text_encoder: False 8 | sd_pretrained_model_name_or_path: "runwayml/stable-diffusion-v1-5" 9 | 10 | load_finetuned: False 11 | load_pretrained: True 12 | # pretrained: "/export/share/dongxuli/zerobooth/500000-renamed/" 13 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP-Diffusion/blip-diffusion-openimage.tar.gz" 14 | 15 | controlnet_pretrained_model_name_or_path: "lllyasviel/sd-controlnet-hed" 16 | 17 | preprocess: 18 | vis_processor: 19 | train: 20 | name: "blip_diffusion_inp_image_eval" 21 | eval: 22 | name: "blip_diffusion_inp_image_eval" 23 | text_processor: 24 | train: 25 | name: "blip_caption" 26 | eval: 27 | name: "blip_caption" 28 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip2/blip2_caption_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_flant5xl 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_flant5xl.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip2/blip2_caption_opt2.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_opt2.7b 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt2.7b.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-2.7b" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip2/blip2_caption_opt6.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: caption_coco_opt6.7b 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_caption_opt6.7b.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-6.7b" 25 | 26 | # generation configs 27 | prompt: "a photo of" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 364 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 364 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip2/blip2_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: coco 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_finetune_coco.pth" 12 | 13 | # vit encoder 14 | image_size: 364 15 | drop_path_rate: 0 16 | use_grad_checkpoint: True 17 | vit_precision: "fp32" 18 | freeze_vit: False 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 364 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 364 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | eval: 36 | name: "blip_caption" 37 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip2/blip2_instruct_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: flant5xl 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxl_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # T5 25 | t5_model: "google/flan-t5-xl" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip2/blip2_instruct_flant5xxl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: flant5xxl 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_flanxxl_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # T5 25 | t5_model: "google/flan-t5-xxl" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip2/blip2_instruct_vicuna13b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna13b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | # cd_comments: set right path for pretrained blip ckpt 12 | pretrained: "./checkpoints/instruct_blip_vicuna13b_trimmed.pth" 13 | finetuned: "" 14 | 15 | # vit encoder 16 | image_size: 224 17 | drop_path_rate: 0 18 | use_grad_checkpoint: False 19 | vit_precision: "fp16" 20 | freeze_vit: True 21 | 22 | # Q-Former 23 | num_query_token: 32 24 | 25 | # cd_comments: set right path for vicuna 26 | llm_model: "./checkpoints/vicuna-13b-v1.1" 27 | 28 | # generation configs 29 | prompt: "" 30 | 31 | 32 | preprocess: 33 | vis_processor: 34 | train: 35 | name: "blip2_image_train" 36 | image_size: 224 37 | eval: 38 | name: "blip_image_eval" 39 | image_size: 224 40 | text_processor: 41 | train: 42 | name: "blip_caption" 43 | eval: 44 | name: "blip_caption" 45 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna7b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | # cd_comments: set right path for pretrained blip ckpt 12 | pretrained: "path/to/the/instruct_blip_vicuna7b_trimmed.pth" 13 | 14 | finetuned: "" 15 | 16 | # vit encoder 17 | image_size: 224 18 | drop_path_rate: 0 19 | use_grad_checkpoint: False 20 | vit_precision: "fp16" 21 | freeze_vit: True 22 | 23 | # Q-Former 24 | num_query_token: 32 25 | 26 | # cd_comments: set right path for vicuna 27 | llm_model: "path/checkpoints/vicuna-7b-v1.1" 28 | 29 | # generation configs 30 | prompt: "" 31 | 32 | 33 | preprocess: 34 | vis_processor: 35 | train: 36 | name: "blip2_image_train" 37 | image_size: 224 38 | eval: 39 | name: "blip_image_eval" 40 | image_size: 224 41 | text_processor: 42 | train: 43 | name: "blip_caption" 44 | eval: 45 | name: "blip_caption" 46 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip2/blip2_pretrain.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 224 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 224 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | eval: 36 | name: "blip_caption" 37 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip2/blip2_pretrain_flant5xl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip2/blip2_pretrain_flant5xl_vitL.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xl_vitL.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | vit_model: "clip_L" 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # T5 25 | t5_model: "google/flan-t5-xl" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip2/blip2_pretrain_flant5xxl.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_flant5xxl 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_flant5xxl.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # T5 24 | t5_model: "google/flan-t5-xxl" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip2/blip2_pretrain_llama7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip2_llama 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # LLM 24 | llm_model: "/export/home/project/stanford_alpaca/llama_7B" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip2_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip2/blip2_pretrain_opt2.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_opt2.7b 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt2.7b.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-2.7b" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip2/blip2_pretrain_opt6.7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain_opt6.7b 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_opt6.7b.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | image_size: 224 15 | drop_path_rate: 0 16 | use_grad_checkpoint: False 17 | vit_precision: "fp16" 18 | freeze_vit: True 19 | 20 | # Q-Former 21 | num_query_token: 32 22 | 23 | # OPT 24 | opt_model: "facebook/opt-6.7b" 25 | 26 | # generation configs 27 | prompt: "" 28 | 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 224 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 224 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip2/blip2_pretrain_vitL.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: pretrain 8 | load_finetuned: False 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained_vitL.pth" 11 | finetuned: "" 12 | 13 | # vit encoder 14 | vit_model: "clip_L" 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | 25 | preprocess: 26 | vis_processor: 27 | train: 28 | name: "blip_image_train" 29 | image_size: 224 30 | eval: 31 | name: "blip_image_eval" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | eval: 37 | name: "blip_caption" 38 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip_caption_base_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_caption_base.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | 18 | image_size: 384 19 | 20 | # bert config 21 | med_config_path: "configs/models/med_config.json" 22 | 23 | # generation configs 24 | prompt: "a picture of " 25 | 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | eval: 32 | name: "blip_image_eval" 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | prompt: "a picture of " 37 | eval: 38 | name: "blip_caption" 39 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip_caption_large_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | load_finetuned: True 9 | 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large.pth" 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_caption.pth" 12 | 13 | vit_type: "large" 14 | vit_grad_ckpt: True 15 | vit_ckpt_layer: 5 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_large_config.json" 21 | 22 | # generation configs 23 | prompt: "a picture of " 24 | 25 | 26 | preprocess: 27 | vis_processor: 28 | train: 29 | name: "blip_image_train" 30 | eval: 31 | name: "blip_image_eval" 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | prompt: "a picture of " 36 | eval: 37 | name: "blip_caption" 38 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip_classification_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_classification 8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" 9 | 10 | use_distill: True 11 | momentum: 0.995 12 | alpha: 0.4 13 | 14 | # vit encoder 15 | vit_type: "base" 16 | vit_grad_ckpt: False 17 | vit_ckpt_layer: 0 18 | 19 | image_size: 384 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip_feature_extractor_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 9 | 10 | # vit encoder 11 | vit_type: "base" 12 | vit_grad_ckpt: False 13 | vit_ckpt_layer: 0 14 | 15 | image_size: 224 16 | 17 | # bert config 18 | med_config_path: "configs/models/med_config.json" 19 | 20 | embed_dim: 256 21 | 22 | preprocess: 23 | vis_processor: 24 | eval: 25 | name: "blip_image_eval" 26 | image_size: 224 27 | text_processor: 28 | eval: 29 | name: "blip_caption" 30 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip_itm_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_image_text_matching 8 | 9 | load_finetuned: True 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_retrieval_coco.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_config.json" 21 | 22 | embed_dim: 256 23 | 24 | preprocess: 25 | vis_processor: 26 | eval: 27 | name: "blip_image_eval" 28 | image_size: 384 29 | text_processor: 30 | eval: 31 | name: "blip_caption" 32 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip_itm_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_image_text_matching 8 | 9 | load_finetuned: True 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_large_retrieval_coco.pth" 11 | 12 | # vit encoder 13 | vit_type: "large" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 384 18 | 19 | # bert config 20 | med_config_path: "configs/models/med_large_config.json" 21 | 22 | embed_dim: 256 23 | 24 | preprocess: 25 | vis_processor: 26 | eval: 27 | name: "blip_image_eval" 28 | image_size: 384 29 | text_processor: 30 | eval: 31 | name: "blip_caption" 32 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip_nlvr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_nlvr 8 | model_type: nlvr 9 | load_finetuned: True 10 | 11 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_nlvr.pth" 12 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 13 | 14 | num_classes: 2 15 | 16 | # vit encoder 17 | vit_type: "base" 18 | vit_grad_ckpt: False 19 | vit_ckpt_layer: 0 20 | vit_layer_norm_epsilon: 1e-6 21 | 22 | image_size: 384 23 | 24 | # bert config 25 | med_config_path: "configs/models/med_config.json" 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | image_size: 384 32 | eval: 33 | name: "blip_image_eval" 34 | image_size: 384 35 | text_processor: 36 | train: 37 | name: "blip_caption" 38 | eval: 39 | name: "blip_caption" 40 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip_pretrain_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | 9 | load_pretrained: True 10 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 11 | 12 | # vit encoder 13 | vit_type: "base" 14 | vit_grad_ckpt: False 15 | vit_ckpt_layer: 0 16 | 17 | image_size: 224 18 | alpha: 0.4 19 | 20 | # bert config 21 | med_config_path: "configs/models/bert_config.json" 22 | 23 | embed_dim: 256 24 | 25 | # generation configs 26 | prompt: "a picture of " 27 | 28 | preprocess: 29 | vis_processor: 30 | train: 31 | name: "blip_image_train" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip_pretrain_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_pretrain 8 | 9 | # vit encoder 10 | vit_type: "large" 11 | vit_grad_ckpt: True 12 | vit_ckpt_layer: 5 13 | 14 | image_size: 224 15 | 16 | # bert config 17 | med_config_path: "configs/models/med_large_config.json" 18 | 19 | embed_dim: 256 20 | 21 | # generation configs 22 | prompt: "a picture of " 23 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip_retrieval_coco.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_coco_retrieval.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | queue_size: 57600 14 | 15 | # vit encoder 16 | vit_type: "base" 17 | vit_grad_ckpt: True 18 | vit_ckpt_layer: 4 19 | 20 | image_size: 384 21 | 22 | # bert config 23 | med_config_path: "configs/models/med_config.json" 24 | 25 | embed_dim: 256 26 | 27 | preprocess: 28 | vis_processor: 29 | train: 30 | name: "blip_image_train" 31 | image_size: 384 32 | eval: 33 | name: "blip_image_eval" 34 | image_size: 384 35 | text_processor: 36 | train: 37 | name: "blip_caption" 38 | eval: 39 | name: "blip_caption" 40 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip_retrieval_flickr.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_flickr_retrieval.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | queue_size: 57600 14 | alpha: 0.4 15 | 16 | negative_all_rank: False 17 | 18 | # vit encoder 19 | vit_type: "base" 20 | vit_grad_ckpt: True 21 | vit_ckpt_layer: 4 22 | 23 | image_size: 384 24 | 25 | # bert config 26 | med_config_path: "configs/models/med_config.json" 27 | 28 | embed_dim: 256 29 | 30 | preprocess: 31 | vis_processor: 32 | train: 33 | name: "blip_image_train" 34 | image_size: 384 35 | eval: 36 | name: "blip_image_eval" 37 | image_size: 384 38 | text_processor: 39 | train: 40 | name: "blip_caption" 41 | eval: 42 | name: "blip_caption" 43 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip_vqa_aokvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_aokvqa.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip_vqa_okvqa.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP/blip_okvqa.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/blip_vqav2.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | load_finetuned: True 9 | 10 | finetuned: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_vqa_capfilt_large.pth" 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 12 | 13 | # vit encoder 14 | vit_type: "base" 15 | vit_grad_ckpt: False 16 | vit_ckpt_layer: 0 17 | vit_drop_path_rate: 0.1 18 | 19 | image_size: 480 20 | 21 | # bert config 22 | med_config_path: "configs/models/med_config.json" 23 | 24 | preprocess: 25 | vis_processor: 26 | train: 27 | name: "blip_image_train" 28 | image_size: 480 29 | eval: 30 | name: "blip_image_eval" 31 | image_size: 480 32 | text_processor: 33 | train: 34 | name: "blip_question" 35 | eval: 36 | name: "blip_question" 37 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/RN101-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 23, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/RN101.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 23, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/RN50-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": [ 7 | 3, 8 | 4, 9 | 6, 10 | 3 11 | ], 12 | "width": 64, 13 | "patch_size": null 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 512, 19 | "heads": 8, 20 | "layers": 12 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/RN50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": [ 6 | 3, 7 | 4, 8 | 6, 9 | 3 10 | ], 11 | "width": 64, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 512, 18 | "heads": 8, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/RN50x16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 384, 5 | "layers": [ 6 | 6, 7 | 8, 8 | 18, 9 | 8 10 | ], 11 | "width": 96, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 768, 18 | "heads": 12, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/RN50x4.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 288, 5 | "layers": [ 6 | 4, 7 | 6, 8 | 10, 9 | 6 10 | ], 11 | "width": 80, 12 | "patch_size": null 13 | }, 14 | "text_cfg": { 15 | "context_length": 77, 16 | "vocab_size": 49408, 17 | "width": 640, 18 | "heads": 10, 19 | "layers": 12 20 | } 21 | } 22 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/ViT-B-16-plus-240.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 240, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/ViT-B-16-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/ViT-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/ViT-B-32-plus-256.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 640, 3 | "vision_cfg": { 4 | "image_size": 256, 5 | "layers": 12, 6 | "width": 896, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 640, 13 | "heads": 10, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/ViT-B-32-quickgelu.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "quick_gelu": true, 4 | "vision_cfg": { 5 | "image_size": 224, 6 | "layers": 12, 7 | "width": 768, 8 | "patch_size": 32 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/ViT-B-32.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 32 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 512, 13 | "heads": 8, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/ViT-H-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 14 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/ViT-H-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 1280, 7 | "head_width": 80, 8 | "patch_size": 16 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 1024, 14 | "heads": 16, 15 | "layers": 24 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/ViT-L-14-280.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 280, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/ViT-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/ViT-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 14 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/ViT-L-16-320.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 320, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/ViT-L-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "patch_size": 16 8 | }, 9 | "text_cfg": { 10 | "context_length": 77, 11 | "vocab_size": 49408, 12 | "width": 768, 13 | "heads": 12, 14 | "layers": 12 15 | } 16 | } 17 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/ViT-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14 10 | }, 11 | "text_cfg": { 12 | "context_length": 77, 13 | "vocab_size": 49408, 14 | "width": 1024, 15 | "heads": 16, 16 | "layers": 24 17 | } 18 | } 19 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/timm-efficientnetv2_rw_s.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "timm_model_name": "efficientnetv2_rw_s", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 288 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 768, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/timm-resnet50d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnet50d", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/timm-resnetaa50d.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnetaa50d", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/timm-resnetblur50.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "timm_model_name": "resnetblur50", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "abs_attn", 7 | "timm_proj": "", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/timm-swin_base_patch4_window7_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "swin_base_patch4_window7_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/timm-vit_base_patch16_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_base_patch16_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/timm-vit_base_patch32_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_base_patch32_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip/timm-vit_small_patch16_224.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "timm_model_name": "vit_small_patch16_224", 5 | "timm_model_pretrained": false, 6 | "timm_pool": "", 7 | "timm_proj": "linear", 8 | "image_size": 224 9 | }, 10 | "text_cfg": { 11 | "context_length": 77, 12 | "vocab_size": 49408, 13 | "width": 512, 14 | "heads": 8, 15 | "layers": 12 16 | } 17 | } 18 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip_resnet50.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: RN50 10 | 11 | pretrained: openai 12 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/clip_vit_base16.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-B-16 10 | 11 | pretrained: openai 12 | 13 | preprocess: 14 | vis_processor: 15 | eval: 16 | name: "clip_image_eval" 17 | image_size: 224 18 | -------------------------------------------------------------------------------- /experiments/lavis/configs/models/gpt_dialogue_base.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: gpt_dialogue 8 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth" 9 | # pretrained: "https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_capfilt_large.pth" 10 | 11 | len_tokenizer: 50264 # 50257 tokens from gpt2 default tokenizer + additional special tokens 12 | 13 | len_video_ft: 4224 # i3d_rgb: 2048 i3d_flow: 2048 vggish: 128 14 | 15 | preprocess: 16 | vis_processor: 17 | train: 18 | name: "gpt_video_ft" 19 | eval: 20 | name: "gpt_video_ft" 21 | text_processor: 22 | train: 23 | name: "gpt_dialogue" 24 | eval: 25 | name: "gpt_dialogue" -------------------------------------------------------------------------------- /experiments/lavis/configs/models/med_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30524, 19 | "encoder_width": 768, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /experiments/lavis/configs/models/med_config_albef.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30522, 19 | "encoder_width": 768, 20 | "add_cross_attention": true, 21 | "fusion_layer": 6 22 | } -------------------------------------------------------------------------------- /experiments/lavis/configs/models/med_large_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "architectures": [ 3 | "BertModel" 4 | ], 5 | "attention_probs_dropout_prob": 0.1, 6 | "hidden_act": "gelu", 7 | "hidden_dropout_prob": 0.1, 8 | "hidden_size": 768, 9 | "initializer_range": 0.02, 10 | "intermediate_size": 3072, 11 | "layer_norm_eps": 1e-12, 12 | "max_position_embeddings": 512, 13 | "model_type": "bert", 14 | "num_attention_heads": 12, 15 | "num_hidden_layers": 12, 16 | "pad_token_id": 0, 17 | "add_type_embeddings": false, 18 | "vocab_size": 30524, 19 | "encoder_width": 1024, 20 | "add_cross_attention": true 21 | } -------------------------------------------------------------------------------- /experiments/lavis/datasets/builders/classification_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder 10 | from lavis.datasets.datasets.nlvr_datasets import NLVRDataset, NLVREvalDataset 11 | from lavis.datasets.datasets.snli_ve_datasets import SNLIVisualEntialmentDataset 12 | 13 | 14 | @registry.register_builder("nlvr") 15 | class NLVRBuilder(BaseDatasetBuilder): 16 | train_dataset_cls = NLVRDataset 17 | eval_dataset_cls = NLVREvalDataset 18 | 19 | DATASET_CONFIG_DICT = {"default": "configs/datasets/nlvr/defaults.yaml"} 20 | 21 | 22 | @registry.register_builder("snli_ve") 23 | class SNLIVisualEntailmentBuilder(BaseDatasetBuilder): 24 | train_dataset_cls = SNLIVisualEntialmentDataset 25 | eval_dataset_cls = SNLIVisualEntialmentDataset 26 | 27 | DATASET_CONFIG_DICT = {"default": "configs/datasets/snli_ve/defaults.yaml"} 28 | -------------------------------------------------------------------------------- /experiments/lavis/datasets/builders/dialogue_builder.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.datasets.builders.base_dataset_builder import BaseDatasetBuilder 10 | from lavis.datasets.datasets.avsd_dialogue_datasets import ( 11 | AVSDDialDataset, 12 | AVSDDialEvalDataset, 13 | ) 14 | 15 | 16 | @registry.register_builder("avsd_dialogue") 17 | class AVSDDialBuilder(BaseDatasetBuilder): 18 | train_dataset_cls = AVSDDialDataset 19 | eval_dataset_cls = AVSDDialEvalDataset 20 | 21 | DATASET_CONFIG_DICT = {"default": "configs/datasets/avsd/defaults_dial.yaml"} 22 | -------------------------------------------------------------------------------- /experiments/lavis/datasets/datasets/multimodal_classification_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from abc import abstractmethod 9 | from lavis.datasets.datasets.base_dataset import BaseDataset 10 | 11 | 12 | class MultimodalClassificationDataset(BaseDataset): 13 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 14 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 15 | 16 | self.class_labels = None 17 | 18 | @abstractmethod 19 | def _build_class_labels(self): 20 | pass 21 | -------------------------------------------------------------------------------- /experiments/lavis/datasets/datasets/vg_vqa_datasets.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | 10 | from PIL import Image 11 | 12 | from lavis.datasets.datasets.vqa_datasets import VQADataset 13 | 14 | 15 | class VGVQADataset(VQADataset): 16 | def __init__(self, vis_processor, text_processor, vis_root, ann_paths): 17 | super().__init__(vis_processor, text_processor, vis_root, ann_paths) 18 | 19 | def __getitem__(self, index): 20 | ann = self.annotation[index] 21 | 22 | image_path = os.path.join(self.vis_root, ann["image"]) 23 | image = Image.open(image_path).convert("RGB") 24 | 25 | image = self.vis_processor(image) 26 | question = self.text_processor(ann["question"]) 27 | 28 | answers = [ann["answer"]] 29 | # TODO this should be configured better 30 | weights = [0.2] 31 | 32 | return { 33 | "image": image, 34 | "text_input": question, 35 | "answers": answers, 36 | "weights": weights, 37 | } 38 | -------------------------------------------------------------------------------- /experiments/lavis/models/blip2_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/models/blip2_models/__init__.py -------------------------------------------------------------------------------- /experiments/lavis/models/blip_diffusion_models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/models/blip_diffusion_models/__init__.py -------------------------------------------------------------------------------- /experiments/lavis/models/clip_models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | 7 | Based on https://github.com/mlfoundations/open_clip 8 | """ 9 | 10 | """ OpenAI pretrained model functions 11 | Adapted from https://github.com/mlfoundations/open_clip and https://github.com/openai/CLIP. 12 | 13 | Originally MIT License, Copyright (c) 2021 OpenAI. 14 | """ 15 | -------------------------------------------------------------------------------- /experiments/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/models/clip_models/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /experiments/lavis/models/clip_models/pics/CLIP.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/models/clip_models/pics/CLIP.png -------------------------------------------------------------------------------- /experiments/lavis/models/img2prompt_models/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import torch 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /experiments/lavis/models/timesformer/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | 7 | Based on https://github.com/facebookresearch/TimeSformer 8 | """ 9 | -------------------------------------------------------------------------------- /experiments/lavis/models/timesformer/linear.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | """ Linear layer (alternate definition) 9 | """ 10 | import torch 11 | import torch.nn.functional as F 12 | from torch import nn as nn 13 | 14 | 15 | class Linear(nn.Linear): 16 | def forward(self, input: torch.Tensor) -> torch.Tensor: 17 | if torch.jit.is_scripting(): 18 | bias = self.bias.to(dtype=input.dtype) if self.bias is not None else None 19 | return F.linear(input, self.weight.to(dtype=input.dtype), bias=bias) 20 | else: 21 | return F.linear(input, self.weight, self.bias) 22 | -------------------------------------------------------------------------------- /experiments/lavis/processors/base_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from omegaconf import OmegaConf 9 | 10 | 11 | class BaseProcessor: 12 | def __init__(self): 13 | self.transform = lambda x: x 14 | return 15 | 16 | def __call__(self, item): 17 | return self.transform(item) 18 | 19 | @classmethod 20 | def from_config(cls, cfg=None): 21 | return cls() 22 | 23 | def build(self, **kwargs): 24 | cfg = OmegaConf.create(kwargs) 25 | 26 | return self.from_config(cfg) 27 | -------------------------------------------------------------------------------- /experiments/lavis/projects/albef/eval/nlvr_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_nlvr 8 | model_type: nlvr 9 | 10 | datasets: 11 | nlvr: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: multimodal_classification 22 | 23 | batch_size_train: 16 24 | batch_size_eval: 64 25 | num_workers: 4 26 | 27 | seed: 42 28 | output_dir: "output/ALBEF/NLVR" 29 | 30 | evaluate: True 31 | test_splits: ["val", "test"] 32 | 33 | # distribution-specific 34 | device: "cuda" 35 | world_size: 1 36 | dist_url: "env://" 37 | distributed: True 38 | -------------------------------------------------------------------------------- /experiments/lavis/projects/albef/eval/ret_coco_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | model_type: coco 9 | 10 | datasets: 11 | coco_retrieval: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: retrieval 22 | 23 | # dataloading 24 | num_workers: 4 25 | batch_size_train: 32 26 | batch_size_eval: 64 27 | 28 | test_splits: ["test"] 29 | 30 | # distribution 31 | device: "cuda" 32 | world_size: 1 33 | dist_url: "env://" 34 | distributed: True 35 | use_dist_eval_sampler: False 36 | 37 | # model specific 38 | k_test: 128 39 | 40 | # misc 41 | seed: 42 42 | output_dir: "output/ALBEF/Retrieval_COCO" 43 | 44 | evaluate: True 45 | -------------------------------------------------------------------------------- /experiments/lavis/projects/albef/eval/ret_flickr30k_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_retrieval 8 | model_type: flickr 9 | 10 | datasets: 11 | flickr30k: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: retrieval 22 | 23 | # dataloading 24 | num_workers: 4 25 | batch_size_train: 32 26 | batch_size_eval: 64 27 | 28 | test_splits: ["test"] 29 | 30 | # distribution 31 | device: "cuda" 32 | world_size: 1 33 | dist_url: "env://" 34 | distributed: True 35 | use_dist_eval_sampler: False 36 | 37 | # model specific 38 | k_test: 128 39 | 40 | # misc 41 | seed: 42 42 | output_dir: "output/ALBEF/Retrieval_Flickr30k" 43 | 44 | evaluate: True 45 | -------------------------------------------------------------------------------- /experiments/lavis/projects/albef/eval/snli_ve_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_classification 8 | model_type: ve 9 | 10 | datasets: 11 | snli_ve: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | text_processor: 16 | eval: 17 | name: "blip_caption" 18 | 19 | run: 20 | task: multimodal_classification 21 | # optimization-specific 22 | batch_size_train: 32 23 | batch_size_eval: 64 24 | num_workers: 4 25 | 26 | seed: 42 27 | output_dir: "output/ALBEF/SNLI_VE" 28 | 29 | evaluate: True 30 | test_splits: ["val", "test"] 31 | 32 | # distribution-specific 33 | device: "cuda" 34 | world_size: 1 35 | dist_url: "env://" 36 | distributed: True 37 | -------------------------------------------------------------------------------- /experiments/lavis/projects/albef/eval/vqa_test.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_vqa 8 | model_type: vqav2 9 | 10 | image_size: 384 11 | 12 | 13 | datasets: 14 | coco_vqa: # name of the dataset builder 15 | vis_processor: 16 | eval: 17 | name: "blip_image_eval" 18 | image_size: 384 19 | text_processor: 20 | eval: 21 | name: "blip_question" 22 | 23 | run: 24 | task: vqa 25 | 26 | # optimization-specific 27 | batch_size_train: 16 28 | batch_size_eval: 64 29 | num_workers: 4 30 | 31 | # inference-specific 32 | max_len: 10 33 | min_len: 1 34 | num_beams: 3 35 | num_ans_candidates: 128 36 | inference_method: "rank" 37 | 38 | seed: 42 39 | output_dir: "output/ALBEF/VQA" 40 | 41 | evaluate: True 42 | train_splits: ["train"] 43 | test_splits: ["test"] 44 | 45 | # distribution-specific 46 | device: "cuda" 47 | world_size: 1 48 | dist_url: "env://" 49 | distributed: True 50 | -------------------------------------------------------------------------------- /experiments/lavis/projects/albef/eval/vqa_val.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: albef_vqa 8 | model_type: vqav2 9 | 10 | image_size: 384 11 | 12 | datasets: 13 | coco_vqa: # name of the dataset builder 14 | type: eval 15 | vis_processor: 16 | eval: 17 | name: "blip_image_eval" 18 | image_size: 384 19 | text_processor: 20 | eval: 21 | name: "blip_question" 22 | 23 | run: 24 | task: vqa 25 | 26 | # optimization-specific 27 | batch_size_train: 16 28 | batch_size_eval: 64 29 | num_workers: 4 30 | 31 | # inference-specific 32 | max_len: 10 33 | min_len: 1 34 | num_beams: 3 35 | num_ans_candidates: 128 36 | inference_method: "rank" 37 | 38 | seed: 42 39 | output_dir: "output/ALBEF/VQA" 40 | 41 | evaluate: True 42 | test_splits: ["val"] 43 | 44 | # distribution-specific 45 | device: "cuda" 46 | world_size: 1 47 | dist_url: "env://" 48 | distributed: True 49 | -------------------------------------------------------------------------------- /experiments/lavis/projects/alpro/eval/didemo_ret_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | model_type: didemo 9 | 10 | max_txt_len: 50 11 | 12 | timesformer: 13 | n_frms: 8 14 | image_size: 224 15 | 16 | 17 | datasets: 18 | didemo_retrieval: # name of the dataset builder 19 | vis_processor: 20 | eval: 21 | name: "alpro_video_eval" 22 | n_frms: 8 23 | image_size: 224 24 | text_processor: 25 | eval: 26 | name: "blip_caption" 27 | 28 | run: 29 | task: retrieval 30 | # optimization-specific 31 | batch_size_train: 8 32 | batch_size_eval: 64 33 | num_workers: 4 34 | 35 | # k_test: 256 36 | k_test: 1000 37 | 38 | seed: 42 39 | output_dir: "output/ALPRO/didemo_retrieval" 40 | 41 | evaluate: True 42 | train_splits: ["train"] 43 | valid_splits: ["val", "test"] 44 | test_splits: ["test"] 45 | 46 | # distribution-specific 47 | device: "cuda" 48 | world_size: 1 49 | dist_url: "env://" 50 | distributed: True 51 | use_dist_eval_sampler: False 52 | -------------------------------------------------------------------------------- /experiments/lavis/projects/alpro/eval/msrvtt_qa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | model_type: msrvtt 9 | 10 | datasets: 11 | msrvtt_qa: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "alpro_video_eval" 15 | n_frms: 16 16 | image_size: 224 17 | text_processor: 18 | eval: 19 | name: "blip_caption" 20 | 21 | run: 22 | task: multimodal_classification 23 | # optimization-specific 24 | batch_size_train: 32 25 | batch_size_eval: 64 26 | num_workers: 4 27 | 28 | seed: 42 29 | output_dir: "output/ALPRO/msrvtt_qa" 30 | 31 | evaluate: True 32 | valid_splits: ["val"] 33 | test_splits: ["test"] 34 | 35 | # distribution-specific 36 | device: "cuda" 37 | world_size: 1 38 | dist_url: "env://" 39 | distributed: True 40 | -------------------------------------------------------------------------------- /experiments/lavis/projects/alpro/eval/msrvtt_ret_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_retrieval 8 | model_type: msrvtt 9 | 10 | datasets: 11 | msrvtt_retrieval: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "alpro_video_eval" 15 | n_frms: 8 16 | image_size: 224 17 | text_processor: 18 | eval: 19 | name: "blip_caption" 20 | 21 | run: 22 | task: retrieval 23 | # optimization-specific 24 | batch_size_train: 24 25 | batch_size_eval: 64 26 | num_workers: 4 27 | 28 | # k_test: 256 29 | k_test: 1000 30 | 31 | seed: 42 32 | output_dir: "output/ALPRO/msrvtt_retrieval" 33 | 34 | evaluate: True 35 | test_splits: ["test"] 36 | 37 | # distribution-specific 38 | device: "cuda" 39 | world_size: 1 40 | dist_url: "env://" 41 | distributed: True 42 | use_dist_eval_sampler: False 43 | -------------------------------------------------------------------------------- /experiments/lavis/projects/alpro/eval/msvd_qa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: alpro_qa 8 | model_type: msvd 9 | 10 | datasets: 11 | msvd_qa: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "alpro_video_eval" 15 | n_frms: 16 16 | image_size: 224 17 | text_processor: 18 | train: 19 | name: "blip_caption" 20 | eval: 21 | name: "blip_caption" 22 | 23 | run: 24 | task: multimodal_classification 25 | # optimization-specific 26 | batch_size_train: 24 27 | batch_size_eval: 64 28 | num_workers: 4 29 | 30 | seed: 42 31 | output_dir: "output/ALPRO/msvd_qa" 32 | 33 | evaluate: True 34 | test_splits: ["test"] 35 | 36 | # distribution-specific 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | -------------------------------------------------------------------------------- /experiments/lavis/projects/blip/eval/aokvqa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | model_type: aokvqa 9 | image_size: 480 10 | 11 | datasets: 12 | aok_vqa: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 480 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: aok_vqa 23 | # optimization-specific 24 | batch_size_train: 64 25 | batch_size_eval: 64 26 | num_workers: 4 27 | 28 | # inference-specific 29 | max_len: 10 30 | min_len: 1 31 | num_beams: 3 32 | num_ans_candidates: 128 33 | inference_method: "rank" 34 | 35 | seed: 42 36 | output_dir: "output/BLIP/AOKVQA" 37 | 38 | evaluate: True 39 | test_splits: ["val", "test"] 40 | 41 | # distribution-specific 42 | device: "cuda" 43 | world_size: 1 44 | dist_url: "env://" 45 | distributed: True 46 | -------------------------------------------------------------------------------- /experiments/lavis/projects/blip/eval/caption_coco_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | model_type: base_coco 9 | 10 | datasets: 11 | coco_caption: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | text_processor: 16 | eval: 17 | name: "blip_caption" 18 | 19 | run: 20 | # task: retrieval 21 | task: captioning 22 | # optimizer 23 | batch_size_train: 32 24 | batch_size_eval: 64 25 | num_workers: 4 26 | 27 | max_len: 20 28 | min_len: 5 29 | num_beams: 3 30 | 31 | seed: 42 32 | output_dir: "output/BLIP/Caption_coco" 33 | 34 | evaluate: True 35 | test_splits: ["test"] 36 | 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | -------------------------------------------------------------------------------- /experiments/lavis/projects/blip/eval/caption_coco_eval_large.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | model_type: large_coco 9 | 10 | datasets: 11 | coco_caption: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | text_processor: 16 | eval: 17 | name: "blip_caption" 18 | 19 | run: 20 | # task: retrieval 21 | task: captioning 22 | # optimizer 23 | batch_size_train: 32 24 | batch_size_eval: 64 25 | num_workers: 4 26 | 27 | max_len: 20 28 | min_len: 5 29 | num_beams: 3 30 | 31 | seed: 42 32 | output_dir: "output/BLIP/Caption_coco" 33 | 34 | evaluate: True 35 | test_splits: ["test"] 36 | 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | -------------------------------------------------------------------------------- /experiments/lavis/projects/blip/eval/nlvr_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_nlvr 8 | model_type: nlvr 9 | 10 | datasets: 11 | nlvr: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: multimodal_classification 22 | 23 | batch_size_train: 16 24 | batch_size_eval: 64 25 | num_workers: 4 26 | 27 | seed: 42 28 | output_dir: "output/BLIP/NLVR" 29 | 30 | evaluate: True 31 | test_splits: ["val", "test"] 32 | 33 | # distribution-specific 34 | device: "cuda" 35 | world_size: 1 36 | dist_url: "env://" 37 | distributed: True 38 | -------------------------------------------------------------------------------- /experiments/lavis/projects/blip/eval/nocaps_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_caption 8 | model_type: base_coco 9 | # pretrained: 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/models/model_base_caption_capfilt_large.pth' 10 | 11 | datasets: 12 | nocaps: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 384 17 | text_processor: 18 | eval: 19 | name: "blip_caption" 20 | prompt: "a picture of " 21 | 22 | run: 23 | # task: retrieval 24 | task: captioning 25 | # optimizer 26 | batch_size_train: 32 27 | batch_size_eval: 64 28 | num_workers: 4 29 | 30 | max_len: 20 31 | min_len: 5 32 | num_beams: 3 33 | 34 | seed: 42 35 | output_dir: "output/BLIP/NoCaps" 36 | 37 | evaluate: True 38 | test_splits: ["val", "test"] 39 | 40 | device: "cuda" 41 | world_size: 1 42 | dist_url: "env://" 43 | distributed: True 44 | 45 | report_metric: False 46 | -------------------------------------------------------------------------------- /experiments/lavis/projects/blip/eval/okvqa_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | model_type: okvqa 9 | image_size: 480 10 | 11 | datasets: 12 | ok_vqa: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 480 17 | text_processor: 18 | eval: 19 | name: "blip_question" 20 | 21 | run: 22 | task: vqa 23 | # optimization-specific 24 | batch_size_train: 16 25 | batch_size_eval: 16 26 | num_workers: 4 27 | 28 | # inference-specific 29 | max_len: 10 30 | min_len: 1 31 | num_beams: 3 32 | num_ans_candidates: 128 33 | inference_method: "rank" 34 | 35 | seed: 42 36 | output_dir: "output/BLIP/OKVQA" 37 | 38 | evaluate: True 39 | test_splits: ["test"] 40 | 41 | # distribution-specific 42 | device: "cuda" 43 | world_size: 1 44 | dist_url: "env://" 45 | distributed: True 46 | -------------------------------------------------------------------------------- /experiments/lavis/projects/blip/eval/ret_coco_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | model_type: coco 9 | 10 | datasets: 11 | coco_retrieval: # name of the dataset builder 12 | vis_processor: 13 | train: 14 | name: "blip_image_train" 15 | image_size: 384 16 | eval: 17 | name: "blip_image_eval" 18 | image_size: 384 19 | text_processor: 20 | train: 21 | name: "blip_caption" 22 | eval: 23 | name: "blip_caption" 24 | 25 | run: 26 | task: retrieval 27 | 28 | # dataloading 29 | num_workers: 4 30 | batch_size_train: 32 31 | batch_size_eval: 128 32 | 33 | train_splits: ["train"] 34 | valid_splits: ["val"] 35 | test_splits: ["test"] 36 | 37 | # distribution 38 | device: "cuda" 39 | world_size: 1 40 | dist_url: "env://" 41 | distributed: True 42 | use_dist_eval_sampler: False 43 | 44 | # model specific 45 | k_test: 256 46 | 47 | # misc 48 | seed: 42 49 | output_dir: "output/BLIP/Retrieval_COCO" 50 | 51 | evaluate: True 52 | -------------------------------------------------------------------------------- /experiments/lavis/projects/blip/eval/ret_flickr_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_retrieval 8 | model_type: flickr 9 | 10 | datasets: 11 | flickr30k: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "blip_image_eval" 15 | image_size: 384 16 | text_processor: 17 | eval: 18 | name: "blip_caption" 19 | 20 | run: 21 | task: retrieval 22 | 23 | # dataloading 24 | num_workers: 4 25 | batch_size_train: 32 26 | batch_size_eval: 64 27 | 28 | test_splits: ["test"] 29 | 30 | # distribution 31 | device: "cuda" 32 | world_size: 1 33 | dist_url: "env://" 34 | distributed: True 35 | use_dist_eval_sampler: False 36 | 37 | # model specific 38 | k_test: 128 39 | 40 | # misc 41 | seed: 42 42 | output_dir: "output/Retrieval_Flickr30k" 43 | 44 | evaluate: True 45 | -------------------------------------------------------------------------------- /experiments/lavis/projects/blip/eval/vqav2_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip_vqa 8 | model_type: vqav2 9 | image_size: 480 10 | 11 | datasets: 12 | coco_vqa: # name of the dataset builder 13 | type: eval 14 | vis_processor: 15 | eval: 16 | name: "blip_image_eval" 17 | image_size: 480 18 | text_processor: 19 | eval: 20 | name: "blip_question" 21 | 22 | run: 23 | task: vqa 24 | # optimization-specific 25 | batch_size_train: 16 26 | batch_size_eval: 64 27 | num_workers: 4 28 | 29 | # inference-specific 30 | max_len: 10 31 | min_len: 1 32 | num_beams: 3 33 | num_ans_candidates: 128 34 | inference_method: "rank" 35 | 36 | seed: 42 37 | output_dir: "output/BLIP/VQA" 38 | 39 | evaluate: True 40 | test_splits: ["val"] 41 | 42 | # distribution-specific 43 | device: "cuda" 44 | world_size: 1 45 | dist_url: "env://" 46 | distributed: True 47 | -------------------------------------------------------------------------------- /experiments/lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/sangminwoo/AvisC/772eba499dcbda0eaaf844fda2fdf5057dcd0175/experiments/lavis/projects/blip2/eval/caption_coco_flant5xl_eval.yaml -------------------------------------------------------------------------------- /experiments/lavis/projects/blip2/eval/ret_flickr_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: blip2 8 | model_type: coco 9 | use_grad_checkpoint: False 10 | 11 | datasets: 12 | flickr30k: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "blip_image_eval" 16 | image_size: 364 17 | text_processor: 18 | eval: 19 | name: "blip_caption" 20 | 21 | run: 22 | task: retrieval 23 | 24 | # dataloading 25 | num_workers: 4 26 | batch_size_train: 16 27 | batch_size_eval: 32 28 | 29 | test_splits: ["test"] 30 | 31 | # distribution 32 | device: "cuda" 33 | world_size: 1 34 | dist_url: "env://" 35 | distributed: True 36 | use_dist_eval_sampler: False 37 | 38 | # model specific 39 | k_test: 128 40 | 41 | # misc 42 | seed: 42 43 | output_dir: "output/BLIP2/Retrieval_Flickr30k" 44 | 45 | evaluate: True -------------------------------------------------------------------------------- /experiments/lavis/projects/clip/exp_coco_ret_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | 11 | datasets: 12 | coco_retrieval: # name of the dataset builder 13 | vis_processor: 14 | train: 15 | name: "clip_image_train" 16 | image_size: 336 17 | eval: 18 | name: "clip_image_eval" 19 | image_size: 336 20 | text_processor: 21 | train: 22 | name: "blip_caption" 23 | eval: 24 | name: "blip_caption" 25 | 26 | run: 27 | task: retrieval 28 | 29 | # dataloading 30 | num_workers: 4 31 | batch_size_train: 32 32 | batch_size_eval: 128 33 | 34 | test_splits: ["test"] 35 | 36 | # distribution 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | use_dist_eval_sampler: True 42 | 43 | # misc 44 | seed: 42 45 | output_dir: "output/clip/Retrieval_COCO" 46 | 47 | evaluate: True 48 | -------------------------------------------------------------------------------- /experiments/lavis/projects/clip/exp_flickr_ret_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | 11 | datasets: 12 | flickr30k: # name of the dataset builder 13 | vis_processor: 14 | train: 15 | name: "clip_image_train" 16 | image_size: 336 17 | eval: 18 | name: "clip_image_eval" 19 | image_size: 336 20 | text_processor: 21 | train: 22 | name: "blip_caption" 23 | eval: 24 | name: "blip_caption" 25 | 26 | run: 27 | task: retrieval 28 | 29 | # dataloading 30 | num_workers: 4 31 | batch_size_train: 32 32 | batch_size_eval: 128 33 | 34 | test_splits: ["test"] 35 | 36 | # distribution 37 | device: "cuda" 38 | world_size: 1 39 | dist_url: "env://" 40 | distributed: True 41 | use_dist_eval_sampler: True 42 | 43 | # misc 44 | seed: 42 45 | output_dir: "output/clip/Retrieval_Flickr" 46 | 47 | evaluate: True 48 | -------------------------------------------------------------------------------- /experiments/lavis/projects/clip/exp_imnet_zs_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: clip 8 | 9 | model_type: ViT-L-14-336 10 | 11 | datasets: 12 | imagenet: # name of the dataset builder 13 | vis_processor: 14 | eval: 15 | name: "clip_image_eval" 16 | # image_size: 224 17 | image_size: 336 18 | 19 | run: 20 | task: multimodal_classification 21 | 22 | # dataloading 23 | num_workers: 4 24 | batch_size_train: 32 25 | batch_size_eval: 128 26 | 27 | test_splits: ["val"] 28 | 29 | # distribution 30 | device: "cuda" 31 | world_size: 1 32 | dist_url: "env://" 33 | distributed: True 34 | 35 | # misc 36 | seed: 42 37 | output_dir: "output/clip/zs_imnet" 38 | 39 | evaluate: True 40 | -------------------------------------------------------------------------------- /experiments/lavis/projects/gpt/eval/dialogue_avsd_eval.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: gpt_dialogue 8 | model_type: base 9 | 10 | datasets: 11 | avsd_dialogue: # name of the dataset builder 12 | vis_processor: 13 | eval: 14 | name: "gpt_video_ft" 15 | visual_ft: ["i3d_flow", "i3d_rgb"] 16 | audio_ft: ["vggish"] 17 | text_processor: 18 | eval: 19 | name: "gpt_dialogue" 20 | max_turns: 3 21 | use_caption: True 22 | 23 | run: 24 | task: dialogue 25 | # optimizer 26 | batch_size_train: 16 27 | batch_size_eval: 16 28 | num_workers: 0 29 | 30 | max_len: 20 31 | min_len: 5 32 | num_beams: 5 33 | 34 | seed: 42 35 | output_dir: "output/gpt2/dialogue_avsd" 36 | 37 | evaluate: True 38 | valid_splits: ["test"] 39 | 40 | device: "cuda" 41 | world_size: 1 42 | dist_url: "env://" 43 | distributed: True 44 | -------------------------------------------------------------------------------- /experiments/lavis/runners/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.runners.runner_base import RunnerBase 9 | from lavis.runners.runner_iter import RunnerIter 10 | 11 | __all__ = ["RunnerBase", "RunnerIter"] 12 | -------------------------------------------------------------------------------- /experiments/lavis/tasks/image_text_pretrain.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.common.registry import registry 9 | from lavis.tasks.base_task import BaseTask 10 | 11 | 12 | @registry.register_task("image_text_pretrain") 13 | class ImageTextPretrainTask(BaseTask): 14 | def __init__(self): 15 | super().__init__() 16 | 17 | def evaluation(self, model, data_loader, cuda_enabled=True): 18 | pass 19 | -------------------------------------------------------------------------------- /experiments/lavis/tasks/text_to_image_generation.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from lavis.tasks import BaseTask 9 | from lavis.common.registry import registry 10 | 11 | 12 | @registry.register_task("text-to-image-generation") 13 | class TextToImageGenerationTask(BaseTask): 14 | def __init__(self, cfg): 15 | super().__init__() 16 | 17 | self.cfg = cfg 18 | 19 | @classmethod 20 | def setup_task(cls, cfg): 21 | run_cfg = cfg.run_cfg 22 | 23 | return cls(cfg=run_cfg) 24 | -------------------------------------------------------------------------------- /experiments/llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /experiments/llava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | -------------------------------------------------------------------------------- /experiments/llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig 2 | from .language_model.llava_mpt import LlavaMPTForCausalLM, LlavaMPTConfig 3 | -------------------------------------------------------------------------------- /experiments/llava/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llava.model import * 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /experiments/llava/model/language_model/mpt/custom_embedding.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from torch import Tensor 5 | 6 | class SharedEmbedding(nn.Embedding): 7 | 8 | def forward(self, input: Tensor, unembed: bool=False) -> Tensor: 9 | if unembed: 10 | return F.linear(input, self.weight) 11 | return super().forward(input) -------------------------------------------------------------------------------- /experiments/llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion"): 9 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 10 | 11 | raise ValueError(f'Unknown vision tower: {vision_tower}') 12 | -------------------------------------------------------------------------------- /experiments/llava/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torchvision==0.15.2 2 | transformers==4.31.0 3 | torch==2.0.1 4 | tokenizers>=0.12.1,<0.14 5 | shortuuid 6 | accelerate==0.21.0 7 | peft==0.4.0 8 | bitsandbytes==0.41.0 9 | scikit-learn==1.2.2 10 | gradio==3.35.2 11 | gradio_client==0.2.9 12 | httpx==0.24.0 13 | numpy 14 | requests 15 | uvicorn 16 | fastapi 17 | einops 18 | einops-exts 19 | timm 20 | contexttimer 21 | decord 22 | diffusers 23 | fairscale 24 | ftfy 25 | iopath 26 | ipython 27 | omegaconf 28 | opencv-python 29 | opendatasets 30 | packaging 31 | pandas 32 | plotly 33 | pre-commit 34 | pycocoevalcap 35 | pycocotools 36 | python-magic 37 | scikit-image 38 | sentencepiece 39 | spacy 40 | streamlit 41 | tqdm 42 | webdataset 43 | wheel 44 | torchaudio 45 | soundfile 46 | moviepy 47 | nltk -------------------------------------------------------------------------------- /utils/logger.py: -------------------------------------------------------------------------------- 1 | import torch.distributed as dist 2 | import logging 3 | 4 | 5 | def create_logger(logging_dir): 6 | """ 7 | Create a logger that writes to a log file and stdout. 8 | """ 9 | if dist.get_rank() == 0: # real logger 10 | logging.basicConfig( 11 | level=logging.INFO, 12 | format="[\033[34m%(asctime)s\033[0m] %(message)s", 13 | datefmt="%Y-%m-%d %H:%M:%S", 14 | handlers=[logging.StreamHandler(), logging.FileHandler(f"{logging_dir}/log.txt")], 15 | ) 16 | logger = logging.getLogger(__name__) 17 | else: # dummy logger (does nothing) 18 | logger = logging.getLogger(__name__) 19 | logger.addHandler(logging.NullHandler()) 20 | return logger --------------------------------------------------------------------------------