├── .gitignore
├── LICENSE
├── README.md
├── VLMEvalKit
├── LICENSE
├── README.md
├── assets
│ ├── LOGO.svg
│ └── apple.jpg
├── docs
│ ├── en
│ │ ├── .readthedocs.yaml
│ │ ├── Makefile
│ │ ├── _static
│ │ │ ├── css
│ │ │ │ └── readthedocs.css
│ │ │ ├── image
│ │ │ │ ├── logo.svg
│ │ │ │ └── logo_icon.svg
│ │ │ └── js
│ │ │ │ └── custom.js
│ │ ├── _templates
│ │ │ ├── 404.html
│ │ │ ├── autosummary
│ │ │ │ └── class.rst
│ │ │ └── callable.rst
│ │ ├── advanced_guides
│ │ │ ├── Contributors.md
│ │ │ └── Development.md
│ │ ├── conf.py
│ │ ├── docutils.conf
│ │ ├── get_started
│ │ │ └── Quickstart.md
│ │ └── index.rst
│ ├── ja
│ │ └── README_ja.md
│ └── zh-CN
│ │ ├── .readthedocs.yaml
│ │ ├── Makefile
│ │ ├── README_zh-CN.md
│ │ ├── _static
│ │ ├── css
│ │ │ └── readthedocs.css
│ │ ├── image
│ │ │ ├── logo.svg
│ │ │ └── logo_icon.svg
│ │ └── js
│ │ │ └── custom.js
│ │ ├── _templates
│ │ ├── 404.html
│ │ ├── autosummary
│ │ │ └── class.rst
│ │ └── callable.rst
│ │ ├── advanced_guides
│ │ └── Development.md
│ │ ├── conf.py
│ │ ├── cp_origin_docs.sh
│ │ ├── docutils.conf
│ │ ├── get_started
│ │ └── Quickstart.md
│ │ └── index.rst
├── eval.sh
├── requirements.txt
├── requirements
│ └── docs.txt
├── run.py
├── scripts
│ ├── AI2D_preproc.ipynb
│ ├── apires_scan.py
│ ├── auto_run.py
│ ├── cover.sh
│ ├── mmb_eval_gradio.py
│ ├── run.sh
│ ├── srun.sh
│ ├── summarize.py
│ └── visualize.ipynb
├── setup.py
└── vlmeval
│ ├── __init__.py
│ ├── api
│ ├── __init__.py
│ ├── base.py
│ ├── bluelm_v_api.py
│ ├── claude.py
│ ├── cloudwalk.py
│ ├── gemini.py
│ ├── glm_vision.py
│ ├── gpt.py
│ ├── hf_chat_model.py
│ ├── hunyuan.py
│ ├── qwen_api.py
│ ├── qwen_vl_api.py
│ ├── reka.py
│ ├── sensechat_vision.py
│ └── stepai.py
│ ├── config.py
│ ├── dataset
│ ├── __init__.py
│ ├── dude.py
│ ├── image_base.py
│ ├── image_caption.py
│ ├── image_mcq.py
│ ├── image_mt.py
│ ├── image_vqa.py
│ ├── image_yorn.py
│ ├── mmbench_video.py
│ ├── mmlongbench.py
│ ├── mvbench.py
│ ├── slidevqa.py
│ ├── text_base.py
│ ├── text_mcq.py
│ ├── utils
│ │ ├── __init__.py
│ │ ├── crpe.py
│ │ ├── hrbench.py
│ │ ├── judge_util.py
│ │ ├── llavabench.py
│ │ ├── mathv.py
│ │ ├── mathverse.py
│ │ ├── mathvista.py
│ │ ├── mmbench_video.py
│ │ ├── mmdu.py
│ │ ├── mmvet.py
│ │ ├── multiple_choice.py
│ │ ├── mvbench.py
│ │ ├── ocrbench.py
│ │ ├── tablevqabench.py
│ │ ├── videomme.py
│ │ ├── vqa_eval.py
│ │ └── yorn.py
│ ├── vcr.py
│ ├── video_base.py
│ └── videomme.py
│ ├── inference.py
│ ├── inference_mt.py
│ ├── inference_video.py
│ ├── smp
│ ├── __init__.py
│ ├── file.py
│ ├── log.py
│ ├── misc.py
│ └── vlm.py
│ ├── tools.py
│ ├── utils
│ ├── __init__.py
│ ├── matching_util.py
│ ├── mp_util.py
│ └── result_transfer.py
│ └── vlm
│ ├── __init__.py
│ ├── base.py
│ ├── bunnyllama3.py
│ ├── cambrian.py
│ ├── chameleon.py
│ ├── cogvlm.py
│ ├── deepseek_vl.py
│ ├── eagle_x.py
│ ├── emu.py
│ ├── idefics.py
│ ├── instructblip.py
│ ├── internvl_chat.py
│ ├── llama_vision.py
│ ├── llava
│ ├── __init__.py
│ ├── llava.py
│ └── llava_xtuner.py
│ ├── llava_uhd.py
│ ├── llava_uhd2.py
│ ├── mantis.py
│ ├── mgm.py
│ ├── minicpm_v.py
│ ├── minigpt4.py
│ ├── minimonkey.py
│ ├── misc
│ ├── blip2_instruct_vicuna13b.yaml
│ ├── blip2_instruct_vicuna7b.yaml
│ ├── minigpt4_13b_eval.yaml
│ ├── minigpt4_7b_eval.yaml
│ └── minigptv2_eval.yaml
│ ├── mixsense.py
│ ├── mmalaya.py
│ ├── monkey.py
│ ├── moondream.py
│ ├── mplug_owl2.py
│ ├── mplug_owl3.py
│ ├── omchat.py
│ ├── omnilmm.py
│ ├── open_flamingo.py
│ ├── ovis.py
│ ├── paligemma.py
│ ├── pandagpt.py
│ ├── parrot.py
│ ├── phi3_vision.py
│ ├── pixtral.py
│ ├── qh_360vl.py
│ ├── qwen2_vl
│ ├── __init__.py
│ ├── model.py
│ └── prompt.py
│ ├── qwen_vl.py
│ ├── rbdash.py
│ ├── slime.py
│ ├── transcore_m.py
│ ├── video_llm
│ ├── __init__.py
│ ├── chat_uni_vi.py
│ ├── configs
│ │ ├── llama_vid
│ │ │ └── processor
│ │ │ │ └── clip-patch14-224
│ │ │ │ ├── config.json
│ │ │ │ └── preprocessor_config.json
│ │ └── videochat2_hd.json
│ ├── llama_vid.py
│ ├── pllava.py
│ ├── video_chatgpt.py
│ ├── video_llava.py
│ └── videochat2.py
│ ├── vila.py
│ ├── visualglm.py
│ ├── vxverse.py
│ ├── wemm.py
│ ├── xcomposer
│ ├── __init__.py
│ ├── sharecaptioner.py
│ ├── xcomposer.py
│ ├── xcomposer2.py
│ ├── xcomposer2_4KHD.py
│ └── xcomposer2d5.py
│ ├── xgen_mm.py
│ └── yi_vl.py
├── cog.yaml
├── doc
├── HiWin.png
├── arch.png
└── pyramid.png
├── eval.sh
├── featup
├── __init__.py
├── adaptive_conv_cuda
│ ├── __init__.py
│ ├── adaptive_conv.cpp
│ ├── adaptive_conv.py
│ ├── adaptive_conv_cuda.cpp
│ └── adaptive_conv_kernel.cu
├── configs
│ └── vdim_upsampler.yaml
├── datasets
│ ├── COCO.py
│ ├── DAVIS.py
│ ├── DOC.py
│ ├── DocSceneText.py
│ ├── EmbeddingFile.py
│ ├── HTML.py
│ ├── HighResEmbs.py
│ ├── ImageNetSubset.py
│ ├── JitteredImage.py
│ ├── SCENE.py
│ ├── SampleImage.py
│ ├── __init__.py
│ └── util.py
├── downsamplers.py
├── featurizers
│ ├── CLIP.py
│ ├── CLIPLarge.py
│ ├── ClipEncoder.py
│ ├── DINO.py
│ ├── DINOv2.py
│ ├── DeepLabV3.py
│ ├── MAE.py
│ ├── MIDAS.py
│ ├── MaskCLIP.py
│ ├── ResNet.py
│ ├── __init__.py
│ ├── dinov2
│ │ ├── __init__.py
│ │ └── layers
│ │ │ ├── __init__.py
│ │ │ ├── attention.py
│ │ │ ├── block.py
│ │ │ ├── dino_head.py
│ │ │ ├── drop_path.py
│ │ │ ├── layer_scale.py
│ │ │ ├── mlp.py
│ │ │ ├── patch_embed.py
│ │ │ └── swiglu_ffn.py
│ ├── maskclip
│ │ ├── README.md
│ │ ├── __init__.py
│ │ ├── bpe_simple_vocab_16e6.txt.gz
│ │ ├── clip.py
│ │ ├── interpolate.py
│ │ ├── model.py
│ │ └── simple_tokenizer.py
│ ├── modules
│ │ ├── __init__.py
│ │ ├── layers.py
│ │ ├── resnet.py
│ │ └── vgg.py
│ └── util.py
├── layers.py
├── losses.py
├── plotting.py
├── train_vdim_upsampler.py
├── upsamplers.py
└── util.py
├── install.sh
├── llava
├── __init__.py
├── constants.py
├── conversation.py
├── eval
│ ├── eval_ai2d.py
│ ├── eval_chartqa.py
│ ├── eval_docvqa.py
│ ├── eval_gpt_review.py
│ ├── eval_gpt_review_bench.py
│ ├── eval_gpt_review_visual.py
│ ├── eval_pope.py
│ ├── eval_rec.py
│ ├── eval_science_qa.py
│ ├── eval_science_qa_gpt4.py
│ ├── eval_science_qa_gpt4_requery.py
│ ├── eval_textvqa.py
│ ├── evaluate_interleave.py
│ ├── generate_webpage_data_from_table.py
│ ├── m4c_evaluator.py
│ ├── model_qa.py
│ ├── model_vqa.py
│ ├── model_vqa_loader.py
│ ├── model_vqa_mmbench.py
│ ├── model_vqa_science.py
│ ├── qa_baseline_gpt35.py
│ ├── run_llava.py
│ └── summarize_gpt_review.py
├── mm_utils.py
├── model
│ ├── __init__.py
│ ├── apply_delta.py
│ ├── builder.py
│ ├── builder_new.bk
│ ├── consolidate.py
│ ├── language_model
│ │ ├── llava_gemma.py
│ │ ├── llava_llama.py
│ │ ├── llava_mistral.py
│ │ ├── llava_mixtral.py
│ │ ├── llava_mpt.py
│ │ ├── llava_qwen.py
│ │ ├── llava_qwen_moe.py
│ │ └── modeling_llama.py
│ ├── llava_arch.py
│ ├── make_delta.py
│ ├── multimodal_encoder
│ │ ├── adapt_clip_vision_model.py
│ │ ├── builder.py
│ │ ├── clip_encoder.py
│ │ ├── dev_eva_clip
│ │ │ ├── eva_clip
│ │ │ │ ├── __init__.py
│ │ │ │ ├── bpe_simple_vocab_16e6.txt.gz
│ │ │ │ ├── constants.py
│ │ │ │ ├── eva_vit_model.py
│ │ │ │ ├── factory.py
│ │ │ │ ├── hf_configs.py
│ │ │ │ ├── hf_model.py
│ │ │ │ ├── loss.py
│ │ │ │ ├── model.py
│ │ │ │ ├── model_configs
│ │ │ │ │ ├── EVA-CLIP-18B.json
│ │ │ │ │ ├── EVA-CLIP-8B-plus.json
│ │ │ │ │ ├── EVA-CLIP-8B.json
│ │ │ │ │ ├── EVA01-CLIP-B-16.json
│ │ │ │ │ ├── EVA01-CLIP-g-14-plus.json
│ │ │ │ │ ├── EVA01-CLIP-g-14.json
│ │ │ │ │ ├── EVA02-CLIP-B-16.json
│ │ │ │ │ ├── EVA02-CLIP-L-14-336.json
│ │ │ │ │ ├── EVA02-CLIP-L-14.json
│ │ │ │ │ ├── EVA02-CLIP-bigE-14-plus.json
│ │ │ │ │ ├── EVA02-CLIP-bigE-14.json
│ │ │ │ │ ├── Internal-EVA02-CLIP-10B-14-448.json
│ │ │ │ │ └── Internal-EVA02-CLIP-10B-14.json
│ │ │ │ ├── modified_resnet.py
│ │ │ │ ├── openai.py
│ │ │ │ ├── pretrained.py
│ │ │ │ ├── rope.py
│ │ │ │ ├── timm_model.py
│ │ │ │ ├── tokenizer.py
│ │ │ │ ├── transform.py
│ │ │ │ ├── transformer.py
│ │ │ │ └── utils.py
│ │ │ └── eva_vit.py
│ │ ├── eva_clip
│ │ │ ├── eva_clip_encoder.py
│ │ │ ├── eva_clip_processors.py
│ │ │ ├── eva_vit.py
│ │ │ ├── factory.py
│ │ │ └── model_configs
│ │ │ │ ├── EVA-CLIP-18B.json
│ │ │ │ ├── EVA-CLIP-8B-plus.json
│ │ │ │ ├── EVA-CLIP-8B.json
│ │ │ │ ├── EVA01-CLIP-B-16.json
│ │ │ │ ├── EVA01-CLIP-g-14-plus.json
│ │ │ │ ├── EVA01-CLIP-g-14.json
│ │ │ │ ├── EVA02-CLIP-B-16.json
│ │ │ │ ├── EVA02-CLIP-L-14-336.json
│ │ │ │ ├── EVA02-CLIP-L-14.json
│ │ │ │ ├── EVA02-CLIP-bigE-14-plus.json
│ │ │ │ ├── EVA02-CLIP-bigE-14.json
│ │ │ │ ├── Internal-EVA02-CLIP-10B-14-448.json
│ │ │ │ └── Internal-EVA02-CLIP-10B-14.json
│ │ ├── hf_vision.py
│ │ ├── hubconf.py
│ │ ├── imagebind.py
│ │ ├── open_clip_encoder.py
│ │ └── siglip_encoder.py
│ ├── multimodal_projector
│ │ ├── adapt_spatial_resampler.py
│ │ ├── builder.py
│ │ ├── mlp.py
│ │ ├── mlp_v2.py
│ │ ├── percive_sampler.py
│ │ ├── pooler_projector.py
│ │ └── uhd_v1_resampler.py
│ ├── multimodal_resampler
│ │ ├── builder.py
│ │ ├── masked_drop.py
│ │ ├── perceiver.py
│ │ ├── qformer.py
│ │ └── spatial_pool.py
│ └── utils.py
├── serve
│ ├── __init__.py
│ ├── cli.py
│ ├── controller.py
│ ├── examples
│ │ ├── extreme_ironing.jpg
│ │ └── waterview.jpg
│ ├── gradio_multi_image.py
│ ├── gradio_web_server.py
│ ├── model_worker.py
│ ├── register_worker.py
│ ├── sglang_worker.py
│ └── test_message.py
├── slice_process.py
├── train
│ ├── llama_flash_attn_monkey_patch.py
│ ├── llava_trainer.py
│ ├── llava_trainer_eval.py
│ ├── train.py
│ ├── train_dpo.py
│ └── train_mem.py
└── utils.py
├── model-train.sh
├── playground
├── 2d_hist.py
├── data_checker.py
├── demo
│ ├── video_demo.py
│ └── xU25MMA2N4aVtYay.mp4
├── equal_splitter.py
├── remove_mid_ckpt.py
├── sgl_llava_inference_multinode.py
└── upload_data.py
├── scripts
├── archived
│ ├── convert_gqa_for_eval.py
│ ├── convert_mmvet_for_eval.py
│ ├── convert_sqa_to_llava.py
│ ├── convert_sqa_to_llava_base_prompt.py
│ ├── convert_vizwiz_for_submission.py
│ ├── convert_vqav2_for_submission.py
│ ├── data_info.py
│ ├── dpo_data_info.py
│ ├── entry_cmd.sh
│ ├── finetune.sh
│ ├── finetune_1.5.sh
│ ├── finetune_full_schedule.sh
│ ├── finetune_lora.sh
│ ├── finetune_mixtral.sh
│ ├── finetune_mixtral_1.5.sh
│ ├── finetune_mixtral_1.6_336px_anyres.sh
│ ├── finetune_mixtral_1.6_336px_anyres_freeze_vision.sh
│ ├── finetune_mixtral_1.6_336px_anyres_lmms_eval.sh
│ ├── finetune_mixtral_copy.sh
│ ├── finetune_qlora.sh
│ ├── finetune_sqa.sh
│ ├── merge_lora_weights.py
│ ├── pretrain.sh
│ ├── quick_check.py
│ ├── sqa_eval_batch.sh
│ └── sqa_eval_gather.sh
├── convert_gqa_for_eval.py
├── interleave
│ ├── eval_all.sh
│ ├── eval_interleave_3d.sh
│ └── eval_multiprocess.sh
├── qwen.py
├── summarize_data.py
├── train
│ ├── README.md
│ ├── direct_finetune_clip.sh
│ ├── direct_finetune_siglip_a4.sh
│ ├── dpo.sh
│ ├── dpo_ov7b.sh
│ ├── finetune_ov.sh
│ ├── finetune_si.sh
│ ├── mid_stage.yaml
│ ├── onevision.yaml
│ ├── pretrain_clip.sh
│ ├── pretrain_siglip.sh
│ └── single_image.yaml
├── v1_5
│ └── eval
│ │ ├── ai2d.sh
│ │ ├── chartqa.sh
│ │ ├── deepform.sh
│ │ ├── docvqa_test.sh
│ │ ├── docvqa_val.sh
│ │ ├── estvqa.sh
│ │ ├── gqa.sh
│ │ ├── infographics.sh
│ │ ├── llavabench.sh
│ │ ├── mmbench.sh
│ │ ├── mmbench_cn.sh
│ │ ├── mme.sh
│ │ ├── mmvet.sh
│ │ ├── pope.sh
│ │ ├── qbench.sh
│ │ ├── qbench_zh.sh
│ │ ├── rec.sh
│ │ ├── sqa.sh
│ │ ├── textvqa.sh
│ │ ├── vizwiz.sh
│ │ └── vqav2.sh
├── video
│ ├── demo
│ │ └── video_demo.sh
│ ├── eval
│ │ ├── activitynet_eval.sh
│ │ ├── video_chatgpt_benchmark_eval_shard.sh
│ │ ├── video_description_from_t2v.sh
│ │ ├── video_detail_description_eval_only.sh
│ │ └── video_detail_description_eval_shard.sh
│ └── train
│ │ ├── SO400M_Qwen2_72B_ov_to_video_am9.sh
│ │ ├── SO400M_Qwen2_7B_ov_to_video_am9.sh
│ │ └── exp.yaml
├── zero2.json
├── zero2_fused_adamw.json
├── zero2_offload.json
├── zero2_old.json
├── zero3.json
├── zero3_offload.json
└── zero3pp.json
├── setup.py
├── trl
├── __init__.py
├── core.py
├── environment
│ ├── __init__.py
│ └── base_environment.py
├── extras
│ ├── __init__.py
│ ├── best_of_n_sampler.py
│ └── dataset_formatting.py
├── import_utils.py
├── models
│ ├── __init__.py
│ ├── modeling_base.py
│ ├── modeling_sd_base.py
│ ├── modeling_value_head.py
│ └── utils.py
└── trainer
│ ├── __init__.py
│ ├── base.py
│ ├── ddpo_config.py
│ ├── ddpo_trainer.py
│ ├── dpo_trainer.py
│ ├── iterative_sft_trainer.py
│ ├── model_config.py
│ ├── ppo_config.py
│ ├── ppo_trainer.py
│ ├── reward_config.py
│ ├── reward_trainer.py
│ ├── sft_trainer.py
│ └── utils.py
├── uhdv2-qwen2.sh
├── uhdv2-vicuna13b.sh
└── vdim-pretrain.sh
/VLMEvalKit/assets/apple.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/VLMEvalKit/assets/apple.jpg
--------------------------------------------------------------------------------
/VLMEvalKit/docs/en/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | # Set the version of Python and other tools you might need
4 | build:
5 | os: ubuntu-22.04
6 | tools:
7 | python: "3.8"
8 |
9 | formats:
10 | - epub
11 |
12 | sphinx:
13 | configuration: docs/en/conf.py
14 |
15 | python:
16 | install:
17 | - requirements: requirements/docs.txt
18 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/en/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/en/_static/css/readthedocs.css:
--------------------------------------------------------------------------------
1 | .header-logo {
2 | background-image: url("../image/logo.svg");
3 | background-size: 275px 80px;
4 | height: 80px;
5 | width: 275px;
6 | }
7 |
8 |
9 | @media screen and (min-width: 1100px) {
10 | .header-logo {
11 | top: -25px;
12 | }
13 | }
14 |
15 | pre {
16 | white-space: pre;
17 | }
18 |
19 | @media screen and (min-width: 2000px) {
20 | .pytorch-content-left {
21 | width: 1200px;
22 | margin-left: 30px;
23 | }
24 | article.pytorch-article {
25 | max-width: 1200px;
26 | }
27 | .pytorch-breadcrumbs-wrapper {
28 | width: 1200px;
29 | }
30 | .pytorch-right-menu.scrolling-fixed {
31 | position: fixed;
32 | top: 45px;
33 | left: 1580px;
34 | }
35 | }
36 |
37 |
38 | article.pytorch-article section code {
39 | padding: .2em .4em;
40 | background-color: #f3f4f7;
41 | border-radius: 5px;
42 | }
43 |
44 | /* Disable the change in tables */
45 | article.pytorch-article section table code {
46 | padding: unset;
47 | background-color: unset;
48 | border-radius: unset;
49 | }
50 |
51 | table.autosummary td {
52 | width: 50%
53 | }
54 |
55 | img.align-center {
56 | display: block;
57 | margin-left: auto;
58 | margin-right: auto;
59 | }
60 |
61 | article.pytorch-article p.rubric {
62 | font-weight: bold;
63 | }
64 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/en/_static/image/logo_icon.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/en/_static/js/custom.js:
--------------------------------------------------------------------------------
1 | var collapsedSections = [];
2 |
3 | $(document).ready(function () {
4 | $('.model-summary').DataTable({
5 | "stateSave": false,
6 | "lengthChange": false,
7 | "pageLength": 20,
8 | "order": []
9 | });
10 | });
11 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/en/_templates/404.html:
--------------------------------------------------------------------------------
1 | {% extends "layout.html" %}
2 |
3 | {% block body %}
4 |
5 |
Page Not Found
6 |
7 | The page you are looking for cannot be found.
8 |
9 |
10 | If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in
11 | the content table left, or go to the homepage .
12 |
13 |
17 |
18 | {% endblock %}
19 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/en/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 | .. currentmodule:: {{ module }}
4 |
5 |
6 | {{ name | underline}}
7 |
8 | .. autoclass:: {{ name }}
9 | :members:
10 |
11 | ..
12 | autogenerated from _templates/autosummary/class.rst
13 | note it does not have :inherited-members:
14 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/en/_templates/callable.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 | .. currentmodule:: {{ module }}
4 |
5 |
6 | {{ name | underline}}
7 |
8 | .. autoclass:: {{ name }}
9 | :members:
10 | :special-members: __call__
11 |
12 | ..
13 | autogenerated from _templates/callable.rst
14 | note it does not have :inherited-members:
15 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/en/advanced_guides/Contributors.md:
--------------------------------------------------------------------------------
1 | # Contributors
2 |
3 | ## Contributors w. 3+ Major Contributions
4 |
5 | > In this section, we list all the contributors who have made significant contributions (3+) to the development of VLMEvalKit.
6 |
7 | New Qualified Contributors (2024.09):
8 |
9 | 1. [amitbcp](https://github.com/amitbcp): The contributor helped support MUIRBench, Phi-3.5, Idefics3, VILA, and xGen-MM
10 | 2. [czczup](https://github.com/czczup): The contributor helped support the InternVL Series (V1.5, Mini-InternVL, V2, etc.)
11 | 3. [DseidLi](https://github.com/DseidLi): The contributor helped support LLaVA-OneVision, GQA, and developed the readthedocs site for VLMEvalKit
12 | 4. [mayubo2333](https://github.com/mayubo2333): The contributor helped support MMLongBench, SlideVQA, and DUDE
13 | 5. [sun-hailong](https://github.com/sun-hailong): The contributor helped support A-OKVQA, Parrot, MMMB, and MTL-MMBench
14 | 6. [PhoenixZ810](https://github.com/PhoenixZ810): The contributor helped support Video-ChatGPT, Chat-UniVI, and Llama-VID
15 | 7. [Cuiunbo](https://github.com/Cuiunbo): The contributor helped support OmniLMM-12B, MiniCPM-V Series (V1, V2, V2.5)
16 |
17 | ## Full Contributor List
18 |
19 | > In this section, we list all the contributors as well as their corresponding contributions to the development of VLMEvalKit.
20 |
21 | TBD.
22 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/en/docutils.conf:
--------------------------------------------------------------------------------
1 | [html writers]
2 | table_style: colwidths-auto
3 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/en/index.rst:
--------------------------------------------------------------------------------
1 | Welcome to the VLMEvalKit Tutorial!
2 | ==========================================
3 |
4 | VLMEvalKit Getting Started Guide
5 | -------------------------------
6 |
7 | To help users get started quickly, we recommend the following process:
8 |
9 | - For users who want to use VLMEvalKit, we recommend reading the "Start Your First Step" section to set up the environment and start a mini-experiment to familiarize yourself with the process.
10 |
11 | - If you want to customize more modules, such as adding datasets and models, we provide an "Advanced Tutorial."
12 |
13 | We always welcome users' PRs (Pull Requests) and Issues to improve VLMEvalKit!
14 |
15 | .. _Start Your First Step:
16 | .. toctree::
17 | :maxdepth: 1
18 | :caption: Start Your First Step
19 |
20 | get_started/Quickstart.md
21 |
22 |
23 | .. .. _Tutorials:
24 | .. .. toctree::
25 | .. :maxdepth: 1
26 | .. :caption: Tutorials
27 |
28 | .. user_guides/framework_overview.md
29 |
30 | .. _Advanced Tutorial:
31 | .. toctree::
32 | :maxdepth: 1
33 | :caption: Advanced Tutorial
34 |
35 | advanced_guides/Development.md
36 |
37 | .. .. _Other Notes:
38 | .. .. toctree::
39 | .. :maxdepth: 1
40 | .. :caption: Other Notes
41 |
42 | .. notes/contribution_guide.md
43 |
44 | Index and Tables
45 | ==================
46 |
47 | * :ref:`genindex`
48 | * :ref:`search`
49 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/zh-CN/.readthedocs.yaml:
--------------------------------------------------------------------------------
1 | version: 2
2 |
3 | # Set the version of Python and other tools you might need
4 | build:
5 | os: ubuntu-22.04
6 | tools:
7 | python: "3.8"
8 |
9 | formats:
10 | - epub
11 |
12 | sphinx:
13 | configuration: docs/zh-CN/conf.py
14 |
15 | python:
16 | install:
17 | - requirements: requirements/docs.txt
18 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/zh-CN/Makefile:
--------------------------------------------------------------------------------
1 | # Minimal makefile for Sphinx documentation
2 | #
3 |
4 | # You can set these variables from the command line, and also
5 | # from the environment for the first two.
6 | SPHINXOPTS ?=
7 | SPHINXBUILD ?= sphinx-build
8 | SOURCEDIR = .
9 | BUILDDIR = _build
10 |
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 |
15 | .PHONY: help Makefile
16 |
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
21 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/zh-CN/_static/css/readthedocs.css:
--------------------------------------------------------------------------------
1 | .header-logo {
2 | background-image: url("../image/logo.svg");
3 | background-size: 275px 80px;
4 | height: 80px;
5 | width: 275px;
6 | }
7 |
8 |
9 | @media screen and (min-width: 1100px) {
10 | .header-logo {
11 | top: -25px;
12 | }
13 | }
14 |
15 | pre {
16 | white-space: pre;
17 | }
18 |
19 | @media screen and (min-width: 2000px) {
20 | .pytorch-content-left {
21 | width: 1200px;
22 | margin-left: 30px;
23 | }
24 | article.pytorch-article {
25 | max-width: 1200px;
26 | }
27 | .pytorch-breadcrumbs-wrapper {
28 | width: 1200px;
29 | }
30 | .pytorch-right-menu.scrolling-fixed {
31 | position: fixed;
32 | top: 45px;
33 | left: 1580px;
34 | }
35 | }
36 |
37 |
38 | article.pytorch-article section code {
39 | padding: .2em .4em;
40 | background-color: #f3f4f7;
41 | border-radius: 5px;
42 | }
43 |
44 | /* Disable the change in tables */
45 | article.pytorch-article section table code {
46 | padding: unset;
47 | background-color: unset;
48 | border-radius: unset;
49 | }
50 |
51 | table.autosummary td {
52 | width: 50%
53 | }
54 |
55 | img.align-center {
56 | display: block;
57 | margin-left: auto;
58 | margin-right: auto;
59 | }
60 |
61 | article.pytorch-article p.rubric {
62 | font-weight: bold;
63 | }
64 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/zh-CN/_static/image/logo_icon.svg:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
17 |
18 |
19 |
20 |
21 |
22 |
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 |
32 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/zh-CN/_static/js/custom.js:
--------------------------------------------------------------------------------
1 | var collapsedSections = [];
2 |
3 | $(document).ready(function () {
4 | $('.model-summary').DataTable({
5 | "stateSave": false,
6 | "lengthChange": false,
7 | "pageLength": 20,
8 | "order": []
9 | });
10 | });
11 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/zh-CN/_templates/404.html:
--------------------------------------------------------------------------------
1 | {% extends "layout.html" %}
2 |
3 | {% block body %}
4 |
5 | Page Not Found
6 |
7 | The page you are looking for cannot be found.
8 |
9 |
10 | If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in
11 | the content table left, or go to the homepage .
12 |
13 |
17 |
18 | {% endblock %}
19 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/zh-CN/_templates/autosummary/class.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 | .. currentmodule:: {{ module }}
4 |
5 |
6 | {{ name | underline}}
7 |
8 | .. autoclass:: {{ name }}
9 | :members:
10 |
11 | ..
12 | autogenerated from _templates/autosummary/class.rst
13 | note it does not have :inherited-members:
14 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/zh-CN/_templates/callable.rst:
--------------------------------------------------------------------------------
1 | .. role:: hidden
2 | :class: hidden-section
3 | .. currentmodule:: {{ module }}
4 |
5 |
6 | {{ name | underline}}
7 |
8 | .. autoclass:: {{ name }}
9 | :members:
10 | :special-members: __call__
11 |
12 | ..
13 | autogenerated from _templates/callable.rst
14 | note it does not have :inherited-members:
15 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/zh-CN/cp_origin_docs.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | # Copy *.md files from docs/ if it doesn't have a Chinese translation
4 |
5 | for filename in $(find ../en/ -name '*.md' -printf "%P\n");
6 | do
7 | mkdir -p $(dirname $filename)
8 | cp -n ../en/$filename ./$filename
9 | done
10 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/zh-CN/docutils.conf:
--------------------------------------------------------------------------------
1 | [html writers]
2 | table_style: colwidths-auto
3 |
--------------------------------------------------------------------------------
/VLMEvalKit/docs/zh-CN/index.rst:
--------------------------------------------------------------------------------
1 | 欢迎来到 VLMEvalKit 中文教程!
2 | ==========================================
3 |
4 | VLMEvalKit 上手路线
5 | -------------------------------
6 |
7 | 为了用户能够快速上手,我们推荐以下流程:
8 |
9 | - 对于想要使用 VLMEvalKit 的用户,我们推荐先阅读 开始你的第一步_ 部分来设置环境,并启动一个迷你实验熟悉流程。
10 |
11 | - 若您想进行更多模块的自定义,例如增加数据集和模型,我们提供了 进阶教程_ 。
12 |
13 | 我们始终非常欢迎用户的 PRs 和 Issues 来完善 VLMEvalKit!
14 |
15 | .. _开始你的第一步:
16 | .. toctree::
17 | :maxdepth: 1
18 | :caption: 开始你的第一步
19 |
20 | get_started/Quickstart.md
21 |
22 |
23 | .. .. _教程:
24 | .. .. toctree::
25 | .. :maxdepth: 1
26 | .. :caption: 教程
27 |
28 | .. user_guides/framework_overview.md
29 |
30 | .. _进阶教程:
31 | .. toctree::
32 | :maxdepth: 1
33 | :caption: 进阶教程
34 |
35 | advanced_guides/Development.md
36 |
37 | .. .. _其他说明:
38 | .. .. toctree::
39 | .. :maxdepth: 1
40 | .. :caption: 其他说明
41 |
42 | .. notes/contribution_guide.md
43 |
44 | 索引与表格
45 | ==================
46 |
47 | * :ref:`genindex`
48 | * :ref:`search`
49 |
--------------------------------------------------------------------------------
/VLMEvalKit/eval.sh:
--------------------------------------------------------------------------------
1 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
2 | torchrun --nproc-per-node=8 run.py --data OCRBench MMMU_DEV_VAL SEEDBench_IMG MMBench_TEST_EN RealWorldQA HRBench4K --model llava_uhd2 --verbose
--------------------------------------------------------------------------------
/VLMEvalKit/requirements.txt:
--------------------------------------------------------------------------------
1 | decord
2 | gradio
3 | huggingface_hub
4 | imageio
5 | matplotlib
6 | moviepy
7 | numpy>=1.23.4
8 | omegaconf
9 | openai==1.3.5
10 | opencv-python>=4.4.0.46
11 | openpyxl
12 | pandas
13 | peft
14 | pillow
15 | portalocker
16 | protobuf
17 | python-dotenv
18 | requests
19 | rich
20 | sentencepiece
21 | setuptools
22 | sty
23 | tabulate
24 | tiktoken
25 | timeout-decorator
26 | torch>=2.0.1
27 | tqdm
28 | transformers
29 | typing_extensions==4.7.1
30 | validators
31 | xlsxwriter
32 |
--------------------------------------------------------------------------------
/VLMEvalKit/requirements/docs.txt:
--------------------------------------------------------------------------------
1 | docutils==0.18.1
2 | modelindex
3 | myst-parser
4 | -e git+https://github.com/open-compass/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
5 | sphinx==6.1.3
6 | sphinx-copybutton
7 | sphinx-design
8 | sphinx-notfound-page
9 | sphinx-tabs
10 | sphinxcontrib-jquery
11 | tabulate
12 |
--------------------------------------------------------------------------------
/VLMEvalKit/scripts/auto_run.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from vlmeval.smp import *
3 | from vlmeval.config import supported_VLM
4 |
5 | def is_api(x):
6 | return getattr(supported_VLM[x].func, 'is_api', False)
7 |
8 | models = list(supported_VLM)
9 | models = [x for x in models if 'fs' not in x]
10 | models = [x for x in models if not is_api(x)]
11 | exclude_list = ['cogvlm-grounding-generalist', 'emu2']
12 | models = [x for x in models if x not in exclude_list]
13 |
14 | def is_large(x):
15 | return '80b' in x or 'emu2' in x or '34B' in x
16 |
17 | small_models = [x for x in models if not is_large(x)]
18 | large_models = [x for x in models if is_large(x)]
19 | models = small_models + large_models
20 |
21 | parser = argparse.ArgumentParser()
22 | parser.add_argument('--data', type=str, nargs='+', required=True)
23 | args = parser.parse_args()
24 |
25 | # Skip some models
26 | models = [x for x in models if not listinstr(['MiniGPT', 'grounding-generalist'], x)]
27 |
28 | for m in models:
29 | unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.xlsx')]
30 | if len(unknown_datasets) == 0:
31 | continue
32 | dataset_str = ' '.join(unknown_datasets)
33 | if '80b' in m:
34 | cmd = f'python run.py --data {dataset_str} --model {m}'
35 | else:
36 | cmd = f'bash run.sh --data {dataset_str} --model {m}'
37 | print(cmd)
38 | os.system(cmd)
--------------------------------------------------------------------------------
/VLMEvalKit/scripts/cover.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
3 | cp $DIR/../config.py $DIR/../vlmeval/
4 | cp $DIR/../misc/* $DIR/../vlmeval/vlm/misc/
--------------------------------------------------------------------------------
/VLMEvalKit/scripts/run.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 | export GPU=$(nvidia-smi --list-gpus | wc -l)
4 | torchrun --nproc-per-node=$GPU run.py ${@:1}
--------------------------------------------------------------------------------
/VLMEvalKit/scripts/srun.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | set -x
3 | srun -n1 --ntasks-per-node=1 --partition $1 --gres=gpu:8 --quotatype=reserved --job-name vlmeval --cpus-per-task=64 torchrun --nproc-per-node=8 run.py ${@:2}
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 | import torch
3 | except ImportError:
4 | pass
5 |
6 | from .smp import *
7 | from .api import *
8 | from .dataset import *
9 | from .utils import *
10 | from .vlm import *
11 | from .config import *
12 | from .tools import cli
13 |
14 | load_env()
15 |
16 | __version__ = '0.2rc1'
17 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/__init__.py:
--------------------------------------------------------------------------------
1 | from .gpt import OpenAIWrapper, GPT4V
2 | from .hf_chat_model import HFChatModel
3 | from .gemini import GeminiWrapper, GeminiProVision
4 | from .qwen_vl_api import QwenVLWrapper, QwenVLAPI, Qwen2VLAPI
5 | from .qwen_api import QwenAPI
6 | from .claude import Claude_Wrapper, Claude3V
7 | from .reka import Reka
8 | from .glm_vision import GLMVisionAPI
9 | from .cloudwalk import CWWrapper
10 | from .sensechat_vision import SenseChatVisionAPI
11 | from .hunyuan import HunyuanVision
12 | from .bluelm_v_api import BlueLMWrapper, BlueLM_V_API
13 |
14 |
15 | __all__ = [
16 | 'OpenAIWrapper', 'HFChatModel', 'GeminiWrapper', 'GPT4V',
17 | 'GeminiProVision', 'QwenVLWrapper', 'QwenVLAPI', 'QwenAPI',
18 | 'Claude3V', 'Claude_Wrapper', 'Reka', 'GLMVisionAPI',
19 | 'CWWrapper', 'SenseChatVisionAPI', 'HunyuanVision', 'Qwen2VLAPI',
20 | 'BlueLMWrapper', 'BlueLM_V_API',
21 | ]
22 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/api/reka.py:
--------------------------------------------------------------------------------
1 | from vlmeval.smp import *
2 | from vlmeval.api.base import BaseAPI
3 | from time import sleep
4 | import mimetypes
5 |
6 |
7 | class Reka_Wrapper(BaseAPI):
8 |
9 | is_api: bool = True
10 | INTERLEAVE: bool = False
11 |
12 | def __init__(self,
13 | model: str = 'reka-flash-20240226',
14 | key: str = None,
15 | retry: int = 10,
16 | wait: int = 3,
17 | system_prompt: str = None,
18 | verbose: bool = True,
19 | temperature: float = 0,
20 | max_tokens: int = 1024,
21 | **kwargs):
22 |
23 | try:
24 | import reka
25 | except ImportError:
26 | raise ImportError('Please install reka by running "pip install reka-api"')
27 |
28 | self.model = model
29 | default_kwargs = dict(temperature=temperature, request_output_len=max_tokens)
30 | default_kwargs.update(kwargs)
31 | self.kwargs = default_kwargs
32 | if key is not None:
33 | self.key = key
34 | else:
35 | self.key = os.environ.get('REKA_API_KEY', '')
36 | super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
37 |
38 | def generate_inner(self, inputs, **kwargs) -> str:
39 | import reka
40 | reka.API_KEY = self.key
41 | dataset = kwargs.pop('dataset', None)
42 | prompt, image_path = self.message_to_promptimg(inputs, dataset=dataset)
43 | image_b64 = encode_image_file_to_base64(image_path)
44 |
45 | response = reka.chat(
46 | model_name=self.model,
47 | human=prompt,
48 | media_url=f'data:image/jpeg;base64,{image_b64}',
49 | **self.kwargs)
50 |
51 | try:
52 | return 0, response['text'], response
53 | except:
54 | return -1, self.fail_msg, response
55 |
56 |
57 | class Reka(Reka_Wrapper):
58 |
59 | def generate(self, message, dataset=None):
60 | return super(Reka_Wrapper, self).generate(message)
61 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/dataset/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .judge_util import build_judge, DEBUG_MESSAGE
2 | from .multiple_choice import extract_answer_from_item, prefetch_answer
3 | from .vqa_eval import levenshtein_distance
4 |
5 |
6 | __all__ = [
7 | 'build_judge', 'extract_answer_from_item', 'prefetch_answer',
8 | 'levenshtein_distance', 'DEBUG_MESSAGE'
9 | ]
10 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/dataset/utils/crpe.py:
--------------------------------------------------------------------------------
1 | import json
2 | import argparse
3 | from collections import defaultdict
4 |
5 |
6 | def is_correct(predict, answer):
7 | # predict是标准答案 answer是预测
8 | if len(answer) == 1:
9 | return answer[0] == predict[0]
10 | elif len(answer) != 1 and answer[0] in ['A', 'B', 'C', 'D']:
11 | return answer[0] == predict[0]
12 | elif len(answer) != 1 and answer[0] not in ['A', 'B', 'C', 'D']:
13 | return predict[4:].lower() in answer.lower()
14 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/dataset/utils/hrbench.py:
--------------------------------------------------------------------------------
1 | from ...smp import *
2 | import os
3 |
4 |
5 | def report_acc_hrbench(df):
6 | cycle_group = df.groupby('cycle_category')
7 | result_dic = defaultdict(list)
8 | avg_dic = defaultdict(int)
9 |
10 | count = 0
11 | for key, data_value in cycle_group:
12 | count += 1
13 | _, resp_dic = hrbench_score(data_value)
14 |
15 | for task_type, accuracy in resp_dic.items():
16 | result_dic['cycle'].append(key)
17 | result_dic['type'].append(task_type)
18 | result_dic['accuracy'].append(accuracy)
19 |
20 | avg_dic[task_type] += accuracy
21 | for task_type, accuracy in avg_dic.items():
22 | result_dic['cycle'].append('Average')
23 | result_dic['type'].append(task_type)
24 | result_dic['accuracy'].append(accuracy / count)
25 | result_pd = pd.DataFrame(result_dic)
26 |
27 | return result_pd
28 |
29 |
30 | def hrbench_score(data):
31 | ret = defaultdict(list)
32 | resp_dic = {}
33 | category_list = set(data['category'])
34 | score_dict = defaultdict(list)
35 |
36 | for i in range(len(data)):
37 | d = data.iloc[i]
38 | category = d['category']
39 | gpt_score = d['hit']
40 | score_dict[category].append(gpt_score)
41 | score_dict['all'].append(gpt_score)
42 |
43 | all_acc = np.mean(score_dict['all'])
44 | ret['type'].append('all')
45 | ret['acc'].append(all_acc)
46 | resp_dic['all'] = all_acc
47 | for cate in category_list:
48 | acc = np.mean(score_dict[cate])
49 | ret['type'].append(cate)
50 | ret['acc'].append(acc)
51 |
52 | resp_dic[cate] = acc
53 |
54 | return pd.DataFrame(ret), resp_dic
55 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/dataset/utils/judge_util.py:
--------------------------------------------------------------------------------
1 | import os
2 | from ...api import OpenAIWrapper
3 | from ...smp import load_env
4 |
5 | INTERNAL = os.environ.get('INTERNAL', 0)
6 |
7 |
8 | def build_judge(**kwargs):
9 | model = kwargs.pop('model', None)
10 | kwargs.pop('nproc', None)
11 | load_env()
12 | LOCAL_LLM = os.environ.get('LOCAL_LLM', None)
13 | if LOCAL_LLM is None:
14 | model_map = {
15 | 'gpt-4-turbo': 'gpt-4-1106-preview',
16 | 'gpt-4-0613': 'gpt-4-0613',
17 | 'gpt-4-0125': 'gpt-4-0125-preview',
18 | 'gpt-4-0409': 'gpt-4-turbo-2024-04-09',
19 | 'chatgpt-1106': 'gpt-3.5-turbo-1106',
20 | 'chatgpt-0125': 'gpt-3.5-turbo-0125',
21 | 'gpt-4o': 'gpt-4o-2024-05-13',
22 | 'gpt-4o-mini': 'gpt-4o-mini-2024-07-18',
23 | }
24 | model_version = model_map[model]
25 | else:
26 | model_version = LOCAL_LLM
27 | model = OpenAIWrapper(model_version, **kwargs)
28 | return model
29 |
30 |
31 | DEBUG_MESSAGE = """
32 | To debug the OpenAI API, you can try the following scripts in python:
33 | ```python
34 | from vlmeval.api import OpenAIWrapper
35 | model = OpenAIWrapper('gpt-4-1106-preview', verbose=True)
36 | msgs = [dict(type='text', value='Hello!')]
37 | code, answer, resp = model.generate_inner(msgs)
38 | print(code, answer, resp)
39 | ```
40 | You cam see the specific error if the API call fails.
41 | """
42 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/smp/__init__.py:
--------------------------------------------------------------------------------
1 | from .file import *
2 | from .vlm import *
3 | from .misc import *
4 | from .log import *
5 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/smp/log.py:
--------------------------------------------------------------------------------
1 | import logging
2 |
3 | logger_initialized = {}
4 |
5 |
6 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'):
7 | logger = logging.getLogger(name)
8 | if name in logger_initialized:
9 | return logger
10 |
11 | for logger_name in logger_initialized:
12 | if name.startswith(logger_name):
13 | return logger
14 |
15 | stream_handler = logging.StreamHandler()
16 | handlers = [stream_handler]
17 |
18 | try:
19 | import torch.distributed as dist
20 | if dist.is_available() and dist.is_initialized():
21 | rank = dist.get_rank()
22 | else:
23 | rank = 0
24 | except ImportError:
25 | rank = 0
26 |
27 | if rank == 0 and log_file is not None:
28 | file_handler = logging.FileHandler(log_file, file_mode)
29 | handlers.append(file_handler)
30 |
31 | formatter = logging.Formatter(
32 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
33 | for handler in handlers:
34 | handler.setFormatter(formatter)
35 | handler.setLevel(log_level)
36 | logger.addHandler(handler)
37 |
38 | if rank == 0:
39 | logger.setLevel(log_level)
40 | else:
41 | logger.setLevel(logging.ERROR)
42 |
43 | logger_initialized[name] = True
44 | return logger
45 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/utils/__init__.py:
--------------------------------------------------------------------------------
1 | from .matching_util import can_infer, can_infer_option, can_infer_text
2 | from .mp_util import track_progress_rich
3 |
4 |
5 | __all__ = [
6 | 'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich',
7 | ]
8 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/utils/matching_util.py:
--------------------------------------------------------------------------------
1 | import string
2 | import copy as cp
3 | import os
4 | from ..smp import *
5 |
6 |
7 | def can_infer_option(answer, choices):
8 | verbose = os.environ.get('VERBOSE', 0)
9 | # Choices is a dictionary
10 | if 'Failed to obtain answer via API' in answer:
11 | return False
12 |
13 | reject_to_answer = [
14 | "Sorry, I can't help with images of people yet.",
15 | "I can't process this file.",
16 | "I'm sorry, but without the image provided",
17 | 'Cannot determine the answer'
18 | ]
19 | for err in reject_to_answer:
20 | if err in answer:
21 | return 'Z'
22 |
23 | def count_choice(splits, choices, prefix='', suffix=''):
24 | cnt = 0
25 | for c in choices:
26 | if prefix + c + suffix in splits:
27 | cnt += 1
28 | return cnt
29 |
30 | answer_mod = cp.copy(answer)
31 | chars = '.()[],:;!*#{}'
32 | for c in chars:
33 | answer_mod = answer_mod.replace(c, ' ')
34 |
35 | splits = [x.strip() for x in answer_mod.split()]
36 | count = count_choice(splits, choices)
37 |
38 | if count == 1:
39 | for ch in choices:
40 | if 'A' in splits and len(splits) > 3 and verbose:
41 | logger = get_logger('Evaluation')
42 | logger.info(f'A might be a quantifier in the string: {answer}.')
43 | return False
44 | if ch in splits:
45 | return ch
46 | elif count == 0 and count_choice(splits, {'Z', ''}) == 1:
47 | return 'Z'
48 | return False
49 |
50 |
51 | def can_infer_text(answer, choices):
52 | answer = answer.lower()
53 | assert isinstance(choices, dict)
54 | for k in choices:
55 | assert k in string.ascii_uppercase
56 | choices[k] = str(choices[k]).lower()
57 | cands = []
58 | for k in choices:
59 | if choices[k] in answer:
60 | cands.append(k)
61 | if len(cands) == 1:
62 | return cands[0]
63 | return False
64 |
65 |
66 | def can_infer(answer, choices):
67 | answer = str(answer)
68 | copt = can_infer_option(answer, choices)
69 | return copt if copt else can_infer_text(answer, choices)
70 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/__init__.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | torch.set_grad_enabled(False)
4 | torch.manual_seed(1234)
5 | from .base import BaseModel
6 | from .cogvlm import CogVlm, GLM4v
7 | from .emu import Emu
8 | from .eagle_x import Eagle
9 | from .idefics import IDEFICS, IDEFICS2
10 | from .instructblip import InstructBLIP
11 | from .llava import LLaVA, LLaVA_Next, LLaVA_XTuner, LLaVA_Next2, LLaVA_OneVision
12 | from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V, MiniCPM_V_2_6
13 | from .minigpt4 import MiniGPT4
14 | from .mmalaya import MMAlaya, MMAlaya2
15 | from .monkey import Monkey, MonkeyChat
16 | from .moondream import Moondream1, Moondream2
17 | from .minimonkey import MiniMonkey
18 | from .mplug_owl2 import mPLUG_Owl2
19 | from .omnilmm import OmniLMM12B
20 | from .open_flamingo import OpenFlamingo
21 | from .pandagpt import PandaGPT
22 | from .qwen_vl import QwenVL, QwenVLChat
23 | from .qwen2_vl import Qwen2VLChat
24 | from .transcore_m import TransCoreM
25 | from .visualglm import VisualGLM
26 | from .xcomposer import ShareCaptioner, XComposer, XComposer2, XComposer2_4KHD, XComposer2d5
27 | from .yi_vl import Yi_VL
28 | from .internvl_chat import InternVLChat
29 | from .deepseek_vl import DeepSeekVL
30 | from .mgm import Mini_Gemini
31 | from .bunnyllama3 import BunnyLLama3
32 | from .vxverse import VXVERSE
33 | from .paligemma import PaliGemma
34 | from .qh_360vl import QH_360VL
35 | from .phi3_vision import Phi3Vision, Phi3_5Vision
36 | from .wemm import WeMM
37 | from .cambrian import Cambrian
38 | from .chameleon import Chameleon
39 | from .video_llm import VideoLLaVA, VideoLLaVA_HF, Chatunivi, VideoChatGPT, LLaMAVID, VideoChat2_HD, PLLaVA
40 | from .vila import VILA
41 | from .ovis import Ovis, Ovis1_6
42 | from .mantis import Mantis
43 | from .mixsense import LLama3Mixsense
44 | from .parrot import Parrot
45 | from .omchat import OmChat
46 | from .rbdash import RBDash
47 | from .xgen_mm import XGenMM
48 | from .slime import SliME
49 | from .mplug_owl3 import mPLUG_Owl3
50 | from .pixtral import Pixtral
51 | from .llama_vision import llama_vision
52 | from .llava_uhd import LLaVA_UHD
53 | from .llava_uhd2 import LLaVA_UHD2
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/chameleon.py:
--------------------------------------------------------------------------------
1 | import os.path as osp
2 | import warnings
3 | from .base import BaseModel
4 | from ..smp import *
5 | from PIL import Image
6 | import torch
7 |
8 |
9 | class Chameleon(BaseModel):
10 |
11 | INSTALL_REQ = False
12 | INTERLEAVE = True
13 |
14 | def __init__(self, model_path='facebook/chameleon-7b', **kwargs):
15 | try:
16 | from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
17 | except:
18 | warnings.warn('Please install the latest transformers.')
19 |
20 | processor = ChameleonProcessor.from_pretrained(model_path)
21 | model = ChameleonForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16)
22 |
23 | self.model = model.cuda().eval()
24 | self.processor = processor
25 |
26 | def generate_inner(self, message, dataset=None):
27 | content, images = '', []
28 | for x in message:
29 | if x['type'] == 'text':
30 | content += x['value']
31 | elif x['type'] == 'image':
32 | content += '\n'
33 | images.append(Image.open(x['value']))
34 |
35 | inputs = self.processor(
36 | text=[content],
37 | images=images,
38 | padding=True,
39 | return_tensors='pt'
40 | ).to(device='cuda', dtype=torch.bfloat16)
41 | generate_ids = self.model.generate(**inputs, max_new_tokens=512)
42 | input_token_len = inputs.input_ids.shape[1]
43 | text = self.processor.batch_decode(
44 | generate_ids[:, input_token_len:],
45 | skip_special_tokens=True,
46 | clean_up_tokenization_spaces=False
47 | )[0]
48 | return text
49 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/instructblip.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from PIL import Image
3 | import os.path as osp
4 | import sys
5 | from .base import BaseModel
6 | from ..smp import *
7 |
8 |
9 | class InstructBLIP(BaseModel):
10 |
11 | INSTALL_REQ = True
12 | INTERLEAVE = False
13 |
14 | def __init__(self, name):
15 | self.config_map = {
16 | 'instructblip_7b': 'misc/blip2_instruct_vicuna7b.yaml',
17 | 'instructblip_13b': 'misc/blip2_instruct_vicuna13b.yaml',
18 | }
19 |
20 | self.file_path = __file__
21 | config_root = osp.dirname(self.file_path)
22 |
23 | try:
24 | from lavis.models import load_preprocess
25 | from omegaconf import OmegaConf
26 | from lavis.common.registry import registry
27 | except:
28 | warnings.warn('Please install lavis before using InstructBLIP. ')
29 | sys.exit(-1)
30 |
31 | assert name in self.config_map
32 | cfg_path = osp.join(config_root, self.config_map[name])
33 | cfg = OmegaConf.load(cfg_path)
34 |
35 | model_cfg = cfg.model
36 | assert osp.exists(model_cfg.llm_model) or splitlen(model_cfg.llm_model) == 2
37 | model_cls = registry.get_model_class(name='blip2_vicuna_instruct')
38 | model = model_cls.from_config(model_cfg)
39 | model.eval()
40 |
41 | self.device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
42 | device = self.device
43 | model.to(device)
44 | self.model = model
45 | self.kwargs = {'max_length': 512}
46 |
47 | preprocess_cfg = cfg.preprocess
48 | vis_processors, _ = load_preprocess(preprocess_cfg)
49 | self.vis_processors = vis_processors
50 |
51 | def generate_inner(self, message, dataset=None):
52 | prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
53 | vis_processors = self.vis_processors
54 | raw_image = Image.open(image_path).convert('RGB')
55 | image_tensor = vis_processors['eval'](raw_image).unsqueeze(0).to(self.device)
56 | outputs = self.model.generate(dict(image=image_tensor, prompt=prompt))
57 | return outputs[0]
58 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .llava import LLaVA, LLaVA_Next, LLaVA_Next2, LLaVA_OneVision
2 | from .llava_xtuner import LLaVA_XTuner
3 |
4 | __all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner', 'LLaVA_Next2', 'LLaVA_OneVision']
5 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: instruct_vicuna13b
8 | load_finetuned: False
9 | load_pretrained: True
10 |
11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth"
12 | finetuned: ""
13 |
14 | # vit encoder
15 | image_size: 224
16 | drop_path_rate: 0
17 | use_grad_checkpoint: False
18 | vit_precision: "fp16"
19 | freeze_vit: True
20 |
21 | # Q-Former
22 | num_query_token: 32
23 |
24 | # path to Vicuna checkpoint
25 | llm_model: "Please set the path to your vicuna-13b-v1.1"
26 |
27 | # generation configs
28 | prompt: ""
29 |
30 |
31 | preprocess:
32 | vis_processor:
33 | train:
34 | name: "blip2_image_train"
35 | image_size: 224
36 | eval:
37 | name: "blip_image_eval"
38 | image_size: 224
39 | text_processor:
40 | train:
41 | name: "blip_caption"
42 | eval:
43 | name: "blip_caption"
44 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml:
--------------------------------------------------------------------------------
1 | # Copyright (c) 2022, salesforce.com, inc.
2 | # All rights reserved.
3 | # SPDX-License-Identifier: BSD-3-Clause
4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5 |
6 | model:
7 | arch: instruct_vicuna7b
8 | load_finetuned: False
9 | load_pretrained: True
10 |
11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
12 | finetuned: ""
13 |
14 | # vit encoder
15 | image_size: 224
16 | drop_path_rate: 0
17 | use_grad_checkpoint: False
18 | vit_precision: "fp16"
19 | freeze_vit: True
20 |
21 | # Q-Former
22 | num_query_token: 32
23 |
24 | # path to Vicuna checkpoint
25 | llm_model: "Please set the path to your vicuna-7b-v1.1"
26 |
27 | # generation configs
28 | prompt: ""
29 |
30 |
31 | preprocess:
32 | vis_processor:
33 | train:
34 | name: "blip2_image_train"
35 | image_size: 224
36 | eval:
37 | name: "blip_image_eval"
38 | image_size: 224
39 | text_processor:
40 | train:
41 | name: "blip_caption"
42 | eval:
43 | name: "blip_caption"
44 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/misc/minigpt4_13b_eval.yaml:
--------------------------------------------------------------------------------
1 | model:
2 | arch: minigpt4
3 | model_type: pretrain_vicuna_7b
4 | max_txt_len: 160
5 | end_sym: "###"
6 | low_resource: True
7 | prompt_template: '###Human: {} ###Assistant: '
8 | ckpt: "please set this value to the path of pretrained checkpoint"
9 |
10 | # vit encoder
11 | image_size: 224
12 | drop_path_rate: 0
13 | use_grad_checkpoint: False
14 | vit_precision: "fp16"
15 | freeze_vit: True
16 | freeze_qformer: True
17 |
18 | # Q-Former
19 | num_query_token: 32
20 |
21 | # generation configs
22 | prompt: ""
23 |
24 | llama_model: "please set this value to the path of vicuna-13b-v0"
25 |
26 | datasets:
27 | cc_sbu_align:
28 | vis_processor:
29 | train:
30 | name: "blip2_image_eval"
31 | image_size: 224
32 | text_processor:
33 | train:
34 | name: "blip_caption"
35 |
36 | run:
37 | task: image_text_pretrain
38 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/misc/minigpt4_7b_eval.yaml:
--------------------------------------------------------------------------------
1 | model:
2 | arch: minigpt4
3 | model_type: pretrain_vicuna_7b
4 | max_txt_len: 160
5 | end_sym: "###"
6 | low_resource: True
7 | prompt_template: '###Human: {} ###Assistant: '
8 | ckpt: "please set this value to the path of pretrained checkpoint"
9 |
10 | # vit encoder
11 | image_size: 224
12 | drop_path_rate: 0
13 | use_grad_checkpoint: False
14 | vit_precision: "fp16"
15 | freeze_vit: True
16 | freeze_qformer: True
17 |
18 | # Q-Former
19 | num_query_token: 32
20 |
21 | # generation configs
22 | prompt: ""
23 |
24 | llama_model: "please set this value to the path of vicuna-7b-v0"
25 |
26 |
27 | datasets:
28 | cc_sbu_align:
29 | vis_processor:
30 | train:
31 | name: "blip2_image_eval"
32 | image_size: 224
33 | text_processor:
34 | train:
35 | name: "blip_caption"
36 |
37 | run:
38 | task: image_text_pretrain
39 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/misc/minigptv2_eval.yaml:
--------------------------------------------------------------------------------
1 | model:
2 | arch: minigpt_v2
3 | model_type: pretrain
4 | max_txt_len: 160
5 | end_sym: ""
6 | low_resource: True
7 | prompt_template: '[INST] {} [/INST]'
8 | ckpt: "please set this value to the path of pretrained checkpoint"
9 | lora_r: 64
10 | lora_alpha: 16
11 |
12 | # vit encoder
13 | image_size: 448
14 | drop_path_rate: 0
15 | use_grad_checkpoint: False
16 | vit_precision: "fp16"
17 | freeze_vit: True
18 |
19 | # generation configs
20 | prompt: ""
21 |
22 | # LLM
23 | llama_model: "please set this value to the path of llama2-chat-7b"
24 |
25 | datasets:
26 | cc_sbu_align:
27 | vis_processor:
28 | train:
29 | name: "blip2_image_eval"
30 | image_size: 448
31 | text_processor:
32 | train:
33 | name: "blip_caption"
34 |
35 | run:
36 | task: image_text_pretrain
37 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/mixsense.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import transformers
3 | from transformers import AutoModelForCausalLM, AutoTokenizer
4 | from PIL import Image
5 | import warnings
6 |
7 | from .base import BaseModel
8 | from ..smp import *
9 |
10 |
11 | class LLama3Mixsense(BaseModel):
12 |
13 | INSTALL_REQ = False
14 | INTERLEAVE = False
15 |
16 | def __init__(self, model_path='Zero-Vision/Llama-3-MixSenseV1_1', **kwargs):
17 | assert model_path is not None
18 | transformers.logging.set_verbosity_error()
19 | transformers.logging.disable_progress_bar()
20 | warnings.filterwarnings('ignore')
21 | self.tokenizer = AutoTokenizer.from_pretrained(
22 | model_path, trust_remote_code=True
23 | )
24 | self.model = AutoModelForCausalLM.from_pretrained(
25 | model_path, trust_remote_code=True
26 | ).to('cuda').eval()
27 | self.kwargs = kwargs
28 |
29 | def generate_inner(self, message, dataset=None):
30 | prompt, image_path = self.message_to_promptimg(message)
31 | input_ids = self.model.text_process(prompt, self.tokenizer).to(device='cuda')
32 | image = Image.open(image_path).convert('RGB')
33 | image_tensor = self.model.image_process([image]).to(dtype=self.model.dtype, device='cuda')
34 | # generate
35 | with torch.inference_mode():
36 | output_ids = self.model.generate(
37 | input_ids,
38 | images=image_tensor,
39 | max_new_tokens=2048,
40 | use_cache=True,
41 | eos_token_id=[
42 | self.tokenizer.eos_token_id,
43 | self.tokenizer.convert_tokens_to_ids(['<|eot_id|>'])[0],
44 | ],
45 | )
46 | return self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
47 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/paligemma.py:
--------------------------------------------------------------------------------
1 | from PIL import Image
2 | import torch
3 |
4 | from .base import BaseModel
5 | from ..smp import *
6 |
7 |
8 | class PaliGemma(BaseModel):
9 | INSTALL_REQ = False
10 | INTERLEAVE = False
11 |
12 | def __init__(self, model_path='google/paligemma-3b-mix-448', **kwargs):
13 | try:
14 | from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
15 | except:
16 | warnings.warn('Please install the latest version transformers.')
17 | sys.exit(-1)
18 | model = PaliGemmaForConditionalGeneration.from_pretrained(
19 | model_path,
20 | torch_dtype=torch.bfloat16,
21 | device_map='cpu',
22 | revision='bfloat16',
23 | ).eval()
24 | self.model = model.cuda()
25 | self.processor = AutoProcessor.from_pretrained(model_path)
26 | self.kwargs = kwargs
27 |
28 | def generate_inner(self, message, dataset=None):
29 | prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
30 | image = Image.open(image_path).convert('RGB')
31 |
32 | model_inputs = self.processor(
33 | text=prompt, images=image, return_tensors='pt'
34 | ).to('cuda')
35 | input_len = model_inputs['input_ids'].shape[-1]
36 |
37 | with torch.inference_mode():
38 | generation = self.model.generate(
39 | **model_inputs, max_new_tokens=512, do_sample=False
40 | )
41 | generation = generation[0][input_len:]
42 | res = self.processor.decode(generation, skip_special_tokens=True)
43 | return res
44 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/qwen2_vl/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import Qwen2VLChat
2 | from .prompt import Qwen2VLPromptMixin
3 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/video_llm/__init__.py:
--------------------------------------------------------------------------------
1 | from .video_llava import VideoLLaVA, VideoLLaVA_HF
2 | from .videochat2 import VideoChat2_HD
3 | from .chat_uni_vi import Chatunivi
4 | from .video_chatgpt import VideoChatGPT
5 | from .llama_vid import LLaMAVID
6 | from .pllava import PLLaVA
7 |
8 | __all__ = ['VideoLLaVA', 'VideoLLaVA_HF', 'Chatunivi', 'VideoChatGPT', 'LLaMAVID', 'VideoChat2_HD', 'PLLaVA']
9 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/video_llm/configs/llama_vid/processor/clip-patch14-224/preprocessor_config.json:
--------------------------------------------------------------------------------
1 | {
2 | "crop_size": 224,
3 | "do_center_crop": true,
4 | "do_normalize": true,
5 | "do_resize": true,
6 | "feature_extractor_type": "CLIPFeatureExtractor",
7 | "image_mean": [
8 | 0.48145466,
9 | 0.4578275,
10 | 0.40821073
11 | ],
12 | "image_std": [
13 | 0.26862954,
14 | 0.26130258,
15 | 0.27577711
16 | ],
17 | "resample": 3,
18 | "size": 224
19 | }
20 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/video_llm/configs/videochat2_hd.json:
--------------------------------------------------------------------------------
1 | {
2 | "model": {
3 | "model_cls": "VideoChat2_it_hd_mistral",
4 | "vit_blip_model_path": "OpenGVLab/videochat2",
5 | "mistral_model_path": "mistralai/Mistral-7B-Instruct-v0.2",
6 | "videochat2_model_path": "OpenGVLab/VideoChat2_stage2_Mistral_7B",
7 | "freeze_vit": false,
8 | "freeze_qformer": false,
9 | "max_txt_len": 512,
10 | "low_resource": false,
11 | "vision_encoder": {
12 | "name": "vit_l14",
13 | "img_size": 224,
14 | "patch_size": 16,
15 | "d_model": 1024,
16 | "encoder_embed_dim": 1024,
17 | "encoder_depth": 24,
18 | "encoder_num_heads": 16,
19 | "drop_path_rate": 0.0,
20 | "num_frames": 8,
21 | "tubelet_size": 1,
22 | "use_checkpoint": true,
23 | "checkpoint_num": 18,
24 | "pretrained": "",
25 | "return_index": -2,
26 | "vit_add_ln": true,
27 | "ckpt_num_frame": 4
28 | },
29 | "num_query_token": 32,
30 | "qformer_hidden_dropout_prob": 0.1,
31 | "qformer_attention_probs_dropout_prob": 0.1,
32 | "qformer_drop_path_rate": 0.2,
33 | "extra_num_query_token": 64,
34 | "qformer_text_input": true,
35 | "system": "",
36 | "start_token": "",
37 | "end_token": " ",
38 | "add_second_msg": true,
39 | "img_start_token": "",
40 | "img_end_token": " ",
41 | "random_shuffle": true,
42 | "return_question_instruction": false,
43 | "use_flash_attention": true,
44 | "use_lora": false,
45 | "lora_r": 16,
46 | "lora_alpha": 32,
47 | "lora_dropout": 0.1,
48 | "dynamic_config": {
49 | "local_size": 224,
50 | "hd_num": 6,
51 | "padding": false,
52 | "add_global": true
53 | }
54 | },
55 | "device": "cuda"
56 | }
57 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/visualglm.py:
--------------------------------------------------------------------------------
1 | import warnings
2 | from .base import BaseModel
3 | from ..smp import *
4 |
5 |
6 | class VisualGLM(BaseModel):
7 |
8 | INSTALL_REQ = False
9 | INTERLEAVE = False
10 |
11 | def __init__(self, model_path='THUDM/visualglm-6b', **kwargs):
12 | try:
13 | import sat
14 | except:
15 | warnings.warn('Please install SwissArmyTransformer to use VisualGLM')
16 | assert model_path is not None
17 | self.model_path = model_path
18 |
19 | from transformers import AutoModel
20 | from transformers import AutoTokenizer
21 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
22 | model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda()
23 | self.model = model
24 | self.kwargs = kwargs
25 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
26 |
27 | def generate_inner(self, message, dataset=None):
28 | prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
29 | output, _ = self.model.chat(
30 | image_path=image_path,
31 | tokenizer=self.tokenizer,
32 | query=prompt,
33 | history=[],
34 | **self.kwargs
35 | )
36 | return output
37 |
--------------------------------------------------------------------------------
/VLMEvalKit/vlmeval/vlm/xcomposer/__init__.py:
--------------------------------------------------------------------------------
1 | from .sharecaptioner import ShareCaptioner
2 | from .xcomposer import XComposer
3 | from .xcomposer2 import XComposer2
4 | from .xcomposer2_4KHD import XComposer2_4KHD
5 | from .xcomposer2d5 import XComposer2d5
6 |
7 | __all__ = ['ShareCaptioner', 'XComposer', 'XComposer2', 'XComposer2_4KHD', 'XComposer2d5']
8 |
--------------------------------------------------------------------------------
/cog.yaml:
--------------------------------------------------------------------------------
1 | # Configuration for Cog ⚙️
2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
3 |
4 | build:
5 | gpu: true
6 |
7 | python_version: "3.11"
8 |
9 | python_packages:
10 | - "torch==2.0.1"
11 | - "accelerate==0.21.0"
12 | - "bitsandbytes==0.41.0"
13 | - "deepspeed==0.9.5"
14 | - "einops-exts==0.0.4"
15 | - "einops==0.6.1"
16 | - "gradio==3.35.2"
17 | - "gradio_client==0.2.9"
18 | - "httpx==0.24.0"
19 | - "markdown2==2.4.10"
20 | - "numpy==1.26.0"
21 | - "peft==0.4.0"
22 | - "scikit-learn==1.2.2"
23 | - "sentencepiece==0.1.99"
24 | - "shortuuid==1.0.11"
25 | - "timm==0.6.13"
26 | - "tokenizers==0.13.3"
27 | - "torch==2.0.1"
28 | - "torchvision==0.15.2"
29 | - "transformers==4.31.0"
30 | - "wandb==0.15.12"
31 | - "wavedrom==2.0.3.post3"
32 | - "Pygments==2.16.1"
33 | run:
34 | - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget
35 |
36 | # predict.py defines how predictions are run on your model
37 | predict: "predict.py:Predictor"
38 |
--------------------------------------------------------------------------------
/doc/HiWin.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/doc/HiWin.png
--------------------------------------------------------------------------------
/doc/arch.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/doc/arch.png
--------------------------------------------------------------------------------
/doc/pyramid.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/doc/pyramid.png
--------------------------------------------------------------------------------
/eval.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # mkdir -p "./exp_results/$1"
4 | # echo 'made a dir ./exp_results/'$1
5 |
6 | NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
7 |
8 | CUDA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES bash scripts/v1_5/eval/mme.sh $1 #> ./exp_results/$1/mme_result.log
9 | echo 'mme done'
10 |
11 | CUDA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES bash scripts/v1_5/eval/ai2d.sh $1 #> ./exp_results/$1/ai2d_result.log
12 | echo 'ai2d done'
13 |
14 | CUDA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES bash scripts/v1_5/eval/docvqa_val.sh $1 #> ./exp_results/$1/docvqa_eval_result.log
15 | echo 'doc done'
16 |
17 | CUDA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES bash scripts/v1_5/eval/chartqa.sh $1 #> ./exp_results/$1/chartqa_result.log
18 | echo 'chart done'
19 |
20 | # traditional
21 | CUDA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES bash scripts/v1_5/eval/textvqa.sh $1 #> ./exp_results/$1/textvqa_result.log
22 | echo 'textvqa done'
23 |
24 | CUDA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES bash scripts/v1_5/eval/gqa.sh $1 #> ./exp_results/$1/gqa_result.log
25 | echo 'gqa done'
26 |
27 | CUDA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES bash scripts/v1_5/eval/sqa.sh $1 #> ./exp_results/$1/scienceqa_result.log
28 | echo 'sqa done'
29 |
30 | echo 'All eval done, exiting successfully.'
--------------------------------------------------------------------------------
/featup/__init__.py:
--------------------------------------------------------------------------------
1 | from featup.upsamplers import JBULearnedRange
--------------------------------------------------------------------------------
/featup/adaptive_conv_cuda/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/featup/adaptive_conv_cuda/__init__.py
--------------------------------------------------------------------------------
/featup/adaptive_conv_cuda/adaptive_conv.py:
--------------------------------------------------------------------------------
1 | from torch.autograd import Function
2 | import torch
3 |
4 | import adaptive_conv_cuda_impl as cuda_impl
5 | import adaptive_conv_cpp_impl as cpp_impl
6 |
7 | torch.manual_seed(42)
8 |
9 |
10 | class AdaptiveConv(Function):
11 |
12 | @staticmethod
13 | def forward(ctx, input, filters):
14 | ctx.save_for_backward(filters, input)
15 | b, h2, w2, f1, f2 = filters.shape
16 | assert f1 == f2
17 |
18 | if input.is_cuda:
19 | assert filters.is_cuda
20 | result = cuda_impl.forward(input, filters)
21 | else:
22 | result = cpp_impl.forward(input, filters)
23 |
24 | return result
25 |
26 | @staticmethod
27 | def backward(ctx, grad_output):
28 | filters, input = ctx.saved_tensors
29 | grad_input = grad_filters = None
30 | b, h2, w2, f1, f2 = filters.shape
31 | assert f1 == f2
32 |
33 | grad_output = grad_output.contiguous()
34 | if grad_output.is_cuda:
35 | assert input.is_cuda
36 | assert filters.is_cuda
37 | if ctx.needs_input_grad[0]:
38 | grad_input = cuda_impl.grad_input(grad_output, filters)
39 | if ctx.needs_input_grad[1]:
40 | grad_filters = cuda_impl.grad_filters(grad_output, input)
41 | else:
42 | if ctx.needs_input_grad[0]:
43 | grad_input = cpp_impl.grad_input(grad_output, filters)
44 | if ctx.needs_input_grad[1]:
45 | grad_filters = cpp_impl.grad_filters(grad_output, input)
46 |
47 | return grad_input, grad_filters
48 |
--------------------------------------------------------------------------------
/featup/adaptive_conv_cuda/adaptive_conv_cuda.cpp:
--------------------------------------------------------------------------------
1 | #include
2 | using torch::Tensor;
3 |
4 | // CUDA forward declarations
5 |
6 | Tensor adaptive_conv_cuda_forward(Tensor input, Tensor filters);
7 | Tensor adaptive_conv_cuda_grad_input(Tensor grad_output, Tensor filters);
8 | Tensor adaptive_conv_cuda_grad_filters(Tensor grad_output, Tensor input);
9 |
10 | // C++ interface
11 |
12 | // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4.
13 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor")
14 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous")
15 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
16 |
17 | Tensor adaptive_conv_forward(Tensor input, Tensor filters) {
18 | //CHECK_INPUT(input);
19 | //CHECK_INPUT(filters);
20 | return adaptive_conv_cuda_forward(input, filters);
21 | }
22 |
23 | Tensor adaptive_conv_grad_input(Tensor grad_output, Tensor filters) {
24 | //CHECK_INPUT(grad_output);
25 | //CHECK_INPUT(filters);
26 | return adaptive_conv_cuda_grad_input(grad_output, filters);
27 | }
28 |
29 | Tensor adaptive_conv_grad_filters(Tensor grad_output, Tensor input) {
30 | //CHECK_INPUT(grad_output);
31 | //CHECK_INPUT(input);
32 | return adaptive_conv_cuda_grad_filters(grad_output, input);
33 | }
34 |
35 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
36 | m.def("forward", &adaptive_conv_forward, "adaptive_conv forward");
37 | m.def("grad_input", &adaptive_conv_grad_input, "adaptive_conv grad_input");
38 | m.def("grad_filters", &adaptive_conv_grad_filters, "adaptive_conv grad_filters");
39 | }
40 |
--------------------------------------------------------------------------------
/featup/configs/vdim_upsampler.yaml:
--------------------------------------------------------------------------------
1 | # Environment Args
2 | output_root: '.'
3 | pytorch_data_dir: './datasets'
4 | submitting_to_aml: false
5 |
6 | # Dataset args
7 | train_dataset: "cocostuff"
8 | val_dataset: "coco_validation50"
9 | res: 336 #224 or 336
10 |
11 | # Model Args
12 | model_type: "clip-large" #vit or clip-large
13 | activation_type: "token"
14 | is_norm: False
15 | is_high_res: False
16 | dim: 1024 #384 or 1024
17 |
18 | # Upsampling args
19 | outlier_detection: True
20 | upsampler_type: "jbu_4x_stack"
21 | downsampler_type: "attention"
22 | max_pad: 30
23 | max_zoom: 2
24 | n_jitters: 2
25 | random_projection: 30
26 | crf_weight: 0.001
27 | filter_ent_weight: 0.0
28 | tv_weight: 0.0
29 |
30 | implicit_sup_weight: 1.0
31 |
32 | # Training args
33 | batch_size: 2
34 | epochs: 1
35 | num_gpus: 8
36 | num_workers: 24
37 | lr: 1e-3
38 |
39 | # No need to change
40 | hydra:
41 | run:
42 | dir: "."
43 | output_subdir: ~
44 |
45 |
--------------------------------------------------------------------------------
/featup/datasets/DAVIS.py:
--------------------------------------------------------------------------------
1 | from torchvision import transforms
2 | import os
3 | from PIL import Image
4 | from torch.utils.data import Dataset
5 |
6 |
7 | class DAVIS(Dataset):
8 | def __init__(self, root, video_name, transform=None):
9 | """
10 | Args:
11 | root (string): Directory with all the videos.
12 | video_name (string): Name of the specific video.
13 | transform (callable, optional): Optional transform to be applied on a sample.
14 | """
15 | self.root_dir = os.path.join(root, "DAVIS/JPEGImages/480p/", video_name)
16 | self.frames = os.listdir(self.root_dir)
17 | self.transform = transform
18 |
19 | def __len__(self):
20 | return len(self.frames)
21 |
22 | def __getitem__(self, idx):
23 | img_path = os.path.join(self.root_dir, self.frames[idx])
24 | image = Image.open(img_path).convert("RGB")
25 |
26 | if self.transform:
27 | image = self.transform(image)
28 |
29 | return {"img": image, "img_path": img_path}
30 |
31 |
32 | if __name__ == "__main__":
33 | transform = transforms.Compose([
34 | transforms.Resize((256, 256)),
35 | transforms.ToTensor()
36 | ])
37 |
38 | davis_dataset = DAVIS(root='/pytorch-data', video_name="motocross-jump", transform=transform)
39 |
40 | frames = davis_dataset[0]
41 |
42 | print("here")
43 |
--------------------------------------------------------------------------------
/featup/datasets/DOC.py:
--------------------------------------------------------------------------------
1 | from os.path import join
2 |
3 | import numpy as np
4 | import torch
5 | import torch.multiprocessing
6 | from PIL import Image
7 | from torch.utils.data import Dataset
8 |
9 | class Doc(Dataset):
10 | def __init__(self,
11 | root,
12 | split,
13 | transform,
14 | target_transform,
15 | subset=None):
16 | super(Doc, self).__init__()
17 | self.split = split
18 | self.root = join(root, "Doc")
19 | self.transform = transform
20 | self.label_transform = target_transform
21 | self.subset = subset
22 |
23 | if self.subset is None:
24 | self.image_list = "Doc20000.txt"
25 | elif self.subset == 'Doc_validation50':
26 | self.image_list = "Doc_validation50.txt"
27 |
28 | assert self.split in ["train", "val", "train+val"]
29 | split_dirs = {
30 | "train": ["train"],
31 | "val": ["val"],
32 | "train+val": ["train", "val"]
33 | }
34 |
35 | self.image_files = []
36 | for split_dir in split_dirs[self.split]:
37 | with open(join(self.root, "curated", self.image_list), "r") as f:
38 | img_names = [fn.rstrip() for fn in f.readlines()]
39 | for img_name in img_names:
40 | self.image_files.append(join(self.root, "images", img_name))
41 |
42 | def __len__(self):
43 | return len(self.image_files)
44 |
45 | def __getitem__(self, index):
46 | image_path = self.image_files[index]
47 | batch = {}
48 | img = self.transform(Image.open(image_path).convert("RGB"))
49 | batch["img"] = img
50 | batch["img_path"] = image_path
51 | return batch
52 |
--------------------------------------------------------------------------------
/featup/datasets/DocSceneText.py:
--------------------------------------------------------------------------------
1 | from os.path import join
2 |
3 | import numpy as np
4 | import torch
5 | import torch.multiprocessing
6 | from PIL import Image
7 | from torch.utils.data import Dataset
8 |
9 | class DocSceneText(Dataset):
10 | def __init__(self,
11 | root,
12 | split,
13 | transform,
14 | target_transform,
15 | subset=None):
16 | super(DocSceneText, self).__init__()
17 | self.split = split
18 | self.root = join(root, "224DocSceneText")
19 | self.transform = transform
20 | self.label_transform = target_transform
21 | self.subset = subset
22 |
23 | if self.subset is None:
24 | self.image_list = "224docSceneText.txt"
25 |
26 | self.image_files = []
27 | with open(join(self.root, "curated", self.image_list), "r") as f:
28 | img_names = [fn.rstrip() for fn in f.readlines()]
29 | for img_name in img_names:
30 | self.image_files.append(join(self.root, img_name))
31 |
32 | def __len__(self):
33 | return len(self.image_files)
34 |
35 | def __getitem__(self, index):
36 | image_path = self.image_files[index]
37 | batch = {}
38 | img = self.transform(Image.open(image_path).convert("RGB"))
39 | batch["img"] = img
40 | batch["img_path"] = image_path
41 | return batch
42 |
--------------------------------------------------------------------------------
/featup/datasets/EmbeddingFile.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from torch.utils.data import Dataset
3 |
4 |
5 | class EmbeddingFile(Dataset):
6 | """
7 | modified from: https://pytorch.org/docs/stable/_modules/torchvision/datasets/folder.html#ImageFolder
8 | uses cached directory listing if available rather than walking directory
9 | Attributes:
10 | classes (list): List of the class names.
11 | class_to_idx (dict): Dict with items (class_name, class_index).
12 | samples (list): List of (sample path, class_index) tuples
13 | targets (list): The class_index value for each image in the dataset
14 | """
15 |
16 | def __init__(self, file, with_images=False):
17 | super(Dataset, self).__init__()
18 | self.file = file
19 | loaded = np.load(file)
20 | self.feats = loaded["feats"]
21 | self.labels = loaded["labels"]
22 | self.imgs = None
23 | self.with_images = with_images
24 | if with_images:
25 | self.imgs = loaded["images"]
26 | else:
27 | self.imgs = None
28 |
29 | def dim(self):
30 | return self.feats.shape[1]
31 |
32 | def num_classes(self):
33 | return self.labels.max() + 1
34 |
35 | def __getitem__(self, index):
36 | if self.imgs is not None:
37 | return self.feats[index], self.labels[index], self.imgs[index]
38 | return self.feats[index], self.labels[index]
39 |
40 | def __len__(self):
41 | return len(self.labels)
42 |
43 |
44 | class EmbeddingAndImage(Dataset):
45 | def __init__(self, file, dataset):
46 | super(Dataset, self).__init__()
47 | self.file = file
48 | loaded = np.load(file)
49 | self.feats = loaded["feats"]
50 | self.labels = loaded["labels"]
51 | self.imgs = dataset
52 |
53 | def dim(self):
54 | return self.feats.shape[1]
55 |
56 | def num_classes(self):
57 | return self.labels.max() + 1
58 |
59 | def __getitem__(self, index):
60 | return self.feats[index], self.labels[index], self.imgs[index]
61 |
62 | def __len__(self):
63 | return len(self.labels)
64 |
--------------------------------------------------------------------------------
/featup/datasets/HTML.py:
--------------------------------------------------------------------------------
1 | from os.path import join
2 |
3 | import numpy as np
4 | import torch
5 | import torch.multiprocessing
6 | from PIL import Image
7 | from torch.utils.data import Dataset
8 |
9 | class HTML(Dataset):
10 | def __init__(self,
11 | root,
12 | split,
13 | transform,
14 | target_transform,
15 | subset=None):
16 | super(HTML, self).__init__()
17 | self.split = split
18 | self.root = join(root, "HTML")
19 | self.transform = transform
20 | self.label_transform = target_transform
21 | self.subset = subset
22 |
23 | if self.subset is None:
24 | self.image_list = "HTML20000.txt"
25 | elif self.subset == 'HTML_validation50':
26 | self.image_list = "HTML_validation50.txt"
27 |
28 | assert self.split in ["train", "val", "train+val"]
29 | split_dirs = {
30 | "train": ["train"],
31 | "val": ["val"],
32 | "train+val": ["train", "val"]
33 | }
34 |
35 | self.image_files = []
36 | for split_dir in split_dirs[self.split]:
37 | with open(join(self.root, "curated", self.image_list), "r") as f:
38 | img_names = [fn.rstrip() for fn in f.readlines()]
39 | for img_name in img_names:
40 | self.image_files.append(join(self.root, "images", img_name))
41 |
42 | def __len__(self):
43 | return len(self.image_files)
44 |
45 | def __getitem__(self, index):
46 | image_path = self.image_files[index]
47 | batch = {}
48 | img = self.transform(Image.open(image_path).convert("RGB"))
49 | batch["img"] = img
50 | batch["img_path"] = image_path
51 | return batch
52 |
--------------------------------------------------------------------------------
/featup/datasets/JitteredImage.py:
--------------------------------------------------------------------------------
1 | import random
2 |
3 | import torch
4 | import torch.nn.functional as F
5 | from torch.utils.data import Dataset
6 |
7 |
8 | def apply_jitter(img, max_pad, transform_params):
9 | h, w = img.shape[2:]
10 |
11 | padded = F.pad(img, [max_pad] * 4, mode="reflect")
12 |
13 | zoom = transform_params["zoom"].item()
14 | x = transform_params["x"].item()
15 | y = transform_params["y"].item()
16 | flip = transform_params["flip"].item()
17 |
18 | if zoom > 1.0:
19 | zoomed = F.interpolate(padded, scale_factor=zoom, mode="bilinear")
20 | else:
21 | zoomed = padded
22 |
23 | cropped = zoomed[:, :, x:h + x, y:w + y]
24 |
25 | if flip:
26 | return torch.flip(cropped, [3])
27 | else:
28 | return cropped
29 |
30 |
31 | def sample_transform(use_flips, max_pad, max_zoom, h, w):
32 | if use_flips:
33 | flip = random.random() > .5
34 | else:
35 | flip = False
36 |
37 | apply_zoom = random.random() > .5
38 | if apply_zoom:
39 | zoom = random.random() * (max_zoom - 1) + 1
40 | else:
41 | zoom = 1.0
42 |
43 | valid_area_h = (int((h + max_pad * 2) * zoom) - h) + 1
44 | valid_area_w = (int((w + max_pad * 2) * zoom) - w) + 1
45 |
46 | return {
47 | "x": torch.tensor(torch.randint(0, valid_area_h, ()).item()),
48 | "y": torch.tensor(torch.randint(0, valid_area_w, ()).item()),
49 | "zoom": torch.tensor(zoom),
50 | "flip": torch.tensor(flip)
51 | }
52 |
53 |
54 | class JitteredImage(Dataset):
55 |
56 | def __init__(self, img, length, use_flips, max_zoom, max_pad):
57 | self.img = img
58 | self.length = length
59 | self.use_flips = use_flips
60 | self.max_zoom = max_zoom
61 | self.max_pad = max_pad
62 |
63 | def __len__(self):
64 | return self.length
65 |
66 | def __getitem__(self, item):
67 | h, w = self.img.shape[2:]
68 | transform_params = sample_transform(self.use_flips, self.max_pad, self.max_zoom, h, w)
69 | return apply_jitter(self.img, self.max_pad, transform_params).squeeze(0), transform_params
70 |
--------------------------------------------------------------------------------
/featup/datasets/SCENE.py:
--------------------------------------------------------------------------------
1 | from os.path import join
2 |
3 | import numpy as np
4 | import torch
5 | import torch.multiprocessing
6 | from PIL import Image
7 | from torch.utils.data import Dataset
8 |
9 | class Scene(Dataset):
10 | def __init__(self,
11 | root,
12 | split,
13 | transform,
14 | target_transform,
15 | subset=None):
16 | super(Scene, self).__init__()
17 | self.split = split
18 | self.root = join(root, "Scene")
19 | self.transform = transform
20 | self.label_transform = target_transform
21 | self.subset = subset
22 |
23 | if self.subset is None:
24 | self.image_list = "Scenepuretext.txt"
25 | elif self.subset == 'Scene_validation50':
26 | self.image_list = "Scene_validation50.txt"
27 |
28 | assert self.split in ["train", "val", "train+val"]
29 | split_dirs = {
30 | "train": ["train"],
31 | "val": ["val"],
32 | "train+val": ["train", "val"]
33 | }
34 |
35 | self.image_files = []
36 | for split_dir in split_dirs[self.split]:
37 | with open(join(self.root, "curated", self.image_list), "r") as f:
38 | img_names = [fn.rstrip() for fn in f.readlines()]
39 | for img_name in img_names:
40 | self.image_files.append(join(self.root, "puretext", img_name))
41 |
42 | def __len__(self):
43 | return len(self.image_files)
44 |
45 | def __getitem__(self, index):
46 | image_path = self.image_files[index]
47 | batch = {}
48 | img = self.transform(Image.open(image_path).convert("RGB"))
49 | batch["img"] = img
50 | batch["img_path"] = image_path
51 | return batch
52 |
--------------------------------------------------------------------------------
/featup/datasets/SampleImage.py:
--------------------------------------------------------------------------------
1 | from PIL import Image
2 | from torch.utils.data import Dataset
3 |
4 |
5 | class SampleImage(Dataset):
6 | def __init__(self, paths, transform, **kwargs):
7 | self.paths = paths
8 | self.transform = transform
9 |
10 | def __getitem__(self, idx):
11 | image_path = self.paths[idx]
12 | image = Image.open(image_path).convert('RGB')
13 | if self.transform is not None:
14 | image = self.transform(image)
15 | batch = {
16 | "img": image,
17 | "img_path": image_path
18 | }
19 | return batch
20 |
21 | def __len__(self):
22 | return len(self.paths)
23 |
--------------------------------------------------------------------------------
/featup/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/featup/datasets/__init__.py
--------------------------------------------------------------------------------
/featup/featurizers/CLIP.py:
--------------------------------------------------------------------------------
1 | import clip
2 | import torch
3 | from torch import nn
4 | import os
5 |
6 | class CLIPFeaturizer(nn.Module):
7 |
8 | def __init__(self):
9 | super().__init__()
10 | self.model, self.preprocess = clip.load(
11 | "ViT-B/16",
12 | download_root=os.getenv('TORCH_HOME', os.path.join(os.path.expanduser('~'), '.cache', 'torch'))
13 | )
14 | self.model.eval()
15 |
16 | def get_cls_token(self, img):
17 | return self.model.encode_image(img).to(torch.float32)
18 |
19 | def forward(self, img):
20 | features = self.model.get_visual_features(img, include_cls=False).to(torch.float32)
21 | return features
22 |
23 |
24 | if __name__ == "__main__":
25 | import torchvision.transforms as T
26 | from PIL import Image
27 | #from shared import norm, crop_to_divisor
28 |
29 | device = "cuda" if torch.cuda.is_available() else "cpu"
30 |
31 | image = Image.open("/home/god/playground/FeatUp/sample-images/bird_full.jpg")
32 | load_size = 224 # * 3
33 | transform = T.Compose([
34 | T.Resize(load_size, Image.BILINEAR),
35 | T.CenterCrop(load_size),
36 | T.ToTensor()
37 | #lambda x: crop_to_divisor(x, 16),
38 | #norm
39 | ])
40 |
41 | model = CLIPFeaturizer().cuda()
42 |
43 | token = model.get_cls_token(transform(image).cuda().unsqueeze(0)) #torch.Size([1, 768])
44 | results = model(transform(image).cuda().unsqueeze(0)) #torch.Size([1, 768, 24, 24])
45 |
46 | print(clip.available_models())
47 |
--------------------------------------------------------------------------------
/featup/featurizers/CLIPLarge.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | from PIL import Image
4 | import torchvision.transforms as T
5 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
6 | from featup.util import norm
7 | from torchvision.transforms import InterpolationMode
8 |
9 |
10 | #CLIP-ViT-L/14 336 pixel
11 | class CLIPLargeFeaturizer(nn.Module):
12 |
13 | def __init__(self):
14 | super().__init__()
15 | vision_tower_name = 'openai/clip-vit-large-patch14-336'
16 | self.preprocess = CLIPImageProcessor.from_pretrained(vision_tower_name)
17 | self.model = CLIPVisionModel.from_pretrained(vision_tower_name)
18 | self.model.requires_grad_(False)
19 |
20 | def get_cls_token(self, img):
21 | return self.model(img).to(torch.float32).last_hidden_state
22 |
23 | def forward(self, img):
24 | outputs = self.model(img)
25 | last_hidden_states = outputs.last_hidden_state
26 | without_class = last_hidden_states[:, 1:]
27 | #torch.Size([1, 576, 1024])
28 | features = without_class.permute(0,2,1)
29 | #[1, 1024, 24, 24]
30 | features = features.reshape(len(features), features.shape[1], 24, 24)
31 | return features.to(torch.float32)
32 |
33 | if __name__ == '__main__':
34 | vision_tower_name = 'openai/clip-vit-large-patch14-336'
35 | image = Image.open("/home/god/playground/FeatUp/sample-images/bird_full.jpg")
36 |
37 | transformTest = T.Resize(336, InterpolationMode.BILINEAR)
38 |
39 | test_image = transformTest(image.convert("RGB"))
40 |
41 |
42 | transform = T.Compose([
43 | T.Resize(336, InterpolationMode.BILINEAR),
44 | T.CenterCrop(336),
45 | T.ToTensor(),
46 | norm])
47 |
48 | #torch.Size([3, 336, 336])
49 | transformed_image = transform(image.convert("RGB")).unsqueeze(0).to("cuda")
50 |
51 |
52 | model = CLIPLargeFeaturizer().cuda()
53 |
54 | features = model(transformed_image)
55 |
56 | print(features.shape)
57 | #torch.Size([1, 1024, 24, 24])
58 | #torch.Size([1, 768, 24, 24])
59 |
--------------------------------------------------------------------------------
/featup/featurizers/DeepLabV3.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 |
4 | class DeepLabV3Featurizer(nn.Module):
5 | def __init__(self, model):
6 | super().__init__()
7 | self.model = model
8 |
9 | def get_cls_token(self, img):
10 | return self.model.forward(img)
11 |
12 | def forward(self, img, layer_num=-1):
13 | return self.model.backbone(img)['out']
14 |
--------------------------------------------------------------------------------
/featup/featurizers/MaskCLIP.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from torch import nn
3 | import os
4 |
5 | from featup.featurizers.maskclip import clip
6 |
7 |
8 | class MaskCLIPFeaturizer(nn.Module):
9 |
10 | def __init__(self):
11 | super().__init__()
12 | self.model, self.preprocess = clip.load(
13 | "ViT-B/16",
14 | download_root=os.getenv('TORCH_HOME', os.path.join(os.path.expanduser('~'), '.cache', 'torch'))
15 | )
16 | self.model.eval()
17 | self.patch_size = self.model.visual.patch_size
18 |
19 | def forward(self, img):
20 | b, _, input_size_h, input_size_w = img.shape
21 | patch_h = input_size_h // self.patch_size
22 | patch_w = input_size_w // self.patch_size
23 | features = self.model.get_patch_encodings(img).to(torch.float32)
24 | return features.reshape(b, patch_h, patch_w, -1).permute(0, 3, 1, 2)
25 |
26 |
27 | if __name__ == "__main__":
28 | import torchvision.transforms as T
29 | from PIL import Image
30 | from featup.util import norm, unnorm, crop_to_divisor
31 |
32 | device = "cuda" if torch.cuda.is_available() else "cpu"
33 |
34 | image = Image.open("../samples/lex1.jpg")
35 | load_size = 224 # * 3
36 | transform = T.Compose([
37 | T.Resize(load_size, Image.BILINEAR),
38 | # T.CenterCrop(load_size),
39 | T.ToTensor(),
40 | lambda x: crop_to_divisor(x, 16),
41 | norm])
42 |
43 | model = MaskCLIPFeaturizer().cuda()
44 |
45 | results = model(transform(image).cuda().unsqueeze(0))
46 |
47 | print(clip.available_models())
48 |
--------------------------------------------------------------------------------
/featup/featurizers/ResNet.py:
--------------------------------------------------------------------------------
1 | from torch import nn
2 |
3 |
4 | class ResNetFeaturizer(nn.Module):
5 | def __init__(self, model):
6 | super().__init__()
7 | self.model = model
8 |
9 | def get_cls_token(self, img):
10 | return self.model.forward(img)
11 |
12 | def get_layer(self, img, layer_num):
13 | return self.model.get_layer(img, layer_num)
14 |
15 | def forward(self, img, layer_num=-1):
16 | return self.model.get_layer(img, layer_num)
17 |
--------------------------------------------------------------------------------
/featup/featurizers/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/featup/featurizers/__init__.py
--------------------------------------------------------------------------------
/featup/featurizers/dinov2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/featup/featurizers/dinov2/__init__.py
--------------------------------------------------------------------------------
/featup/featurizers/dinov2/layers/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 |
6 | from .dino_head import DINOHead
7 | from .mlp import Mlp
8 | from .patch_embed import PatchEmbed
9 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
10 | from .block import NestedTensorBlock
11 | from .attention import MemEffAttention
12 |
--------------------------------------------------------------------------------
/featup/featurizers/dinov2/layers/dino_head.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 |
6 | import torch
7 | import torch.nn as nn
8 | from torch.nn.init import trunc_normal_
9 | from torch.nn.utils import weight_norm
10 |
11 |
12 | class DINOHead(nn.Module):
13 | def __init__(
14 | self,
15 | in_dim,
16 | out_dim,
17 | use_bn=False,
18 | nlayers=3,
19 | hidden_dim=2048,
20 | bottleneck_dim=256,
21 | mlp_bias=True,
22 | ):
23 | super().__init__()
24 | nlayers = max(nlayers, 1)
25 | self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias)
26 | self.apply(self._init_weights)
27 | self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False))
28 | self.last_layer.weight_g.data.fill_(1)
29 |
30 | def _init_weights(self, m):
31 | if isinstance(m, nn.Linear):
32 | trunc_normal_(m.weight, std=0.02)
33 | if isinstance(m, nn.Linear) and m.bias is not None:
34 | nn.init.constant_(m.bias, 0)
35 |
36 | def forward(self, x):
37 | x = self.mlp(x)
38 | eps = 1e-6 if x.dtype == torch.float16 else 1e-12
39 | x = nn.functional.normalize(x, dim=-1, p=2, eps=eps)
40 | x = self.last_layer(x)
41 | return x
42 |
43 |
44 | def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True):
45 | if nlayers == 1:
46 | return nn.Linear(in_dim, bottleneck_dim, bias=bias)
47 | else:
48 | layers = [nn.Linear(in_dim, hidden_dim, bias=bias)]
49 | if use_bn:
50 | layers.append(nn.BatchNorm1d(hidden_dim))
51 | layers.append(nn.GELU())
52 | for _ in range(nlayers - 2):
53 | layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias))
54 | if use_bn:
55 | layers.append(nn.BatchNorm1d(hidden_dim))
56 | layers.append(nn.GELU())
57 | layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias))
58 | return nn.Sequential(*layers)
59 |
--------------------------------------------------------------------------------
/featup/featurizers/dinov2/layers/drop_path.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 |
6 | # References:
7 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
8 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
9 |
10 |
11 | from torch import nn
12 |
13 |
14 | def drop_path(x, drop_prob: float = 0.0, training: bool = False):
15 | if drop_prob == 0.0 or not training:
16 | return x
17 | keep_prob = 1 - drop_prob
18 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
19 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
20 | if keep_prob > 0.0:
21 | random_tensor.div_(keep_prob)
22 | output = x * random_tensor
23 | return output
24 |
25 |
26 | class DropPath(nn.Module):
27 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
28 |
29 | def __init__(self, drop_prob=None):
30 | super(DropPath, self).__init__()
31 | self.drop_prob = drop_prob
32 |
33 | def forward(self, x):
34 | return drop_path(x, self.drop_prob, self.training)
35 |
--------------------------------------------------------------------------------
/featup/featurizers/dinov2/layers/layer_scale.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 |
6 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110
7 |
8 | from typing import Union
9 |
10 | import torch
11 | from torch import Tensor
12 | from torch import nn
13 |
14 |
15 | class LayerScale(nn.Module):
16 | def __init__(
17 | self,
18 | dim: int,
19 | init_values: Union[float, Tensor] = 1e-5,
20 | inplace: bool = False,
21 | ) -> None:
22 | super().__init__()
23 | self.inplace = inplace
24 | self.gamma = nn.Parameter(init_values * torch.ones(dim))
25 |
26 | def forward(self, x: Tensor) -> Tensor:
27 | return x.mul_(self.gamma) if self.inplace else x * self.gamma
28 |
--------------------------------------------------------------------------------
/featup/featurizers/dinov2/layers/mlp.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | #
3 | # This source code is licensed under the Apache License, Version 2.0
4 | # found in the LICENSE file in the root directory of this source tree.
5 |
6 | # References:
7 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
8 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
9 |
10 |
11 | from typing import Callable, Optional
12 |
13 | from torch import Tensor, nn
14 |
15 |
16 | class Mlp(nn.Module):
17 | def __init__(
18 | self,
19 | in_features: int,
20 | hidden_features: Optional[int] = None,
21 | out_features: Optional[int] = None,
22 | act_layer: Callable[..., nn.Module] = nn.GELU,
23 | drop: float = 0.0,
24 | bias: bool = True,
25 | ) -> None:
26 | super().__init__()
27 | out_features = out_features or in_features
28 | hidden_features = hidden_features or in_features
29 | self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
30 | self.act = act_layer()
31 | self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
32 | self.drop = nn.Dropout(drop)
33 |
34 | def forward(self, x: Tensor) -> Tensor:
35 | x = self.fc1(x)
36 | x = self.act(x)
37 | x = self.drop(x)
38 | x = self.fc2(x)
39 | x = self.drop(x)
40 | return x
41 |
--------------------------------------------------------------------------------
/featup/featurizers/maskclip/README.md:
--------------------------------------------------------------------------------
1 | # CLIP
2 | Modified version of [CLIP](https://github.com/openai/CLIP) with support for dense patch-level feature extraction
3 | (based on [MaskCLIP](https://arxiv.org/abs/2112.01071) parametrization) and interpolation of the positional encoding.
4 |
--------------------------------------------------------------------------------
/featup/featurizers/maskclip/__init__.py:
--------------------------------------------------------------------------------
1 | from .clip import *
2 |
3 | """
4 | Modified from https://github.com/openai/CLIP
5 | """
6 |
--------------------------------------------------------------------------------
/featup/featurizers/maskclip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/featup/featurizers/maskclip/bpe_simple_vocab_16e6.txt.gz
--------------------------------------------------------------------------------
/featup/featurizers/modules/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/featup/featurizers/modules/__init__.py
--------------------------------------------------------------------------------
/featup/plotting.py:
--------------------------------------------------------------------------------
1 | import matplotlib.pyplot as plt
2 | from featup.util import pca, remove_axes
3 | from featup.featurizers.maskclip.clip import tokenize
4 | from pytorch_lightning import seed_everything
5 | import torch
6 | import torch.nn.functional as F
7 |
8 |
9 | @torch.no_grad()
10 | def plot_feats(image, lr, hr):
11 | assert len(image.shape) == len(lr.shape) == len(hr.shape) == 3
12 | seed_everything(0)
13 | [lr_feats_pca, hr_feats_pca], _ = pca([lr.unsqueeze(0), hr.unsqueeze(0)])
14 | fig, ax = plt.subplots(1, 3, figsize=(15, 5))
15 | ax[0].imshow(image.permute(1, 2, 0).detach().cpu())
16 | ax[0].set_title("Image")
17 | ax[1].imshow(lr_feats_pca[0].permute(1, 2, 0).detach().cpu())
18 | ax[1].set_title("Original Features")
19 | ax[2].imshow(hr_feats_pca[0].permute(1, 2, 0).detach().cpu())
20 | ax[2].set_title("Upsampled Features")
21 | remove_axes(ax)
22 | plt.show()
23 |
24 |
25 | @torch.no_grad()
26 | def plot_lang_heatmaps(model, image, lr_feats, hr_feats, text_query):
27 | assert len(image.shape) == len(lr_feats.shape) == len(hr_feats.shape) == 3
28 | fig, ax = plt.subplots(1, 3, figsize=(15, 5))
29 | cmap = plt.get_cmap("turbo")
30 |
31 | # encode query
32 | text = tokenize(text_query).to(lr_feats.device)
33 | text_feats = model.model.encode_text(text).squeeze().to(torch.float32)
34 | assert len(text_feats.shape) == 1
35 |
36 | lr_sims = torch.einsum(
37 | "chw,c->hw", F.normalize(lr_feats.to(torch.float32), dim=0), F.normalize(text_feats, dim=0))
38 | hr_sims = torch.einsum(
39 | "chw,c->hw", F.normalize(hr_feats.to(torch.float32), dim=0), F.normalize(text_feats, dim=0))
40 |
41 | lr_sims_norm = (lr_sims - lr_sims.min()) / (lr_sims.max() - lr_sims.min())
42 | hr_sims_norm = (hr_sims - hr_sims.min()) / (hr_sims.max() - hr_sims.min())
43 | lr_heatmap = cmap(lr_sims_norm.cpu().numpy())
44 | hr_heatmap = cmap(hr_sims_norm.cpu().numpy())
45 |
46 | ax[0].imshow(image.permute(1, 2, 0).detach().cpu())
47 | ax[0].set_title("Image")
48 | ax[1].imshow(lr_heatmap)
49 | ax[1].set_title(f"Original Similarity to \"{text_query}\"")
50 | ax[2].imshow(hr_heatmap)
51 | ax[2].set_title(f"Upsampled Similarity to \"{text_query}\"")
52 | remove_axes(ax)
53 |
54 | return plt.show()
55 |
--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
1 | #install
2 | pip install imgaug
3 | pip install openpyxl
4 |
5 | pip install --upgrade pip # enable PEP 660 support
6 | pip install torch==2.1.2
7 | pip install -e .
8 |
9 | pip install -e ".[train]"
10 | pip install flash-attn --no-build-isolation
11 |
--------------------------------------------------------------------------------
/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 |
--------------------------------------------------------------------------------
/llava/constants.py:
--------------------------------------------------------------------------------
1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
2 | WORKER_HEART_BEAT_INTERVAL = 15
3 |
4 | LOGDIR = "."
5 |
6 | # Model Constants
7 | IGNORE_INDEX = -100
8 | IMAGE_TOKEN_INDEX = -200
9 | DEFAULT_IMAGE_TOKEN = ""
10 | DEFAULT_IMAGE_PATCH_TOKEN = ""
11 | DEFAULT_IM_START_TOKEN = ""
12 | DEFAULT_IM_END_TOKEN = ""
13 |
--------------------------------------------------------------------------------
/llava/model/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 |
3 | AVAILABLE_MODELS = {
4 | "llava_llama": "LlavaLlamaForCausalLM, LlavaConfig",
5 | "llava_qwen": "LlavaQwenForCausalLM, LlavaQwenConfig",
6 | "llava_mistral": "LlavaMistralForCausalLM, LlavaMistralConfig",
7 | "llava_mixtral": "LlavaMixtralForCausalLM, LlavaMixtralConfig",
8 | # "llava_qwen_moe": "LlavaQwenMoeForCausalLM, LlavaQwenMoeConfig",
9 | # Add other models as needed
10 | }
11 |
12 | for model_name, model_classes in AVAILABLE_MODELS.items():
13 | try:
14 | exec(f"from .language_model.{model_name} import {model_classes}")
15 | except Exception as e:
16 | print(f"Failed to import {model_name} from llava.language_model.{model_name}. Error: {e}")
17 |
--------------------------------------------------------------------------------
/llava/model/apply_delta.py:
--------------------------------------------------------------------------------
1 | """
2 | Usage:
3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
4 | """
5 |
6 | import argparse
7 |
8 | import torch
9 | from tqdm import tqdm
10 | from transformers import AutoTokenizer, AutoModelForCausalLM
11 | from llava import LlavaLlamaForCausalLM
12 |
13 |
14 | def apply_delta(base_model_path, target_model_path, delta_path):
15 | print("Loading base model")
16 | base = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |
18 | print("Loading delta")
19 | delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21 |
22 | print("Applying delta")
23 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24 | if name not in base.state_dict():
25 | assert name in ["model.mm_projector.weight", "model.mm_projector.bias"], f"{name} not in base model"
26 | continue
27 | if param.data.shape == base.state_dict()[name].shape:
28 | param.data += base.state_dict()[name]
29 | else:
30 | assert name in ["model.embed_tokens.weight", "lm_head.weight"], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}"
31 | bparam = base.state_dict()[name]
32 | param.data[: bparam.shape[0], : bparam.shape[1]] += bparam
33 |
34 | print("Saving target model")
35 | delta.save_pretrained(target_model_path)
36 | delta_tokenizer.save_pretrained(target_model_path)
37 |
38 |
39 | if __name__ == "__main__":
40 | parser = argparse.ArgumentParser()
41 | parser.add_argument("--base-model-path", type=str, required=True)
42 | parser.add_argument("--target-model-path", type=str, required=True)
43 | parser.add_argument("--delta-path", type=str, required=True)
44 |
45 | args = parser.parse_args()
46 |
47 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
48 |
--------------------------------------------------------------------------------
/llava/model/consolidate.py:
--------------------------------------------------------------------------------
1 | """
2 | Usage:
3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
4 | """
5 |
6 | import argparse
7 |
8 | import torch
9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava.model import *
11 | from llava.model.utils import auto_upgrade
12 |
13 |
14 | def consolidate_ckpt(src_path, dst_path):
15 | print("Loading model")
16 | auto_upgrade(src_path)
17 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
18 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
19 | src_model.save_pretrained(dst_path)
20 | src_tokenizer.save_pretrained(dst_path)
21 |
22 |
23 | if __name__ == "__main__":
24 | parser = argparse.ArgumentParser()
25 | parser.add_argument("--src", type=str, required=True)
26 | parser.add_argument("--dst", type=str, required=True)
27 |
28 | args = parser.parse_args()
29 |
30 | consolidate_ckpt(args.src, args.dst)
31 |
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
1 | import os
2 | from .imagebind import ImageBindWrapper
3 | from .open_clip_encoder import OpenCLIPVisionTower
4 | from .hf_vision import HFVisionTower
5 | from .siglip_encoder import SigLipVisionTower
6 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
7 |
8 | # from .eva_clip.eva_clip_encoder import EvaClipVisionTower
9 | # from .dev_eva_clip.eva_vit import EvaViTWrapper
10 |
11 |
12 | def build_vision_tower(vision_tower_cfg, **kwargs):
13 | vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None))
14 | is_absolute_path_exists = os.path.exists(vision_tower)
15 | use_s2 = getattr(vision_tower_cfg, "s2", False)
16 | if "siglip" in vision_tower:
17 | return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs)
18 | elif is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
19 | if use_s2:
20 | return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
21 | else:
22 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
23 | elif vision_tower.startswith("hf:"):
24 | return HFVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
25 | elif vision_tower in ["imagebind_huge"]:
26 | return ImageBindWrapper(vision_tower, args=vision_tower_cfg, **kwargs)
27 | elif vision_tower.startswith("open_clip_hub"):
28 | return OpenCLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
29 | # elif "internal-eva" in vision_tower.lower() or "eva02" in vision_tower.lower():
30 | # return EvaClipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
31 | # elif vision_tower in ["EVA-CLIP-8B", "EVA-CLIP-8B-plus"]:
32 | # return EvaViTWrapper(vision_tower, args=vision_tower_cfg, **kwargs)
33 |
34 | raise ValueError(f"Unknown vision tower: {vision_tower}")
35 |
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/__init__.py:
--------------------------------------------------------------------------------
1 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD
2 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer
3 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint
4 | from .loss import ClipLoss
5 | from .model import CLIP, CustomCLIP, CLIPTextCfg, CLIPVisionCfg, convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype
6 | from .openai import load_openai_model, list_openai_models
7 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained
8 | from .tokenizer import SimpleTokenizer, tokenize
9 | from .transform import image_transform
10 |
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/constants.py:
--------------------------------------------------------------------------------
1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
3 |
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-18B.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1536,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 48,
6 | "width": 5120,
7 | "head_width": 128,
8 | "mlp_ratio": 5,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-18b-14-x",
11 | "drop_path_rate": 0,
12 | "qkv_bias": false,
13 | "xattn": true,
14 | "postnorm": true,
15 | "fusedLN": false,
16 | "use_rms_norm": true
17 | },
18 | "text_cfg": {
19 | "context_length": 77,
20 | "vocab_size": 49408,
21 | "width": 1280,
22 | "heads": 20,
23 | "layers": 32,
24 | "xattn": false,
25 | "fusedLN": false
26 | }
27 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-8B-plus.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1280,
3 | "vision_cfg": {
4 | "image_size": 448,
5 | "layers": 32,
6 | "width": 4096,
7 | "head_width": 128,
8 | "mlp_ratio": 5,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-8b-14-plus-x",
11 | "drop_path_rate": 0,
12 | "qkv_bias": false,
13 | "xattn": true,
14 | "postnorm": false,
15 | "fusedLN": false,
16 | "use_rms_norm": true
17 | },
18 | "text_cfg": {
19 | "context_length": 77,
20 | "vocab_size": 49408,
21 | "width": 1280,
22 | "heads": 20,
23 | "layers": 32,
24 | "xattn": false,
25 | "fusedLN": false
26 | }
27 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-8B.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1280,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 4096,
7 | "head_width": 128,
8 | "mlp_ratio": 5,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-8b-14-x",
11 | "drop_path_rate": 0,
12 | "qkv_bias": false,
13 | "xattn": true,
14 | "postnorm": false,
15 | "fusedLN": false,
16 | "use_rms_norm": true
17 | },
18 | "text_cfg": {
19 | "context_length": 77,
20 | "vocab_size": 49408,
21 | "width": 1280,
22 | "heads": 20,
23 | "layers": 32,
24 | "xattn": false,
25 | "fusedLN": false
26 | }
27 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-B-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 16,
8 | "eva_model_name": "eva-clip-b-16",
9 | "ls_init_value": 0.1,
10 | "drop_path_rate": 0.0
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 512,
16 | "heads": 8,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 40,
6 | "width": 1408,
7 | "head_width": 88,
8 | "mlp_ratio": 4.3637,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-g-14-x",
11 | "drop_path_rate": 0,
12 | "xattn": true,
13 | "fusedLN": true
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 1024,
19 | "heads": 16,
20 | "layers": 24,
21 | "xattn": false,
22 | "fusedLN": true
23 | }
24 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-g-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 40,
6 | "width": 1408,
7 | "head_width": 88,
8 | "mlp_ratio": 4.3637,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-g-14-x",
11 | "drop_path_rate": 0.4,
12 | "xattn": true,
13 | "fusedLN": true
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 768,
19 | "heads": 12,
20 | "layers": 12,
21 | "xattn": false,
22 | "fusedLN": true
23 | }
24 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-B-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "head_width": 64,
8 | "patch_size": 16,
9 | "mlp_ratio": 2.6667,
10 | "eva_model_name": "eva-clip-b-16-X",
11 | "drop_path_rate": 0.0,
12 | "xattn": true,
13 | "fusedLN": true,
14 | "rope": true,
15 | "pt_hw_seq_len": 16,
16 | "intp_freq": true,
17 | "naiveswiglu": true,
18 | "subln": true
19 | },
20 | "text_cfg": {
21 | "context_length": 77,
22 | "vocab_size": 49408,
23 | "width": 512,
24 | "heads": 8,
25 | "layers": 12,
26 | "xattn": true,
27 | "fusedLN": true
28 | }
29 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-L-14-336.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 336,
5 | "layers": 24,
6 | "width": 1024,
7 | "drop_path_rate": 0,
8 | "head_width": 64,
9 | "mlp_ratio": 2.6667,
10 | "patch_size": 14,
11 | "eva_model_name": "eva-clip-l-14-336",
12 | "xattn": true,
13 | "fusedLN": true,
14 | "rope": true,
15 | "pt_hw_seq_len": 16,
16 | "intp_freq": true,
17 | "naiveswiglu": true,
18 | "subln": true
19 | },
20 | "text_cfg": {
21 | "context_length": 77,
22 | "vocab_size": 49408,
23 | "width": 768,
24 | "heads": 12,
25 | "layers": 12,
26 | "xattn": false,
27 | "fusedLN": true
28 | }
29 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-L-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "drop_path_rate": 0,
8 | "head_width": 64,
9 | "mlp_ratio": 2.6667,
10 | "patch_size": 14,
11 | "eva_model_name": "eva-clip-l-14",
12 | "xattn": true,
13 | "fusedLN": true,
14 | "rope": true,
15 | "pt_hw_seq_len": 16,
16 | "intp_freq": true,
17 | "naiveswiglu": true,
18 | "subln": true
19 | },
20 | "text_cfg": {
21 | "context_length": 77,
22 | "vocab_size": 49408,
23 | "width": 768,
24 | "heads": 12,
25 | "layers": 12,
26 | "xattn": false,
27 | "fusedLN": true
28 | }
29 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 64,
6 | "width": 1792,
7 | "head_width": 112,
8 | "mlp_ratio": 8.571428571428571,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-4b-14-x",
11 | "drop_path_rate": 0,
12 | "xattn": true,
13 | "postnorm": true,
14 | "fusedLN": true
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 1280,
20 | "heads": 20,
21 | "layers": 32,
22 | "xattn": false,
23 | "fusedLN": true
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 64,
6 | "width": 1792,
7 | "head_width": 112,
8 | "mlp_ratio": 8.571428571428571,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-4b-14-x",
11 | "drop_path_rate": 0,
12 | "xattn": true,
13 | "postnorm": true,
14 | "fusedLN": true
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 1024,
20 | "heads": 16,
21 | "layers": 24,
22 | "xattn": false,
23 | "fusedLN": true
24 | }
25 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14-448.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 448,
5 | "layers": 77,
6 | "width": 2304,
7 | "head_width": 144,
8 | "mlp_ratio": 10.9722,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-10b-14-x",
11 | "drop_path_rate": 0,
12 | "xattn": true,
13 | "postnorm": false,
14 | "fusedLN": true
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 1280,
20 | "heads": 20,
21 | "layers": 32,
22 | "xattn": false,
23 | "fusedLN": true
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 77,
6 | "width": 2304,
7 | "head_width": 144,
8 | "mlp_ratio": 10.9722,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-10b-14-x",
11 | "drop_path_rate": 0,
12 | "xattn": true,
13 | "postnorm": false,
14 | "fusedLN": true
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 1280,
20 | "heads": 20,
21 | "layers": 32,
22 | "xattn": false,
23 | "fusedLN": true
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/factory.py:
--------------------------------------------------------------------------------
1 | import json
2 | import logging
3 | import os
4 | import pathlib
5 | import re
6 | from copy import deepcopy
7 | from pathlib import Path
8 | from typing import Optional, Tuple, Union, Dict, Any
9 | import torch
10 |
11 | _MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"]
12 | _MODEL_CONFIGS = {} # directory (model_name: config) of model architecture configs
13 |
14 |
15 | def _natural_key(string_):
16 | return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_.lower())]
17 |
18 |
19 | def _rescan_model_configs():
20 | global _MODEL_CONFIGS
21 |
22 | config_ext = (".json",)
23 | config_files = []
24 | for config_path in _MODEL_CONFIG_PATHS:
25 | if config_path.is_file() and config_path.suffix in config_ext:
26 | config_files.append(config_path)
27 | elif config_path.is_dir():
28 | for ext in config_ext:
29 | config_files.extend(config_path.glob(f"*{ext}"))
30 |
31 | for cf in config_files:
32 | with open(cf, "r", encoding="utf8") as f:
33 | model_cfg = json.load(f)
34 | if all(a in model_cfg for a in ("embed_dim", "vision_cfg", "text_cfg")):
35 | _MODEL_CONFIGS[cf.stem] = model_cfg
36 |
37 | _MODEL_CONFIGS = dict(sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0])))
38 |
39 |
40 | _rescan_model_configs() # initial populate of model config registry
41 |
42 |
43 | def list_models():
44 | """enumerate available model architectures based on config files"""
45 | return list(_MODEL_CONFIGS.keys())
46 |
47 |
48 | def add_model_config(path):
49 | """add model config path or file and update registry"""
50 | if not isinstance(path, Path):
51 | path = Path(path)
52 | _MODEL_CONFIG_PATHS.append(path)
53 | _rescan_model_configs()
54 |
55 |
56 | def get_model_config(model_name):
57 | if model_name in _MODEL_CONFIGS:
58 | return deepcopy(_MODEL_CONFIGS[model_name])
59 | else:
60 | return None
61 |
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-18B.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1536,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 48,
6 | "width": 5120,
7 | "head_width": 128,
8 | "mlp_ratio": 5,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-18b-14-x",
11 | "drop_path_rate": 0,
12 | "qkv_bias": false,
13 | "xattn": true,
14 | "postnorm": true,
15 | "fusedLN": false,
16 | "use_rms_norm": true
17 | },
18 | "text_cfg": {
19 | "context_length": 77,
20 | "vocab_size": 49408,
21 | "width": 1280,
22 | "heads": 20,
23 | "layers": 32,
24 | "xattn": false,
25 | "fusedLN": false
26 | }
27 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-8B-plus.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1280,
3 | "vision_cfg": {
4 | "image_size": 448,
5 | "layers": 32,
6 | "width": 4096,
7 | "head_width": 128,
8 | "mlp_ratio": 5,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-8b-14-plus-x",
11 | "drop_path_rate": 0,
12 | "qkv_bias": false,
13 | "xattn": true,
14 | "postnorm": false,
15 | "fusedLN": false,
16 | "use_rms_norm": true
17 | },
18 | "text_cfg": {
19 | "context_length": 77,
20 | "vocab_size": 49408,
21 | "width": 1280,
22 | "heads": 20,
23 | "layers": 32,
24 | "xattn": false,
25 | "fusedLN": false
26 | }
27 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-8B.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1280,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 32,
6 | "width": 4096,
7 | "head_width": 128,
8 | "mlp_ratio": 5,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-8b-14-x",
11 | "drop_path_rate": 0,
12 | "qkv_bias": false,
13 | "xattn": true,
14 | "postnorm": false,
15 | "fusedLN": false,
16 | "use_rms_norm": true
17 | },
18 | "text_cfg": {
19 | "context_length": 77,
20 | "vocab_size": 49408,
21 | "width": 1280,
22 | "heads": 20,
23 | "layers": 32,
24 | "xattn": false,
25 | "fusedLN": false
26 | }
27 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-B-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "patch_size": 16,
8 | "eva_model_name": "eva-clip-b-16",
9 | "ls_init_value": 0.1,
10 | "drop_path_rate": 0.0
11 | },
12 | "text_cfg": {
13 | "context_length": 77,
14 | "vocab_size": 49408,
15 | "width": 512,
16 | "heads": 8,
17 | "layers": 12
18 | }
19 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 40,
6 | "width": 1408,
7 | "head_width": 88,
8 | "mlp_ratio": 4.3637,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-g-14-x",
11 | "drop_path_rate": 0,
12 | "xattn": true,
13 | "fusedLN": true
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 1024,
19 | "heads": 16,
20 | "layers": 24,
21 | "xattn": false,
22 | "fusedLN": true
23 | }
24 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-g-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 40,
6 | "width": 1408,
7 | "head_width": 88,
8 | "mlp_ratio": 4.3637,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-g-14-x",
11 | "drop_path_rate": 0.4,
12 | "xattn": true,
13 | "fusedLN": true
14 | },
15 | "text_cfg": {
16 | "context_length": 77,
17 | "vocab_size": 49408,
18 | "width": 768,
19 | "heads": 12,
20 | "layers": 12,
21 | "xattn": false,
22 | "fusedLN": true
23 | }
24 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-B-16.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 512,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 12,
6 | "width": 768,
7 | "head_width": 64,
8 | "patch_size": 16,
9 | "mlp_ratio": 2.6667,
10 | "eva_model_name": "eva-clip-b-16-X",
11 | "drop_path_rate": 0.0,
12 | "xattn": true,
13 | "fusedLN": true,
14 | "rope": true,
15 | "pt_hw_seq_len": 16,
16 | "intp_freq": true,
17 | "naiveswiglu": true,
18 | "subln": true
19 | },
20 | "text_cfg": {
21 | "context_length": 77,
22 | "vocab_size": 49408,
23 | "width": 512,
24 | "heads": 8,
25 | "layers": 12,
26 | "xattn": true,
27 | "fusedLN": true
28 | }
29 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-L-14-336.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 336,
5 | "layers": 24,
6 | "width": 1024,
7 | "drop_path_rate": 0,
8 | "head_width": 64,
9 | "mlp_ratio": 2.6667,
10 | "patch_size": 14,
11 | "eva_model_name": "eva-clip-l-14-336",
12 | "xattn": true,
13 | "fusedLN": true,
14 | "rope": true,
15 | "pt_hw_seq_len": 16,
16 | "intp_freq": true,
17 | "naiveswiglu": true,
18 | "subln": true
19 | },
20 | "text_cfg": {
21 | "context_length": 77,
22 | "vocab_size": 49408,
23 | "width": 768,
24 | "heads": 12,
25 | "layers": 12,
26 | "xattn": false,
27 | "fusedLN": true
28 | }
29 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-L-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 768,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 24,
6 | "width": 1024,
7 | "drop_path_rate": 0,
8 | "head_width": 64,
9 | "mlp_ratio": 2.6667,
10 | "patch_size": 14,
11 | "eva_model_name": "eva-clip-l-14",
12 | "xattn": true,
13 | "fusedLN": true,
14 | "rope": true,
15 | "pt_hw_seq_len": 16,
16 | "intp_freq": true,
17 | "naiveswiglu": true,
18 | "subln": true
19 | },
20 | "text_cfg": {
21 | "context_length": 77,
22 | "vocab_size": 49408,
23 | "width": 768,
24 | "heads": 12,
25 | "layers": 12,
26 | "xattn": false,
27 | "fusedLN": true
28 | }
29 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 64,
6 | "width": 1792,
7 | "head_width": 112,
8 | "mlp_ratio": 8.571428571428571,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-4b-14-x",
11 | "drop_path_rate": 0,
12 | "xattn": true,
13 | "postnorm": true,
14 | "fusedLN": true
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 1280,
20 | "heads": 20,
21 | "layers": 32,
22 | "xattn": false,
23 | "fusedLN": true
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-bigE-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 64,
6 | "width": 1792,
7 | "head_width": 112,
8 | "mlp_ratio": 8.571428571428571,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-4b-14-x",
11 | "drop_path_rate": 0,
12 | "xattn": true,
13 | "postnorm": true,
14 | "fusedLN": true
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 1024,
20 | "heads": 16,
21 | "layers": 24,
22 | "xattn": false,
23 | "fusedLN": true
24 | }
25 | }
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14-448.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 448,
5 | "layers": 77,
6 | "width": 2304,
7 | "head_width": 144,
8 | "mlp_ratio": 10.9722,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-10b-14-x",
11 | "drop_path_rate": 0,
12 | "xattn": true,
13 | "postnorm": false,
14 | "fusedLN": true
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 1280,
20 | "heads": 20,
21 | "layers": 32,
22 | "xattn": false,
23 | "fusedLN": true
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/llava/model/multimodal_encoder/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14.json:
--------------------------------------------------------------------------------
1 | {
2 | "embed_dim": 1024,
3 | "vision_cfg": {
4 | "image_size": 224,
5 | "layers": 77,
6 | "width": 2304,
7 | "head_width": 144,
8 | "mlp_ratio": 10.9722,
9 | "patch_size": 14,
10 | "eva_model_name": "eva-clip-10b-14-x",
11 | "drop_path_rate": 0,
12 | "xattn": true,
13 | "postnorm": false,
14 | "fusedLN": true
15 | },
16 | "text_cfg": {
17 | "context_length": 77,
18 | "vocab_size": 49408,
19 | "width": 1280,
20 | "heads": 20,
21 | "layers": 32,
22 | "xattn": false,
23 | "fusedLN": true
24 | }
25 | }
26 |
--------------------------------------------------------------------------------
/llava/model/multimodal_projector/pooler_projector.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 |
4 | import math
5 |
6 | from transformers.models.clip.modeling_clip import CLIPVisionModel
7 |
8 |
9 | class PoolerProjector(nn.Module):
10 | def __init__(self, config, vision_cfg):
11 | super().__init__()
12 | self._config = config
13 | self.hw = vision_cfg.image_size // vision_cfg.patch_size
14 |
15 | self.conv_pool = nn.Conv2d(config.mm_hidden_size, config.hidden_size, kernel_size=2, stride=2)
16 |
17 | self.proj = nn.Sequential(
18 | nn.GELU(),
19 | nn.Linear(config.hidden_size, config.hidden_size),
20 | )
21 |
22 | def forward(self, x, *args, **kwargs):
23 | height = width = self.hw
24 | assert height * width == x.shape[1]
25 | x = x.view(x.shape[0], height, width, -1).permute(0, 3, 1, 2)
26 | x = self.conv_pool(x)
27 | x = x.flatten(2).transpose(1, 2)
28 | x = self.proj(x)
29 | return x
30 |
31 | @property
32 | def config(self):
33 | return {"mm_projector_type": "pooler"}
34 |
--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/builder.py:
--------------------------------------------------------------------------------
1 | import torch
2 |
3 | from .masked_drop import MaskedDrop
4 | from .spatial_pool import SpatialPool
5 | from .perceiver import PerceiverResampler
6 | from .qformer import Qformer
7 |
8 |
9 | class IdentityMap(torch.nn.Module):
10 | def __init__(self):
11 | super().__init__()
12 |
13 | def forward(self, x, *args, **kwargs):
14 | return x
15 |
16 | @property
17 | def config(self):
18 | return {"mm_resampler_type": None}
19 |
20 |
21 | def build_vision_resampler(model_args, delay_load=False, **kwargs):
22 | resampler_type = getattr(model_args, "mm_resampler_type", None)
23 | if resampler_type == "masked_drop":
24 | return MaskedDrop(model_args)
25 | elif resampler_type == "spatial_pool":
26 | return SpatialPool(model_args, **kwargs)
27 | elif resampler_type == "perceiver":
28 | return PerceiverResampler(model_args, **kwargs)
29 | elif resampler_type == "qformer":
30 | return Qformer(model_args, **kwargs)
31 | elif resampler_type is None:
32 | return IdentityMap()
33 |
34 | raise ValueError(f"Unknown resampler type: {resampler_type}")
35 |
--------------------------------------------------------------------------------
/llava/model/multimodal_resampler/spatial_pool.py:
--------------------------------------------------------------------------------
1 | import torch
2 | import torch.nn as nn
3 | import math
4 |
5 |
6 | class SpatialPool(nn.Module):
7 | def __init__(self, model_args, vision_tower):
8 | super().__init__()
9 |
10 | self.mode = model_args.mm_spatial_pool_mode
11 | self.stride = model_args.mm_spatial_pool_stride
12 | self.out_channels = getattr(model_args, "mm_spatial_pool_out_channels", vision_tower.hidden_size)
13 |
14 | if self.mode == "average":
15 | self.pool = nn.AvgPool2d(kernel_size=self.stride, stride=self.stride)
16 | elif self.mode == "max":
17 | self.pool = nn.MaxPool2d(kernel_size=self.stride, stride=self.stride)
18 | elif self.mode == "conv":
19 | self.pool = nn.Conv2d(in_channels=vision_tower.hidden_size, out_channels=self.out_channels, kernel_size=self.stride, stride=self.stride)
20 | else:
21 | raise ValueError(f"Unknown pooling mode: {self.pool}.")
22 |
23 | def forward(self, image_features, images, *args, **kwargs):
24 | ori_W = int(math.sqrt(image_features.shape[1] * images.shape[3] // images.shape[2]))
25 | ori_H = int(ori_W * images.shape[2] // images.shape[3])
26 |
27 | B, _, F = image_features.shape
28 |
29 | image_features_spatial = image_features.view(B, ori_H, ori_H, F).permute(0, 3, 1, 2)
30 | image_features_spatial_pool = self.pool(image_features_spatial)
31 |
32 | return image_features_spatial_pool.flatten(2).transpose(1, 2).contiguous()
33 |
34 | @property
35 | def config(self):
36 | return {
37 | "mm_resampler_type": "spatial_pool",
38 | "mm_spatial_pool_stride": self.stride,
39 | "mm_spatial_pool_mode": self.mode,
40 | "mm_spatial_pool_out_channels": self.out_channels,
41 | }
42 |
43 | @property
44 | def hidden_size(self):
45 | return self.out_channels
46 |
--------------------------------------------------------------------------------
/llava/model/utils.py:
--------------------------------------------------------------------------------
1 | from transformers import AutoConfig
2 |
3 |
4 | def auto_upgrade(config):
5 | cfg = AutoConfig.from_pretrained(config)
6 | if "llava" in config and "llava" not in cfg.model_type:
7 | assert cfg.model_type == "llama"
8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 | if confirm.lower() in ["y", "yes"]:
12 | print("Upgrading checkpoint...")
13 | assert len(cfg.architectures) == 1
14 | setattr(cfg.__class__, "model_type", "llava")
15 | cfg.architectures[0] = "LlavaLlamaForCausalLM"
16 | cfg.save_pretrained(config)
17 | print("Checkpoint upgraded.")
18 | else:
19 | print("Checkpoint upgrade aborted.")
20 | exit(1)
21 |
--------------------------------------------------------------------------------
/llava/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/llava/serve/__init__.py
--------------------------------------------------------------------------------
/llava/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/llava/serve/examples/extreme_ironing.jpg
--------------------------------------------------------------------------------
/llava/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/llava/serve/examples/waterview.jpg
--------------------------------------------------------------------------------
/llava/serve/register_worker.py:
--------------------------------------------------------------------------------
1 | """
2 | Manually register workers.
3 |
4 | Usage:
5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
6 | """
7 |
8 | import argparse
9 |
10 | import requests
11 |
12 | if __name__ == "__main__":
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument("--controller-address", type=str)
15 | parser.add_argument("--worker-name", type=str)
16 | parser.add_argument("--check-heart-beat", action="store_true")
17 | args = parser.parse_args()
18 |
19 | url = args.controller_address + "/register_worker"
20 | data = {
21 | "worker_name": args.worker_name,
22 | "check_heart_beat": args.check_heart_beat,
23 | "worker_status": None,
24 | }
25 | r = requests.post(url, json=data)
26 | assert r.status_code == 200
27 |
--------------------------------------------------------------------------------
/llava/serve/test_message.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | import json
3 |
4 | import requests
5 |
6 | from llava.conversation import default_conversation
7 |
8 |
9 | def main():
10 | if args.worker_address:
11 | worker_addr = args.worker_address
12 | else:
13 | controller_addr = args.controller_address
14 | ret = requests.post(controller_addr + "/refresh_all_workers")
15 | ret = requests.post(controller_addr + "/list_models")
16 | models = ret.json()["models"]
17 | models.sort()
18 | print(f"Models: {models}")
19 |
20 | ret = requests.post(controller_addr + "/get_worker_address", json={"model": args.model_name})
21 | worker_addr = ret.json()["address"]
22 | print(f"worker_addr: {worker_addr}")
23 |
24 | if worker_addr == "":
25 | return
26 |
27 | conv = default_conversation.copy()
28 | conv.append_message(conv.roles[0], args.message)
29 | prompt = conv.get_prompt()
30 |
31 | headers = {"User-Agent": "LLaVA Client"}
32 | pload = {
33 | "model": args.model_name,
34 | "prompt": prompt,
35 | "max_new_tokens": args.max_new_tokens,
36 | "temperature": 0.7,
37 | "stop": conv.sep,
38 | }
39 | response = requests.post(worker_addr + "/worker_generate_stream", headers=headers, json=pload, stream=True)
40 |
41 | print(prompt.replace(conv.sep, "\n"), end="")
42 | for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
43 | if chunk:
44 | data = json.loads(chunk.decode("utf-8"))
45 | output = data["text"].split(conv.sep)[-1]
46 | print(output, end="\r")
47 | print("")
48 |
49 |
50 | if __name__ == "__main__":
51 | parser = argparse.ArgumentParser()
52 | parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
53 | parser.add_argument("--worker-address", type=str)
54 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
55 | parser.add_argument("--max-new-tokens", type=int, default=32)
56 | parser.add_argument("--message", type=str, default="Tell me a story with more than 1000 words.")
57 | args = parser.parse_args()
58 |
59 | main()
60 |
--------------------------------------------------------------------------------
/llava/train/train_mem.py:
--------------------------------------------------------------------------------
1 | from llava.train.train import train
2 |
3 | if __name__ == "__main__":
4 | train()
5 |
--------------------------------------------------------------------------------
/playground/demo/xU25MMA2N4aVtYay.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/playground/demo/xU25MMA2N4aVtYay.mp4
--------------------------------------------------------------------------------
/playground/equal_splitter.py:
--------------------------------------------------------------------------------
1 | import json
2 | from math import ceil
3 |
4 |
5 | def split_json_file(input_file, n_splits):
6 | # Read the JSON file
7 | with open(input_file, "r") as file:
8 | data = json.load(file)
9 |
10 | # Calculate the size of each split
11 | total_items = len(data)
12 | items_per_split = ceil(total_items / n_splits)
13 |
14 | # Split the data and save into separate files
15 | for i in range(n_splits):
16 | start_index = i * items_per_split
17 | end_index = min((i + 1) * items_per_split, total_items)
18 | split_data = data[start_index:end_index]
19 |
20 | # Write the split data to a new JSON file
21 | with open(f"{input_file.split('.')[0]}_split_{i}.json", "w") as split_file:
22 | json.dump(split_data, split_file, indent=4)
23 |
24 |
25 | def main():
26 | import argparse
27 |
28 | parser = argparse.ArgumentParser(description="Split a JSON file into multiple parts.")
29 | parser.add_argument("--input_file", type=str, help="The JSON file to split")
30 | parser.add_argument("--n_splits", type=int, help="The number of splits")
31 |
32 | args = parser.parse_args()
33 |
34 | split_json_file(args.input_file, args.n_splits)
35 |
36 |
37 | if __name__ == "__main__":
38 | main()
39 |
--------------------------------------------------------------------------------
/playground/remove_mid_ckpt.py:
--------------------------------------------------------------------------------
1 | import os
2 | import shutil
3 | import glob
4 |
5 |
6 | def remove_checkpoints(directory, pattern):
7 | # Walk through the directory
8 | for root, dirs, files in os.walk(directory):
9 | # Use glob to find paths matching the pattern
10 | for file_path in glob.glob(os.path.join(root, pattern)):
11 | # Check if it is a directory
12 | if "llava-1.6-mistral-7b" in file_path:
13 | continue
14 | if os.path.isdir(file_path):
15 | # Remove the directory
16 | print(f"Removing {file_path}")
17 | input("Press Enter to continue...")
18 | shutil.rmtree(file_path)
19 | print(f"Removed directory: {file_path}")
20 | else:
21 | print(f"Removing {file_path}")
22 | input("Press Enter to continue...")
23 | # Remove the file
24 | os.remove(file_path)
25 | print(f"Removed file: {file_path}")
26 |
27 |
28 | # Directory containing the checkpoints
29 | directory = "/mnt/bn/vl-research/checkpoints/feng/"
30 |
31 | # Pattern to match in the file names
32 | pattern = "global_step*"
33 |
34 | # Call the function
35 | remove_checkpoints(directory, pattern)
36 |
--------------------------------------------------------------------------------
/scripts/archived/convert_gqa_for_eval.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 |
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument("--src", type=str)
7 | parser.add_argument("--dst", type=str)
8 | args = parser.parse_args()
9 |
10 | all_answers = []
11 | for line_idx, line in enumerate(open(args.src)):
12 | res = json.loads(line)
13 | question_id = res["question_id"]
14 | text = res["text"].rstrip(".").lower()
15 | all_answers.append({"questionId": question_id, "prediction": text})
16 |
17 | with open(args.dst, "w") as f:
18 | json.dump(all_answers, f)
19 |
--------------------------------------------------------------------------------
/scripts/archived/convert_mmvet_for_eval.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 |
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument("--src", type=str)
7 | parser.add_argument("--dst", type=str)
8 | args = parser.parse_args()
9 |
10 | cur_result = {}
11 |
12 | for line in open(args.src):
13 | data = json.loads(line)
14 | qid = data["question_id"]
15 | cur_result[f"v1_{qid}"] = data["text"]
16 |
17 | with open(args.dst, "w") as f:
18 | json.dump(cur_result, f, indent=2)
19 |
--------------------------------------------------------------------------------
/scripts/archived/convert_vizwiz_for_submission.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import json
4 |
5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
6 |
7 |
8 | def parse_args():
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument("--annotation-file", type=str, required=True)
11 | parser.add_argument("--result-file", type=str, required=True)
12 | parser.add_argument("--result-upload-file", type=str, required=True)
13 | return parser.parse_args()
14 |
15 |
16 | if __name__ == "__main__":
17 |
18 | args = parse_args()
19 |
20 | os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True)
21 |
22 | results = []
23 | error_line = 0
24 | for line_idx, line in enumerate(open(args.result_file)):
25 | try:
26 | results.append(json.loads(line))
27 | except:
28 | error_line += 1
29 | results = {x["question_id"]: x["text"] for x in results}
30 | test_split = [json.loads(line) for line in open(args.annotation_file)]
31 | split_ids = set([x["question_id"] for x in test_split])
32 |
33 | print(f"total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}")
34 |
35 | all_answers = []
36 |
37 | answer_processor = EvalAIAnswerProcessor()
38 |
39 | for x in test_split:
40 | # import pdb; pdb.set_trace()
41 | assert x["question_id"] in results, print(x)
42 | all_answers.append({"image": x["image"], "answer": answer_processor(results[x["question_id"]])})
43 |
44 | with open(args.result_upload_file, "w") as f:
45 | json.dump(all_answers, f)
46 |
--------------------------------------------------------------------------------
/scripts/archived/convert_vqav2_for_submission.py:
--------------------------------------------------------------------------------
1 | import os
2 | import argparse
3 | import json
4 |
5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor
6 |
7 |
8 | def parse_args():
9 | parser = argparse.ArgumentParser()
10 | parser.add_argument("--dir", type=str, default="./playground/data/eval/vqav2")
11 | parser.add_argument("--ckpt", type=str, required=True)
12 | parser.add_argument("--split", type=str, required=True)
13 | return parser.parse_args()
14 |
15 |
16 | if __name__ == "__main__":
17 |
18 | args = parse_args()
19 |
20 | src = os.path.join(args.dir, "answers", args.split, args.ckpt, "merge.jsonl")
21 | test_split = os.path.join(args.dir, "llava_vqav2_mscoco_test2015.jsonl")
22 | dst = os.path.join(args.dir, "answers_upload", args.split, f"{args.ckpt}.json")
23 | os.makedirs(os.path.dirname(dst), exist_ok=True)
24 |
25 | results = []
26 | error_line = 0
27 | for line_idx, line in enumerate(open(src)):
28 | try:
29 | results.append(json.loads(line))
30 | except:
31 | error_line += 1
32 |
33 | results = {x["question_id"]: x["text"] for x in results}
34 | test_split = [json.loads(line) for line in open(test_split)]
35 | split_ids = set([x["question_id"] for x in test_split])
36 |
37 | print(f"total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}")
38 |
39 | all_answers = []
40 |
41 | answer_processor = EvalAIAnswerProcessor()
42 |
43 | for x in test_split:
44 | if x["question_id"] not in results:
45 | all_answers.append({"question_id": x["question_id"], "answer": ""})
46 | else:
47 | all_answers.append({"question_id": x["question_id"], "answer": answer_processor(results[x["question_id"]])})
48 |
49 | with open(dst, "w") as f:
50 | json.dump(all_answers, open(dst, "w"))
51 |
--------------------------------------------------------------------------------
/scripts/archived/entry_cmd.sh:
--------------------------------------------------------------------------------
1 | python3 -m pip install --upgrade pip;
2 |
3 | export http_proxy=http://sys-proxy-rd-relay.byted.org:8118;
4 | export https_proxy=http://sys-proxy-rd-relay.byted.org:8118;
5 |
6 | export HF_HUB_ENABLE_HF_TRANSFER="1";
7 |
8 | cd /mnt/bn/vl-research-boli01-cn/projects/zzz/lmms-eval;
9 | pip install -e .;
10 |
11 | cd /mnt/bn/vl-research-boli01-cn/projects/zzz/LLaVA_Next;
12 | pip install -e .;
13 |
14 | python3 -m pip install ninja;
15 | python3 -m pip install flash-attn --no-build-isolation;
16 |
17 | bash /mnt/bn/vl-research-boli01-cn/projects/zzz/LLaVA_Next/cn_scripts/vicuna/internal0.6m_finetune_llava1.6mix_7b_v0.2_unfreeze.sh
18 |
19 |
20 | accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \
21 | --model llava \
22 | --model_args pretrained="/mnt/bn/vl-research-boli01-cn/projects/zzz/LLaVA_Next/internal_project_checkpoints/llavanext-lmsys_vicuna-7b-v1.5-clip-vit-large-patch14-336-mlp2x_gelu-pretrain_internal0.6m_vicuna_v1_finetune_llava1.6_datamix_unfreezeVIS_1e" \
23 | --tasks ok_vqa,textcaps_val,mme_test,mmmu,cmmmu,coco2017_cap_val,vizwiz_vqa_val,ai2d,chartqa,pope \
24 | --batch_size 1 \
25 | --log_samples \
26 | --log_samples_suffix debug \
27 | --output_path ./logs/ \
28 | --wandb_args 'project=llava-next-lmms-eval,job_type=eval';
--------------------------------------------------------------------------------
/scripts/archived/finetune_full_schedule.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Uncomment and set the following variables correspondingly to run this script:
4 |
5 | ################## VICUNA ##################
6 | # PROMPT_VERSION=v1
7 | # MODEL_VERSION="vicuna-v1-3-7b"
8 | ################## VICUNA ##################
9 |
10 | ################## LLaMA-2 ##################
11 | # PROMPT_VERSION="llava_llama_2"
12 | # MODEL_VERSION="llama-2-7b-chat"
13 | ################## LLaMA-2 ##################
14 |
15 | deepspeed llava/train/train_mem.py \
16 | --deepspeed ./scripts/zero2.json \
17 | --model_name_or_path ./checkpoints/$MODEL_VERSION \
18 | --version $PROMPT_VERSION \
19 | --data_path ./playground/data/llava_instruct_158k.json \
20 | --image_folder /path/to/coco/train2017 \
21 | --vision_tower openai/clip-vit-large-patch14 \
22 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
23 | --mm_vision_select_layer -2 \
24 | --mm_use_im_start_end False \
25 | --mm_use_im_patch_token False \
26 | --bf16 True \
27 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \
28 | --num_train_epochs 3 \
29 | --per_device_train_batch_size 16 \
30 | --per_device_eval_batch_size 4 \
31 | --gradient_accumulation_steps 1 \
32 | --evaluation_strategy "no" \
33 | --save_strategy "steps" \
34 | --save_steps 50000 \
35 | --save_total_limit 1 \
36 | --learning_rate 2e-5 \
37 | --weight_decay 0. \
38 | --warmup_ratio 0.03 \
39 | --lr_scheduler_type "cosine" \
40 | --logging_steps 1 \
41 | --tf32 True \
42 | --model_max_length 2048 \
43 | --gradient_checkpointing True \
44 | --dataloader_num_workers 16 \
45 | --lazy_preprocess True \
46 | --report_to wandb
47 |
--------------------------------------------------------------------------------
/scripts/archived/finetune_lora.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Uncomment and set the following variables correspondingly to run this script:
4 |
5 | ################## VICUNA ##################
6 | # PROMPT_VERSION=v1
7 | # MODEL_VERSION="vicuna-v1-3-7b"
8 | ################## VICUNA ##################
9 |
10 | ################## LLaMA-2 ##################
11 | # PROMPT_VERSION="llava_llama_2"
12 | # MODEL_VERSION="llama-2-7b-chat"
13 | ################## LLaMA-2 ##################
14 |
15 | deepspeed llava/train/train_mem.py \
16 | --deepspeed ./scripts/zero2.json \
17 | --lora_enable True \
18 | --model_name_or_path ./checkpoints/$MODEL_VERSION \
19 | --version $PROMPT_VERSION \
20 | --data_path ./playground/data/llava_instruct_80k.json \
21 | --image_folder /path/to/coco/train2017 \
22 | --vision_tower openai/clip-vit-large-patch14 \
23 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
24 | --mm_vision_select_layer -2 \
25 | --mm_use_im_start_end False \
26 | --mm_use_im_patch_token False \
27 | --bf16 True \
28 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
29 | --num_train_epochs 1 \
30 | --per_device_train_batch_size 16 \
31 | --per_device_eval_batch_size 4 \
32 | --gradient_accumulation_steps 1 \
33 | --evaluation_strategy "no" \
34 | --save_strategy "steps" \
35 | --save_steps 50000 \
36 | --save_total_limit 1 \
37 | --learning_rate 2e-5 \
38 | --weight_decay 0. \
39 | --warmup_ratio 0.03 \
40 | --lr_scheduler_type "cosine" \
41 | --logging_steps 1 \
42 | --tf32 True \
43 | --model_max_length 2048 \
44 | --gradient_checkpointing True \
45 | --lazy_preprocess True \
46 | --dataloader_num_workers 16 \
47 | --report_to wandb
48 |
--------------------------------------------------------------------------------
/scripts/archived/finetune_qlora.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Uncomment and set the following variables correspondingly to run this script:
4 |
5 | ################## VICUNA ##################
6 | # PROMPT_VERSION=v1
7 | # MODEL_VERSION="vicuna-v1-3-7b"
8 | ################## VICUNA ##################
9 |
10 | ################## LLaMA-2 ##################
11 | # PROMPT_VERSION="llava_llama_2"
12 | # MODEL_VERSION="llama-2-7b-chat"
13 | ################## LLaMA-2 ##################
14 |
15 | deepspeed llava/train/train_mem.py \
16 | --deepspeed ./scripts/zero2.json \
17 | --lora_enable True \
18 | --bits 4 \
19 | --model_name_or_path ./checkpoints/$MODEL_VERSION \
20 | --version $PROMPT_VERSION \
21 | --data_path ./playground/data/llava_instruct_80k.json \
22 | --image_folder /path/to/coco/train2017 \
23 | --vision_tower openai/clip-vit-large-patch14 \
24 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \
25 | --mm_vision_select_layer -2 \
26 | --mm_use_im_start_end False \
27 | --mm_use_im_patch_token False \
28 | --bf16 True \
29 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \
30 | --num_train_epochs 1 \
31 | --per_device_train_batch_size 16 \
32 | --per_device_eval_batch_size 4 \
33 | --gradient_accumulation_steps 1 \
34 | --evaluation_strategy "no" \
35 | --save_strategy "steps" \
36 | --save_steps 50000 \
37 | --save_total_limit 1 \
38 | --learning_rate 2e-5 \
39 | --weight_decay 0. \
40 | --warmup_ratio 0.03 \
41 | --lr_scheduler_type "cosine" \
42 | --logging_steps 1 \
43 | --tf32 True \
44 | --model_max_length 2048 \
45 | --gradient_checkpointing True \
46 | --lazy_preprocess True \
47 | --dataloader_num_workers 16 \
48 | --report_to wandb
49 |
--------------------------------------------------------------------------------
/scripts/archived/finetune_sqa.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | deepspeed llava/train/train_mem.py \
4 | --deepspeed ./scripts/zero2.json \
5 | --model_name_or_path lmsys/vicuna-13b-v1.3 \
6 | --version $PROMPT_VERSION \
7 | --data_path /Data/ScienceQA/data/scienceqa/llava_train_QCM-LEA.json \
8 | --image_folder /Data/ScienceQA/data/scienceqa/images/train \
9 | --vision_tower openai/clip-vit-large-patch14 \
10 | --pretrain_mm_mlp_adapter ./checkpoints/huggingface/liuhaotian/llava-pretrain-vicuna-13b-v1.3/mm_projector.bin \
11 | --mm_vision_select_layer -2 \
12 | --mm_use_im_start_end False \
13 | --mm_use_im_patch_token False \
14 | --bf16 True \
15 | --output_dir ./checkpoints/llava-vicuna-13b-v1.3-pretrain_lcs558k_plain-ScienceQA_QCM_LEA-12e \
16 | --num_train_epochs 12 \
17 | --per_device_train_batch_size 16 \
18 | --per_device_eval_batch_size 4 \
19 | --gradient_accumulation_steps 1 \
20 | --evaluation_strategy "no" \
21 | --save_strategy "steps" \
22 | --save_steps 50000 \
23 | --save_total_limit 1 \
24 | --learning_rate 2e-5 \
25 | --weight_decay 0. \
26 | --warmup_ratio 0.03 \
27 | --lr_scheduler_type "cosine" \
28 | --logging_steps 1 \
29 | --tf32 True \
30 | --model_max_length 2048 \
31 | --gradient_checkpointing True \
32 | --dataloader_num_workers 16 \
33 | --lazy_preprocess True \
34 | --report_to wandb
35 |
--------------------------------------------------------------------------------
/scripts/archived/merge_lora_weights.py:
--------------------------------------------------------------------------------
1 | import argparse
2 | from llava.model.builder import load_pretrained_model
3 | from llava.mm_utils import get_model_name_from_path
4 |
5 |
6 | def merge_lora(args):
7 | model_name = get_model_name_from_path(args.model_path)
8 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map="cpu")
9 |
10 | model.save_pretrained(args.save_model_path)
11 | tokenizer.save_pretrained(args.save_model_path)
12 |
13 |
14 | if __name__ == "__main__":
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument("--model-path", type=str, required=True)
17 | parser.add_argument("--model-base", type=str, required=True)
18 | parser.add_argument("--save-model-path", type=str, required=True)
19 |
20 | args = parser.parse_args()
21 |
22 | merge_lora(args)
23 |
--------------------------------------------------------------------------------
/scripts/archived/pretrain.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Uncomment and set the following variables correspondingly to run this script:
4 |
5 | # MODEL_VERSION=vicuna-v1-3-7b
6 | # MODEL_VERSION=llama-2-7b-chat
7 |
8 | ########### DO NOT CHANGE ###########
9 | ########### USE THIS FOR BOTH ###########
10 | PROMPT_VERSION=plain
11 | ########### DO NOT CHANGE ###########
12 |
13 | deepspeed llava/train/train_mem.py \
14 | --deepspeed ./scripts/zero2.json \
15 | --model_name_or_path ./checkpoints/$MODEL_VERSION \
16 | --version $PROMPT_VERSION \
17 | --data_path /path/to/pretrain_data.json \
18 | --image_folder /path/to/images \
19 | --vision_tower openai/clip-vit-large-patch14 \
20 | --tune_mm_mlp_adapter True \
21 | --mm_vision_select_layer -2 \
22 | --mm_use_im_start_end False \
23 | --mm_use_im_patch_token False \
24 | --bf16 True \
25 | --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \
26 | --num_train_epochs 1 \
27 | --per_device_train_batch_size 16 \
28 | --per_device_eval_batch_size 4 \
29 | --gradient_accumulation_steps 1 \
30 | --evaluation_strategy "no" \
31 | --save_strategy "steps" \
32 | --save_steps 24000 \
33 | --learning_rate 2e-3 \
34 | --weight_decay 0. \
35 | --warmup_ratio 0.03 \
36 | --lr_scheduler_type "cosine" \
37 | --logging_steps 1 \
38 | --tf32 True \
39 | --model_max_length 2048 \
40 | --gradient_checkpointing True \
41 | --dataloader_num_workers 16 \
42 | --lazy_preprocess True \
43 | --report_to wandb
44 |
--------------------------------------------------------------------------------
/scripts/archived/quick_check.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import argparse
4 | from tqdm import tqdm
5 | import yaml
6 |
7 |
8 | def check_missing_images(json_path, images_folder):
9 | data = json.load(open(json_path, "r"))
10 | missing_data = []
11 |
12 | for i, d in enumerate(tqdm(data)):
13 | image = d["image"] if "image" in d else ""
14 | if image != "":
15 | path = os.path.join(images_folder, image)
16 | if not os.path.exists(path):
17 | print(f"Missing image: {path}")
18 | missing_data.append(d)
19 |
20 | return missing_data
21 |
22 |
23 | def read_yaml_to_llava_data(yaml_path, images_folder):
24 | print(f"Reading YAML file: {yaml_path}")
25 | with open(yaml_path, "r") as f:
26 | data = yaml.safe_load(f)
27 |
28 | llava_json_paths = data["datasets"]
29 | for item in llava_json_paths:
30 | json_path = item["json_path"]
31 | missing_data = check_missing_images(json_path, images_folder)
32 | if len(missing_data) > 0:
33 | print(f"Missing images in {json_path}:")
34 | for d in missing_data:
35 | print(d)
36 |
37 |
38 | def direct_check_llava_data(json_path, images_folder):
39 | missing_data = check_missing_images(json_path, images_folder)
40 | if len(missing_data) > 0:
41 | print(f"Missing images in {json_path}:")
42 | for d in missing_data:
43 | print(d)
44 |
45 |
46 | if __name__ == "__main__":
47 | parser = argparse.ArgumentParser(description="Check for missing images in dataset.")
48 | parser.add_argument("--yaml_path", type=str, default="", help="Path to the YAML file containing the dataset.")
49 | parser.add_argument("--json_path", type=str, default="", help="Path to the JSON file containing the dataset.")
50 | parser.add_argument("--images_folder", type=str, default="/mnt/bn/vl-research/data/llava_data", help="Path to the folder containing the images.")
51 |
52 | args = parser.parse_args()
53 |
54 | if args.json_path != "":
55 | direct_check_llava_data(args.json_path, args.images_folder)
56 | elif args.yaml_path != "":
57 | read_yaml_to_llava_data(args.yaml_path, args.images_folder)
58 |
--------------------------------------------------------------------------------
/scripts/archived/sqa_eval_batch.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CHUNKS=8
4 | for IDX in {0..7}; do
5 | CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \
6 | --model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \
7 | --question-file ~/haotian/datasets/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \
8 | --image-folder ~/haotian/datasets/ScienceQA/data/scienceqa/images/test \
9 | --answers-file ./test_llava-13b-chunk$CHUNKS_$IDX.jsonl \
10 | --num-chunks $CHUNKS \
11 | --chunk-idx $IDX \
12 | --conv-mode llava_v1 &
13 | done
14 |
--------------------------------------------------------------------------------
/scripts/archived/sqa_eval_gather.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | CHUNKS=8
4 | output_file="test_llava-13b.jsonl"
5 |
6 | # Clear out the output file if it exists.
7 | > "$output_file"
8 |
9 | # Loop through the indices and concatenate each file.
10 | for idx in $(seq 0 $((CHUNKS-1))); do
11 | cat "./test_llava-13b-chunk${idx}.jsonl" >> "$output_file"
12 | done
13 |
14 | python llava/eval/eval_science_qa.py \
15 | --base-dir ~/haotian/datasets/ScienceQA/data/scienceqa \
16 | --result-file ./test_llava-13b.jsonl \
17 | --output-file ./test_llava-13b_output.json \
18 | --output-result ./test_llava-13b_result.json
19 |
--------------------------------------------------------------------------------
/scripts/convert_gqa_for_eval.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import argparse
4 |
5 | parser = argparse.ArgumentParser()
6 | parser.add_argument("--src", type=str)
7 | parser.add_argument("--dst", type=str)
8 | args = parser.parse_args()
9 |
10 | all_answers = []
11 | for line_idx, line in enumerate(open(args.src)):
12 | res = json.loads(line)
13 | question_id = res['question_id']
14 | text = res['text'].rstrip('.').lower()
15 | all_answers.append({"questionId": question_id, "prediction": text})
16 |
17 | with open(args.dst, 'w') as f:
18 | json.dump(all_answers, f)
19 |
--------------------------------------------------------------------------------
/scripts/interleave/eval_all.sh:
--------------------------------------------------------------------------------
1 |
2 | # evaluate
3 | ./scripts/interleave/eval_interleave_3d.sh /path/to/ckpt /path/to/images multi_image_in_domain
4 | ./scripts/interleave/eval_interleave_3d.sh /path/to/ckpt /path/to/images multi_image_out_domain
5 | ./scripts/interleave/eval_interleave_3d.sh /path/to/ckpt /path/to/images multi_view_in_domain
--------------------------------------------------------------------------------
/scripts/interleave/eval_interleave_3d.sh:
--------------------------------------------------------------------------------
1 | alias python=python3
2 | CKPT_PATH=$1
3 | NAME=$(echo "$CKPT_PATH" | awk -F'/' '{print $NF}')
4 | echo $NAME
5 | ##### set images path
6 | DATA_PATH=$2
7 | EVAL_TYPE=$3
8 | JSON_PATH=$2/$3.json
9 | ############################### eval multi-image
10 | RESULT_NAME="logs/${NAME}/${EVAL_TYPE}"
11 | echo $RESULT_NAME
12 |
13 | mkdir -p logs/${NAME}
14 |
15 | file_path=${RESULT_NAME}/result.jsonl
16 |
17 | bash scripts/interleave/eval_multiprocess.sh \
18 | ${CKPT_PATH} \
19 | ${JSON_PATH} \
20 | ${RESULT_NAME} \
21 | ${DATA_PATH} \
22 | "" \
23 | 8 0
24 |
25 | python3 llava/eval/evaluate_interleave.py --result-dir ${RESULT_NAME}
26 |
27 |
28 |
29 |
--------------------------------------------------------------------------------
/scripts/interleave/eval_multiprocess.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Check if three arguments are passed
4 | if [ "$#" -ne 7 ]; then
5 | echo "Usage: $0 "
6 | exit 1
7 | fi
8 |
9 | # Assign the command line arguments to variables
10 | model_path=$1
11 | question_path=$2
12 | base_answer_path=$3
13 | image_folder=$4
14 | extra_prompt=$5
15 | N=$6
16 | temperature=$7
17 |
18 | # Loop over each chunk/process
19 | for (( chunk_id=0; chunk_id "${base_answer_path}.jsonl"
42 | for ((i=0; i> "${base_answer_path}/result.jsonl"
45 | done
46 | # remove the unmerged files
47 | for (( chunk_id=0; chunk_id "$output_file"
41 |
42 | # Loop through the indices and concatenate each file.
43 | for IDX in $(seq 0 $((CHUNKS-1))); do
44 | cat ./playground/data/eval/ai2d/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
45 | done
46 |
47 | python -m llava.eval.eval_ai2d \
48 | --annotation-file ./playground/data/eval/ai2d/test_from_mova.jsonl \
49 | --result-file $output_file \
50 | --mid_result ./playground/data/eval/ai2d/mid_results/$CKPT.jsonl \
51 | --output_result ./exp_results/$CKPT/ai2d_result.jsonl
--------------------------------------------------------------------------------
/scripts/v1_5/eval/deepform.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
5 |
6 | CHUNKS=${#GPULIST[@]}
7 |
8 | CKPT=$1
9 | echo $CKPT
10 | SPLIT="test"
11 |
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
14 | --model-path ./checkpoints_new/$CKPT \
15 | --question-file ./playground/data/eval/DeepForm/deep_questions.jsonl \
16 | --image-folder ./playground/data/ureader/DUE_Benchmark/DeepForm/page_pngs \
17 | --answers-file ./playground/data/eval/DeepForm/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 | --num-chunks $CHUNKS \
19 | --chunk-idx $IDX \
20 | --temperature 0 \
21 | --num_beams 1 \
22 | --conv-mode vicuna_v1 &
23 | done
24 |
25 | wait
26 |
27 | output_file=./playground/data/eval/DeepForm/answers/$SPLIT/$CKPT/merge_slice.jsonl
28 |
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 |
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 | cat ./playground/data/eval/DeepForm/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 |
37 | python -m llava.eval.eval_docvqa \
38 | --annotation-file ./playground/data/eval/DeepForm/deep_annotations.jsonl \
39 | --result-file $output_file \
40 | --mid_result ./playground/data/eval/DeepForm/mid_results/$CKPT.jsonl \
41 | --output_result ./exp_results/$CKPT/deep_result.jsonl
--------------------------------------------------------------------------------
/scripts/v1_5/eval/docvqa_test.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python -m mova.eval.model_vqa_loader \
4 | --model-path checkpoints/mova-8b \
5 | --question-file ./playground/data/eval/docvqa/test.jsonl \
6 | --image-folder ./playground/data/eval/docvqa/test/documents/ \
7 | --answers-file ./playground/data/eval/docvqa/answers/mova-8b.jsonl \
8 | --temperature 0 \
9 | --conv-mode mova_llama3
10 |
11 | python scripts/convert_docvqa_for_submission.py \
12 | --result-dir ./playground/data/eval/docvqa/answers \
13 | --upload_dir ./playground/data/eval/docvqa/upload_results \
14 | --experiment mova-8b
15 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/docvqa_val.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
5 |
6 | CHUNKS=${#GPULIST[@]}
7 |
8 | CKPT=$1
9 | echo $CKPT
10 | SPLIT="val"
11 |
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
14 | --model-path ./checkpoints_new/$CKPT \
15 | --question-file ./playground/data/eval/docvqa/docvqa_questions.jsonl \
16 | --image-folder ./playground/data/ureader/DUE_Benchmark/DocVQA/pngs \
17 | --answers-file ./playground/data/eval/docvqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 | --num-chunks $CHUNKS \
19 | --chunk-idx $IDX \
20 | --temperature 0 \
21 | --num_beams 1 \
22 | --conv-mode qwen_1_5 &
23 | done
24 |
25 | wait
26 |
27 | output_file=./playground/data/eval/docvqa/answers/$SPLIT/$CKPT/merge_slice.jsonl
28 |
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 |
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 | cat ./playground/data/eval/docvqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 |
37 | python -m llava.eval.eval_docvqa \
38 | --annotation-file ./playground/data/eval/docvqa/docvqa_annotations.jsonl \
39 | --result-file $output_file \
40 | --mid_result ./playground/data/eval/docvqa/mid_results/$CKPT.jsonl \
41 | --output_result ./exp_results/$CKPT/docvqa_result.jsonl
--------------------------------------------------------------------------------
/scripts/v1_5/eval/estvqa.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
5 |
6 | CHUNKS=${#GPULIST[@]}
7 |
8 | CKPT=$1
9 | echo $CKPT
10 | SPLIT="test"
11 |
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
14 | --model-path ./checkpoints_new/$CKPT \
15 | --question-file ./playground/data/eval/ESTVQA/est_questions.jsonl \
16 | --image-folder ./playground/data/eval/ESTVQA/test \
17 | --answers-file ./playground/data/eval/ESTVQA/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 | --num-chunks $CHUNKS \
19 | --chunk-idx $IDX \
20 | --temperature 0 \
21 | --num_beams 1 \
22 | --conv-mode vicuna_v1 &
23 | done
24 |
25 | wait
26 |
27 | output_file=./playground/data/eval/ESTVQA/answers/$SPLIT/$CKPT/merge_slice.jsonl
28 |
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 |
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 | cat ./playground/data/eval/ESTVQA/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 |
37 | python -m llava.eval.eval_docvqa \
38 | --annotation-file ./playground/data/eval/ESTVQA/est_annotations.jsonl \
39 | --result-file $output_file \
40 | --mid_result ./playground/data/eval/ESTVQA/mid_results/$CKPT.jsonl \
41 | --output_result ./exp_results/$CKPT/est_result.jsonl
--------------------------------------------------------------------------------
/scripts/v1_5/eval/gqa.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
5 |
6 | CHUNKS=${#GPULIST[@]}
7 |
8 | CKPT=$1
9 | echo $CKPT
10 | SPLIT="llava_gqa_testdev_balanced"
11 | GQADIR="./playground/data/eval/gqa/data"
12 |
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
15 | --model-path ./checkpoints_new/$CKPT \
16 | --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \
17 | --image-folder ./playground/data/eval/gqa/data/images \
18 | --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
19 | --num-chunks $CHUNKS \
20 | --chunk-idx $IDX \
21 | --temperature 0 \
22 | --num_beams 3 \
23 | --conv-mode qwen_1_5 &
24 | done
25 |
26 | wait
27 |
28 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl
29 |
30 | # Clear out the output file if it exists.
31 | > "$output_file"
32 |
33 | # Loop through the indices and concatenate each file.
34 | for IDX in $(seq 0 $((CHUNKS-1))); do
35 | cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
36 | done
37 |
38 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json
39 |
40 | cd $GQADIR
41 | python eval/eval.py --tier testdev_balanced
42 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/infographics.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
5 |
6 | CHUNKS=${#GPULIST[@]}
7 |
8 | CKPT=$1
9 | echo $CKPT
10 | SPLIT="test"
11 |
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
14 | --model-path ./checkpoints_new/$CKPT \
15 | --question-file ./playground/data/eval/InfographicsVQA/info_questions.jsonl \
16 | --image-folder ./playground/data/ureader/DUE_Benchmark/InfographicsVQA/pngs \
17 | --answers-file ./playground/data/eval/InfographicsVQA/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 | --num-chunks $CHUNKS \
19 | --chunk-idx $IDX \
20 | --temperature 0 \
21 | --num_beams 1 \
22 | --conv-mode vicuna_v1 &
23 | done
24 |
25 | wait
26 |
27 | output_file=./playground/data/eval/InfographicsVQA/answers/$SPLIT/$CKPT/merge_slice.jsonl
28 |
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 |
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 | cat ./playground/data/eval/InfographicsVQA/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 |
37 | python -m llava.eval.eval_docvqa \
38 | --annotation-file ./playground/data/eval/InfographicsVQA/info_annotations.jsonl \
39 | --result-file $output_file \
40 | --mid_result ./playground/data/eval/InfographicsVQA/mid_results/$CKPT.jsonl \
41 | --output_result ./exp_results/$CKPT/info_result.jsonl
--------------------------------------------------------------------------------
/scripts/v1_5/eval/llavabench.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | python -m llava.eval.model_vqa \
4 | --model-path liuhaotian/llava-v1.5-13b \
5 | --question-file ./playground/data/eval/llava-bench-in-the-wild/questions.jsonl \
6 | --image-folder ./playground/data/eval/llava-bench-in-the-wild/images \
7 | --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \
8 | --temperature 0 \
9 | --conv-mode vicuna_v1
10 |
11 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews
12 |
13 | python llava/eval/eval_gpt_review_bench.py \
14 | --question playground/data/eval/llava-bench-in-the-wild/questions.jsonl \
15 | --context playground/data/eval/llava-bench-in-the-wild/context.jsonl \
16 | --rule llava/eval/table/rule.json \
17 | --answer-list \
18 | playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \
19 | playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \
20 | --output \
21 | playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl
22 |
23 | python llava/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl
24 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmbench.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # SPLIT="mmbench_dev_20230712"
4 |
5 | # python -m llava.eval.model_vqa_mmbench \
6 | # --model-path liuhaotian/llava-v1.5-13b \
7 | # --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
8 | # --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/llava-v1.5-13b.jsonl \
9 | # --single-pred-prompt \
10 | # --temperature 0 \
11 | # --conv-mode vicuna_v1
12 |
13 |
14 |
15 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
16 | IFS=',' read -ra GPULIST <<< "$gpu_list"
17 |
18 | CHUNKS=${#GPULIST[@]}
19 |
20 | CKPT=$1
21 | echo $CKPT
22 | SPLIT="mmbench_dev_20230712"
23 |
24 | for IDX in $(seq 0 $((CHUNKS-1))); do
25 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_mmbench \
26 | --model-path ./checkpoints_new/$CKPT \
27 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
28 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
29 | --num-chunks $CHUNKS \
30 | --chunk-idx $IDX \
31 | --temperature 0 \
32 | --single-pred-prompt \
33 | --num_beams 1 \
34 | --conv-mode vicuna_v1 &
35 | done
36 |
37 | wait
38 |
39 | output_file=./playground/data/eval/mmbench/answers/$SPLIT/$CKPT/merge.jsonl
40 |
41 | # Clear out the output file if it exists.
42 | > "$output_file"
43 |
44 | # Loop through the indices and concatenate each file.
45 | for IDX in $(seq 0 $((CHUNKS-1))); do
46 | cat ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
47 | done
48 |
49 |
50 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
51 |
52 | python scripts/convert_mmbench_for_submission.py \
53 | --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
54 | --result-dir ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT \
55 | --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \
56 | --experiment merge
57 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmbench_cn.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # SPLIT="mmbench_dev_cn_20231003"
4 |
5 |
6 | # python -m llava.eval.model_vqa_mmbench \
7 | # --model-path liuhaotian/llava-v1.5-13b \
8 | # --question-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \
9 | # --answers-file ./playground/data/eval/mmbench_cn/answers/$SPLIT/llava-v1.5-13b.jsonl \
10 | # --lang cn \
11 | # --single-pred-prompt \
12 | # --temperature 0 \
13 | # --conv-mode vicuna_v1
14 |
15 |
16 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
17 | IFS=',' read -ra GPULIST <<< "$gpu_list"
18 |
19 | CHUNKS=${#GPULIST[@]}
20 |
21 | CKPT=$1
22 | echo $CKPT
23 | SPLIT="mmbench_dev_cn_20231003"
24 |
25 | for IDX in $(seq 0 $((CHUNKS-1))); do
26 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_mmbench \
27 | --model-path ./checkpoints_new/$CKPT \
28 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \
29 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
30 | --lang cn \
31 | --single-pred-prompt \
32 | --num-chunks $CHUNKS \
33 | --chunk-idx $IDX \
34 | --temperature 0.2 \
35 | --num_beams 1 \
36 | --conv-mode vicuna_v1 &
37 | done
38 |
39 | wait
40 |
41 | output_file=./playground/data/eval/mmbench/answers/$SPLIT/$CKPT/merge.jsonl
42 |
43 | # Clear out the output file if it exists.
44 | > "$output_file"
45 |
46 | # Loop through the indices and concatenate each file.
47 | for IDX in $(seq 0 $((CHUNKS-1))); do
48 | cat ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
49 | done
50 |
51 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT
52 |
53 | python scripts/convert_mmbench_for_submission.py \
54 | --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \
55 | --result-dir ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT \
56 | --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \
57 | --experiment merge
58 |
59 |
60 |
61 |
62 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/mme.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
5 |
6 | CHUNKS=${#GPULIST[@]}
7 |
8 | CKPT=$1
9 | echo $CKPT
10 | SPLIT="llava_mme"
11 |
12 |
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
15 | --model-path ./checkpoints_new/$CKPT \
16 | --question-file ./playground/data/eval/MME/$SPLIT.jsonl \
17 | --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \
18 | --answers-file ./playground/data/eval/MME/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
19 | --num-chunks $CHUNKS \
20 | --chunk-idx $IDX \
21 | --temperature 0 \
22 | --num_beams 5 \
23 | --conv-mode qwen_1_5 &
24 | done
25 |
26 | wait
27 |
28 | output_file=./playground/data/eval/MME/answers/$SPLIT/$CKPT/merge.jsonl
29 |
30 | # Clear out the output file if it exists.
31 | > "$output_file"
32 |
33 | # Loop through the indices and concatenate each file.
34 | for IDX in $(seq 0 $((CHUNKS-1))); do
35 | cat ./playground/data/eval/MME/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
36 | done
37 |
38 |
39 | cp $output_file ./playground/data/eval/MME/answers/$CKPT.jsonl
40 |
41 | cd ./playground/data/eval/MME
42 |
43 | python convert_answer_to_mme.py --experiment $CKPT
44 |
45 | cd eval_tool
46 |
47 | python calculation.py --results_dir answers/$CKPT
48 |
49 |
50 |
51 |
52 | # python -m llava.eval.model_vqa_loader \
53 | # --model-path ./checkpoints_new/$CKPT \
54 | # --question-file ./playground/data/eval/MME/$SPLIT.jsonl \
55 | # --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \
56 | # --answers-file ./playground/data/eval/MME/answers/$CKPT.jsonl \
57 | # --temperature 0 \
58 | # --num_beams 3 \
59 | # --conv-mode vicuna_v1
60 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/mmvet.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 |
4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
5 | IFS=',' read -ra GPULIST <<< "$gpu_list"
6 |
7 | CHUNKS=${#GPULIST[@]}
8 |
9 | CKPT=$1
10 | echo $CKPT
11 | SPLIT="llava-mm-vet"
12 |
13 | # python -m llava.eval.model_vqa \
14 | # --model-path liuhaotian/llava-v1.5-13b \
15 | # --question-file ./playground/data/eval/mm-vet/llava-mm-vet.jsonl \
16 | # --image-folder ./playground/data/eval/mm-vet/images \
17 | # --answers-file ./playground/data/eval/mm-vet/answers/llava-v1.5-13b.jsonl \
18 | # --temperature 0 \
19 | # --conv-mode vicuna_v1
20 |
21 | for IDX in $(seq 0 $((CHUNKS-1))); do
22 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
23 | --model-path ./checkpoints_new/$CKPT \
24 | --question-file ./playground/data/eval/mm-vet/$SPLIT.jsonl \
25 | --image-folder ./playground/data/eval/mm-vet/images \
26 | --answers-file ./playground/data/eval/mm-vet/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
27 | --num-chunks $CHUNKS \
28 | --chunk-idx $IDX \
29 | --temperature 0 \
30 | --num_beams 5 \
31 | --conv-mode vicuna_v1 &
32 | done
33 |
34 | wait
35 |
36 | output_file=./playground/data/eval/mm-vet/answers/$SPLIT/$CKPT/merge.jsonl
37 |
38 | # Clear out the output file if it exists.
39 | > "$output_file"
40 |
41 | # Loop through the indices and concatenate each file.
42 | for IDX in $(seq 0 $((CHUNKS-1))); do
43 | cat ./playground/data/eval/mm-vet/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
44 | done
45 |
46 |
47 | mkdir -p ./playground/data/eval/mm-vet/results
48 |
49 | python scripts/convert_mmvet_for_eval.py \
50 | --src $output_file \
51 | --dst ./playground/data/eval/mm-vet/results/$CKPT.json
52 |
53 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/pope.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
5 |
6 | CHUNKS=${#GPULIST[@]}
7 |
8 | CKPT=$1
9 | echo $CKPT
10 | SPLIT="llava_pope_test_my"
11 |
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
14 | --model-path ./checkpoints_new/$CKPT \
15 | --question-file ./playground/data/eval/pope/$SPLIT.jsonl \
16 | --image-folder ./playground/data/eval/pope/val2014 \
17 | --answers-file ./playground/data/eval/pope/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 | --num-chunks $CHUNKS \
19 | --chunk-idx $IDX \
20 | --temperature 0 \
21 | --num_beams 1 \
22 | --conv-mode vicuna_v1 &
23 | done
24 |
25 | wait
26 |
27 | output_file=./playground/data/eval/pope/answers/$SPLIT/$CKPT/merge_slice.jsonl
28 |
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 |
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 | cat ./playground/data/eval/pope/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 |
37 | python llava/eval/eval_pope.py \
38 | --annotation-dir ./playground/data/eval/pope/coco \
39 | --question-file ./playground/data/eval/pope/$SPLIT.jsonl \
40 | --result-file ./playground/data/eval/pope/answers/$SPLIT/$CKPT/merge_slice.jsonl
41 |
42 |
43 | # CKPT="llava-v1.5-adapt"
44 |
45 | # python -m llava.eval.model_vqa_loader \
46 | # --model-path ./checkpoints_new/$CKPT \
47 | # --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
48 | # --image-folder ./playground/data/eval/pope/val2014 \
49 | # --answers-file ./playground/data/eval/pope/answers/$CKPT.jsonl \
50 | # --temperature 0 \
51 | # --conv-mode vicuna_v1
52 |
53 | # python llava/eval/eval_pope.py \
54 | # --annotation-dir ./playground/data/eval/pope/coco \
55 | # --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \
56 | # --result-file ./playground/data/eval/pope/answers/$CKPT.jsonl
57 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/qbench.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [ "$1" = "dev" ]; then
4 | echo "Evaluating in 'dev' split."
5 | elif [ "$1" = "test" ]; then
6 | echo "Evaluating in 'test' split."
7 | else
8 | echo "Unknown split, please choose between 'dev' and 'test'."
9 | exit 1
10 | fi
11 |
12 | python -m llava.eval.model_vqa_qbench \
13 | --model-path liuhaotian/llava-v1.5-13b \
14 | --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \
15 | --questions-file ./playground/data/eval/qbench/llvisionqa_$1.json \
16 | --answers-file ./playground/data/eval/qbench/llvisionqa_$1_answers.jsonl \
17 | --conv-mode llava_v1 \
18 | --lang en
19 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/qbench_zh.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | if [ "$1" = "dev" ]; then
4 | ZH_SPLIT="验证集"
5 | echo "Evaluating in 'dev' split."
6 | elif [ "$1" = "test" ]; then
7 | ZH_SPLIT="测试集"
8 | echo "Evaluating in 'test' split."
9 | else
10 | echo "Unknown split, please choose between 'dev' and 'test'."
11 | exit 1
12 | fi
13 |
14 | python -m llava.eval.model_vqa_qbench \
15 | --model-path liuhaotian/llava-v1.5-13b \
16 | --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \
17 | --questions-file ./playground/data/eval/qbench/质衡-问答-$ZH_SPLIT.json \
18 | --answers-file ./playground/data/eval/qbench/llvisionqa_zh_$1_answers.jsonl \
19 | --conv-mode llava_v1 \
20 | --lang zh
21 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/rec.sh:
--------------------------------------------------------------------------------
1 | # #!/bin/bash
2 |
3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
5 |
6 | CHUNKS=${#GPULIST[@]}
7 |
8 | CKPT=$1
9 | echo $CKPT
10 | SPLIT="llava_ref3_test_2017"
11 |
12 |
13 | for IDX in $(seq 0 $((CHUNKS-1))); do
14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
15 | --model-path ./checkpoints_new/$CKPT \
16 | --question-file ./playground/data/eval/rec/$SPLIT.jsonl \
17 | --image-folder ./playground/data/coco/train2017 \
18 | --answers-file ./playground/data/eval/rec/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
19 | --num-chunks $CHUNKS \
20 | --chunk-idx $IDX \
21 | --temperature 0 \
22 | --num_beams 3 \
23 | --conv-mode vicuna_v1 &
24 | done
25 |
26 | wait
27 |
28 | output_file=./playground/data/eval/rec/answers/$SPLIT/$CKPT/merge.jsonl
29 |
30 | # Clear out the output file if it exists.
31 | > "$output_file"
32 |
33 | # Loop through the indices and concatenate each file.
34 | for IDX in $(seq 0 $((CHUNKS-1))); do
35 | cat ./playground/data/eval/rec/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
36 | done
37 |
38 | python -m llava.eval.eval_rec \
39 | --annotation-file ./playground/data/eval/rec/llava_ref3_labels.jsonl \
40 | --question-file ./playground/data/eval/rec/$SPLIT.jsonl \
41 | --result-file ./playground/data/eval/rec/answers/$SPLIT/$CKPT/merge.jsonl
--------------------------------------------------------------------------------
/scripts/v1_5/eval/textvqa.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 |
4 | # # multiple evaluation
5 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
6 | IFS=',' read -ra GPULIST <<< "$gpu_list"
7 |
8 | CHUNKS=${#GPULIST[@]}
9 |
10 | CKPT=$1
11 | echo $CKPT
12 | SPLIT="llava_textvqa_val_v051_ocr"
13 |
14 | for IDX in $(seq 0 $((CHUNKS-1))); do
15 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
16 | --model-path ./checkpoints_new/$CKPT \
17 | --question-file ./playground/data/eval/textvqa/$SPLIT.jsonl \
18 | --image-folder ./playground/data/eval/textvqa/train_images \
19 | --answers-file ./playground/data/eval/textvqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
20 | --num-chunks $CHUNKS \
21 | --chunk-idx $IDX \
22 | --temperature 0 \
23 | --num_beams 3 \
24 | --conv-mode qwen_1_5 &
25 | done
26 |
27 | wait
28 |
29 | output_file=./playground/data/eval/textvqa/answers/$SPLIT/$CKPT/merge_slice.jsonl
30 |
31 | # Clear out the output file if it exists.
32 | > "$output_file"
33 |
34 | # Loop through the indices and concatenate each file.
35 | for IDX in $(seq 0 $((CHUNKS-1))); do
36 | cat ./playground/data/eval/textvqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
37 | done
38 |
39 | python -m llava.eval.eval_textvqa \
40 | --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \
41 | --result-file ./playground/data/eval/textvqa/answers/$SPLIT/$CKPT/merge_slice.jsonl
42 |
43 | # python -m llava.eval.model_vqa_loader \
44 | # --model-path liuhaotian/llava-v1.5-13b \
45 | # --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \
46 | # --image-folder ./playground/data/eval/textvqa/train_images \
47 | # --answers-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl \
48 | # --temperature 0 \
49 | # --conv-mode vicuna_v1
50 |
51 | # python -m llava.eval.eval_textvqa \
52 | # --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \
53 | # --result-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl
54 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/vizwiz.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 |
4 | # python -m llava.eval.model_vqa_loader \
5 | # --model-path liuhaotian/llava-v1.5-13b \
6 | # --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \
7 | # --image-folder ./playground/data/eval/vizwiz/test \
8 | # --answers-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \
9 | # --temperature 0 \
10 | # --conv-mode vicuna_v1
11 |
12 | # # multiple evaluation
13 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
14 | IFS=',' read -ra GPULIST <<< "$gpu_list"
15 |
16 | CHUNKS=${#GPULIST[@]}
17 |
18 | CKPT=$1
19 | echo $CKPT
20 | SPLIT="llava_test"
21 |
22 | for IDX in $(seq 0 $((CHUNKS-1))); do
23 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
24 | --model-path ./checkpoints_new/$CKPT \
25 | --question-file ./playground/data/eval/vizwiz/$SPLIT.jsonl \
26 | --image-folder ./playground/data/eval/vizwiz/test \
27 | --answers-file ./playground/data/eval/vizwiz/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
28 | --num-chunks $CHUNKS \
29 | --chunk-idx $IDX \
30 | --temperature 0 \
31 | --num_beams 3 \
32 | --conv-mode vicuna_v1 &
33 | done
34 |
35 | wait
36 |
37 | output_file=./playground/data/eval/vizwiz/answers/$SPLIT/$CKPT/merge_slice.jsonl
38 |
39 | # Clear out the output file if it exists.
40 | > "$output_file"
41 |
42 | # Loop through the indices and concatenate each file.
43 | for IDX in $(seq 0 $((CHUNKS-1))); do
44 | cat ./playground/data/eval/vizwiz/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
45 | done
46 |
47 |
48 | python scripts/convert_vizwiz_for_submission.py \
49 | --annotation-file ./playground/data/eval/vizwiz/$SPLIT.jsonl \
50 | --result-file $output_file \
51 | --result-upload-file ./playground/data/eval/vizwiz/answers_upload/$CKPT.json
52 |
--------------------------------------------------------------------------------
/scripts/v1_5/eval/vqav2.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}"
4 | IFS=',' read -ra GPULIST <<< "$gpu_list"
5 |
6 | CHUNKS=${#GPULIST[@]}
7 |
8 | CKPT=$1
9 | echo $CKPT
10 | SPLIT="llava_vqav2_mscoco_test-dev2015"
11 |
12 | for IDX in $(seq 0 $((CHUNKS-1))); do
13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \
14 | --model-path ./checkpoints_new/$CKPT \
15 | --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \
16 | --image-folder ./playground/data/eval/vqav2/test2015 \
17 | --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \
18 | --num-chunks $CHUNKS \
19 | --chunk-idx $IDX \
20 | --temperature 0 \
21 | --num_beams 3 \
22 | --conv-mode vicuna_v1 &
23 | done
24 |
25 | wait
26 |
27 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl
28 |
29 | # Clear out the output file if it exists.
30 | > "$output_file"
31 |
32 | # Loop through the indices and concatenate each file.
33 | for IDX in $(seq 0 $((CHUNKS-1))); do
34 | cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file"
35 | done
36 |
37 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT
38 |
39 |
--------------------------------------------------------------------------------
/scripts/video/demo/video_demo.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ROOT_DIR="/mnt/bn/vl-research/workspace/yhzhang/LLaVA-NeXT"
3 |
4 | if [ ! -e $ROOT_DIR ]; then
5 | echo "The root dir does not exist. Exiting the script."
6 | exit 1
7 | fi
8 |
9 | cd $ROOT_DIR
10 |
11 | export PYTHONWARNINGS=ignore
12 | export TOKENIZERS_PARALLELISM=false
13 |
14 | CKPT=$1
15 | CONV_MODE=$2
16 | FRAMES=$3
17 | POOL_STRIDE=$4
18 | POOL_MODE=$5
19 | NEWLINE_POSITION=$6
20 | OVERWRITE=$7
21 | VIDEO_PATH=$8
22 |
23 |
24 | if [ "$OVERWRITE" = False ]; then
25 | SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}_overwrite_${OVERWRITE}
26 |
27 | else
28 | SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}
29 | fi
30 |
31 | python3 playground/demo/video_demo.py \
32 | --model-path $CKPT \
33 | --video_path ${VIDEO_PATH} \
34 | --output_dir ./work_dirs/video_demo/$SAVE_DIR \
35 | --output_name pred \
36 | --chunk-idx $(($IDX - 1)) \
37 | --overwrite ${OVERWRITE} \
38 | --mm_spatial_pool_stride ${POOL_STRIDE:-4} \
39 | --for_get_frames_num $FRAMES \
40 | --conv-mode $CONV_MODE \
41 | --mm_spatial_pool_mode ${POOL_MODE:-average} \
42 | --mm_newline_position ${NEWLINE_POSITION:-grid} \
43 | --prompt "Please provide a detailed description of the video, focusing on the main subjects, their actions, the background scenes."
--------------------------------------------------------------------------------
/scripts/video/eval/video_detail_description_eval_only.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | ROOT_DIR="root to LLaVA-NeXT-Video"
3 |
4 | if [ ! -e $ROOT_DIR ]; then
5 | echo "The root dir does not exist. Exiting the script."
6 | exit 1
7 | fi
8 |
9 | cd $ROOT_DIR
10 |
11 | export PYTHONWARNINGS=ignore
12 | export TOKENIZERS_PARALLELISM=false
13 |
14 | OPENAIKEY="INPUT YOUR OPENAI API"
15 |
16 | SAVE_DIR=$1
17 |
18 | python3 llava/eval/evaluate_benchmark_video_detail_description.py \
19 | --pred_path ./work_dirs/eval_video_detail_description/$SAVE_DIR/pred.json \
20 | --output_dir ./work_dirs/eval_video_detail_description/$SAVE_DIR/detail_results \
21 | --output_json ./work_dirs/eval_video_detail_description/$SAVE_DIR/detail_results.json \
22 | --num_chunks 1 \
23 | --num_tasks 16 \
24 | --api_key $OPENAIKEY \
--------------------------------------------------------------------------------
/scripts/zero2.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "optimizer": {
14 | "type": "AdamW",
15 | "params": {
16 | "lr": "auto",
17 | "betas": "auto",
18 | "eps": "auto",
19 | "weight_decay": "auto"
20 | }
21 | },
22 | "zero_optimization": {
23 | "stage": 2,
24 | "offload_optimizer": {
25 | "device": "none",
26 | "pin_memory": true
27 | },
28 | "allgather_partitions": true,
29 | "allgather_bucket_size": 2e8,
30 | "overlap_comm": false,
31 | "reduce_scatter": true,
32 | "reduce_bucket_size": 2e8,
33 | "contiguous_gradients": true
34 | },
35 | "gradient_accumulation_steps": "auto",
36 | "gradient_clipping": "auto",
37 | "steps_per_print": 100,
38 | "train_batch_size": "auto",
39 | "train_micro_batch_size_per_gpu": "auto",
40 | "wall_clock_breakdown": false
41 | }
--------------------------------------------------------------------------------
/scripts/zero2_fused_adamw.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "optimizer": {
14 | "type": "AdamW",
15 | "params": {
16 | "lr": "auto",
17 | "betas": "auto",
18 | "eps": "auto",
19 | "weight_decay": "auto"
20 | }
21 | },
22 | "zero_optimization": {
23 | "stage": 2,
24 | "offload_optimizer": {
25 | "device": "none",
26 | "pin_memory": true
27 | },
28 | "allgather_partitions": true,
29 | "allgather_bucket_size": 2e8,
30 | "overlap_comm": true,
31 | "reduce_scatter": true,
32 | "reduce_bucket_size": 2e8,
33 | "contiguous_gradients": true
34 | },
35 | "gradient_accumulation_steps": "auto",
36 | "gradient_clipping": "auto",
37 | "steps_per_print": 100,
38 | "train_batch_size": "auto",
39 | "train_micro_batch_size_per_gpu": "auto",
40 | "wall_clock_breakdown": false
41 | }
--------------------------------------------------------------------------------
/scripts/zero2_offload.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "train_micro_batch_size_per_gpu": "auto",
14 | "train_batch_size": "auto",
15 | "gradient_accumulation_steps": "auto",
16 | "zero_optimization": {
17 | "stage": 2,
18 | "offload_optimizer": {
19 | "device": "cpu",
20 | "pin_memory": true
21 | },
22 | "offload_param": {
23 | "device": "cpu",
24 | "pin_memory": true
25 | },
26 | "overlap_comm": true,
27 | "contiguous_gradients": true,
28 | "sub_group_size": 1e9,
29 | "reduce_bucket_size": "auto"
30 | }
31 | }
--------------------------------------------------------------------------------
/scripts/zero2_old.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "train_micro_batch_size_per_gpu": "auto",
14 | "train_batch_size": "auto",
15 | "gradient_accumulation_steps": "auto",
16 | "zero_optimization": {
17 | "stage": 2,
18 | "overlap_comm": false,
19 | "contiguous_gradients": true,
20 | "sub_group_size": 1e9,
21 | "reduce_bucket_size": "auto"
22 | }
23 | }
--------------------------------------------------------------------------------
/scripts/zero3.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 |
14 | "zero_optimization": {
15 | "stage": 3,
16 | "offload_optimizer": {
17 | "device": "none",
18 | "pin_memory": true
19 | },
20 | "offload_param": {
21 | "device": "none",
22 | "pin_memory": true
23 | },
24 | "overlap_comm": true,
25 | "contiguous_gradients": true,
26 | "sub_group_size": 1e9,
27 | "reduce_bucket_size": "auto",
28 | "stage3_prefetch_bucket_size": "auto",
29 | "stage3_param_persistence_threshold": "auto",
30 | "stage3_max_live_parameters": 1e9,
31 | "stage3_max_reuse_distance": 1e9,
32 | "stage3_gather_16bit_weights_on_model_save": true
33 | },
34 |
35 | "gradient_accumulation_steps": "auto",
36 | "gradient_clipping": "auto",
37 | "steps_per_print": 100,
38 | "train_batch_size": "auto",
39 | "train_micro_batch_size_per_gpu": "auto",
40 | "wall_clock_breakdown": false
41 | }
--------------------------------------------------------------------------------
/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "optimizer": {
14 | "type": "AdamW",
15 | "params": {
16 | "lr": "auto",
17 | "betas": "auto",
18 | "eps": "auto",
19 | "weight_decay": "auto"
20 | }
21 | },
22 | "zero_optimization": {
23 | "stage": 3,
24 | "offload_optimizer": {
25 | "device": "cpu",
26 | "pin_memory": true
27 | },
28 | "offload_param": {
29 | "device": "cpu",
30 | "pin_memory": true
31 | },
32 | "overlap_comm": true,
33 | "contiguous_gradients": true,
34 | "sub_group_size": 1e9,
35 | "reduce_bucket_size": "auto",
36 | "stage3_prefetch_bucket_size": "auto",
37 | "stage3_param_persistence_threshold": "auto",
38 | "stage3_max_live_parameters": 1e9,
39 | "stage3_max_reuse_distance": 1e9,
40 | "gather_16bit_weights_on_model_save": true
41 | },
42 | "gradient_accumulation_steps": "auto",
43 | "gradient_clipping": "auto",
44 | "train_batch_size": "auto",
45 | "train_micro_batch_size_per_gpu": "auto",
46 | "steps_per_print": 1e5,
47 | "wall_clock_breakdown": false
48 | }
--------------------------------------------------------------------------------
/scripts/zero3pp.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "optimizer": {
14 | "type": "AdamW",
15 | "params": {
16 | "lr": "auto",
17 | "betas": "auto",
18 | "eps": "auto",
19 | "weight_decay": "auto"
20 | }
21 | },
22 |
23 | "zero_optimization": {
24 | "stage": 3,
25 | "offload_optimizer": {
26 | "device": "none",
27 | "pin_memory": true
28 | },
29 | "offload_param": {
30 | "device": "none",
31 | "pin_memory": true
32 | },
33 | "overlap_comm": true,
34 | "contiguous_gradients": true,
35 | "zero_quantized_weights": true,
36 | "zero_hpz_partition_size": 16,
37 | "zero_quantized_gradients": true,
38 | "sub_group_size": 1e9,
39 | "reduce_bucket_size": "auto",
40 | "stage3_prefetch_bucket_size": "auto",
41 | "stage3_param_persistence_threshold": "auto",
42 | "stage3_max_live_parameters": 1e9,
43 | "stage3_max_reuse_distance": 1e9,
44 | "stage3_gather_16bit_weights_on_model_save": true
45 | },
46 |
47 | "gradient_accumulation_steps": "auto",
48 | "gradient_clipping": "auto",
49 | "steps_per_print": 100,
50 | "train_batch_size": "auto",
51 | "train_micro_batch_size_per_gpu": "auto",
52 | "wall_clock_breakdown": false
53 | }
--------------------------------------------------------------------------------
/trl/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | __version__ = "0.7.11.dev0"
4 |
5 | from .core import set_seed
6 | from .environment import TextEnvironment, TextHistory
7 | from .extras import BestOfNSampler
8 | from .import_utils import (
9 | is_bitsandbytes_available,
10 | is_diffusers_available,
11 | is_npu_available,
12 | is_peft_available,
13 | is_wandb_available,
14 | is_xpu_available,
15 | )
16 | from .models import (
17 | AutoModelForCausalLMWithValueHead,
18 | AutoModelForSeq2SeqLMWithValueHead,
19 | PreTrainedModelWrapper,
20 | create_reference_model,
21 | setup_chat_format,
22 | )
23 | from .trainer import (
24 | DataCollatorForCompletionOnlyLM,
25 | DPOTrainer,
26 | IterativeSFTTrainer,
27 | ModelConfig,
28 | PPOConfig,
29 | PPOTrainer,
30 | RewardConfig,
31 | RewardTrainer,
32 | SFTTrainer,
33 | )
34 | from .trainer.utils import get_kbit_device_map, get_peft_config, get_quantization_config
35 |
36 |
37 | if is_diffusers_available():
38 | from .models import (
39 | DDPOPipelineOutput,
40 | DDPOSchedulerOutput,
41 | DDPOStableDiffusionPipeline,
42 | DefaultDDPOStableDiffusionPipeline,
43 | )
44 | from .trainer import DDPOConfig, DDPOTrainer
45 |
--------------------------------------------------------------------------------
/trl/environment/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | from .base_environment import TextEnvironment, TextHistory
4 |
--------------------------------------------------------------------------------
/trl/extras/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | # Copyright 2022 The HuggingFace Team. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | from .best_of_n_sampler import BestOfNSampler
17 |
--------------------------------------------------------------------------------
/trl/models/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | # Copyright 2022 The HuggingFace Team. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 | from .modeling_base import PreTrainedModelWrapper, create_reference_model
17 | from .modeling_value_head import AutoModelForCausalLMWithValueHead, AutoModelForSeq2SeqLMWithValueHead
18 | from .utils import setup_chat_format
19 |
20 |
21 | SUPPORTED_ARCHITECTURES = (
22 | AutoModelForCausalLMWithValueHead,
23 | AutoModelForSeq2SeqLMWithValueHead,
24 | )
25 |
26 | from ..import_utils import is_diffusers_available
27 |
28 |
29 | if is_diffusers_available():
30 | from .modeling_sd_base import (
31 | DDPOPipelineOutput,
32 | DDPOSchedulerOutput,
33 | DDPOStableDiffusionPipeline,
34 | DefaultDDPOStableDiffusionPipeline,
35 | )
36 |
--------------------------------------------------------------------------------
/trl/trainer/__init__.py:
--------------------------------------------------------------------------------
1 | # flake8: noqa
2 |
3 | # Copyright 2022 The HuggingFace Team. All rights reserved.
4 | #
5 | # Licensed under the Apache License, Version 2.0 (the "License");
6 | # you may not use this file except in compliance with the License.
7 | # You may obtain a copy of the License at
8 | #
9 | # http://www.apache.org/licenses/LICENSE-2.0
10 | #
11 | # Unless required by applicable law or agreed to in writing, software
12 | # distributed under the License is distributed on an "AS IS" BASIS,
13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 | # See the License for the specific language governing permissions and
15 | # limitations under the License.
16 |
17 | # There is a circular import in the PPOTrainer if we let isort sort these
18 | # isort: off
19 | from .utils import (
20 | AdaptiveKLController,
21 | FixedKLController,
22 | ConstantLengthDataset,
23 | DataCollatorForCompletionOnlyLM,
24 | RunningMoments,
25 | disable_dropout_in_model,
26 | peft_module_casting_to_bf16,
27 | )
28 |
29 | # isort: on
30 |
31 | from ..import_utils import is_diffusers_available
32 | from .base import BaseTrainer
33 | from .ddpo_config import DDPOConfig
34 |
35 |
36 | if is_diffusers_available():
37 | from .ddpo_trainer import DDPOTrainer
38 |
39 | from .dpo_trainer import DPOTrainer
40 | from .iterative_sft_trainer import IterativeSFTTrainer
41 | from .model_config import ModelConfig
42 | from .ppo_config import PPOConfig
43 | from .ppo_trainer import PPOTrainer
44 | from .reward_config import RewardConfig
45 | from .reward_trainer import RewardTrainer, compute_accuracy
46 | from .sft_trainer import SFTTrainer
47 |
--------------------------------------------------------------------------------
/trl/trainer/base.py:
--------------------------------------------------------------------------------
1 | # Copyright 2022 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from huggingface_hub import PyTorchModelHubMixin
16 |
17 |
18 | class BaseTrainer(PyTorchModelHubMixin):
19 | r"""
20 | Base class for all trainers - this base class implements the basic functions that we
21 | need for a trainer.
22 |
23 | The trainer needs to have the following functions:
24 | - step: takes in a batch of data and performs a step of training
25 | - loss: takes in a batch of data and returns the loss
26 | - compute_rewards: takes in a batch of data and returns the rewards
27 | - _build_models_and_tokenizer: builds the models and tokenizer
28 | - _build_dataset: builds the dataset
29 | Each user is expected to implement their own trainer class that inherits from this base
30 | if they want to use a new training algorithm.
31 | """
32 |
33 | def __init__(self, config):
34 | self.config = config
35 |
36 | def step(self, *args):
37 | raise NotImplementedError("Not implemented")
38 |
39 | def loss(self, *args):
40 | raise NotImplementedError("Not implemented")
41 |
42 | def compute_rewards(self, *args):
43 | raise NotImplementedError("Not implemented")
44 |
45 | def _save_pretrained(self, save_directory):
46 | raise NotImplementedError("Not implemented")
47 |
--------------------------------------------------------------------------------
/trl/trainer/reward_config.py:
--------------------------------------------------------------------------------
1 | # Copyright 2024 The HuggingFace Team. All rights reserved.
2 | #
3 | # Licensed under the Apache License, Version 2.0 (the "License");
4 | # you may not use this file except in compliance with the License.
5 | # You may obtain a copy of the License at
6 | #
7 | # http://www.apache.org/licenses/LICENSE-2.0
8 | #
9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 |
15 | from dataclasses import dataclass
16 | from typing import Optional
17 |
18 | from transformers import TrainingArguments
19 |
20 |
21 | @dataclass
22 | class RewardConfig(TrainingArguments):
23 | """
24 | RewardConfig collects all training arguments related to the [`RewardTrainer`] class.
25 |
26 | Using [`HfArgumentParser`] we can turn this class into
27 | [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the
28 | command line.
29 |
30 | Parameters:
31 | max_length (`int`, *optional*, defaults to `None`):
32 | The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.
33 | gradient_checkpointing (`bool`, *optional*, defaults to `True`):
34 | If True, use gradient checkpointing to save memory at the expense of slower backward pass.
35 | """
36 |
37 | max_length: Optional[int] = None
38 | """The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator."""
39 |
--------------------------------------------------------------------------------
/vdim-pretrain.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | python featup/train_vdim_upsampler.py
--------------------------------------------------------------------------------