├── .gitignore ├── LICENSE ├── README.md ├── VLMEvalKit ├── LICENSE ├── README.md ├── assets │ ├── LOGO.svg │ └── apple.jpg ├── docs │ ├── en │ │ ├── .readthedocs.yaml │ │ ├── Makefile │ │ ├── _static │ │ │ ├── css │ │ │ │ └── readthedocs.css │ │ │ ├── image │ │ │ │ ├── logo.svg │ │ │ │ └── logo_icon.svg │ │ │ └── js │ │ │ │ └── custom.js │ │ ├── _templates │ │ │ ├── 404.html │ │ │ ├── autosummary │ │ │ │ └── class.rst │ │ │ └── callable.rst │ │ ├── advanced_guides │ │ │ ├── Contributors.md │ │ │ └── Development.md │ │ ├── conf.py │ │ ├── docutils.conf │ │ ├── get_started │ │ │ └── Quickstart.md │ │ └── index.rst │ ├── ja │ │ └── README_ja.md │ └── zh-CN │ │ ├── .readthedocs.yaml │ │ ├── Makefile │ │ ├── README_zh-CN.md │ │ ├── _static │ │ ├── css │ │ │ └── readthedocs.css │ │ ├── image │ │ │ ├── logo.svg │ │ │ └── logo_icon.svg │ │ └── js │ │ │ └── custom.js │ │ ├── _templates │ │ ├── 404.html │ │ ├── autosummary │ │ │ └── class.rst │ │ └── callable.rst │ │ ├── advanced_guides │ │ └── Development.md │ │ ├── conf.py │ │ ├── cp_origin_docs.sh │ │ ├── docutils.conf │ │ ├── get_started │ │ └── Quickstart.md │ │ └── index.rst ├── eval.sh ├── requirements.txt ├── requirements │ └── docs.txt ├── run.py ├── scripts │ ├── AI2D_preproc.ipynb │ ├── apires_scan.py │ ├── auto_run.py │ ├── cover.sh │ ├── mmb_eval_gradio.py │ ├── run.sh │ ├── srun.sh │ ├── summarize.py │ └── visualize.ipynb ├── setup.py └── vlmeval │ ├── __init__.py │ ├── api │ ├── __init__.py │ ├── base.py │ ├── bluelm_v_api.py │ ├── claude.py │ ├── cloudwalk.py │ ├── gemini.py │ ├── glm_vision.py │ ├── gpt.py │ ├── hf_chat_model.py │ ├── hunyuan.py │ ├── qwen_api.py │ ├── qwen_vl_api.py │ ├── reka.py │ ├── sensechat_vision.py │ └── stepai.py │ ├── config.py │ ├── dataset │ ├── __init__.py │ ├── dude.py │ ├── image_base.py │ ├── image_caption.py │ ├── image_mcq.py │ ├── image_mt.py │ ├── image_vqa.py │ ├── image_yorn.py │ ├── mmbench_video.py │ ├── mmlongbench.py │ ├── mvbench.py │ ├── slidevqa.py │ ├── text_base.py │ ├── text_mcq.py │ ├── utils │ │ ├── __init__.py │ │ ├── crpe.py │ │ ├── hrbench.py │ │ ├── judge_util.py │ │ ├── llavabench.py │ │ ├── mathv.py │ │ ├── mathverse.py │ │ ├── mathvista.py │ │ ├── mmbench_video.py │ │ ├── mmdu.py │ │ ├── mmvet.py │ │ ├── multiple_choice.py │ │ ├── mvbench.py │ │ ├── ocrbench.py │ │ ├── tablevqabench.py │ │ ├── videomme.py │ │ ├── vqa_eval.py │ │ └── yorn.py │ ├── vcr.py │ ├── video_base.py │ └── videomme.py │ ├── inference.py │ ├── inference_mt.py │ ├── inference_video.py │ ├── smp │ ├── __init__.py │ ├── file.py │ ├── log.py │ ├── misc.py │ └── vlm.py │ ├── tools.py │ ├── utils │ ├── __init__.py │ ├── matching_util.py │ ├── mp_util.py │ └── result_transfer.py │ └── vlm │ ├── __init__.py │ ├── base.py │ ├── bunnyllama3.py │ ├── cambrian.py │ ├── chameleon.py │ ├── cogvlm.py │ ├── deepseek_vl.py │ ├── eagle_x.py │ ├── emu.py │ ├── idefics.py │ ├── instructblip.py │ ├── internvl_chat.py │ ├── llama_vision.py │ ├── llava │ ├── __init__.py │ ├── llava.py │ └── llava_xtuner.py │ ├── llava_uhd.py │ ├── llava_uhd2.py │ ├── mantis.py │ ├── mgm.py │ ├── minicpm_v.py │ ├── minigpt4.py │ ├── minimonkey.py │ ├── misc │ ├── blip2_instruct_vicuna13b.yaml │ ├── blip2_instruct_vicuna7b.yaml │ ├── minigpt4_13b_eval.yaml │ ├── minigpt4_7b_eval.yaml │ └── minigptv2_eval.yaml │ ├── mixsense.py │ ├── mmalaya.py │ ├── monkey.py │ ├── moondream.py │ ├── mplug_owl2.py │ ├── mplug_owl3.py │ ├── omchat.py │ ├── omnilmm.py │ ├── open_flamingo.py │ ├── ovis.py │ ├── paligemma.py │ ├── pandagpt.py │ ├── parrot.py │ ├── phi3_vision.py │ ├── pixtral.py │ ├── qh_360vl.py │ ├── qwen2_vl │ ├── __init__.py │ ├── model.py │ └── prompt.py │ ├── qwen_vl.py │ ├── rbdash.py │ ├── slime.py │ ├── transcore_m.py │ ├── video_llm │ ├── __init__.py │ ├── chat_uni_vi.py │ ├── configs │ │ ├── llama_vid │ │ │ └── processor │ │ │ │ └── clip-patch14-224 │ │ │ │ ├── config.json │ │ │ │ └── preprocessor_config.json │ │ └── videochat2_hd.json │ ├── llama_vid.py │ ├── pllava.py │ ├── video_chatgpt.py │ ├── video_llava.py │ └── videochat2.py │ ├── vila.py │ ├── visualglm.py │ ├── vxverse.py │ ├── wemm.py │ ├── xcomposer │ ├── __init__.py │ ├── sharecaptioner.py │ ├── xcomposer.py │ ├── xcomposer2.py │ ├── xcomposer2_4KHD.py │ └── xcomposer2d5.py │ ├── xgen_mm.py │ └── yi_vl.py ├── cog.yaml ├── doc ├── HiWin.png ├── arch.png └── pyramid.png ├── eval.sh ├── featup ├── __init__.py ├── adaptive_conv_cuda │ ├── __init__.py │ ├── adaptive_conv.cpp │ ├── adaptive_conv.py │ ├── adaptive_conv_cuda.cpp │ └── adaptive_conv_kernel.cu ├── configs │ └── vdim_upsampler.yaml ├── datasets │ ├── COCO.py │ ├── DAVIS.py │ ├── DOC.py │ ├── DocSceneText.py │ ├── EmbeddingFile.py │ ├── HTML.py │ ├── HighResEmbs.py │ ├── ImageNetSubset.py │ ├── JitteredImage.py │ ├── SCENE.py │ ├── SampleImage.py │ ├── __init__.py │ └── util.py ├── downsamplers.py ├── featurizers │ ├── CLIP.py │ ├── CLIPLarge.py │ ├── ClipEncoder.py │ ├── DINO.py │ ├── DINOv2.py │ ├── DeepLabV3.py │ ├── MAE.py │ ├── MIDAS.py │ ├── MaskCLIP.py │ ├── ResNet.py │ ├── __init__.py │ ├── dinov2 │ │ ├── __init__.py │ │ └── layers │ │ │ ├── __init__.py │ │ │ ├── attention.py │ │ │ ├── block.py │ │ │ ├── dino_head.py │ │ │ ├── drop_path.py │ │ │ ├── layer_scale.py │ │ │ ├── mlp.py │ │ │ ├── patch_embed.py │ │ │ └── swiglu_ffn.py │ ├── maskclip │ │ ├── README.md │ │ ├── __init__.py │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ ├── clip.py │ │ ├── interpolate.py │ │ ├── model.py │ │ └── simple_tokenizer.py │ ├── modules │ │ ├── __init__.py │ │ ├── layers.py │ │ ├── resnet.py │ │ └── vgg.py │ └── util.py ├── layers.py ├── losses.py ├── plotting.py ├── train_vdim_upsampler.py ├── upsamplers.py └── util.py ├── install.sh ├── llava ├── __init__.py ├── constants.py ├── conversation.py ├── eval │ ├── eval_ai2d.py │ ├── eval_chartqa.py │ ├── eval_docvqa.py │ ├── eval_gpt_review.py │ ├── eval_gpt_review_bench.py │ ├── eval_gpt_review_visual.py │ ├── eval_pope.py │ ├── eval_rec.py │ ├── eval_science_qa.py │ ├── eval_science_qa_gpt4.py │ ├── eval_science_qa_gpt4_requery.py │ ├── eval_textvqa.py │ ├── evaluate_interleave.py │ ├── generate_webpage_data_from_table.py │ ├── m4c_evaluator.py │ ├── model_qa.py │ ├── model_vqa.py │ ├── model_vqa_loader.py │ ├── model_vqa_mmbench.py │ ├── model_vqa_science.py │ ├── qa_baseline_gpt35.py │ ├── run_llava.py │ └── summarize_gpt_review.py ├── mm_utils.py ├── model │ ├── __init__.py │ ├── apply_delta.py │ ├── builder.py │ ├── builder_new.bk │ ├── consolidate.py │ ├── language_model │ │ ├── llava_gemma.py │ │ ├── llava_llama.py │ │ ├── llava_mistral.py │ │ ├── llava_mixtral.py │ │ ├── llava_mpt.py │ │ ├── llava_qwen.py │ │ ├── llava_qwen_moe.py │ │ └── modeling_llama.py │ ├── llava_arch.py │ ├── make_delta.py │ ├── multimodal_encoder │ │ ├── adapt_clip_vision_model.py │ │ ├── builder.py │ │ ├── clip_encoder.py │ │ ├── dev_eva_clip │ │ │ ├── eva_clip │ │ │ │ ├── __init__.py │ │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ │ ├── constants.py │ │ │ │ ├── eva_vit_model.py │ │ │ │ ├── factory.py │ │ │ │ ├── hf_configs.py │ │ │ │ ├── hf_model.py │ │ │ │ ├── loss.py │ │ │ │ ├── model.py │ │ │ │ ├── model_configs │ │ │ │ │ ├── EVA-CLIP-18B.json │ │ │ │ │ ├── EVA-CLIP-8B-plus.json │ │ │ │ │ ├── EVA-CLIP-8B.json │ │ │ │ │ ├── EVA01-CLIP-B-16.json │ │ │ │ │ ├── EVA01-CLIP-g-14-plus.json │ │ │ │ │ ├── EVA01-CLIP-g-14.json │ │ │ │ │ ├── EVA02-CLIP-B-16.json │ │ │ │ │ ├── EVA02-CLIP-L-14-336.json │ │ │ │ │ ├── EVA02-CLIP-L-14.json │ │ │ │ │ ├── EVA02-CLIP-bigE-14-plus.json │ │ │ │ │ ├── EVA02-CLIP-bigE-14.json │ │ │ │ │ ├── Internal-EVA02-CLIP-10B-14-448.json │ │ │ │ │ └── Internal-EVA02-CLIP-10B-14.json │ │ │ │ ├── modified_resnet.py │ │ │ │ ├── openai.py │ │ │ │ ├── pretrained.py │ │ │ │ ├── rope.py │ │ │ │ ├── timm_model.py │ │ │ │ ├── tokenizer.py │ │ │ │ ├── transform.py │ │ │ │ ├── transformer.py │ │ │ │ └── utils.py │ │ │ └── eva_vit.py │ │ ├── eva_clip │ │ │ ├── eva_clip_encoder.py │ │ │ ├── eva_clip_processors.py │ │ │ ├── eva_vit.py │ │ │ ├── factory.py │ │ │ └── model_configs │ │ │ │ ├── EVA-CLIP-18B.json │ │ │ │ ├── EVA-CLIP-8B-plus.json │ │ │ │ ├── EVA-CLIP-8B.json │ │ │ │ ├── EVA01-CLIP-B-16.json │ │ │ │ ├── EVA01-CLIP-g-14-plus.json │ │ │ │ ├── EVA01-CLIP-g-14.json │ │ │ │ ├── EVA02-CLIP-B-16.json │ │ │ │ ├── EVA02-CLIP-L-14-336.json │ │ │ │ ├── EVA02-CLIP-L-14.json │ │ │ │ ├── EVA02-CLIP-bigE-14-plus.json │ │ │ │ ├── EVA02-CLIP-bigE-14.json │ │ │ │ ├── Internal-EVA02-CLIP-10B-14-448.json │ │ │ │ └── Internal-EVA02-CLIP-10B-14.json │ │ ├── hf_vision.py │ │ ├── hubconf.py │ │ ├── imagebind.py │ │ ├── open_clip_encoder.py │ │ └── siglip_encoder.py │ ├── multimodal_projector │ │ ├── adapt_spatial_resampler.py │ │ ├── builder.py │ │ ├── mlp.py │ │ ├── mlp_v2.py │ │ ├── percive_sampler.py │ │ ├── pooler_projector.py │ │ └── uhd_v1_resampler.py │ ├── multimodal_resampler │ │ ├── builder.py │ │ ├── masked_drop.py │ │ ├── perceiver.py │ │ ├── qformer.py │ │ └── spatial_pool.py │ └── utils.py ├── serve │ ├── __init__.py │ ├── cli.py │ ├── controller.py │ ├── examples │ │ ├── extreme_ironing.jpg │ │ └── waterview.jpg │ ├── gradio_multi_image.py │ ├── gradio_web_server.py │ ├── model_worker.py │ ├── register_worker.py │ ├── sglang_worker.py │ └── test_message.py ├── slice_process.py ├── train │ ├── llama_flash_attn_monkey_patch.py │ ├── llava_trainer.py │ ├── llava_trainer_eval.py │ ├── train.py │ ├── train_dpo.py │ └── train_mem.py └── utils.py ├── model-train.sh ├── playground ├── 2d_hist.py ├── data_checker.py ├── demo │ ├── video_demo.py │ └── xU25MMA2N4aVtYay.mp4 ├── equal_splitter.py ├── remove_mid_ckpt.py ├── sgl_llava_inference_multinode.py └── upload_data.py ├── scripts ├── archived │ ├── convert_gqa_for_eval.py │ ├── convert_mmvet_for_eval.py │ ├── convert_sqa_to_llava.py │ ├── convert_sqa_to_llava_base_prompt.py │ ├── convert_vizwiz_for_submission.py │ ├── convert_vqav2_for_submission.py │ ├── data_info.py │ ├── dpo_data_info.py │ ├── entry_cmd.sh │ ├── finetune.sh │ ├── finetune_1.5.sh │ ├── finetune_full_schedule.sh │ ├── finetune_lora.sh │ ├── finetune_mixtral.sh │ ├── finetune_mixtral_1.5.sh │ ├── finetune_mixtral_1.6_336px_anyres.sh │ ├── finetune_mixtral_1.6_336px_anyres_freeze_vision.sh │ ├── finetune_mixtral_1.6_336px_anyres_lmms_eval.sh │ ├── finetune_mixtral_copy.sh │ ├── finetune_qlora.sh │ ├── finetune_sqa.sh │ ├── merge_lora_weights.py │ ├── pretrain.sh │ ├── quick_check.py │ ├── sqa_eval_batch.sh │ └── sqa_eval_gather.sh ├── convert_gqa_for_eval.py ├── interleave │ ├── eval_all.sh │ ├── eval_interleave_3d.sh │ └── eval_multiprocess.sh ├── qwen.py ├── summarize_data.py ├── train │ ├── README.md │ ├── direct_finetune_clip.sh │ ├── direct_finetune_siglip_a4.sh │ ├── dpo.sh │ ├── dpo_ov7b.sh │ ├── finetune_ov.sh │ ├── finetune_si.sh │ ├── mid_stage.yaml │ ├── onevision.yaml │ ├── pretrain_clip.sh │ ├── pretrain_siglip.sh │ └── single_image.yaml ├── v1_5 │ └── eval │ │ ├── ai2d.sh │ │ ├── chartqa.sh │ │ ├── deepform.sh │ │ ├── docvqa_test.sh │ │ ├── docvqa_val.sh │ │ ├── estvqa.sh │ │ ├── gqa.sh │ │ ├── infographics.sh │ │ ├── llavabench.sh │ │ ├── mmbench.sh │ │ ├── mmbench_cn.sh │ │ ├── mme.sh │ │ ├── mmvet.sh │ │ ├── pope.sh │ │ ├── qbench.sh │ │ ├── qbench_zh.sh │ │ ├── rec.sh │ │ ├── sqa.sh │ │ ├── textvqa.sh │ │ ├── vizwiz.sh │ │ └── vqav2.sh ├── video │ ├── demo │ │ └── video_demo.sh │ ├── eval │ │ ├── activitynet_eval.sh │ │ ├── video_chatgpt_benchmark_eval_shard.sh │ │ ├── video_description_from_t2v.sh │ │ ├── video_detail_description_eval_only.sh │ │ └── video_detail_description_eval_shard.sh │ └── train │ │ ├── SO400M_Qwen2_72B_ov_to_video_am9.sh │ │ ├── SO400M_Qwen2_7B_ov_to_video_am9.sh │ │ └── exp.yaml ├── zero2.json ├── zero2_fused_adamw.json ├── zero2_offload.json ├── zero2_old.json ├── zero3.json ├── zero3_offload.json └── zero3pp.json ├── setup.py ├── trl ├── __init__.py ├── core.py ├── environment │ ├── __init__.py │ └── base_environment.py ├── extras │ ├── __init__.py │ ├── best_of_n_sampler.py │ └── dataset_formatting.py ├── import_utils.py ├── models │ ├── __init__.py │ ├── modeling_base.py │ ├── modeling_sd_base.py │ ├── modeling_value_head.py │ └── utils.py └── trainer │ ├── __init__.py │ ├── base.py │ ├── ddpo_config.py │ ├── ddpo_trainer.py │ ├── dpo_trainer.py │ ├── iterative_sft_trainer.py │ ├── model_config.py │ ├── ppo_config.py │ ├── ppo_trainer.py │ ├── reward_config.py │ ├── reward_trainer.py │ ├── sft_trainer.py │ └── utils.py ├── uhdv2-qwen2.sh ├── uhdv2-vicuna13b.sh └── vdim-pretrain.sh /VLMEvalKit/assets/apple.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/VLMEvalKit/assets/apple.jpg -------------------------------------------------------------------------------- /VLMEvalKit/docs/en/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | # Set the version of Python and other tools you might need 4 | build: 5 | os: ubuntu-22.04 6 | tools: 7 | python: "3.8" 8 | 9 | formats: 10 | - epub 11 | 12 | sphinx: 13 | configuration: docs/en/conf.py 14 | 15 | python: 16 | install: 17 | - requirements: requirements/docs.txt 18 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/en/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/en/_static/css/readthedocs.css: -------------------------------------------------------------------------------- 1 | .header-logo { 2 | background-image: url("../image/logo.svg"); 3 | background-size: 275px 80px; 4 | height: 80px; 5 | width: 275px; 6 | } 7 | 8 | 9 | @media screen and (min-width: 1100px) { 10 | .header-logo { 11 | top: -25px; 12 | } 13 | } 14 | 15 | pre { 16 | white-space: pre; 17 | } 18 | 19 | @media screen and (min-width: 2000px) { 20 | .pytorch-content-left { 21 | width: 1200px; 22 | margin-left: 30px; 23 | } 24 | article.pytorch-article { 25 | max-width: 1200px; 26 | } 27 | .pytorch-breadcrumbs-wrapper { 28 | width: 1200px; 29 | } 30 | .pytorch-right-menu.scrolling-fixed { 31 | position: fixed; 32 | top: 45px; 33 | left: 1580px; 34 | } 35 | } 36 | 37 | 38 | article.pytorch-article section code { 39 | padding: .2em .4em; 40 | background-color: #f3f4f7; 41 | border-radius: 5px; 42 | } 43 | 44 | /* Disable the change in tables */ 45 | article.pytorch-article section table code { 46 | padding: unset; 47 | background-color: unset; 48 | border-radius: unset; 49 | } 50 | 51 | table.autosummary td { 52 | width: 50% 53 | } 54 | 55 | img.align-center { 56 | display: block; 57 | margin-left: auto; 58 | margin-right: auto; 59 | } 60 | 61 | article.pytorch-article p.rubric { 62 | font-weight: bold; 63 | } 64 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/en/_static/image/logo_icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/en/_static/js/custom.js: -------------------------------------------------------------------------------- 1 | var collapsedSections = []; 2 | 3 | $(document).ready(function () { 4 | $('.model-summary').DataTable({ 5 | "stateSave": false, 6 | "lengthChange": false, 7 | "pageLength": 20, 8 | "order": [] 9 | }); 10 | }); 11 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/en/_templates/404.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block body %} 4 | 5 |

Page Not Found

6 |

7 | The page you are looking for cannot be found. 8 |

9 |

10 | If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in 11 | the content table left, or go to the homepage. 12 |

13 | 17 | 18 | {% endblock %} 19 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/en/_templates/autosummary/class.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | .. currentmodule:: {{ module }} 4 | 5 | 6 | {{ name | underline}} 7 | 8 | .. autoclass:: {{ name }} 9 | :members: 10 | 11 | .. 12 | autogenerated from _templates/autosummary/class.rst 13 | note it does not have :inherited-members: 14 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/en/_templates/callable.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | .. currentmodule:: {{ module }} 4 | 5 | 6 | {{ name | underline}} 7 | 8 | .. autoclass:: {{ name }} 9 | :members: 10 | :special-members: __call__ 11 | 12 | .. 13 | autogenerated from _templates/callable.rst 14 | note it does not have :inherited-members: 15 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/en/advanced_guides/Contributors.md: -------------------------------------------------------------------------------- 1 | # Contributors 2 | 3 | ## Contributors w. 3+ Major Contributions 4 | 5 | > In this section, we list all the contributors who have made significant contributions (3+) to the development of VLMEvalKit. 6 | 7 | New Qualified Contributors (2024.09): 8 | 9 | 1. [amitbcp](https://github.com/amitbcp): The contributor helped support MUIRBench, Phi-3.5, Idefics3, VILA, and xGen-MM 10 | 2. [czczup](https://github.com/czczup): The contributor helped support the InternVL Series (V1.5, Mini-InternVL, V2, etc.) 11 | 3. [DseidLi](https://github.com/DseidLi): The contributor helped support LLaVA-OneVision, GQA, and developed the readthedocs site for VLMEvalKit 12 | 4. [mayubo2333](https://github.com/mayubo2333): The contributor helped support MMLongBench, SlideVQA, and DUDE 13 | 5. [sun-hailong](https://github.com/sun-hailong): The contributor helped support A-OKVQA, Parrot, MMMB, and MTL-MMBench 14 | 6. [PhoenixZ810](https://github.com/PhoenixZ810): The contributor helped support Video-ChatGPT, Chat-UniVI, and Llama-VID 15 | 7. [Cuiunbo](https://github.com/Cuiunbo): The contributor helped support OmniLMM-12B, MiniCPM-V Series (V1, V2, V2.5) 16 | 17 | ## Full Contributor List 18 | 19 | > In this section, we list all the contributors as well as their corresponding contributions to the development of VLMEvalKit. 20 | 21 | TBD. 22 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/en/docutils.conf: -------------------------------------------------------------------------------- 1 | [html writers] 2 | table_style: colwidths-auto 3 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/en/index.rst: -------------------------------------------------------------------------------- 1 | Welcome to the VLMEvalKit Tutorial! 2 | ========================================== 3 | 4 | VLMEvalKit Getting Started Guide 5 | ------------------------------- 6 | 7 | To help users get started quickly, we recommend the following process: 8 | 9 | - For users who want to use VLMEvalKit, we recommend reading the "Start Your First Step" section to set up the environment and start a mini-experiment to familiarize yourself with the process. 10 | 11 | - If you want to customize more modules, such as adding datasets and models, we provide an "Advanced Tutorial." 12 | 13 | We always welcome users' PRs (Pull Requests) and Issues to improve VLMEvalKit! 14 | 15 | .. _Start Your First Step: 16 | .. toctree:: 17 | :maxdepth: 1 18 | :caption: Start Your First Step 19 | 20 | get_started/Quickstart.md 21 | 22 | 23 | .. .. _Tutorials: 24 | .. .. toctree:: 25 | .. :maxdepth: 1 26 | .. :caption: Tutorials 27 | 28 | .. user_guides/framework_overview.md 29 | 30 | .. _Advanced Tutorial: 31 | .. toctree:: 32 | :maxdepth: 1 33 | :caption: Advanced Tutorial 34 | 35 | advanced_guides/Development.md 36 | 37 | .. .. _Other Notes: 38 | .. .. toctree:: 39 | .. :maxdepth: 1 40 | .. :caption: Other Notes 41 | 42 | .. notes/contribution_guide.md 43 | 44 | Index and Tables 45 | ================== 46 | 47 | * :ref:`genindex` 48 | * :ref:`search` 49 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/zh-CN/.readthedocs.yaml: -------------------------------------------------------------------------------- 1 | version: 2 2 | 3 | # Set the version of Python and other tools you might need 4 | build: 5 | os: ubuntu-22.04 6 | tools: 7 | python: "3.8" 8 | 9 | formats: 10 | - epub 11 | 12 | sphinx: 13 | configuration: docs/zh-CN/conf.py 14 | 15 | python: 16 | install: 17 | - requirements: requirements/docs.txt 18 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/zh-CN/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line, and also 5 | # from the environment for the first two. 6 | SPHINXOPTS ?= 7 | SPHINXBUILD ?= sphinx-build 8 | SOURCEDIR = . 9 | BUILDDIR = _build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 21 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/zh-CN/_static/css/readthedocs.css: -------------------------------------------------------------------------------- 1 | .header-logo { 2 | background-image: url("../image/logo.svg"); 3 | background-size: 275px 80px; 4 | height: 80px; 5 | width: 275px; 6 | } 7 | 8 | 9 | @media screen and (min-width: 1100px) { 10 | .header-logo { 11 | top: -25px; 12 | } 13 | } 14 | 15 | pre { 16 | white-space: pre; 17 | } 18 | 19 | @media screen and (min-width: 2000px) { 20 | .pytorch-content-left { 21 | width: 1200px; 22 | margin-left: 30px; 23 | } 24 | article.pytorch-article { 25 | max-width: 1200px; 26 | } 27 | .pytorch-breadcrumbs-wrapper { 28 | width: 1200px; 29 | } 30 | .pytorch-right-menu.scrolling-fixed { 31 | position: fixed; 32 | top: 45px; 33 | left: 1580px; 34 | } 35 | } 36 | 37 | 38 | article.pytorch-article section code { 39 | padding: .2em .4em; 40 | background-color: #f3f4f7; 41 | border-radius: 5px; 42 | } 43 | 44 | /* Disable the change in tables */ 45 | article.pytorch-article section table code { 46 | padding: unset; 47 | background-color: unset; 48 | border-radius: unset; 49 | } 50 | 51 | table.autosummary td { 52 | width: 50% 53 | } 54 | 55 | img.align-center { 56 | display: block; 57 | margin-left: auto; 58 | margin-right: auto; 59 | } 60 | 61 | article.pytorch-article p.rubric { 62 | font-weight: bold; 63 | } 64 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/zh-CN/_static/image/logo_icon.svg: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/zh-CN/_static/js/custom.js: -------------------------------------------------------------------------------- 1 | var collapsedSections = []; 2 | 3 | $(document).ready(function () { 4 | $('.model-summary').DataTable({ 5 | "stateSave": false, 6 | "lengthChange": false, 7 | "pageLength": 20, 8 | "order": [] 9 | }); 10 | }); 11 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/zh-CN/_templates/404.html: -------------------------------------------------------------------------------- 1 | {% extends "layout.html" %} 2 | 3 | {% block body %} 4 | 5 |

Page Not Found

6 |

7 | The page you are looking for cannot be found. 8 |

9 |

10 | If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in 11 | the content table left, or go to the homepage. 12 |

13 | 17 | 18 | {% endblock %} 19 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/zh-CN/_templates/autosummary/class.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | .. currentmodule:: {{ module }} 4 | 5 | 6 | {{ name | underline}} 7 | 8 | .. autoclass:: {{ name }} 9 | :members: 10 | 11 | .. 12 | autogenerated from _templates/autosummary/class.rst 13 | note it does not have :inherited-members: 14 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/zh-CN/_templates/callable.rst: -------------------------------------------------------------------------------- 1 | .. role:: hidden 2 | :class: hidden-section 3 | .. currentmodule:: {{ module }} 4 | 5 | 6 | {{ name | underline}} 7 | 8 | .. autoclass:: {{ name }} 9 | :members: 10 | :special-members: __call__ 11 | 12 | .. 13 | autogenerated from _templates/callable.rst 14 | note it does not have :inherited-members: 15 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/zh-CN/cp_origin_docs.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | # Copy *.md files from docs/ if it doesn't have a Chinese translation 4 | 5 | for filename in $(find ../en/ -name '*.md' -printf "%P\n"); 6 | do 7 | mkdir -p $(dirname $filename) 8 | cp -n ../en/$filename ./$filename 9 | done 10 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/zh-CN/docutils.conf: -------------------------------------------------------------------------------- 1 | [html writers] 2 | table_style: colwidths-auto 3 | -------------------------------------------------------------------------------- /VLMEvalKit/docs/zh-CN/index.rst: -------------------------------------------------------------------------------- 1 | 欢迎来到 VLMEvalKit 中文教程! 2 | ========================================== 3 | 4 | VLMEvalKit 上手路线 5 | ------------------------------- 6 | 7 | 为了用户能够快速上手,我们推荐以下流程: 8 | 9 | - 对于想要使用 VLMEvalKit 的用户,我们推荐先阅读 开始你的第一步_ 部分来设置环境,并启动一个迷你实验熟悉流程。 10 | 11 | - 若您想进行更多模块的自定义,例如增加数据集和模型,我们提供了 进阶教程_ 。 12 | 13 | 我们始终非常欢迎用户的 PRs 和 Issues 来完善 VLMEvalKit! 14 | 15 | .. _开始你的第一步: 16 | .. toctree:: 17 | :maxdepth: 1 18 | :caption: 开始你的第一步 19 | 20 | get_started/Quickstart.md 21 | 22 | 23 | .. .. _教程: 24 | .. .. toctree:: 25 | .. :maxdepth: 1 26 | .. :caption: 教程 27 | 28 | .. user_guides/framework_overview.md 29 | 30 | .. _进阶教程: 31 | .. toctree:: 32 | :maxdepth: 1 33 | :caption: 进阶教程 34 | 35 | advanced_guides/Development.md 36 | 37 | .. .. _其他说明: 38 | .. .. toctree:: 39 | .. :maxdepth: 1 40 | .. :caption: 其他说明 41 | 42 | .. notes/contribution_guide.md 43 | 44 | 索引与表格 45 | ================== 46 | 47 | * :ref:`genindex` 48 | * :ref:`search` 49 | -------------------------------------------------------------------------------- /VLMEvalKit/eval.sh: -------------------------------------------------------------------------------- 1 | export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 2 | torchrun --nproc-per-node=8 run.py --data OCRBench MMMU_DEV_VAL SEEDBench_IMG MMBench_TEST_EN RealWorldQA HRBench4K --model llava_uhd2 --verbose -------------------------------------------------------------------------------- /VLMEvalKit/requirements.txt: -------------------------------------------------------------------------------- 1 | decord 2 | gradio 3 | huggingface_hub 4 | imageio 5 | matplotlib 6 | moviepy 7 | numpy>=1.23.4 8 | omegaconf 9 | openai==1.3.5 10 | opencv-python>=4.4.0.46 11 | openpyxl 12 | pandas 13 | peft 14 | pillow 15 | portalocker 16 | protobuf 17 | python-dotenv 18 | requests 19 | rich 20 | sentencepiece 21 | setuptools 22 | sty 23 | tabulate 24 | tiktoken 25 | timeout-decorator 26 | torch>=2.0.1 27 | tqdm 28 | transformers 29 | typing_extensions==4.7.1 30 | validators 31 | xlsxwriter 32 | -------------------------------------------------------------------------------- /VLMEvalKit/requirements/docs.txt: -------------------------------------------------------------------------------- 1 | docutils==0.18.1 2 | modelindex 3 | myst-parser 4 | -e git+https://github.com/open-compass/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme 5 | sphinx==6.1.3 6 | sphinx-copybutton 7 | sphinx-design 8 | sphinx-notfound-page 9 | sphinx-tabs 10 | sphinxcontrib-jquery 11 | tabulate 12 | -------------------------------------------------------------------------------- /VLMEvalKit/scripts/auto_run.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from vlmeval.smp import * 3 | from vlmeval.config import supported_VLM 4 | 5 | def is_api(x): 6 | return getattr(supported_VLM[x].func, 'is_api', False) 7 | 8 | models = list(supported_VLM) 9 | models = [x for x in models if 'fs' not in x] 10 | models = [x for x in models if not is_api(x)] 11 | exclude_list = ['cogvlm-grounding-generalist', 'emu2'] 12 | models = [x for x in models if x not in exclude_list] 13 | 14 | def is_large(x): 15 | return '80b' in x or 'emu2' in x or '34B' in x 16 | 17 | small_models = [x for x in models if not is_large(x)] 18 | large_models = [x for x in models if is_large(x)] 19 | models = small_models + large_models 20 | 21 | parser = argparse.ArgumentParser() 22 | parser.add_argument('--data', type=str, nargs='+', required=True) 23 | args = parser.parse_args() 24 | 25 | # Skip some models 26 | models = [x for x in models if not listinstr(['MiniGPT', 'grounding-generalist'], x)] 27 | 28 | for m in models: 29 | unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.xlsx')] 30 | if len(unknown_datasets) == 0: 31 | continue 32 | dataset_str = ' '.join(unknown_datasets) 33 | if '80b' in m: 34 | cmd = f'python run.py --data {dataset_str} --model {m}' 35 | else: 36 | cmd = f'bash run.sh --data {dataset_str} --model {m}' 37 | print(cmd) 38 | os.system(cmd) -------------------------------------------------------------------------------- /VLMEvalKit/scripts/cover.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) 3 | cp $DIR/../config.py $DIR/../vlmeval/ 4 | cp $DIR/../misc/* $DIR/../vlmeval/vlm/misc/ -------------------------------------------------------------------------------- /VLMEvalKit/scripts/run.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | export GPU=$(nvidia-smi --list-gpus | wc -l) 4 | torchrun --nproc-per-node=$GPU run.py ${@:1} -------------------------------------------------------------------------------- /VLMEvalKit/scripts/srun.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | set -x 3 | srun -n1 --ntasks-per-node=1 --partition $1 --gres=gpu:8 --quotatype=reserved --job-name vlmeval --cpus-per-task=64 torchrun --nproc-per-node=8 run.py ${@:2} -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | except ImportError: 4 | pass 5 | 6 | from .smp import * 7 | from .api import * 8 | from .dataset import * 9 | from .utils import * 10 | from .vlm import * 11 | from .config import * 12 | from .tools import cli 13 | 14 | load_env() 15 | 16 | __version__ = '0.2rc1' 17 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/__init__.py: -------------------------------------------------------------------------------- 1 | from .gpt import OpenAIWrapper, GPT4V 2 | from .hf_chat_model import HFChatModel 3 | from .gemini import GeminiWrapper, GeminiProVision 4 | from .qwen_vl_api import QwenVLWrapper, QwenVLAPI, Qwen2VLAPI 5 | from .qwen_api import QwenAPI 6 | from .claude import Claude_Wrapper, Claude3V 7 | from .reka import Reka 8 | from .glm_vision import GLMVisionAPI 9 | from .cloudwalk import CWWrapper 10 | from .sensechat_vision import SenseChatVisionAPI 11 | from .hunyuan import HunyuanVision 12 | from .bluelm_v_api import BlueLMWrapper, BlueLM_V_API 13 | 14 | 15 | __all__ = [ 16 | 'OpenAIWrapper', 'HFChatModel', 'GeminiWrapper', 'GPT4V', 17 | 'GeminiProVision', 'QwenVLWrapper', 'QwenVLAPI', 'QwenAPI', 18 | 'Claude3V', 'Claude_Wrapper', 'Reka', 'GLMVisionAPI', 19 | 'CWWrapper', 'SenseChatVisionAPI', 'HunyuanVision', 'Qwen2VLAPI', 20 | 'BlueLMWrapper', 'BlueLM_V_API', 21 | ] 22 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/api/reka.py: -------------------------------------------------------------------------------- 1 | from vlmeval.smp import * 2 | from vlmeval.api.base import BaseAPI 3 | from time import sleep 4 | import mimetypes 5 | 6 | 7 | class Reka_Wrapper(BaseAPI): 8 | 9 | is_api: bool = True 10 | INTERLEAVE: bool = False 11 | 12 | def __init__(self, 13 | model: str = 'reka-flash-20240226', 14 | key: str = None, 15 | retry: int = 10, 16 | wait: int = 3, 17 | system_prompt: str = None, 18 | verbose: bool = True, 19 | temperature: float = 0, 20 | max_tokens: int = 1024, 21 | **kwargs): 22 | 23 | try: 24 | import reka 25 | except ImportError: 26 | raise ImportError('Please install reka by running "pip install reka-api"') 27 | 28 | self.model = model 29 | default_kwargs = dict(temperature=temperature, request_output_len=max_tokens) 30 | default_kwargs.update(kwargs) 31 | self.kwargs = default_kwargs 32 | if key is not None: 33 | self.key = key 34 | else: 35 | self.key = os.environ.get('REKA_API_KEY', '') 36 | super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs) 37 | 38 | def generate_inner(self, inputs, **kwargs) -> str: 39 | import reka 40 | reka.API_KEY = self.key 41 | dataset = kwargs.pop('dataset', None) 42 | prompt, image_path = self.message_to_promptimg(inputs, dataset=dataset) 43 | image_b64 = encode_image_file_to_base64(image_path) 44 | 45 | response = reka.chat( 46 | model_name=self.model, 47 | human=prompt, 48 | media_url=f'data:image/jpeg;base64,{image_b64}', 49 | **self.kwargs) 50 | 51 | try: 52 | return 0, response['text'], response 53 | except: 54 | return -1, self.fail_msg, response 55 | 56 | 57 | class Reka(Reka_Wrapper): 58 | 59 | def generate(self, message, dataset=None): 60 | return super(Reka_Wrapper, self).generate(message) 61 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/dataset/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .judge_util import build_judge, DEBUG_MESSAGE 2 | from .multiple_choice import extract_answer_from_item, prefetch_answer 3 | from .vqa_eval import levenshtein_distance 4 | 5 | 6 | __all__ = [ 7 | 'build_judge', 'extract_answer_from_item', 'prefetch_answer', 8 | 'levenshtein_distance', 'DEBUG_MESSAGE' 9 | ] 10 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/dataset/utils/crpe.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | from collections import defaultdict 4 | 5 | 6 | def is_correct(predict, answer): 7 | # predict是标准答案 answer是预测 8 | if len(answer) == 1: 9 | return answer[0] == predict[0] 10 | elif len(answer) != 1 and answer[0] in ['A', 'B', 'C', 'D']: 11 | return answer[0] == predict[0] 12 | elif len(answer) != 1 and answer[0] not in ['A', 'B', 'C', 'D']: 13 | return predict[4:].lower() in answer.lower() 14 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/dataset/utils/hrbench.py: -------------------------------------------------------------------------------- 1 | from ...smp import * 2 | import os 3 | 4 | 5 | def report_acc_hrbench(df): 6 | cycle_group = df.groupby('cycle_category') 7 | result_dic = defaultdict(list) 8 | avg_dic = defaultdict(int) 9 | 10 | count = 0 11 | for key, data_value in cycle_group: 12 | count += 1 13 | _, resp_dic = hrbench_score(data_value) 14 | 15 | for task_type, accuracy in resp_dic.items(): 16 | result_dic['cycle'].append(key) 17 | result_dic['type'].append(task_type) 18 | result_dic['accuracy'].append(accuracy) 19 | 20 | avg_dic[task_type] += accuracy 21 | for task_type, accuracy in avg_dic.items(): 22 | result_dic['cycle'].append('Average') 23 | result_dic['type'].append(task_type) 24 | result_dic['accuracy'].append(accuracy / count) 25 | result_pd = pd.DataFrame(result_dic) 26 | 27 | return result_pd 28 | 29 | 30 | def hrbench_score(data): 31 | ret = defaultdict(list) 32 | resp_dic = {} 33 | category_list = set(data['category']) 34 | score_dict = defaultdict(list) 35 | 36 | for i in range(len(data)): 37 | d = data.iloc[i] 38 | category = d['category'] 39 | gpt_score = d['hit'] 40 | score_dict[category].append(gpt_score) 41 | score_dict['all'].append(gpt_score) 42 | 43 | all_acc = np.mean(score_dict['all']) 44 | ret['type'].append('all') 45 | ret['acc'].append(all_acc) 46 | resp_dic['all'] = all_acc 47 | for cate in category_list: 48 | acc = np.mean(score_dict[cate]) 49 | ret['type'].append(cate) 50 | ret['acc'].append(acc) 51 | 52 | resp_dic[cate] = acc 53 | 54 | return pd.DataFrame(ret), resp_dic 55 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/dataset/utils/judge_util.py: -------------------------------------------------------------------------------- 1 | import os 2 | from ...api import OpenAIWrapper 3 | from ...smp import load_env 4 | 5 | INTERNAL = os.environ.get('INTERNAL', 0) 6 | 7 | 8 | def build_judge(**kwargs): 9 | model = kwargs.pop('model', None) 10 | kwargs.pop('nproc', None) 11 | load_env() 12 | LOCAL_LLM = os.environ.get('LOCAL_LLM', None) 13 | if LOCAL_LLM is None: 14 | model_map = { 15 | 'gpt-4-turbo': 'gpt-4-1106-preview', 16 | 'gpt-4-0613': 'gpt-4-0613', 17 | 'gpt-4-0125': 'gpt-4-0125-preview', 18 | 'gpt-4-0409': 'gpt-4-turbo-2024-04-09', 19 | 'chatgpt-1106': 'gpt-3.5-turbo-1106', 20 | 'chatgpt-0125': 'gpt-3.5-turbo-0125', 21 | 'gpt-4o': 'gpt-4o-2024-05-13', 22 | 'gpt-4o-mini': 'gpt-4o-mini-2024-07-18', 23 | } 24 | model_version = model_map[model] 25 | else: 26 | model_version = LOCAL_LLM 27 | model = OpenAIWrapper(model_version, **kwargs) 28 | return model 29 | 30 | 31 | DEBUG_MESSAGE = """ 32 | To debug the OpenAI API, you can try the following scripts in python: 33 | ```python 34 | from vlmeval.api import OpenAIWrapper 35 | model = OpenAIWrapper('gpt-4-1106-preview', verbose=True) 36 | msgs = [dict(type='text', value='Hello!')] 37 | code, answer, resp = model.generate_inner(msgs) 38 | print(code, answer, resp) 39 | ``` 40 | You cam see the specific error if the API call fails. 41 | """ 42 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/smp/__init__.py: -------------------------------------------------------------------------------- 1 | from .file import * 2 | from .vlm import * 3 | from .misc import * 4 | from .log import * 5 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/smp/log.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | logger_initialized = {} 4 | 5 | 6 | def get_logger(name, log_file=None, log_level=logging.INFO, file_mode='w'): 7 | logger = logging.getLogger(name) 8 | if name in logger_initialized: 9 | return logger 10 | 11 | for logger_name in logger_initialized: 12 | if name.startswith(logger_name): 13 | return logger 14 | 15 | stream_handler = logging.StreamHandler() 16 | handlers = [stream_handler] 17 | 18 | try: 19 | import torch.distributed as dist 20 | if dist.is_available() and dist.is_initialized(): 21 | rank = dist.get_rank() 22 | else: 23 | rank = 0 24 | except ImportError: 25 | rank = 0 26 | 27 | if rank == 0 and log_file is not None: 28 | file_handler = logging.FileHandler(log_file, file_mode) 29 | handlers.append(file_handler) 30 | 31 | formatter = logging.Formatter( 32 | '%(asctime)s - %(name)s - %(levelname)s - %(message)s') 33 | for handler in handlers: 34 | handler.setFormatter(formatter) 35 | handler.setLevel(log_level) 36 | logger.addHandler(handler) 37 | 38 | if rank == 0: 39 | logger.setLevel(log_level) 40 | else: 41 | logger.setLevel(logging.ERROR) 42 | 43 | logger_initialized[name] = True 44 | return logger 45 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/utils/__init__.py: -------------------------------------------------------------------------------- 1 | from .matching_util import can_infer, can_infer_option, can_infer_text 2 | from .mp_util import track_progress_rich 3 | 4 | 5 | __all__ = [ 6 | 'can_infer', 'can_infer_option', 'can_infer_text', 'track_progress_rich', 7 | ] 8 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/utils/matching_util.py: -------------------------------------------------------------------------------- 1 | import string 2 | import copy as cp 3 | import os 4 | from ..smp import * 5 | 6 | 7 | def can_infer_option(answer, choices): 8 | verbose = os.environ.get('VERBOSE', 0) 9 | # Choices is a dictionary 10 | if 'Failed to obtain answer via API' in answer: 11 | return False 12 | 13 | reject_to_answer = [ 14 | "Sorry, I can't help with images of people yet.", 15 | "I can't process this file.", 16 | "I'm sorry, but without the image provided", 17 | 'Cannot determine the answer' 18 | ] 19 | for err in reject_to_answer: 20 | if err in answer: 21 | return 'Z' 22 | 23 | def count_choice(splits, choices, prefix='', suffix=''): 24 | cnt = 0 25 | for c in choices: 26 | if prefix + c + suffix in splits: 27 | cnt += 1 28 | return cnt 29 | 30 | answer_mod = cp.copy(answer) 31 | chars = '.()[],:;!*#{}' 32 | for c in chars: 33 | answer_mod = answer_mod.replace(c, ' ') 34 | 35 | splits = [x.strip() for x in answer_mod.split()] 36 | count = count_choice(splits, choices) 37 | 38 | if count == 1: 39 | for ch in choices: 40 | if 'A' in splits and len(splits) > 3 and verbose: 41 | logger = get_logger('Evaluation') 42 | logger.info(f'A might be a quantifier in the string: {answer}.') 43 | return False 44 | if ch in splits: 45 | return ch 46 | elif count == 0 and count_choice(splits, {'Z', ''}) == 1: 47 | return 'Z' 48 | return False 49 | 50 | 51 | def can_infer_text(answer, choices): 52 | answer = answer.lower() 53 | assert isinstance(choices, dict) 54 | for k in choices: 55 | assert k in string.ascii_uppercase 56 | choices[k] = str(choices[k]).lower() 57 | cands = [] 58 | for k in choices: 59 | if choices[k] in answer: 60 | cands.append(k) 61 | if len(cands) == 1: 62 | return cands[0] 63 | return False 64 | 65 | 66 | def can_infer(answer, choices): 67 | answer = str(answer) 68 | copt = can_infer_option(answer, choices) 69 | return copt if copt else can_infer_text(answer, choices) 70 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/__init__.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | torch.set_grad_enabled(False) 4 | torch.manual_seed(1234) 5 | from .base import BaseModel 6 | from .cogvlm import CogVlm, GLM4v 7 | from .emu import Emu 8 | from .eagle_x import Eagle 9 | from .idefics import IDEFICS, IDEFICS2 10 | from .instructblip import InstructBLIP 11 | from .llava import LLaVA, LLaVA_Next, LLaVA_XTuner, LLaVA_Next2, LLaVA_OneVision 12 | from .minicpm_v import MiniCPM_V, MiniCPM_Llama3_V, MiniCPM_V_2_6 13 | from .minigpt4 import MiniGPT4 14 | from .mmalaya import MMAlaya, MMAlaya2 15 | from .monkey import Monkey, MonkeyChat 16 | from .moondream import Moondream1, Moondream2 17 | from .minimonkey import MiniMonkey 18 | from .mplug_owl2 import mPLUG_Owl2 19 | from .omnilmm import OmniLMM12B 20 | from .open_flamingo import OpenFlamingo 21 | from .pandagpt import PandaGPT 22 | from .qwen_vl import QwenVL, QwenVLChat 23 | from .qwen2_vl import Qwen2VLChat 24 | from .transcore_m import TransCoreM 25 | from .visualglm import VisualGLM 26 | from .xcomposer import ShareCaptioner, XComposer, XComposer2, XComposer2_4KHD, XComposer2d5 27 | from .yi_vl import Yi_VL 28 | from .internvl_chat import InternVLChat 29 | from .deepseek_vl import DeepSeekVL 30 | from .mgm import Mini_Gemini 31 | from .bunnyllama3 import BunnyLLama3 32 | from .vxverse import VXVERSE 33 | from .paligemma import PaliGemma 34 | from .qh_360vl import QH_360VL 35 | from .phi3_vision import Phi3Vision, Phi3_5Vision 36 | from .wemm import WeMM 37 | from .cambrian import Cambrian 38 | from .chameleon import Chameleon 39 | from .video_llm import VideoLLaVA, VideoLLaVA_HF, Chatunivi, VideoChatGPT, LLaMAVID, VideoChat2_HD, PLLaVA 40 | from .vila import VILA 41 | from .ovis import Ovis, Ovis1_6 42 | from .mantis import Mantis 43 | from .mixsense import LLama3Mixsense 44 | from .parrot import Parrot 45 | from .omchat import OmChat 46 | from .rbdash import RBDash 47 | from .xgen_mm import XGenMM 48 | from .slime import SliME 49 | from .mplug_owl3 import mPLUG_Owl3 50 | from .pixtral import Pixtral 51 | from .llama_vision import llama_vision 52 | from .llava_uhd import LLaVA_UHD 53 | from .llava_uhd2 import LLaVA_UHD2 -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/chameleon.py: -------------------------------------------------------------------------------- 1 | import os.path as osp 2 | import warnings 3 | from .base import BaseModel 4 | from ..smp import * 5 | from PIL import Image 6 | import torch 7 | 8 | 9 | class Chameleon(BaseModel): 10 | 11 | INSTALL_REQ = False 12 | INTERLEAVE = True 13 | 14 | def __init__(self, model_path='facebook/chameleon-7b', **kwargs): 15 | try: 16 | from transformers import ChameleonProcessor, ChameleonForConditionalGeneration 17 | except: 18 | warnings.warn('Please install the latest transformers.') 19 | 20 | processor = ChameleonProcessor.from_pretrained(model_path) 21 | model = ChameleonForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16) 22 | 23 | self.model = model.cuda().eval() 24 | self.processor = processor 25 | 26 | def generate_inner(self, message, dataset=None): 27 | content, images = '', [] 28 | for x in message: 29 | if x['type'] == 'text': 30 | content += x['value'] 31 | elif x['type'] == 'image': 32 | content += '\n' 33 | images.append(Image.open(x['value'])) 34 | 35 | inputs = self.processor( 36 | text=[content], 37 | images=images, 38 | padding=True, 39 | return_tensors='pt' 40 | ).to(device='cuda', dtype=torch.bfloat16) 41 | generate_ids = self.model.generate(**inputs, max_new_tokens=512) 42 | input_token_len = inputs.input_ids.shape[1] 43 | text = self.processor.batch_decode( 44 | generate_ids[:, input_token_len:], 45 | skip_special_tokens=True, 46 | clean_up_tokenization_spaces=False 47 | )[0] 48 | return text 49 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/instructblip.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from PIL import Image 3 | import os.path as osp 4 | import sys 5 | from .base import BaseModel 6 | from ..smp import * 7 | 8 | 9 | class InstructBLIP(BaseModel): 10 | 11 | INSTALL_REQ = True 12 | INTERLEAVE = False 13 | 14 | def __init__(self, name): 15 | self.config_map = { 16 | 'instructblip_7b': 'misc/blip2_instruct_vicuna7b.yaml', 17 | 'instructblip_13b': 'misc/blip2_instruct_vicuna13b.yaml', 18 | } 19 | 20 | self.file_path = __file__ 21 | config_root = osp.dirname(self.file_path) 22 | 23 | try: 24 | from lavis.models import load_preprocess 25 | from omegaconf import OmegaConf 26 | from lavis.common.registry import registry 27 | except: 28 | warnings.warn('Please install lavis before using InstructBLIP. ') 29 | sys.exit(-1) 30 | 31 | assert name in self.config_map 32 | cfg_path = osp.join(config_root, self.config_map[name]) 33 | cfg = OmegaConf.load(cfg_path) 34 | 35 | model_cfg = cfg.model 36 | assert osp.exists(model_cfg.llm_model) or splitlen(model_cfg.llm_model) == 2 37 | model_cls = registry.get_model_class(name='blip2_vicuna_instruct') 38 | model = model_cls.from_config(model_cfg) 39 | model.eval() 40 | 41 | self.device = torch.device('cuda') if torch.cuda.is_available() else 'cpu' 42 | device = self.device 43 | model.to(device) 44 | self.model = model 45 | self.kwargs = {'max_length': 512} 46 | 47 | preprocess_cfg = cfg.preprocess 48 | vis_processors, _ = load_preprocess(preprocess_cfg) 49 | self.vis_processors = vis_processors 50 | 51 | def generate_inner(self, message, dataset=None): 52 | prompt, image_path = self.message_to_promptimg(message, dataset=dataset) 53 | vis_processors = self.vis_processors 54 | raw_image = Image.open(image_path).convert('RGB') 55 | image_tensor = vis_processors['eval'](raw_image).unsqueeze(0).to(self.device) 56 | outputs = self.model.generate(dict(image=image_tensor, prompt=prompt)) 57 | return outputs[0] 58 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .llava import LLaVA, LLaVA_Next, LLaVA_Next2, LLaVA_OneVision 2 | from .llava_xtuner import LLaVA_XTuner 3 | 4 | __all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner', 'LLaVA_Next2', 'LLaVA_OneVision'] 5 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna13b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "Please set the path to your vicuna-13b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | model: 7 | arch: instruct_vicuna7b 8 | load_finetuned: False 9 | load_pretrained: True 10 | 11 | pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth" 12 | finetuned: "" 13 | 14 | # vit encoder 15 | image_size: 224 16 | drop_path_rate: 0 17 | use_grad_checkpoint: False 18 | vit_precision: "fp16" 19 | freeze_vit: True 20 | 21 | # Q-Former 22 | num_query_token: 32 23 | 24 | # path to Vicuna checkpoint 25 | llm_model: "Please set the path to your vicuna-7b-v1.1" 26 | 27 | # generation configs 28 | prompt: "" 29 | 30 | 31 | preprocess: 32 | vis_processor: 33 | train: 34 | name: "blip2_image_train" 35 | image_size: 224 36 | eval: 37 | name: "blip_image_eval" 38 | image_size: 224 39 | text_processor: 40 | train: 41 | name: "blip_caption" 42 | eval: 43 | name: "blip_caption" 44 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/misc/minigpt4_13b_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: minigpt4 3 | model_type: pretrain_vicuna_7b 4 | max_txt_len: 160 5 | end_sym: "###" 6 | low_resource: True 7 | prompt_template: '###Human: {} ###Assistant: ' 8 | ckpt: "please set this value to the path of pretrained checkpoint" 9 | 10 | # vit encoder 11 | image_size: 224 12 | drop_path_rate: 0 13 | use_grad_checkpoint: False 14 | vit_precision: "fp16" 15 | freeze_vit: True 16 | freeze_qformer: True 17 | 18 | # Q-Former 19 | num_query_token: 32 20 | 21 | # generation configs 22 | prompt: "" 23 | 24 | llama_model: "please set this value to the path of vicuna-13b-v0" 25 | 26 | datasets: 27 | cc_sbu_align: 28 | vis_processor: 29 | train: 30 | name: "blip2_image_eval" 31 | image_size: 224 32 | text_processor: 33 | train: 34 | name: "blip_caption" 35 | 36 | run: 37 | task: image_text_pretrain 38 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/misc/minigpt4_7b_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: minigpt4 3 | model_type: pretrain_vicuna_7b 4 | max_txt_len: 160 5 | end_sym: "###" 6 | low_resource: True 7 | prompt_template: '###Human: {} ###Assistant: ' 8 | ckpt: "please set this value to the path of pretrained checkpoint" 9 | 10 | # vit encoder 11 | image_size: 224 12 | drop_path_rate: 0 13 | use_grad_checkpoint: False 14 | vit_precision: "fp16" 15 | freeze_vit: True 16 | freeze_qformer: True 17 | 18 | # Q-Former 19 | num_query_token: 32 20 | 21 | # generation configs 22 | prompt: "" 23 | 24 | llama_model: "please set this value to the path of vicuna-7b-v0" 25 | 26 | 27 | datasets: 28 | cc_sbu_align: 29 | vis_processor: 30 | train: 31 | name: "blip2_image_eval" 32 | image_size: 224 33 | text_processor: 34 | train: 35 | name: "blip_caption" 36 | 37 | run: 38 | task: image_text_pretrain 39 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/misc/minigptv2_eval.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: minigpt_v2 3 | model_type: pretrain 4 | max_txt_len: 160 5 | end_sym: "" 6 | low_resource: True 7 | prompt_template: '[INST] {} [/INST]' 8 | ckpt: "please set this value to the path of pretrained checkpoint" 9 | lora_r: 64 10 | lora_alpha: 16 11 | 12 | # vit encoder 13 | image_size: 448 14 | drop_path_rate: 0 15 | use_grad_checkpoint: False 16 | vit_precision: "fp16" 17 | freeze_vit: True 18 | 19 | # generation configs 20 | prompt: "" 21 | 22 | # LLM 23 | llama_model: "please set this value to the path of llama2-chat-7b" 24 | 25 | datasets: 26 | cc_sbu_align: 27 | vis_processor: 28 | train: 29 | name: "blip2_image_eval" 30 | image_size: 448 31 | text_processor: 32 | train: 33 | name: "blip_caption" 34 | 35 | run: 36 | task: image_text_pretrain 37 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/mixsense.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import transformers 3 | from transformers import AutoModelForCausalLM, AutoTokenizer 4 | from PIL import Image 5 | import warnings 6 | 7 | from .base import BaseModel 8 | from ..smp import * 9 | 10 | 11 | class LLama3Mixsense(BaseModel): 12 | 13 | INSTALL_REQ = False 14 | INTERLEAVE = False 15 | 16 | def __init__(self, model_path='Zero-Vision/Llama-3-MixSenseV1_1', **kwargs): 17 | assert model_path is not None 18 | transformers.logging.set_verbosity_error() 19 | transformers.logging.disable_progress_bar() 20 | warnings.filterwarnings('ignore') 21 | self.tokenizer = AutoTokenizer.from_pretrained( 22 | model_path, trust_remote_code=True 23 | ) 24 | self.model = AutoModelForCausalLM.from_pretrained( 25 | model_path, trust_remote_code=True 26 | ).to('cuda').eval() 27 | self.kwargs = kwargs 28 | 29 | def generate_inner(self, message, dataset=None): 30 | prompt, image_path = self.message_to_promptimg(message) 31 | input_ids = self.model.text_process(prompt, self.tokenizer).to(device='cuda') 32 | image = Image.open(image_path).convert('RGB') 33 | image_tensor = self.model.image_process([image]).to(dtype=self.model.dtype, device='cuda') 34 | # generate 35 | with torch.inference_mode(): 36 | output_ids = self.model.generate( 37 | input_ids, 38 | images=image_tensor, 39 | max_new_tokens=2048, 40 | use_cache=True, 41 | eos_token_id=[ 42 | self.tokenizer.eos_token_id, 43 | self.tokenizer.convert_tokens_to_ids(['<|eot_id|>'])[0], 44 | ], 45 | ) 46 | return self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() 47 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/paligemma.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | import torch 3 | 4 | from .base import BaseModel 5 | from ..smp import * 6 | 7 | 8 | class PaliGemma(BaseModel): 9 | INSTALL_REQ = False 10 | INTERLEAVE = False 11 | 12 | def __init__(self, model_path='google/paligemma-3b-mix-448', **kwargs): 13 | try: 14 | from transformers import AutoProcessor, PaliGemmaForConditionalGeneration 15 | except: 16 | warnings.warn('Please install the latest version transformers.') 17 | sys.exit(-1) 18 | model = PaliGemmaForConditionalGeneration.from_pretrained( 19 | model_path, 20 | torch_dtype=torch.bfloat16, 21 | device_map='cpu', 22 | revision='bfloat16', 23 | ).eval() 24 | self.model = model.cuda() 25 | self.processor = AutoProcessor.from_pretrained(model_path) 26 | self.kwargs = kwargs 27 | 28 | def generate_inner(self, message, dataset=None): 29 | prompt, image_path = self.message_to_promptimg(message, dataset=dataset) 30 | image = Image.open(image_path).convert('RGB') 31 | 32 | model_inputs = self.processor( 33 | text=prompt, images=image, return_tensors='pt' 34 | ).to('cuda') 35 | input_len = model_inputs['input_ids'].shape[-1] 36 | 37 | with torch.inference_mode(): 38 | generation = self.model.generate( 39 | **model_inputs, max_new_tokens=512, do_sample=False 40 | ) 41 | generation = generation[0][input_len:] 42 | res = self.processor.decode(generation, skip_special_tokens=True) 43 | return res 44 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/qwen2_vl/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import Qwen2VLChat 2 | from .prompt import Qwen2VLPromptMixin 3 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/video_llm/__init__.py: -------------------------------------------------------------------------------- 1 | from .video_llava import VideoLLaVA, VideoLLaVA_HF 2 | from .videochat2 import VideoChat2_HD 3 | from .chat_uni_vi import Chatunivi 4 | from .video_chatgpt import VideoChatGPT 5 | from .llama_vid import LLaMAVID 6 | from .pllava import PLLaVA 7 | 8 | __all__ = ['VideoLLaVA', 'VideoLLaVA_HF', 'Chatunivi', 'VideoChatGPT', 'LLaMAVID', 'VideoChat2_HD', 'PLLaVA'] 9 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/video_llm/configs/llama_vid/processor/clip-patch14-224/preprocessor_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "crop_size": 224, 3 | "do_center_crop": true, 4 | "do_normalize": true, 5 | "do_resize": true, 6 | "feature_extractor_type": "CLIPFeatureExtractor", 7 | "image_mean": [ 8 | 0.48145466, 9 | 0.4578275, 10 | 0.40821073 11 | ], 12 | "image_std": [ 13 | 0.26862954, 14 | 0.26130258, 15 | 0.27577711 16 | ], 17 | "resample": 3, 18 | "size": 224 19 | } 20 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/video_llm/configs/videochat2_hd.json: -------------------------------------------------------------------------------- 1 | { 2 | "model": { 3 | "model_cls": "VideoChat2_it_hd_mistral", 4 | "vit_blip_model_path": "OpenGVLab/videochat2", 5 | "mistral_model_path": "mistralai/Mistral-7B-Instruct-v0.2", 6 | "videochat2_model_path": "OpenGVLab/VideoChat2_stage2_Mistral_7B", 7 | "freeze_vit": false, 8 | "freeze_qformer": false, 9 | "max_txt_len": 512, 10 | "low_resource": false, 11 | "vision_encoder": { 12 | "name": "vit_l14", 13 | "img_size": 224, 14 | "patch_size": 16, 15 | "d_model": 1024, 16 | "encoder_embed_dim": 1024, 17 | "encoder_depth": 24, 18 | "encoder_num_heads": 16, 19 | "drop_path_rate": 0.0, 20 | "num_frames": 8, 21 | "tubelet_size": 1, 22 | "use_checkpoint": true, 23 | "checkpoint_num": 18, 24 | "pretrained": "", 25 | "return_index": -2, 26 | "vit_add_ln": true, 27 | "ckpt_num_frame": 4 28 | }, 29 | "num_query_token": 32, 30 | "qformer_hidden_dropout_prob": 0.1, 31 | "qformer_attention_probs_dropout_prob": 0.1, 32 | "qformer_drop_path_rate": 0.2, 33 | "extra_num_query_token": 64, 34 | "qformer_text_input": true, 35 | "system": "", 36 | "start_token": "", 38 | "add_second_msg": true, 39 | "img_start_token": "", 40 | "img_end_token": "", 41 | "random_shuffle": true, 42 | "return_question_instruction": false, 43 | "use_flash_attention": true, 44 | "use_lora": false, 45 | "lora_r": 16, 46 | "lora_alpha": 32, 47 | "lora_dropout": 0.1, 48 | "dynamic_config": { 49 | "local_size": 224, 50 | "hd_num": 6, 51 | "padding": false, 52 | "add_global": true 53 | } 54 | }, 55 | "device": "cuda" 56 | } 57 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/visualglm.py: -------------------------------------------------------------------------------- 1 | import warnings 2 | from .base import BaseModel 3 | from ..smp import * 4 | 5 | 6 | class VisualGLM(BaseModel): 7 | 8 | INSTALL_REQ = False 9 | INTERLEAVE = False 10 | 11 | def __init__(self, model_path='THUDM/visualglm-6b', **kwargs): 12 | try: 13 | import sat 14 | except: 15 | warnings.warn('Please install SwissArmyTransformer to use VisualGLM') 16 | assert model_path is not None 17 | self.model_path = model_path 18 | 19 | from transformers import AutoModel 20 | from transformers import AutoTokenizer 21 | self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) 22 | model = AutoModel.from_pretrained(model_path, trust_remote_code=True).half().cuda() 23 | self.model = model 24 | self.kwargs = kwargs 25 | warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') 26 | 27 | def generate_inner(self, message, dataset=None): 28 | prompt, image_path = self.message_to_promptimg(message, dataset=dataset) 29 | output, _ = self.model.chat( 30 | image_path=image_path, 31 | tokenizer=self.tokenizer, 32 | query=prompt, 33 | history=[], 34 | **self.kwargs 35 | ) 36 | return output 37 | -------------------------------------------------------------------------------- /VLMEvalKit/vlmeval/vlm/xcomposer/__init__.py: -------------------------------------------------------------------------------- 1 | from .sharecaptioner import ShareCaptioner 2 | from .xcomposer import XComposer 3 | from .xcomposer2 import XComposer2 4 | from .xcomposer2_4KHD import XComposer2_4KHD 5 | from .xcomposer2d5 import XComposer2d5 6 | 7 | __all__ = ['ShareCaptioner', 'XComposer', 'XComposer2', 'XComposer2_4KHD', 'XComposer2d5'] 8 | -------------------------------------------------------------------------------- /cog.yaml: -------------------------------------------------------------------------------- 1 | # Configuration for Cog ⚙️ 2 | # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md 3 | 4 | build: 5 | gpu: true 6 | 7 | python_version: "3.11" 8 | 9 | python_packages: 10 | - "torch==2.0.1" 11 | - "accelerate==0.21.0" 12 | - "bitsandbytes==0.41.0" 13 | - "deepspeed==0.9.5" 14 | - "einops-exts==0.0.4" 15 | - "einops==0.6.1" 16 | - "gradio==3.35.2" 17 | - "gradio_client==0.2.9" 18 | - "httpx==0.24.0" 19 | - "markdown2==2.4.10" 20 | - "numpy==1.26.0" 21 | - "peft==0.4.0" 22 | - "scikit-learn==1.2.2" 23 | - "sentencepiece==0.1.99" 24 | - "shortuuid==1.0.11" 25 | - "timm==0.6.13" 26 | - "tokenizers==0.13.3" 27 | - "torch==2.0.1" 28 | - "torchvision==0.15.2" 29 | - "transformers==4.31.0" 30 | - "wandb==0.15.12" 31 | - "wavedrom==2.0.3.post3" 32 | - "Pygments==2.16.1" 33 | run: 34 | - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget 35 | 36 | # predict.py defines how predictions are run on your model 37 | predict: "predict.py:Predictor" 38 | -------------------------------------------------------------------------------- /doc/HiWin.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/doc/HiWin.png -------------------------------------------------------------------------------- /doc/arch.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/doc/arch.png -------------------------------------------------------------------------------- /doc/pyramid.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/doc/pyramid.png -------------------------------------------------------------------------------- /eval.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # mkdir -p "./exp_results/$1" 4 | # echo 'made a dir ./exp_results/'$1 5 | 6 | NVIDIA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 7 | 8 | CUDA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES bash scripts/v1_5/eval/mme.sh $1 #> ./exp_results/$1/mme_result.log 9 | echo 'mme done' 10 | 11 | CUDA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES bash scripts/v1_5/eval/ai2d.sh $1 #> ./exp_results/$1/ai2d_result.log 12 | echo 'ai2d done' 13 | 14 | CUDA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES bash scripts/v1_5/eval/docvqa_val.sh $1 #> ./exp_results/$1/docvqa_eval_result.log 15 | echo 'doc done' 16 | 17 | CUDA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES bash scripts/v1_5/eval/chartqa.sh $1 #> ./exp_results/$1/chartqa_result.log 18 | echo 'chart done' 19 | 20 | # traditional 21 | CUDA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES bash scripts/v1_5/eval/textvqa.sh $1 #> ./exp_results/$1/textvqa_result.log 22 | echo 'textvqa done' 23 | 24 | CUDA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES bash scripts/v1_5/eval/gqa.sh $1 #> ./exp_results/$1/gqa_result.log 25 | echo 'gqa done' 26 | 27 | CUDA_VISIBLE_DEVICES=$NVIDIA_VISIBLE_DEVICES bash scripts/v1_5/eval/sqa.sh $1 #> ./exp_results/$1/scienceqa_result.log 28 | echo 'sqa done' 29 | 30 | echo 'All eval done, exiting successfully.' -------------------------------------------------------------------------------- /featup/__init__.py: -------------------------------------------------------------------------------- 1 | from featup.upsamplers import JBULearnedRange -------------------------------------------------------------------------------- /featup/adaptive_conv_cuda/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/featup/adaptive_conv_cuda/__init__.py -------------------------------------------------------------------------------- /featup/adaptive_conv_cuda/adaptive_conv.py: -------------------------------------------------------------------------------- 1 | from torch.autograd import Function 2 | import torch 3 | 4 | import adaptive_conv_cuda_impl as cuda_impl 5 | import adaptive_conv_cpp_impl as cpp_impl 6 | 7 | torch.manual_seed(42) 8 | 9 | 10 | class AdaptiveConv(Function): 11 | 12 | @staticmethod 13 | def forward(ctx, input, filters): 14 | ctx.save_for_backward(filters, input) 15 | b, h2, w2, f1, f2 = filters.shape 16 | assert f1 == f2 17 | 18 | if input.is_cuda: 19 | assert filters.is_cuda 20 | result = cuda_impl.forward(input, filters) 21 | else: 22 | result = cpp_impl.forward(input, filters) 23 | 24 | return result 25 | 26 | @staticmethod 27 | def backward(ctx, grad_output): 28 | filters, input = ctx.saved_tensors 29 | grad_input = grad_filters = None 30 | b, h2, w2, f1, f2 = filters.shape 31 | assert f1 == f2 32 | 33 | grad_output = grad_output.contiguous() 34 | if grad_output.is_cuda: 35 | assert input.is_cuda 36 | assert filters.is_cuda 37 | if ctx.needs_input_grad[0]: 38 | grad_input = cuda_impl.grad_input(grad_output, filters) 39 | if ctx.needs_input_grad[1]: 40 | grad_filters = cuda_impl.grad_filters(grad_output, input) 41 | else: 42 | if ctx.needs_input_grad[0]: 43 | grad_input = cpp_impl.grad_input(grad_output, filters) 44 | if ctx.needs_input_grad[1]: 45 | grad_filters = cpp_impl.grad_filters(grad_output, input) 46 | 47 | return grad_input, grad_filters 48 | -------------------------------------------------------------------------------- /featup/adaptive_conv_cuda/adaptive_conv_cuda.cpp: -------------------------------------------------------------------------------- 1 | #include 2 | using torch::Tensor; 3 | 4 | // CUDA forward declarations 5 | 6 | Tensor adaptive_conv_cuda_forward(Tensor input, Tensor filters); 7 | Tensor adaptive_conv_cuda_grad_input(Tensor grad_output, Tensor filters); 8 | Tensor adaptive_conv_cuda_grad_filters(Tensor grad_output, Tensor input); 9 | 10 | // C++ interface 11 | 12 | // NOTE: AT_ASSERT has become AT_CHECK on master after 0.4. 13 | #define CHECK_CUDA(x) AT_ASSERTM(x.type().is_cuda(), #x " must be a CUDA tensor") 14 | #define CHECK_CONTIGUOUS(x) AT_ASSERTM(x.is_contiguous(), #x " must be contiguous") 15 | #define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) 16 | 17 | Tensor adaptive_conv_forward(Tensor input, Tensor filters) { 18 | //CHECK_INPUT(input); 19 | //CHECK_INPUT(filters); 20 | return adaptive_conv_cuda_forward(input, filters); 21 | } 22 | 23 | Tensor adaptive_conv_grad_input(Tensor grad_output, Tensor filters) { 24 | //CHECK_INPUT(grad_output); 25 | //CHECK_INPUT(filters); 26 | return adaptive_conv_cuda_grad_input(grad_output, filters); 27 | } 28 | 29 | Tensor adaptive_conv_grad_filters(Tensor grad_output, Tensor input) { 30 | //CHECK_INPUT(grad_output); 31 | //CHECK_INPUT(input); 32 | return adaptive_conv_cuda_grad_filters(grad_output, input); 33 | } 34 | 35 | PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { 36 | m.def("forward", &adaptive_conv_forward, "adaptive_conv forward"); 37 | m.def("grad_input", &adaptive_conv_grad_input, "adaptive_conv grad_input"); 38 | m.def("grad_filters", &adaptive_conv_grad_filters, "adaptive_conv grad_filters"); 39 | } 40 | -------------------------------------------------------------------------------- /featup/configs/vdim_upsampler.yaml: -------------------------------------------------------------------------------- 1 | # Environment Args 2 | output_root: '.' 3 | pytorch_data_dir: './datasets' 4 | submitting_to_aml: false 5 | 6 | # Dataset args 7 | train_dataset: "cocostuff" 8 | val_dataset: "coco_validation50" 9 | res: 336 #224 or 336 10 | 11 | # Model Args 12 | model_type: "clip-large" #vit or clip-large 13 | activation_type: "token" 14 | is_norm: False 15 | is_high_res: False 16 | dim: 1024 #384 or 1024 17 | 18 | # Upsampling args 19 | outlier_detection: True 20 | upsampler_type: "jbu_4x_stack" 21 | downsampler_type: "attention" 22 | max_pad: 30 23 | max_zoom: 2 24 | n_jitters: 2 25 | random_projection: 30 26 | crf_weight: 0.001 27 | filter_ent_weight: 0.0 28 | tv_weight: 0.0 29 | 30 | implicit_sup_weight: 1.0 31 | 32 | # Training args 33 | batch_size: 2 34 | epochs: 1 35 | num_gpus: 8 36 | num_workers: 24 37 | lr: 1e-3 38 | 39 | # No need to change 40 | hydra: 41 | run: 42 | dir: "." 43 | output_subdir: ~ 44 | 45 | -------------------------------------------------------------------------------- /featup/datasets/DAVIS.py: -------------------------------------------------------------------------------- 1 | from torchvision import transforms 2 | import os 3 | from PIL import Image 4 | from torch.utils.data import Dataset 5 | 6 | 7 | class DAVIS(Dataset): 8 | def __init__(self, root, video_name, transform=None): 9 | """ 10 | Args: 11 | root (string): Directory with all the videos. 12 | video_name (string): Name of the specific video. 13 | transform (callable, optional): Optional transform to be applied on a sample. 14 | """ 15 | self.root_dir = os.path.join(root, "DAVIS/JPEGImages/480p/", video_name) 16 | self.frames = os.listdir(self.root_dir) 17 | self.transform = transform 18 | 19 | def __len__(self): 20 | return len(self.frames) 21 | 22 | def __getitem__(self, idx): 23 | img_path = os.path.join(self.root_dir, self.frames[idx]) 24 | image = Image.open(img_path).convert("RGB") 25 | 26 | if self.transform: 27 | image = self.transform(image) 28 | 29 | return {"img": image, "img_path": img_path} 30 | 31 | 32 | if __name__ == "__main__": 33 | transform = transforms.Compose([ 34 | transforms.Resize((256, 256)), 35 | transforms.ToTensor() 36 | ]) 37 | 38 | davis_dataset = DAVIS(root='/pytorch-data', video_name="motocross-jump", transform=transform) 39 | 40 | frames = davis_dataset[0] 41 | 42 | print("here") 43 | -------------------------------------------------------------------------------- /featup/datasets/DOC.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | 3 | import numpy as np 4 | import torch 5 | import torch.multiprocessing 6 | from PIL import Image 7 | from torch.utils.data import Dataset 8 | 9 | class Doc(Dataset): 10 | def __init__(self, 11 | root, 12 | split, 13 | transform, 14 | target_transform, 15 | subset=None): 16 | super(Doc, self).__init__() 17 | self.split = split 18 | self.root = join(root, "Doc") 19 | self.transform = transform 20 | self.label_transform = target_transform 21 | self.subset = subset 22 | 23 | if self.subset is None: 24 | self.image_list = "Doc20000.txt" 25 | elif self.subset == 'Doc_validation50': 26 | self.image_list = "Doc_validation50.txt" 27 | 28 | assert self.split in ["train", "val", "train+val"] 29 | split_dirs = { 30 | "train": ["train"], 31 | "val": ["val"], 32 | "train+val": ["train", "val"] 33 | } 34 | 35 | self.image_files = [] 36 | for split_dir in split_dirs[self.split]: 37 | with open(join(self.root, "curated", self.image_list), "r") as f: 38 | img_names = [fn.rstrip() for fn in f.readlines()] 39 | for img_name in img_names: 40 | self.image_files.append(join(self.root, "images", img_name)) 41 | 42 | def __len__(self): 43 | return len(self.image_files) 44 | 45 | def __getitem__(self, index): 46 | image_path = self.image_files[index] 47 | batch = {} 48 | img = self.transform(Image.open(image_path).convert("RGB")) 49 | batch["img"] = img 50 | batch["img_path"] = image_path 51 | return batch 52 | -------------------------------------------------------------------------------- /featup/datasets/DocSceneText.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | 3 | import numpy as np 4 | import torch 5 | import torch.multiprocessing 6 | from PIL import Image 7 | from torch.utils.data import Dataset 8 | 9 | class DocSceneText(Dataset): 10 | def __init__(self, 11 | root, 12 | split, 13 | transform, 14 | target_transform, 15 | subset=None): 16 | super(DocSceneText, self).__init__() 17 | self.split = split 18 | self.root = join(root, "224DocSceneText") 19 | self.transform = transform 20 | self.label_transform = target_transform 21 | self.subset = subset 22 | 23 | if self.subset is None: 24 | self.image_list = "224docSceneText.txt" 25 | 26 | self.image_files = [] 27 | with open(join(self.root, "curated", self.image_list), "r") as f: 28 | img_names = [fn.rstrip() for fn in f.readlines()] 29 | for img_name in img_names: 30 | self.image_files.append(join(self.root, img_name)) 31 | 32 | def __len__(self): 33 | return len(self.image_files) 34 | 35 | def __getitem__(self, index): 36 | image_path = self.image_files[index] 37 | batch = {} 38 | img = self.transform(Image.open(image_path).convert("RGB")) 39 | batch["img"] = img 40 | batch["img_path"] = image_path 41 | return batch 42 | -------------------------------------------------------------------------------- /featup/datasets/EmbeddingFile.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class EmbeddingFile(Dataset): 6 | """ 7 | modified from: https://pytorch.org/docs/stable/_modules/torchvision/datasets/folder.html#ImageFolder 8 | uses cached directory listing if available rather than walking directory 9 | Attributes: 10 | classes (list): List of the class names. 11 | class_to_idx (dict): Dict with items (class_name, class_index). 12 | samples (list): List of (sample path, class_index) tuples 13 | targets (list): The class_index value for each image in the dataset 14 | """ 15 | 16 | def __init__(self, file, with_images=False): 17 | super(Dataset, self).__init__() 18 | self.file = file 19 | loaded = np.load(file) 20 | self.feats = loaded["feats"] 21 | self.labels = loaded["labels"] 22 | self.imgs = None 23 | self.with_images = with_images 24 | if with_images: 25 | self.imgs = loaded["images"] 26 | else: 27 | self.imgs = None 28 | 29 | def dim(self): 30 | return self.feats.shape[1] 31 | 32 | def num_classes(self): 33 | return self.labels.max() + 1 34 | 35 | def __getitem__(self, index): 36 | if self.imgs is not None: 37 | return self.feats[index], self.labels[index], self.imgs[index] 38 | return self.feats[index], self.labels[index] 39 | 40 | def __len__(self): 41 | return len(self.labels) 42 | 43 | 44 | class EmbeddingAndImage(Dataset): 45 | def __init__(self, file, dataset): 46 | super(Dataset, self).__init__() 47 | self.file = file 48 | loaded = np.load(file) 49 | self.feats = loaded["feats"] 50 | self.labels = loaded["labels"] 51 | self.imgs = dataset 52 | 53 | def dim(self): 54 | return self.feats.shape[1] 55 | 56 | def num_classes(self): 57 | return self.labels.max() + 1 58 | 59 | def __getitem__(self, index): 60 | return self.feats[index], self.labels[index], self.imgs[index] 61 | 62 | def __len__(self): 63 | return len(self.labels) 64 | -------------------------------------------------------------------------------- /featup/datasets/HTML.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | 3 | import numpy as np 4 | import torch 5 | import torch.multiprocessing 6 | from PIL import Image 7 | from torch.utils.data import Dataset 8 | 9 | class HTML(Dataset): 10 | def __init__(self, 11 | root, 12 | split, 13 | transform, 14 | target_transform, 15 | subset=None): 16 | super(HTML, self).__init__() 17 | self.split = split 18 | self.root = join(root, "HTML") 19 | self.transform = transform 20 | self.label_transform = target_transform 21 | self.subset = subset 22 | 23 | if self.subset is None: 24 | self.image_list = "HTML20000.txt" 25 | elif self.subset == 'HTML_validation50': 26 | self.image_list = "HTML_validation50.txt" 27 | 28 | assert self.split in ["train", "val", "train+val"] 29 | split_dirs = { 30 | "train": ["train"], 31 | "val": ["val"], 32 | "train+val": ["train", "val"] 33 | } 34 | 35 | self.image_files = [] 36 | for split_dir in split_dirs[self.split]: 37 | with open(join(self.root, "curated", self.image_list), "r") as f: 38 | img_names = [fn.rstrip() for fn in f.readlines()] 39 | for img_name in img_names: 40 | self.image_files.append(join(self.root, "images", img_name)) 41 | 42 | def __len__(self): 43 | return len(self.image_files) 44 | 45 | def __getitem__(self, index): 46 | image_path = self.image_files[index] 47 | batch = {} 48 | img = self.transform(Image.open(image_path).convert("RGB")) 49 | batch["img"] = img 50 | batch["img_path"] = image_path 51 | return batch 52 | -------------------------------------------------------------------------------- /featup/datasets/JitteredImage.py: -------------------------------------------------------------------------------- 1 | import random 2 | 3 | import torch 4 | import torch.nn.functional as F 5 | from torch.utils.data import Dataset 6 | 7 | 8 | def apply_jitter(img, max_pad, transform_params): 9 | h, w = img.shape[2:] 10 | 11 | padded = F.pad(img, [max_pad] * 4, mode="reflect") 12 | 13 | zoom = transform_params["zoom"].item() 14 | x = transform_params["x"].item() 15 | y = transform_params["y"].item() 16 | flip = transform_params["flip"].item() 17 | 18 | if zoom > 1.0: 19 | zoomed = F.interpolate(padded, scale_factor=zoom, mode="bilinear") 20 | else: 21 | zoomed = padded 22 | 23 | cropped = zoomed[:, :, x:h + x, y:w + y] 24 | 25 | if flip: 26 | return torch.flip(cropped, [3]) 27 | else: 28 | return cropped 29 | 30 | 31 | def sample_transform(use_flips, max_pad, max_zoom, h, w): 32 | if use_flips: 33 | flip = random.random() > .5 34 | else: 35 | flip = False 36 | 37 | apply_zoom = random.random() > .5 38 | if apply_zoom: 39 | zoom = random.random() * (max_zoom - 1) + 1 40 | else: 41 | zoom = 1.0 42 | 43 | valid_area_h = (int((h + max_pad * 2) * zoom) - h) + 1 44 | valid_area_w = (int((w + max_pad * 2) * zoom) - w) + 1 45 | 46 | return { 47 | "x": torch.tensor(torch.randint(0, valid_area_h, ()).item()), 48 | "y": torch.tensor(torch.randint(0, valid_area_w, ()).item()), 49 | "zoom": torch.tensor(zoom), 50 | "flip": torch.tensor(flip) 51 | } 52 | 53 | 54 | class JitteredImage(Dataset): 55 | 56 | def __init__(self, img, length, use_flips, max_zoom, max_pad): 57 | self.img = img 58 | self.length = length 59 | self.use_flips = use_flips 60 | self.max_zoom = max_zoom 61 | self.max_pad = max_pad 62 | 63 | def __len__(self): 64 | return self.length 65 | 66 | def __getitem__(self, item): 67 | h, w = self.img.shape[2:] 68 | transform_params = sample_transform(self.use_flips, self.max_pad, self.max_zoom, h, w) 69 | return apply_jitter(self.img, self.max_pad, transform_params).squeeze(0), transform_params 70 | -------------------------------------------------------------------------------- /featup/datasets/SCENE.py: -------------------------------------------------------------------------------- 1 | from os.path import join 2 | 3 | import numpy as np 4 | import torch 5 | import torch.multiprocessing 6 | from PIL import Image 7 | from torch.utils.data import Dataset 8 | 9 | class Scene(Dataset): 10 | def __init__(self, 11 | root, 12 | split, 13 | transform, 14 | target_transform, 15 | subset=None): 16 | super(Scene, self).__init__() 17 | self.split = split 18 | self.root = join(root, "Scene") 19 | self.transform = transform 20 | self.label_transform = target_transform 21 | self.subset = subset 22 | 23 | if self.subset is None: 24 | self.image_list = "Scenepuretext.txt" 25 | elif self.subset == 'Scene_validation50': 26 | self.image_list = "Scene_validation50.txt" 27 | 28 | assert self.split in ["train", "val", "train+val"] 29 | split_dirs = { 30 | "train": ["train"], 31 | "val": ["val"], 32 | "train+val": ["train", "val"] 33 | } 34 | 35 | self.image_files = [] 36 | for split_dir in split_dirs[self.split]: 37 | with open(join(self.root, "curated", self.image_list), "r") as f: 38 | img_names = [fn.rstrip() for fn in f.readlines()] 39 | for img_name in img_names: 40 | self.image_files.append(join(self.root, "puretext", img_name)) 41 | 42 | def __len__(self): 43 | return len(self.image_files) 44 | 45 | def __getitem__(self, index): 46 | image_path = self.image_files[index] 47 | batch = {} 48 | img = self.transform(Image.open(image_path).convert("RGB")) 49 | batch["img"] = img 50 | batch["img_path"] = image_path 51 | return batch 52 | -------------------------------------------------------------------------------- /featup/datasets/SampleImage.py: -------------------------------------------------------------------------------- 1 | from PIL import Image 2 | from torch.utils.data import Dataset 3 | 4 | 5 | class SampleImage(Dataset): 6 | def __init__(self, paths, transform, **kwargs): 7 | self.paths = paths 8 | self.transform = transform 9 | 10 | def __getitem__(self, idx): 11 | image_path = self.paths[idx] 12 | image = Image.open(image_path).convert('RGB') 13 | if self.transform is not None: 14 | image = self.transform(image) 15 | batch = { 16 | "img": image, 17 | "img_path": image_path 18 | } 19 | return batch 20 | 21 | def __len__(self): 22 | return len(self.paths) 23 | -------------------------------------------------------------------------------- /featup/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/featup/datasets/__init__.py -------------------------------------------------------------------------------- /featup/featurizers/CLIP.py: -------------------------------------------------------------------------------- 1 | import clip 2 | import torch 3 | from torch import nn 4 | import os 5 | 6 | class CLIPFeaturizer(nn.Module): 7 | 8 | def __init__(self): 9 | super().__init__() 10 | self.model, self.preprocess = clip.load( 11 | "ViT-B/16", 12 | download_root=os.getenv('TORCH_HOME', os.path.join(os.path.expanduser('~'), '.cache', 'torch')) 13 | ) 14 | self.model.eval() 15 | 16 | def get_cls_token(self, img): 17 | return self.model.encode_image(img).to(torch.float32) 18 | 19 | def forward(self, img): 20 | features = self.model.get_visual_features(img, include_cls=False).to(torch.float32) 21 | return features 22 | 23 | 24 | if __name__ == "__main__": 25 | import torchvision.transforms as T 26 | from PIL import Image 27 | #from shared import norm, crop_to_divisor 28 | 29 | device = "cuda" if torch.cuda.is_available() else "cpu" 30 | 31 | image = Image.open("/home/god/playground/FeatUp/sample-images/bird_full.jpg") 32 | load_size = 224 # * 3 33 | transform = T.Compose([ 34 | T.Resize(load_size, Image.BILINEAR), 35 | T.CenterCrop(load_size), 36 | T.ToTensor() 37 | #lambda x: crop_to_divisor(x, 16), 38 | #norm 39 | ]) 40 | 41 | model = CLIPFeaturizer().cuda() 42 | 43 | token = model.get_cls_token(transform(image).cuda().unsqueeze(0)) #torch.Size([1, 768]) 44 | results = model(transform(image).cuda().unsqueeze(0)) #torch.Size([1, 768, 24, 24]) 45 | 46 | print(clip.available_models()) 47 | -------------------------------------------------------------------------------- /featup/featurizers/CLIPLarge.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | from PIL import Image 4 | import torchvision.transforms as T 5 | from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig 6 | from featup.util import norm 7 | from torchvision.transforms import InterpolationMode 8 | 9 | 10 | #CLIP-ViT-L/14 336 pixel 11 | class CLIPLargeFeaturizer(nn.Module): 12 | 13 | def __init__(self): 14 | super().__init__() 15 | vision_tower_name = 'openai/clip-vit-large-patch14-336' 16 | self.preprocess = CLIPImageProcessor.from_pretrained(vision_tower_name) 17 | self.model = CLIPVisionModel.from_pretrained(vision_tower_name) 18 | self.model.requires_grad_(False) 19 | 20 | def get_cls_token(self, img): 21 | return self.model(img).to(torch.float32).last_hidden_state 22 | 23 | def forward(self, img): 24 | outputs = self.model(img) 25 | last_hidden_states = outputs.last_hidden_state 26 | without_class = last_hidden_states[:, 1:] 27 | #torch.Size([1, 576, 1024]) 28 | features = without_class.permute(0,2,1) 29 | #[1, 1024, 24, 24] 30 | features = features.reshape(len(features), features.shape[1], 24, 24) 31 | return features.to(torch.float32) 32 | 33 | if __name__ == '__main__': 34 | vision_tower_name = 'openai/clip-vit-large-patch14-336' 35 | image = Image.open("/home/god/playground/FeatUp/sample-images/bird_full.jpg") 36 | 37 | transformTest = T.Resize(336, InterpolationMode.BILINEAR) 38 | 39 | test_image = transformTest(image.convert("RGB")) 40 | 41 | 42 | transform = T.Compose([ 43 | T.Resize(336, InterpolationMode.BILINEAR), 44 | T.CenterCrop(336), 45 | T.ToTensor(), 46 | norm]) 47 | 48 | #torch.Size([3, 336, 336]) 49 | transformed_image = transform(image.convert("RGB")).unsqueeze(0).to("cuda") 50 | 51 | 52 | model = CLIPLargeFeaturizer().cuda() 53 | 54 | features = model(transformed_image) 55 | 56 | print(features.shape) 57 | #torch.Size([1, 1024, 24, 24]) 58 | #torch.Size([1, 768, 24, 24]) 59 | -------------------------------------------------------------------------------- /featup/featurizers/DeepLabV3.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class DeepLabV3Featurizer(nn.Module): 5 | def __init__(self, model): 6 | super().__init__() 7 | self.model = model 8 | 9 | def get_cls_token(self, img): 10 | return self.model.forward(img) 11 | 12 | def forward(self, img, layer_num=-1): 13 | return self.model.backbone(img)['out'] 14 | -------------------------------------------------------------------------------- /featup/featurizers/MaskCLIP.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from torch import nn 3 | import os 4 | 5 | from featup.featurizers.maskclip import clip 6 | 7 | 8 | class MaskCLIPFeaturizer(nn.Module): 9 | 10 | def __init__(self): 11 | super().__init__() 12 | self.model, self.preprocess = clip.load( 13 | "ViT-B/16", 14 | download_root=os.getenv('TORCH_HOME', os.path.join(os.path.expanduser('~'), '.cache', 'torch')) 15 | ) 16 | self.model.eval() 17 | self.patch_size = self.model.visual.patch_size 18 | 19 | def forward(self, img): 20 | b, _, input_size_h, input_size_w = img.shape 21 | patch_h = input_size_h // self.patch_size 22 | patch_w = input_size_w // self.patch_size 23 | features = self.model.get_patch_encodings(img).to(torch.float32) 24 | return features.reshape(b, patch_h, patch_w, -1).permute(0, 3, 1, 2) 25 | 26 | 27 | if __name__ == "__main__": 28 | import torchvision.transforms as T 29 | from PIL import Image 30 | from featup.util import norm, unnorm, crop_to_divisor 31 | 32 | device = "cuda" if torch.cuda.is_available() else "cpu" 33 | 34 | image = Image.open("../samples/lex1.jpg") 35 | load_size = 224 # * 3 36 | transform = T.Compose([ 37 | T.Resize(load_size, Image.BILINEAR), 38 | # T.CenterCrop(load_size), 39 | T.ToTensor(), 40 | lambda x: crop_to_divisor(x, 16), 41 | norm]) 42 | 43 | model = MaskCLIPFeaturizer().cuda() 44 | 45 | results = model(transform(image).cuda().unsqueeze(0)) 46 | 47 | print(clip.available_models()) 48 | -------------------------------------------------------------------------------- /featup/featurizers/ResNet.py: -------------------------------------------------------------------------------- 1 | from torch import nn 2 | 3 | 4 | class ResNetFeaturizer(nn.Module): 5 | def __init__(self, model): 6 | super().__init__() 7 | self.model = model 8 | 9 | def get_cls_token(self, img): 10 | return self.model.forward(img) 11 | 12 | def get_layer(self, img, layer_num): 13 | return self.model.get_layer(img, layer_num) 14 | 15 | def forward(self, img, layer_num=-1): 16 | return self.model.get_layer(img, layer_num) 17 | -------------------------------------------------------------------------------- /featup/featurizers/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/featup/featurizers/__init__.py -------------------------------------------------------------------------------- /featup/featurizers/dinov2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/featup/featurizers/dinov2/__init__.py -------------------------------------------------------------------------------- /featup/featurizers/dinov2/layers/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | from .dino_head import DINOHead 7 | from .mlp import Mlp 8 | from .patch_embed import PatchEmbed 9 | from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused 10 | from .block import NestedTensorBlock 11 | from .attention import MemEffAttention 12 | -------------------------------------------------------------------------------- /featup/featurizers/dinov2/layers/dino_head.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | import torch 7 | import torch.nn as nn 8 | from torch.nn.init import trunc_normal_ 9 | from torch.nn.utils import weight_norm 10 | 11 | 12 | class DINOHead(nn.Module): 13 | def __init__( 14 | self, 15 | in_dim, 16 | out_dim, 17 | use_bn=False, 18 | nlayers=3, 19 | hidden_dim=2048, 20 | bottleneck_dim=256, 21 | mlp_bias=True, 22 | ): 23 | super().__init__() 24 | nlayers = max(nlayers, 1) 25 | self.mlp = _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=hidden_dim, use_bn=use_bn, bias=mlp_bias) 26 | self.apply(self._init_weights) 27 | self.last_layer = weight_norm(nn.Linear(bottleneck_dim, out_dim, bias=False)) 28 | self.last_layer.weight_g.data.fill_(1) 29 | 30 | def _init_weights(self, m): 31 | if isinstance(m, nn.Linear): 32 | trunc_normal_(m.weight, std=0.02) 33 | if isinstance(m, nn.Linear) and m.bias is not None: 34 | nn.init.constant_(m.bias, 0) 35 | 36 | def forward(self, x): 37 | x = self.mlp(x) 38 | eps = 1e-6 if x.dtype == torch.float16 else 1e-12 39 | x = nn.functional.normalize(x, dim=-1, p=2, eps=eps) 40 | x = self.last_layer(x) 41 | return x 42 | 43 | 44 | def _build_mlp(nlayers, in_dim, bottleneck_dim, hidden_dim=None, use_bn=False, bias=True): 45 | if nlayers == 1: 46 | return nn.Linear(in_dim, bottleneck_dim, bias=bias) 47 | else: 48 | layers = [nn.Linear(in_dim, hidden_dim, bias=bias)] 49 | if use_bn: 50 | layers.append(nn.BatchNorm1d(hidden_dim)) 51 | layers.append(nn.GELU()) 52 | for _ in range(nlayers - 2): 53 | layers.append(nn.Linear(hidden_dim, hidden_dim, bias=bias)) 54 | if use_bn: 55 | layers.append(nn.BatchNorm1d(hidden_dim)) 56 | layers.append(nn.GELU()) 57 | layers.append(nn.Linear(hidden_dim, bottleneck_dim, bias=bias)) 58 | return nn.Sequential(*layers) 59 | -------------------------------------------------------------------------------- /featup/featurizers/dinov2/layers/drop_path.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | # References: 7 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 8 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py 9 | 10 | 11 | from torch import nn 12 | 13 | 14 | def drop_path(x, drop_prob: float = 0.0, training: bool = False): 15 | if drop_prob == 0.0 or not training: 16 | return x 17 | keep_prob = 1 - drop_prob 18 | shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets 19 | random_tensor = x.new_empty(shape).bernoulli_(keep_prob) 20 | if keep_prob > 0.0: 21 | random_tensor.div_(keep_prob) 22 | output = x * random_tensor 23 | return output 24 | 25 | 26 | class DropPath(nn.Module): 27 | """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" 28 | 29 | def __init__(self, drop_prob=None): 30 | super(DropPath, self).__init__() 31 | self.drop_prob = drop_prob 32 | 33 | def forward(self, x): 34 | return drop_path(x, self.drop_prob, self.training) 35 | -------------------------------------------------------------------------------- /featup/featurizers/dinov2/layers/layer_scale.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py#L103-L110 7 | 8 | from typing import Union 9 | 10 | import torch 11 | from torch import Tensor 12 | from torch import nn 13 | 14 | 15 | class LayerScale(nn.Module): 16 | def __init__( 17 | self, 18 | dim: int, 19 | init_values: Union[float, Tensor] = 1e-5, 20 | inplace: bool = False, 21 | ) -> None: 22 | super().__init__() 23 | self.inplace = inplace 24 | self.gamma = nn.Parameter(init_values * torch.ones(dim)) 25 | 26 | def forward(self, x: Tensor) -> Tensor: 27 | return x.mul_(self.gamma) if self.inplace else x * self.gamma 28 | -------------------------------------------------------------------------------- /featup/featurizers/dinov2/layers/mlp.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # 3 | # This source code is licensed under the Apache License, Version 2.0 4 | # found in the LICENSE file in the root directory of this source tree. 5 | 6 | # References: 7 | # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py 8 | # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py 9 | 10 | 11 | from typing import Callable, Optional 12 | 13 | from torch import Tensor, nn 14 | 15 | 16 | class Mlp(nn.Module): 17 | def __init__( 18 | self, 19 | in_features: int, 20 | hidden_features: Optional[int] = None, 21 | out_features: Optional[int] = None, 22 | act_layer: Callable[..., nn.Module] = nn.GELU, 23 | drop: float = 0.0, 24 | bias: bool = True, 25 | ) -> None: 26 | super().__init__() 27 | out_features = out_features or in_features 28 | hidden_features = hidden_features or in_features 29 | self.fc1 = nn.Linear(in_features, hidden_features, bias=bias) 30 | self.act = act_layer() 31 | self.fc2 = nn.Linear(hidden_features, out_features, bias=bias) 32 | self.drop = nn.Dropout(drop) 33 | 34 | def forward(self, x: Tensor) -> Tensor: 35 | x = self.fc1(x) 36 | x = self.act(x) 37 | x = self.drop(x) 38 | x = self.fc2(x) 39 | x = self.drop(x) 40 | return x 41 | -------------------------------------------------------------------------------- /featup/featurizers/maskclip/README.md: -------------------------------------------------------------------------------- 1 | # CLIP 2 | Modified version of [CLIP](https://github.com/openai/CLIP) with support for dense patch-level feature extraction 3 | (based on [MaskCLIP](https://arxiv.org/abs/2112.01071) parametrization) and interpolation of the positional encoding. 4 | -------------------------------------------------------------------------------- /featup/featurizers/maskclip/__init__.py: -------------------------------------------------------------------------------- 1 | from .clip import * 2 | 3 | """ 4 | Modified from https://github.com/openai/CLIP 5 | """ 6 | -------------------------------------------------------------------------------- /featup/featurizers/maskclip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/featup/featurizers/maskclip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /featup/featurizers/modules/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/featup/featurizers/modules/__init__.py -------------------------------------------------------------------------------- /featup/plotting.py: -------------------------------------------------------------------------------- 1 | import matplotlib.pyplot as plt 2 | from featup.util import pca, remove_axes 3 | from featup.featurizers.maskclip.clip import tokenize 4 | from pytorch_lightning import seed_everything 5 | import torch 6 | import torch.nn.functional as F 7 | 8 | 9 | @torch.no_grad() 10 | def plot_feats(image, lr, hr): 11 | assert len(image.shape) == len(lr.shape) == len(hr.shape) == 3 12 | seed_everything(0) 13 | [lr_feats_pca, hr_feats_pca], _ = pca([lr.unsqueeze(0), hr.unsqueeze(0)]) 14 | fig, ax = plt.subplots(1, 3, figsize=(15, 5)) 15 | ax[0].imshow(image.permute(1, 2, 0).detach().cpu()) 16 | ax[0].set_title("Image") 17 | ax[1].imshow(lr_feats_pca[0].permute(1, 2, 0).detach().cpu()) 18 | ax[1].set_title("Original Features") 19 | ax[2].imshow(hr_feats_pca[0].permute(1, 2, 0).detach().cpu()) 20 | ax[2].set_title("Upsampled Features") 21 | remove_axes(ax) 22 | plt.show() 23 | 24 | 25 | @torch.no_grad() 26 | def plot_lang_heatmaps(model, image, lr_feats, hr_feats, text_query): 27 | assert len(image.shape) == len(lr_feats.shape) == len(hr_feats.shape) == 3 28 | fig, ax = plt.subplots(1, 3, figsize=(15, 5)) 29 | cmap = plt.get_cmap("turbo") 30 | 31 | # encode query 32 | text = tokenize(text_query).to(lr_feats.device) 33 | text_feats = model.model.encode_text(text).squeeze().to(torch.float32) 34 | assert len(text_feats.shape) == 1 35 | 36 | lr_sims = torch.einsum( 37 | "chw,c->hw", F.normalize(lr_feats.to(torch.float32), dim=0), F.normalize(text_feats, dim=0)) 38 | hr_sims = torch.einsum( 39 | "chw,c->hw", F.normalize(hr_feats.to(torch.float32), dim=0), F.normalize(text_feats, dim=0)) 40 | 41 | lr_sims_norm = (lr_sims - lr_sims.min()) / (lr_sims.max() - lr_sims.min()) 42 | hr_sims_norm = (hr_sims - hr_sims.min()) / (hr_sims.max() - hr_sims.min()) 43 | lr_heatmap = cmap(lr_sims_norm.cpu().numpy()) 44 | hr_heatmap = cmap(hr_sims_norm.cpu().numpy()) 45 | 46 | ax[0].imshow(image.permute(1, 2, 0).detach().cpu()) 47 | ax[0].set_title("Image") 48 | ax[1].imshow(lr_heatmap) 49 | ax[1].set_title(f"Original Similarity to \"{text_query}\"") 50 | ax[2].imshow(hr_heatmap) 51 | ax[2].set_title(f"Upsampled Similarity to \"{text_query}\"") 52 | remove_axes(ax) 53 | 54 | return plt.show() 55 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #install 2 | pip install imgaug 3 | pip install openpyxl 4 | 5 | pip install --upgrade pip # enable PEP 660 support 6 | pip install torch==2.1.2 7 | pip install -e . 8 | 9 | pip install -e ".[train]" 10 | pip install flash-attn --no-build-isolation 11 | -------------------------------------------------------------------------------- /llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /llava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | -------------------------------------------------------------------------------- /llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | AVAILABLE_MODELS = { 4 | "llava_llama": "LlavaLlamaForCausalLM, LlavaConfig", 5 | "llava_qwen": "LlavaQwenForCausalLM, LlavaQwenConfig", 6 | "llava_mistral": "LlavaMistralForCausalLM, LlavaMistralConfig", 7 | "llava_mixtral": "LlavaMixtralForCausalLM, LlavaMixtralConfig", 8 | # "llava_qwen_moe": "LlavaQwenMoeForCausalLM, LlavaQwenMoeConfig", 9 | # Add other models as needed 10 | } 11 | 12 | for model_name, model_classes in AVAILABLE_MODELS.items(): 13 | try: 14 | exec(f"from .language_model.{model_name} import {model_classes}") 15 | except Exception as e: 16 | print(f"Failed to import {model_name} from llava.language_model.{model_name}. Error: {e}") 17 | -------------------------------------------------------------------------------- /llava/model/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta 4 | """ 5 | 6 | import argparse 7 | 8 | import torch 9 | from tqdm import tqdm 10 | from transformers import AutoTokenizer, AutoModelForCausalLM 11 | from llava import LlavaLlamaForCausalLM 12 | 13 | 14 | def apply_delta(base_model_path, target_model_path, delta_path): 15 | print("Loading base model") 16 | base = AutoModelForCausalLM.from_pretrained(base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading delta") 19 | delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 20 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) 21 | 22 | print("Applying delta") 23 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): 24 | if name not in base.state_dict(): 25 | assert name in ["model.mm_projector.weight", "model.mm_projector.bias"], f"{name} not in base model" 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data += base.state_dict()[name] 29 | else: 30 | assert name in ["model.embed_tokens.weight", "lm_head.weight"], f"{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}" 31 | bparam = base.state_dict()[name] 32 | param.data[: bparam.shape[0], : bparam.shape[1]] += bparam 33 | 34 | print("Saving target model") 35 | delta.save_pretrained(target_model_path) 36 | delta_tokenizer.save_pretrained(target_model_path) 37 | 38 | 39 | if __name__ == "__main__": 40 | parser = argparse.ArgumentParser() 41 | parser.add_argument("--base-model-path", type=str, required=True) 42 | parser.add_argument("--target-model-path", type=str, required=True) 43 | parser.add_argument("--delta-path", type=str, required=True) 44 | 45 | args = parser.parse_args() 46 | 47 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 48 | -------------------------------------------------------------------------------- /llava/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | 6 | import argparse 7 | 8 | import torch 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava.model import * 11 | from llava.model.utils import auto_upgrade 12 | 13 | 14 | def consolidate_ckpt(src_path, dst_path): 15 | print("Loading model") 16 | auto_upgrade(src_path) 17 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 18 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 19 | src_model.save_pretrained(dst_path) 20 | src_tokenizer.save_pretrained(dst_path) 21 | 22 | 23 | if __name__ == "__main__": 24 | parser = argparse.ArgumentParser() 25 | parser.add_argument("--src", type=str, required=True) 26 | parser.add_argument("--dst", type=str, required=True) 27 | 28 | args = parser.parse_args() 29 | 30 | consolidate_ckpt(args.src, args.dst) 31 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .imagebind import ImageBindWrapper 3 | from .open_clip_encoder import OpenCLIPVisionTower 4 | from .hf_vision import HFVisionTower 5 | from .siglip_encoder import SigLipVisionTower 6 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2 7 | 8 | # from .eva_clip.eva_clip_encoder import EvaClipVisionTower 9 | # from .dev_eva_clip.eva_vit import EvaViTWrapper 10 | 11 | 12 | def build_vision_tower(vision_tower_cfg, **kwargs): 13 | vision_tower = getattr(vision_tower_cfg, "mm_vision_tower", getattr(vision_tower_cfg, "vision_tower", None)) 14 | is_absolute_path_exists = os.path.exists(vision_tower) 15 | use_s2 = getattr(vision_tower_cfg, "s2", False) 16 | if "siglip" in vision_tower: 17 | return SigLipVisionTower(vision_tower, vision_tower_cfg=vision_tower_cfg, **kwargs) 18 | elif is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 19 | if use_s2: 20 | return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) 21 | else: 22 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 23 | elif vision_tower.startswith("hf:"): 24 | return HFVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 25 | elif vision_tower in ["imagebind_huge"]: 26 | return ImageBindWrapper(vision_tower, args=vision_tower_cfg, **kwargs) 27 | elif vision_tower.startswith("open_clip_hub"): 28 | return OpenCLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 29 | # elif "internal-eva" in vision_tower.lower() or "eva02" in vision_tower.lower(): 30 | # return EvaClipVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 31 | # elif vision_tower in ["EVA-CLIP-8B", "EVA-CLIP-8B-plus"]: 32 | # return EvaViTWrapper(vision_tower, args=vision_tower_cfg, **kwargs) 33 | 34 | raise ValueError(f"Unknown vision tower: {vision_tower}") 35 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/__init__.py: -------------------------------------------------------------------------------- 1 | from .constants import OPENAI_DATASET_MEAN, OPENAI_DATASET_STD 2 | from .factory import create_model, create_model_and_transforms, create_model_from_pretrained, get_tokenizer 3 | from .factory import list_models, add_model_config, get_model_config, load_checkpoint 4 | from .loss import ClipLoss 5 | from .model import CLIP, CustomCLIP, CLIPTextCfg, CLIPVisionCfg, convert_weights_to_lp, convert_weights_to_fp16, trace_model, get_cast_dtype 6 | from .openai import load_openai_model, list_openai_models 7 | from .pretrained import list_pretrained, list_pretrained_models_by_tag, list_pretrained_tags_by_model, get_pretrained_url, download_pretrained_from_url, is_pretrained_cfg, get_pretrained_cfg, download_pretrained 8 | from .tokenizer import SimpleTokenizer, tokenize 9 | from .transform import image_transform 10 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/constants.py: -------------------------------------------------------------------------------- 1 | OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073) 2 | OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711) 3 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-18B.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1536, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 48, 6 | "width": 5120, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-18b-14-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": true, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-8B-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": 32, 6 | "width": 4096, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-8b-14-plus-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": false, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-8B.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 4096, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-8b-14-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": false, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16, 8 | "eva_model_name": "eva-clip-b-16", 9 | "ls_init_value": 0.1, 10 | "drop_path_rate": 0.0 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 1024, 19 | "heads": 16, 20 | "layers": 24, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0.4, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 768, 19 | "heads": 12, 20 | "layers": 12, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "head_width": 64, 8 | "patch_size": 16, 9 | "mlp_ratio": 2.6667, 10 | "eva_model_name": "eva-clip-b-16-X", 11 | "drop_path_rate": 0.0, 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 512, 24 | "heads": 8, 25 | "layers": 12, 26 | "xattn": true, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14-336", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1024, 20 | "heads": 16, 21 | "layers": 24, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14-448.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": 77, 6 | "width": 2304, 7 | "head_width": 144, 8 | "mlp_ratio": 10.9722, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-10b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": false, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 77, 6 | "width": 2304, 7 | "head_width": 144, 8 | "mlp_ratio": 10.9722, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-10b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": false, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/factory.py: -------------------------------------------------------------------------------- 1 | import json 2 | import logging 3 | import os 4 | import pathlib 5 | import re 6 | from copy import deepcopy 7 | from pathlib import Path 8 | from typing import Optional, Tuple, Union, Dict, Any 9 | import torch 10 | 11 | _MODEL_CONFIG_PATHS = [Path(__file__).parent / f"model_configs/"] 12 | _MODEL_CONFIGS = {} # directory (model_name: config) of model architecture configs 13 | 14 | 15 | def _natural_key(string_): 16 | return [int(s) if s.isdigit() else s for s in re.split(r"(\d+)", string_.lower())] 17 | 18 | 19 | def _rescan_model_configs(): 20 | global _MODEL_CONFIGS 21 | 22 | config_ext = (".json",) 23 | config_files = [] 24 | for config_path in _MODEL_CONFIG_PATHS: 25 | if config_path.is_file() and config_path.suffix in config_ext: 26 | config_files.append(config_path) 27 | elif config_path.is_dir(): 28 | for ext in config_ext: 29 | config_files.extend(config_path.glob(f"*{ext}")) 30 | 31 | for cf in config_files: 32 | with open(cf, "r", encoding="utf8") as f: 33 | model_cfg = json.load(f) 34 | if all(a in model_cfg for a in ("embed_dim", "vision_cfg", "text_cfg")): 35 | _MODEL_CONFIGS[cf.stem] = model_cfg 36 | 37 | _MODEL_CONFIGS = dict(sorted(_MODEL_CONFIGS.items(), key=lambda x: _natural_key(x[0]))) 38 | 39 | 40 | _rescan_model_configs() # initial populate of model config registry 41 | 42 | 43 | def list_models(): 44 | """enumerate available model architectures based on config files""" 45 | return list(_MODEL_CONFIGS.keys()) 46 | 47 | 48 | def add_model_config(path): 49 | """add model config path or file and update registry""" 50 | if not isinstance(path, Path): 51 | path = Path(path) 52 | _MODEL_CONFIG_PATHS.append(path) 53 | _rescan_model_configs() 54 | 55 | 56 | def get_model_config(model_name): 57 | if model_name in _MODEL_CONFIGS: 58 | return deepcopy(_MODEL_CONFIGS[model_name]) 59 | else: 60 | return None 61 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-18B.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1536, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 48, 6 | "width": 5120, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-18b-14-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": true, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-8B-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": 32, 6 | "width": 4096, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-8b-14-plus-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": false, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-8B.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1280, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 32, 6 | "width": 4096, 7 | "head_width": 128, 8 | "mlp_ratio": 5, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-8b-14-x", 11 | "drop_path_rate": 0, 12 | "qkv_bias": false, 13 | "xattn": true, 14 | "postnorm": false, 15 | "fusedLN": false, 16 | "use_rms_norm": true 17 | }, 18 | "text_cfg": { 19 | "context_length": 77, 20 | "vocab_size": 49408, 21 | "width": 1280, 22 | "heads": 20, 23 | "layers": 32, 24 | "xattn": false, 25 | "fusedLN": false 26 | } 27 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "patch_size": 16, 8 | "eva_model_name": "eva-clip-b-16", 9 | "ls_init_value": 0.1, 10 | "drop_path_rate": 0.0 11 | }, 12 | "text_cfg": { 13 | "context_length": 77, 14 | "vocab_size": 49408, 15 | "width": 512, 16 | "heads": 8, 17 | "layers": 12 18 | } 19 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 1024, 19 | "heads": 16, 20 | "layers": 24, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-g-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 40, 6 | "width": 1408, 7 | "head_width": 88, 8 | "mlp_ratio": 4.3637, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-g-14-x", 11 | "drop_path_rate": 0.4, 12 | "xattn": true, 13 | "fusedLN": true 14 | }, 15 | "text_cfg": { 16 | "context_length": 77, 17 | "vocab_size": 49408, 18 | "width": 768, 19 | "heads": 12, 20 | "layers": 12, 21 | "xattn": false, 22 | "fusedLN": true 23 | } 24 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-B-16.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 512, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 12, 6 | "width": 768, 7 | "head_width": 64, 8 | "patch_size": 16, 9 | "mlp_ratio": 2.6667, 10 | "eva_model_name": "eva-clip-b-16-X", 11 | "drop_path_rate": 0.0, 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 512, 24 | "heads": 8, 25 | "layers": 12, 26 | "xattn": true, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-L-14-336.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 336, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14-336", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-L-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 768, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 24, 6 | "width": 1024, 7 | "drop_path_rate": 0, 8 | "head_width": 64, 9 | "mlp_ratio": 2.6667, 10 | "patch_size": 14, 11 | "eva_model_name": "eva-clip-l-14", 12 | "xattn": true, 13 | "fusedLN": true, 14 | "rope": true, 15 | "pt_hw_seq_len": 16, 16 | "intp_freq": true, 17 | "naiveswiglu": true, 18 | "subln": true 19 | }, 20 | "text_cfg": { 21 | "context_length": 77, 22 | "vocab_size": 49408, 23 | "width": 768, 24 | "heads": 12, 25 | "layers": 12, 26 | "xattn": false, 27 | "fusedLN": true 28 | } 29 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-bigE-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 64, 6 | "width": 1792, 7 | "head_width": 112, 8 | "mlp_ratio": 8.571428571428571, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-4b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": true, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1024, 20 | "heads": 16, 21 | "layers": 24, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14-448.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 448, 5 | "layers": 77, 6 | "width": 2304, 7 | "head_width": 144, 8 | "mlp_ratio": 10.9722, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-10b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": false, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14.json: -------------------------------------------------------------------------------- 1 | { 2 | "embed_dim": 1024, 3 | "vision_cfg": { 4 | "image_size": 224, 5 | "layers": 77, 6 | "width": 2304, 7 | "head_width": 144, 8 | "mlp_ratio": 10.9722, 9 | "patch_size": 14, 10 | "eva_model_name": "eva-clip-10b-14-x", 11 | "drop_path_rate": 0, 12 | "xattn": true, 13 | "postnorm": false, 14 | "fusedLN": true 15 | }, 16 | "text_cfg": { 17 | "context_length": 77, 18 | "vocab_size": 49408, 19 | "width": 1280, 20 | "heads": 20, 21 | "layers": 32, 22 | "xattn": false, 23 | "fusedLN": true 24 | } 25 | } 26 | -------------------------------------------------------------------------------- /llava/model/multimodal_projector/pooler_projector.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | 4 | import math 5 | 6 | from transformers.models.clip.modeling_clip import CLIPVisionModel 7 | 8 | 9 | class PoolerProjector(nn.Module): 10 | def __init__(self, config, vision_cfg): 11 | super().__init__() 12 | self._config = config 13 | self.hw = vision_cfg.image_size // vision_cfg.patch_size 14 | 15 | self.conv_pool = nn.Conv2d(config.mm_hidden_size, config.hidden_size, kernel_size=2, stride=2) 16 | 17 | self.proj = nn.Sequential( 18 | nn.GELU(), 19 | nn.Linear(config.hidden_size, config.hidden_size), 20 | ) 21 | 22 | def forward(self, x, *args, **kwargs): 23 | height = width = self.hw 24 | assert height * width == x.shape[1] 25 | x = x.view(x.shape[0], height, width, -1).permute(0, 3, 1, 2) 26 | x = self.conv_pool(x) 27 | x = x.flatten(2).transpose(1, 2) 28 | x = self.proj(x) 29 | return x 30 | 31 | @property 32 | def config(self): 33 | return {"mm_projector_type": "pooler"} 34 | -------------------------------------------------------------------------------- /llava/model/multimodal_resampler/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .masked_drop import MaskedDrop 4 | from .spatial_pool import SpatialPool 5 | from .perceiver import PerceiverResampler 6 | from .qformer import Qformer 7 | 8 | 9 | class IdentityMap(torch.nn.Module): 10 | def __init__(self): 11 | super().__init__() 12 | 13 | def forward(self, x, *args, **kwargs): 14 | return x 15 | 16 | @property 17 | def config(self): 18 | return {"mm_resampler_type": None} 19 | 20 | 21 | def build_vision_resampler(model_args, delay_load=False, **kwargs): 22 | resampler_type = getattr(model_args, "mm_resampler_type", None) 23 | if resampler_type == "masked_drop": 24 | return MaskedDrop(model_args) 25 | elif resampler_type == "spatial_pool": 26 | return SpatialPool(model_args, **kwargs) 27 | elif resampler_type == "perceiver": 28 | return PerceiverResampler(model_args, **kwargs) 29 | elif resampler_type == "qformer": 30 | return Qformer(model_args, **kwargs) 31 | elif resampler_type is None: 32 | return IdentityMap() 33 | 34 | raise ValueError(f"Unknown resampler type: {resampler_type}") 35 | -------------------------------------------------------------------------------- /llava/model/multimodal_resampler/spatial_pool.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | 5 | 6 | class SpatialPool(nn.Module): 7 | def __init__(self, model_args, vision_tower): 8 | super().__init__() 9 | 10 | self.mode = model_args.mm_spatial_pool_mode 11 | self.stride = model_args.mm_spatial_pool_stride 12 | self.out_channels = getattr(model_args, "mm_spatial_pool_out_channels", vision_tower.hidden_size) 13 | 14 | if self.mode == "average": 15 | self.pool = nn.AvgPool2d(kernel_size=self.stride, stride=self.stride) 16 | elif self.mode == "max": 17 | self.pool = nn.MaxPool2d(kernel_size=self.stride, stride=self.stride) 18 | elif self.mode == "conv": 19 | self.pool = nn.Conv2d(in_channels=vision_tower.hidden_size, out_channels=self.out_channels, kernel_size=self.stride, stride=self.stride) 20 | else: 21 | raise ValueError(f"Unknown pooling mode: {self.pool}.") 22 | 23 | def forward(self, image_features, images, *args, **kwargs): 24 | ori_W = int(math.sqrt(image_features.shape[1] * images.shape[3] // images.shape[2])) 25 | ori_H = int(ori_W * images.shape[2] // images.shape[3]) 26 | 27 | B, _, F = image_features.shape 28 | 29 | image_features_spatial = image_features.view(B, ori_H, ori_H, F).permute(0, 3, 1, 2) 30 | image_features_spatial_pool = self.pool(image_features_spatial) 31 | 32 | return image_features_spatial_pool.flatten(2).transpose(1, 2).contiguous() 33 | 34 | @property 35 | def config(self): 36 | return { 37 | "mm_resampler_type": "spatial_pool", 38 | "mm_spatial_pool_stride": self.stride, 39 | "mm_spatial_pool_mode": self.mode, 40 | "mm_spatial_pool_out_channels": self.out_channels, 41 | } 42 | 43 | @property 44 | def hidden_size(self): 45 | return self.out_channels 46 | -------------------------------------------------------------------------------- /llava/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if "llava" in config and "llava" not in cfg.model_type: 7 | assert cfg.model_type == "llama" 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = "LlavaLlamaForCausalLM" 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /llava/serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/llava/serve/__init__.py -------------------------------------------------------------------------------- /llava/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/llava/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /llava/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/llava/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /llava/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /llava/serve/test_message.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import requests 5 | 6 | from llava.conversation import default_conversation 7 | 8 | 9 | def main(): 10 | if args.worker_address: 11 | worker_addr = args.worker_address 12 | else: 13 | controller_addr = args.controller_address 14 | ret = requests.post(controller_addr + "/refresh_all_workers") 15 | ret = requests.post(controller_addr + "/list_models") 16 | models = ret.json()["models"] 17 | models.sort() 18 | print(f"Models: {models}") 19 | 20 | ret = requests.post(controller_addr + "/get_worker_address", json={"model": args.model_name}) 21 | worker_addr = ret.json()["address"] 22 | print(f"worker_addr: {worker_addr}") 23 | 24 | if worker_addr == "": 25 | return 26 | 27 | conv = default_conversation.copy() 28 | conv.append_message(conv.roles[0], args.message) 29 | prompt = conv.get_prompt() 30 | 31 | headers = {"User-Agent": "LLaVA Client"} 32 | pload = { 33 | "model": args.model_name, 34 | "prompt": prompt, 35 | "max_new_tokens": args.max_new_tokens, 36 | "temperature": 0.7, 37 | "stop": conv.sep, 38 | } 39 | response = requests.post(worker_addr + "/worker_generate_stream", headers=headers, json=pload, stream=True) 40 | 41 | print(prompt.replace(conv.sep, "\n"), end="") 42 | for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"): 43 | if chunk: 44 | data = json.loads(chunk.decode("utf-8")) 45 | output = data["text"].split(conv.sep)[-1] 46 | print(output, end="\r") 47 | print("") 48 | 49 | 50 | if __name__ == "__main__": 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument("--controller-address", type=str, default="http://localhost:21001") 53 | parser.add_argument("--worker-address", type=str) 54 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 55 | parser.add_argument("--max-new-tokens", type=int, default=32) 56 | parser.add_argument("--message", type=str, default="Tell me a story with more than 1000 words.") 57 | args = parser.parse_args() 58 | 59 | main() 60 | -------------------------------------------------------------------------------- /llava/train/train_mem.py: -------------------------------------------------------------------------------- 1 | from llava.train.train import train 2 | 3 | if __name__ == "__main__": 4 | train() 5 | -------------------------------------------------------------------------------- /playground/demo/xU25MMA2N4aVtYay.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/thunlp/LLaVA-UHD/08c85e208cd12c825303619723e4fe20e64f7f58/playground/demo/xU25MMA2N4aVtYay.mp4 -------------------------------------------------------------------------------- /playground/equal_splitter.py: -------------------------------------------------------------------------------- 1 | import json 2 | from math import ceil 3 | 4 | 5 | def split_json_file(input_file, n_splits): 6 | # Read the JSON file 7 | with open(input_file, "r") as file: 8 | data = json.load(file) 9 | 10 | # Calculate the size of each split 11 | total_items = len(data) 12 | items_per_split = ceil(total_items / n_splits) 13 | 14 | # Split the data and save into separate files 15 | for i in range(n_splits): 16 | start_index = i * items_per_split 17 | end_index = min((i + 1) * items_per_split, total_items) 18 | split_data = data[start_index:end_index] 19 | 20 | # Write the split data to a new JSON file 21 | with open(f"{input_file.split('.')[0]}_split_{i}.json", "w") as split_file: 22 | json.dump(split_data, split_file, indent=4) 23 | 24 | 25 | def main(): 26 | import argparse 27 | 28 | parser = argparse.ArgumentParser(description="Split a JSON file into multiple parts.") 29 | parser.add_argument("--input_file", type=str, help="The JSON file to split") 30 | parser.add_argument("--n_splits", type=int, help="The number of splits") 31 | 32 | args = parser.parse_args() 33 | 34 | split_json_file(args.input_file, args.n_splits) 35 | 36 | 37 | if __name__ == "__main__": 38 | main() 39 | -------------------------------------------------------------------------------- /playground/remove_mid_ckpt.py: -------------------------------------------------------------------------------- 1 | import os 2 | import shutil 3 | import glob 4 | 5 | 6 | def remove_checkpoints(directory, pattern): 7 | # Walk through the directory 8 | for root, dirs, files in os.walk(directory): 9 | # Use glob to find paths matching the pattern 10 | for file_path in glob.glob(os.path.join(root, pattern)): 11 | # Check if it is a directory 12 | if "llava-1.6-mistral-7b" in file_path: 13 | continue 14 | if os.path.isdir(file_path): 15 | # Remove the directory 16 | print(f"Removing {file_path}") 17 | input("Press Enter to continue...") 18 | shutil.rmtree(file_path) 19 | print(f"Removed directory: {file_path}") 20 | else: 21 | print(f"Removing {file_path}") 22 | input("Press Enter to continue...") 23 | # Remove the file 24 | os.remove(file_path) 25 | print(f"Removed file: {file_path}") 26 | 27 | 28 | # Directory containing the checkpoints 29 | directory = "/mnt/bn/vl-research/checkpoints/feng/" 30 | 31 | # Pattern to match in the file names 32 | pattern = "global_step*" 33 | 34 | # Call the function 35 | remove_checkpoints(directory, pattern) 36 | -------------------------------------------------------------------------------- /scripts/archived/convert_gqa_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | all_answers = [] 11 | for line_idx, line in enumerate(open(args.src)): 12 | res = json.loads(line) 13 | question_id = res["question_id"] 14 | text = res["text"].rstrip(".").lower() 15 | all_answers.append({"questionId": question_id, "prediction": text}) 16 | 17 | with open(args.dst, "w") as f: 18 | json.dump(all_answers, f) 19 | -------------------------------------------------------------------------------- /scripts/archived/convert_mmvet_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | cur_result = {} 11 | 12 | for line in open(args.src): 13 | data = json.loads(line) 14 | qid = data["question_id"] 15 | cur_result[f"v1_{qid}"] = data["text"] 16 | 17 | with open(args.dst, "w") as f: 18 | json.dump(cur_result, f, indent=2) 19 | -------------------------------------------------------------------------------- /scripts/archived/convert_vizwiz_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("--annotation-file", type=str, required=True) 11 | parser.add_argument("--result-file", type=str, required=True) 12 | parser.add_argument("--result-upload-file", type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == "__main__": 17 | 18 | args = parse_args() 19 | 20 | os.makedirs(os.path.dirname(args.result_upload_file), exist_ok=True) 21 | 22 | results = [] 23 | error_line = 0 24 | for line_idx, line in enumerate(open(args.result_file)): 25 | try: 26 | results.append(json.loads(line)) 27 | except: 28 | error_line += 1 29 | results = {x["question_id"]: x["text"] for x in results} 30 | test_split = [json.loads(line) for line in open(args.annotation_file)] 31 | split_ids = set([x["question_id"] for x in test_split]) 32 | 33 | print(f"total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}") 34 | 35 | all_answers = [] 36 | 37 | answer_processor = EvalAIAnswerProcessor() 38 | 39 | for x in test_split: 40 | # import pdb; pdb.set_trace() 41 | assert x["question_id"] in results, print(x) 42 | all_answers.append({"image": x["image"], "answer": answer_processor(results[x["question_id"]])}) 43 | 44 | with open(args.result_upload_file, "w") as f: 45 | json.dump(all_answers, f) 46 | -------------------------------------------------------------------------------- /scripts/archived/convert_vqav2_for_submission.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import json 4 | 5 | from llava.eval.m4c_evaluator import EvalAIAnswerProcessor 6 | 7 | 8 | def parse_args(): 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument("--dir", type=str, default="./playground/data/eval/vqav2") 11 | parser.add_argument("--ckpt", type=str, required=True) 12 | parser.add_argument("--split", type=str, required=True) 13 | return parser.parse_args() 14 | 15 | 16 | if __name__ == "__main__": 17 | 18 | args = parse_args() 19 | 20 | src = os.path.join(args.dir, "answers", args.split, args.ckpt, "merge.jsonl") 21 | test_split = os.path.join(args.dir, "llava_vqav2_mscoco_test2015.jsonl") 22 | dst = os.path.join(args.dir, "answers_upload", args.split, f"{args.ckpt}.json") 23 | os.makedirs(os.path.dirname(dst), exist_ok=True) 24 | 25 | results = [] 26 | error_line = 0 27 | for line_idx, line in enumerate(open(src)): 28 | try: 29 | results.append(json.loads(line)) 30 | except: 31 | error_line += 1 32 | 33 | results = {x["question_id"]: x["text"] for x in results} 34 | test_split = [json.loads(line) for line in open(test_split)] 35 | split_ids = set([x["question_id"] for x in test_split]) 36 | 37 | print(f"total results: {len(results)}, total split: {len(test_split)}, error_line: {error_line}") 38 | 39 | all_answers = [] 40 | 41 | answer_processor = EvalAIAnswerProcessor() 42 | 43 | for x in test_split: 44 | if x["question_id"] not in results: 45 | all_answers.append({"question_id": x["question_id"], "answer": ""}) 46 | else: 47 | all_answers.append({"question_id": x["question_id"], "answer": answer_processor(results[x["question_id"]])}) 48 | 49 | with open(dst, "w") as f: 50 | json.dump(all_answers, open(dst, "w")) 51 | -------------------------------------------------------------------------------- /scripts/archived/entry_cmd.sh: -------------------------------------------------------------------------------- 1 | python3 -m pip install --upgrade pip; 2 | 3 | export http_proxy=http://sys-proxy-rd-relay.byted.org:8118; 4 | export https_proxy=http://sys-proxy-rd-relay.byted.org:8118; 5 | 6 | export HF_HUB_ENABLE_HF_TRANSFER="1"; 7 | 8 | cd /mnt/bn/vl-research-boli01-cn/projects/zzz/lmms-eval; 9 | pip install -e .; 10 | 11 | cd /mnt/bn/vl-research-boli01-cn/projects/zzz/LLaVA_Next; 12 | pip install -e .; 13 | 14 | python3 -m pip install ninja; 15 | python3 -m pip install flash-attn --no-build-isolation; 16 | 17 | bash /mnt/bn/vl-research-boli01-cn/projects/zzz/LLaVA_Next/cn_scripts/vicuna/internal0.6m_finetune_llava1.6mix_7b_v0.2_unfreeze.sh 18 | 19 | 20 | accelerate launch --num_processes 8 --main_process_port 12345 -m lmms_eval \ 21 | --model llava \ 22 | --model_args pretrained="/mnt/bn/vl-research-boli01-cn/projects/zzz/LLaVA_Next/internal_project_checkpoints/llavanext-lmsys_vicuna-7b-v1.5-clip-vit-large-patch14-336-mlp2x_gelu-pretrain_internal0.6m_vicuna_v1_finetune_llava1.6_datamix_unfreezeVIS_1e" \ 23 | --tasks ok_vqa,textcaps_val,mme_test,mmmu,cmmmu,coco2017_cap_val,vizwiz_vqa_val,ai2d,chartqa,pope \ 24 | --batch_size 1 \ 25 | --log_samples \ 26 | --log_samples_suffix debug \ 27 | --output_path ./logs/ \ 28 | --wandb_args 'project=llava-next-lmms-eval,job_type=eval'; -------------------------------------------------------------------------------- /scripts/archived/finetune_full_schedule.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Uncomment and set the following variables correspondingly to run this script: 4 | 5 | ################## VICUNA ################## 6 | # PROMPT_VERSION=v1 7 | # MODEL_VERSION="vicuna-v1-3-7b" 8 | ################## VICUNA ################## 9 | 10 | ################## LLaMA-2 ################## 11 | # PROMPT_VERSION="llava_llama_2" 12 | # MODEL_VERSION="llama-2-7b-chat" 13 | ################## LLaMA-2 ################## 14 | 15 | deepspeed llava/train/train_mem.py \ 16 | --deepspeed ./scripts/zero2.json \ 17 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 18 | --version $PROMPT_VERSION \ 19 | --data_path ./playground/data/llava_instruct_158k.json \ 20 | --image_folder /path/to/coco/train2017 \ 21 | --vision_tower openai/clip-vit-large-patch14 \ 22 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 23 | --mm_vision_select_layer -2 \ 24 | --mm_use_im_start_end False \ 25 | --mm_use_im_patch_token False \ 26 | --bf16 True \ 27 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune \ 28 | --num_train_epochs 3 \ 29 | --per_device_train_batch_size 16 \ 30 | --per_device_eval_batch_size 4 \ 31 | --gradient_accumulation_steps 1 \ 32 | --evaluation_strategy "no" \ 33 | --save_strategy "steps" \ 34 | --save_steps 50000 \ 35 | --save_total_limit 1 \ 36 | --learning_rate 2e-5 \ 37 | --weight_decay 0. \ 38 | --warmup_ratio 0.03 \ 39 | --lr_scheduler_type "cosine" \ 40 | --logging_steps 1 \ 41 | --tf32 True \ 42 | --model_max_length 2048 \ 43 | --gradient_checkpointing True \ 44 | --dataloader_num_workers 16 \ 45 | --lazy_preprocess True \ 46 | --report_to wandb 47 | -------------------------------------------------------------------------------- /scripts/archived/finetune_lora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Uncomment and set the following variables correspondingly to run this script: 4 | 5 | ################## VICUNA ################## 6 | # PROMPT_VERSION=v1 7 | # MODEL_VERSION="vicuna-v1-3-7b" 8 | ################## VICUNA ################## 9 | 10 | ################## LLaMA-2 ################## 11 | # PROMPT_VERSION="llava_llama_2" 12 | # MODEL_VERSION="llama-2-7b-chat" 13 | ################## LLaMA-2 ################## 14 | 15 | deepspeed llava/train/train_mem.py \ 16 | --deepspeed ./scripts/zero2.json \ 17 | --lora_enable True \ 18 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 19 | --version $PROMPT_VERSION \ 20 | --data_path ./playground/data/llava_instruct_80k.json \ 21 | --image_folder /path/to/coco/train2017 \ 22 | --vision_tower openai/clip-vit-large-patch14 \ 23 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 24 | --mm_vision_select_layer -2 \ 25 | --mm_use_im_start_end False \ 26 | --mm_use_im_patch_token False \ 27 | --bf16 True \ 28 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \ 29 | --num_train_epochs 1 \ 30 | --per_device_train_batch_size 16 \ 31 | --per_device_eval_batch_size 4 \ 32 | --gradient_accumulation_steps 1 \ 33 | --evaluation_strategy "no" \ 34 | --save_strategy "steps" \ 35 | --save_steps 50000 \ 36 | --save_total_limit 1 \ 37 | --learning_rate 2e-5 \ 38 | --weight_decay 0. \ 39 | --warmup_ratio 0.03 \ 40 | --lr_scheduler_type "cosine" \ 41 | --logging_steps 1 \ 42 | --tf32 True \ 43 | --model_max_length 2048 \ 44 | --gradient_checkpointing True \ 45 | --lazy_preprocess True \ 46 | --dataloader_num_workers 16 \ 47 | --report_to wandb 48 | -------------------------------------------------------------------------------- /scripts/archived/finetune_qlora.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Uncomment and set the following variables correspondingly to run this script: 4 | 5 | ################## VICUNA ################## 6 | # PROMPT_VERSION=v1 7 | # MODEL_VERSION="vicuna-v1-3-7b" 8 | ################## VICUNA ################## 9 | 10 | ################## LLaMA-2 ################## 11 | # PROMPT_VERSION="llava_llama_2" 12 | # MODEL_VERSION="llama-2-7b-chat" 13 | ################## LLaMA-2 ################## 14 | 15 | deepspeed llava/train/train_mem.py \ 16 | --deepspeed ./scripts/zero2.json \ 17 | --lora_enable True \ 18 | --bits 4 \ 19 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 20 | --version $PROMPT_VERSION \ 21 | --data_path ./playground/data/llava_instruct_80k.json \ 22 | --image_folder /path/to/coco/train2017 \ 23 | --vision_tower openai/clip-vit-large-patch14 \ 24 | --pretrain_mm_mlp_adapter ./checkpoints/llava-$MODEL_VERSION-pretrain/mm_projector.bin \ 25 | --mm_vision_select_layer -2 \ 26 | --mm_use_im_start_end False \ 27 | --mm_use_im_patch_token False \ 28 | --bf16 True \ 29 | --output_dir ./checkpoints/llava-$MODEL_VERSION-finetune_lora \ 30 | --num_train_epochs 1 \ 31 | --per_device_train_batch_size 16 \ 32 | --per_device_eval_batch_size 4 \ 33 | --gradient_accumulation_steps 1 \ 34 | --evaluation_strategy "no" \ 35 | --save_strategy "steps" \ 36 | --save_steps 50000 \ 37 | --save_total_limit 1 \ 38 | --learning_rate 2e-5 \ 39 | --weight_decay 0. \ 40 | --warmup_ratio 0.03 \ 41 | --lr_scheduler_type "cosine" \ 42 | --logging_steps 1 \ 43 | --tf32 True \ 44 | --model_max_length 2048 \ 45 | --gradient_checkpointing True \ 46 | --lazy_preprocess True \ 47 | --dataloader_num_workers 16 \ 48 | --report_to wandb 49 | -------------------------------------------------------------------------------- /scripts/archived/finetune_sqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | deepspeed llava/train/train_mem.py \ 4 | --deepspeed ./scripts/zero2.json \ 5 | --model_name_or_path lmsys/vicuna-13b-v1.3 \ 6 | --version $PROMPT_VERSION \ 7 | --data_path /Data/ScienceQA/data/scienceqa/llava_train_QCM-LEA.json \ 8 | --image_folder /Data/ScienceQA/data/scienceqa/images/train \ 9 | --vision_tower openai/clip-vit-large-patch14 \ 10 | --pretrain_mm_mlp_adapter ./checkpoints/huggingface/liuhaotian/llava-pretrain-vicuna-13b-v1.3/mm_projector.bin \ 11 | --mm_vision_select_layer -2 \ 12 | --mm_use_im_start_end False \ 13 | --mm_use_im_patch_token False \ 14 | --bf16 True \ 15 | --output_dir ./checkpoints/llava-vicuna-13b-v1.3-pretrain_lcs558k_plain-ScienceQA_QCM_LEA-12e \ 16 | --num_train_epochs 12 \ 17 | --per_device_train_batch_size 16 \ 18 | --per_device_eval_batch_size 4 \ 19 | --gradient_accumulation_steps 1 \ 20 | --evaluation_strategy "no" \ 21 | --save_strategy "steps" \ 22 | --save_steps 50000 \ 23 | --save_total_limit 1 \ 24 | --learning_rate 2e-5 \ 25 | --weight_decay 0. \ 26 | --warmup_ratio 0.03 \ 27 | --lr_scheduler_type "cosine" \ 28 | --logging_steps 1 \ 29 | --tf32 True \ 30 | --model_max_length 2048 \ 31 | --gradient_checkpointing True \ 32 | --dataloader_num_workers 16 \ 33 | --lazy_preprocess True \ 34 | --report_to wandb 35 | -------------------------------------------------------------------------------- /scripts/archived/merge_lora_weights.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | from llava.model.builder import load_pretrained_model 3 | from llava.mm_utils import get_model_name_from_path 4 | 5 | 6 | def merge_lora(args): 7 | model_name = get_model_name_from_path(args.model_path) 8 | tokenizer, model, image_processor, context_len = load_pretrained_model(args.model_path, args.model_base, model_name, device_map="cpu") 9 | 10 | model.save_pretrained(args.save_model_path) 11 | tokenizer.save_pretrained(args.save_model_path) 12 | 13 | 14 | if __name__ == "__main__": 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument("--model-path", type=str, required=True) 17 | parser.add_argument("--model-base", type=str, required=True) 18 | parser.add_argument("--save-model-path", type=str, required=True) 19 | 20 | args = parser.parse_args() 21 | 22 | merge_lora(args) 23 | -------------------------------------------------------------------------------- /scripts/archived/pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Uncomment and set the following variables correspondingly to run this script: 4 | 5 | # MODEL_VERSION=vicuna-v1-3-7b 6 | # MODEL_VERSION=llama-2-7b-chat 7 | 8 | ########### DO NOT CHANGE ########### 9 | ########### USE THIS FOR BOTH ########### 10 | PROMPT_VERSION=plain 11 | ########### DO NOT CHANGE ########### 12 | 13 | deepspeed llava/train/train_mem.py \ 14 | --deepspeed ./scripts/zero2.json \ 15 | --model_name_or_path ./checkpoints/$MODEL_VERSION \ 16 | --version $PROMPT_VERSION \ 17 | --data_path /path/to/pretrain_data.json \ 18 | --image_folder /path/to/images \ 19 | --vision_tower openai/clip-vit-large-patch14 \ 20 | --tune_mm_mlp_adapter True \ 21 | --mm_vision_select_layer -2 \ 22 | --mm_use_im_start_end False \ 23 | --mm_use_im_patch_token False \ 24 | --bf16 True \ 25 | --output_dir ./checkpoints/llava-$MODEL_VERSION-pretrain \ 26 | --num_train_epochs 1 \ 27 | --per_device_train_batch_size 16 \ 28 | --per_device_eval_batch_size 4 \ 29 | --gradient_accumulation_steps 1 \ 30 | --evaluation_strategy "no" \ 31 | --save_strategy "steps" \ 32 | --save_steps 24000 \ 33 | --learning_rate 2e-3 \ 34 | --weight_decay 0. \ 35 | --warmup_ratio 0.03 \ 36 | --lr_scheduler_type "cosine" \ 37 | --logging_steps 1 \ 38 | --tf32 True \ 39 | --model_max_length 2048 \ 40 | --gradient_checkpointing True \ 41 | --dataloader_num_workers 16 \ 42 | --lazy_preprocess True \ 43 | --report_to wandb 44 | -------------------------------------------------------------------------------- /scripts/archived/quick_check.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import argparse 4 | from tqdm import tqdm 5 | import yaml 6 | 7 | 8 | def check_missing_images(json_path, images_folder): 9 | data = json.load(open(json_path, "r")) 10 | missing_data = [] 11 | 12 | for i, d in enumerate(tqdm(data)): 13 | image = d["image"] if "image" in d else "" 14 | if image != "": 15 | path = os.path.join(images_folder, image) 16 | if not os.path.exists(path): 17 | print(f"Missing image: {path}") 18 | missing_data.append(d) 19 | 20 | return missing_data 21 | 22 | 23 | def read_yaml_to_llava_data(yaml_path, images_folder): 24 | print(f"Reading YAML file: {yaml_path}") 25 | with open(yaml_path, "r") as f: 26 | data = yaml.safe_load(f) 27 | 28 | llava_json_paths = data["datasets"] 29 | for item in llava_json_paths: 30 | json_path = item["json_path"] 31 | missing_data = check_missing_images(json_path, images_folder) 32 | if len(missing_data) > 0: 33 | print(f"Missing images in {json_path}:") 34 | for d in missing_data: 35 | print(d) 36 | 37 | 38 | def direct_check_llava_data(json_path, images_folder): 39 | missing_data = check_missing_images(json_path, images_folder) 40 | if len(missing_data) > 0: 41 | print(f"Missing images in {json_path}:") 42 | for d in missing_data: 43 | print(d) 44 | 45 | 46 | if __name__ == "__main__": 47 | parser = argparse.ArgumentParser(description="Check for missing images in dataset.") 48 | parser.add_argument("--yaml_path", type=str, default="", help="Path to the YAML file containing the dataset.") 49 | parser.add_argument("--json_path", type=str, default="", help="Path to the JSON file containing the dataset.") 50 | parser.add_argument("--images_folder", type=str, default="/mnt/bn/vl-research/data/llava_data", help="Path to the folder containing the images.") 51 | 52 | args = parser.parse_args() 53 | 54 | if args.json_path != "": 55 | direct_check_llava_data(args.json_path, args.images_folder) 56 | elif args.yaml_path != "": 57 | read_yaml_to_llava_data(args.yaml_path, args.images_folder) 58 | -------------------------------------------------------------------------------- /scripts/archived/sqa_eval_batch.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CHUNKS=8 4 | for IDX in {0..7}; do 5 | CUDA_VISIBLE_DEVICES=$IDX python -m llava.eval.model_vqa_science \ 6 | --model-path liuhaotian/llava-lcs558k-scienceqa-vicuna-13b-v1.3 \ 7 | --question-file ~/haotian/datasets/ScienceQA/data/scienceqa/llava_test_QCM-LEA.json \ 8 | --image-folder ~/haotian/datasets/ScienceQA/data/scienceqa/images/test \ 9 | --answers-file ./test_llava-13b-chunk$CHUNKS_$IDX.jsonl \ 10 | --num-chunks $CHUNKS \ 11 | --chunk-idx $IDX \ 12 | --conv-mode llava_v1 & 13 | done 14 | -------------------------------------------------------------------------------- /scripts/archived/sqa_eval_gather.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | CHUNKS=8 4 | output_file="test_llava-13b.jsonl" 5 | 6 | # Clear out the output file if it exists. 7 | > "$output_file" 8 | 9 | # Loop through the indices and concatenate each file. 10 | for idx in $(seq 0 $((CHUNKS-1))); do 11 | cat "./test_llava-13b-chunk${idx}.jsonl" >> "$output_file" 12 | done 13 | 14 | python llava/eval/eval_science_qa.py \ 15 | --base-dir ~/haotian/datasets/ScienceQA/data/scienceqa \ 16 | --result-file ./test_llava-13b.jsonl \ 17 | --output-file ./test_llava-13b_output.json \ 18 | --output-result ./test_llava-13b_result.json 19 | -------------------------------------------------------------------------------- /scripts/convert_gqa_for_eval.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | 5 | parser = argparse.ArgumentParser() 6 | parser.add_argument("--src", type=str) 7 | parser.add_argument("--dst", type=str) 8 | args = parser.parse_args() 9 | 10 | all_answers = [] 11 | for line_idx, line in enumerate(open(args.src)): 12 | res = json.loads(line) 13 | question_id = res['question_id'] 14 | text = res['text'].rstrip('.').lower() 15 | all_answers.append({"questionId": question_id, "prediction": text}) 16 | 17 | with open(args.dst, 'w') as f: 18 | json.dump(all_answers, f) 19 | -------------------------------------------------------------------------------- /scripts/interleave/eval_all.sh: -------------------------------------------------------------------------------- 1 | 2 | # evaluate 3 | ./scripts/interleave/eval_interleave_3d.sh /path/to/ckpt /path/to/images multi_image_in_domain 4 | ./scripts/interleave/eval_interleave_3d.sh /path/to/ckpt /path/to/images multi_image_out_domain 5 | ./scripts/interleave/eval_interleave_3d.sh /path/to/ckpt /path/to/images multi_view_in_domain -------------------------------------------------------------------------------- /scripts/interleave/eval_interleave_3d.sh: -------------------------------------------------------------------------------- 1 | alias python=python3 2 | CKPT_PATH=$1 3 | NAME=$(echo "$CKPT_PATH" | awk -F'/' '{print $NF}') 4 | echo $NAME 5 | ##### set images path 6 | DATA_PATH=$2 7 | EVAL_TYPE=$3 8 | JSON_PATH=$2/$3.json 9 | ############################### eval multi-image 10 | RESULT_NAME="logs/${NAME}/${EVAL_TYPE}" 11 | echo $RESULT_NAME 12 | 13 | mkdir -p logs/${NAME} 14 | 15 | file_path=${RESULT_NAME}/result.jsonl 16 | 17 | bash scripts/interleave/eval_multiprocess.sh \ 18 | ${CKPT_PATH} \ 19 | ${JSON_PATH} \ 20 | ${RESULT_NAME} \ 21 | ${DATA_PATH} \ 22 | "" \ 23 | 8 0 24 | 25 | python3 llava/eval/evaluate_interleave.py --result-dir ${RESULT_NAME} 26 | 27 | 28 | 29 | -------------------------------------------------------------------------------- /scripts/interleave/eval_multiprocess.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Check if three arguments are passed 4 | if [ "$#" -ne 7 ]; then 5 | echo "Usage: $0 " 6 | exit 1 7 | fi 8 | 9 | # Assign the command line arguments to variables 10 | model_path=$1 11 | question_path=$2 12 | base_answer_path=$3 13 | image_folder=$4 14 | extra_prompt=$5 15 | N=$6 16 | temperature=$7 17 | 18 | # Loop over each chunk/process 19 | for (( chunk_id=0; chunk_id "${base_answer_path}.jsonl" 42 | for ((i=0; i> "${base_answer_path}/result.jsonl" 45 | done 46 | # remove the unmerged files 47 | for (( chunk_id=0; chunk_id "$output_file" 41 | 42 | # Loop through the indices and concatenate each file. 43 | for IDX in $(seq 0 $((CHUNKS-1))); do 44 | cat ./playground/data/eval/ai2d/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 45 | done 46 | 47 | python -m llava.eval.eval_ai2d \ 48 | --annotation-file ./playground/data/eval/ai2d/test_from_mova.jsonl \ 49 | --result-file $output_file \ 50 | --mid_result ./playground/data/eval/ai2d/mid_results/$CKPT.jsonl \ 51 | --output_result ./exp_results/$CKPT/ai2d_result.jsonl -------------------------------------------------------------------------------- /scripts/v1_5/eval/deepform.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT=$1 9 | echo $CKPT 10 | SPLIT="test" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 14 | --model-path ./checkpoints_new/$CKPT \ 15 | --question-file ./playground/data/eval/DeepForm/deep_questions.jsonl \ 16 | --image-folder ./playground/data/ureader/DUE_Benchmark/DeepForm/page_pngs \ 17 | --answers-file ./playground/data/eval/DeepForm/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --num_beams 1 \ 22 | --conv-mode vicuna_v1 & 23 | done 24 | 25 | wait 26 | 27 | output_file=./playground/data/eval/DeepForm/answers/$SPLIT/$CKPT/merge_slice.jsonl 28 | 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ./playground/data/eval/DeepForm/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | python -m llava.eval.eval_docvqa \ 38 | --annotation-file ./playground/data/eval/DeepForm/deep_annotations.jsonl \ 39 | --result-file $output_file \ 40 | --mid_result ./playground/data/eval/DeepForm/mid_results/$CKPT.jsonl \ 41 | --output_result ./exp_results/$CKPT/deep_result.jsonl -------------------------------------------------------------------------------- /scripts/v1_5/eval/docvqa_test.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m mova.eval.model_vqa_loader \ 4 | --model-path checkpoints/mova-8b \ 5 | --question-file ./playground/data/eval/docvqa/test.jsonl \ 6 | --image-folder ./playground/data/eval/docvqa/test/documents/ \ 7 | --answers-file ./playground/data/eval/docvqa/answers/mova-8b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode mova_llama3 10 | 11 | python scripts/convert_docvqa_for_submission.py \ 12 | --result-dir ./playground/data/eval/docvqa/answers \ 13 | --upload_dir ./playground/data/eval/docvqa/upload_results \ 14 | --experiment mova-8b 15 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/docvqa_val.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT=$1 9 | echo $CKPT 10 | SPLIT="val" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 14 | --model-path ./checkpoints_new/$CKPT \ 15 | --question-file ./playground/data/eval/docvqa/docvqa_questions.jsonl \ 16 | --image-folder ./playground/data/ureader/DUE_Benchmark/DocVQA/pngs \ 17 | --answers-file ./playground/data/eval/docvqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --num_beams 1 \ 22 | --conv-mode qwen_1_5 & 23 | done 24 | 25 | wait 26 | 27 | output_file=./playground/data/eval/docvqa/answers/$SPLIT/$CKPT/merge_slice.jsonl 28 | 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ./playground/data/eval/docvqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | python -m llava.eval.eval_docvqa \ 38 | --annotation-file ./playground/data/eval/docvqa/docvqa_annotations.jsonl \ 39 | --result-file $output_file \ 40 | --mid_result ./playground/data/eval/docvqa/mid_results/$CKPT.jsonl \ 41 | --output_result ./exp_results/$CKPT/docvqa_result.jsonl -------------------------------------------------------------------------------- /scripts/v1_5/eval/estvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT=$1 9 | echo $CKPT 10 | SPLIT="test" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 14 | --model-path ./checkpoints_new/$CKPT \ 15 | --question-file ./playground/data/eval/ESTVQA/est_questions.jsonl \ 16 | --image-folder ./playground/data/eval/ESTVQA/test \ 17 | --answers-file ./playground/data/eval/ESTVQA/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --num_beams 1 \ 22 | --conv-mode vicuna_v1 & 23 | done 24 | 25 | wait 26 | 27 | output_file=./playground/data/eval/ESTVQA/answers/$SPLIT/$CKPT/merge_slice.jsonl 28 | 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ./playground/data/eval/ESTVQA/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | python -m llava.eval.eval_docvqa \ 38 | --annotation-file ./playground/data/eval/ESTVQA/est_annotations.jsonl \ 39 | --result-file $output_file \ 40 | --mid_result ./playground/data/eval/ESTVQA/mid_results/$CKPT.jsonl \ 41 | --output_result ./exp_results/$CKPT/est_result.jsonl -------------------------------------------------------------------------------- /scripts/v1_5/eval/gqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT=$1 9 | echo $CKPT 10 | SPLIT="llava_gqa_testdev_balanced" 11 | GQADIR="./playground/data/eval/gqa/data" 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 15 | --model-path ./checkpoints_new/$CKPT \ 16 | --question-file ./playground/data/eval/gqa/$SPLIT.jsonl \ 17 | --image-folder ./playground/data/eval/gqa/data/images \ 18 | --answers-file ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 19 | --num-chunks $CHUNKS \ 20 | --chunk-idx $IDX \ 21 | --temperature 0 \ 22 | --num_beams 3 \ 23 | --conv-mode qwen_1_5 & 24 | done 25 | 26 | wait 27 | 28 | output_file=./playground/data/eval/gqa/answers/$SPLIT/$CKPT/merge.jsonl 29 | 30 | # Clear out the output file if it exists. 31 | > "$output_file" 32 | 33 | # Loop through the indices and concatenate each file. 34 | for IDX in $(seq 0 $((CHUNKS-1))); do 35 | cat ./playground/data/eval/gqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 36 | done 37 | 38 | python scripts/convert_gqa_for_eval.py --src $output_file --dst $GQADIR/testdev_balanced_predictions.json 39 | 40 | cd $GQADIR 41 | python eval/eval.py --tier testdev_balanced 42 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/infographics.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT=$1 9 | echo $CKPT 10 | SPLIT="test" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 14 | --model-path ./checkpoints_new/$CKPT \ 15 | --question-file ./playground/data/eval/InfographicsVQA/info_questions.jsonl \ 16 | --image-folder ./playground/data/ureader/DUE_Benchmark/InfographicsVQA/pngs \ 17 | --answers-file ./playground/data/eval/InfographicsVQA/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --num_beams 1 \ 22 | --conv-mode vicuna_v1 & 23 | done 24 | 25 | wait 26 | 27 | output_file=./playground/data/eval/InfographicsVQA/answers/$SPLIT/$CKPT/merge_slice.jsonl 28 | 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ./playground/data/eval/InfographicsVQA/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | python -m llava.eval.eval_docvqa \ 38 | --annotation-file ./playground/data/eval/InfographicsVQA/info_annotations.jsonl \ 39 | --result-file $output_file \ 40 | --mid_result ./playground/data/eval/InfographicsVQA/mid_results/$CKPT.jsonl \ 41 | --output_result ./exp_results/$CKPT/info_result.jsonl -------------------------------------------------------------------------------- /scripts/v1_5/eval/llavabench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python -m llava.eval.model_vqa \ 4 | --model-path liuhaotian/llava-v1.5-13b \ 5 | --question-file ./playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 6 | --image-folder ./playground/data/eval/llava-bench-in-the-wild/images \ 7 | --answers-file ./playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \ 8 | --temperature 0 \ 9 | --conv-mode vicuna_v1 10 | 11 | mkdir -p playground/data/eval/llava-bench-in-the-wild/reviews 12 | 13 | python llava/eval/eval_gpt_review_bench.py \ 14 | --question playground/data/eval/llava-bench-in-the-wild/questions.jsonl \ 15 | --context playground/data/eval/llava-bench-in-the-wild/context.jsonl \ 16 | --rule llava/eval/table/rule.json \ 17 | --answer-list \ 18 | playground/data/eval/llava-bench-in-the-wild/answers_gpt4.jsonl \ 19 | playground/data/eval/llava-bench-in-the-wild/answers/llava-v1.5-13b.jsonl \ 20 | --output \ 21 | playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl 22 | 23 | python llava/eval/summarize_gpt_review.py -f playground/data/eval/llava-bench-in-the-wild/reviews/llava-v1.5-13b.jsonl 24 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mmbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # SPLIT="mmbench_dev_20230712" 4 | 5 | # python -m llava.eval.model_vqa_mmbench \ 6 | # --model-path liuhaotian/llava-v1.5-13b \ 7 | # --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 8 | # --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/llava-v1.5-13b.jsonl \ 9 | # --single-pred-prompt \ 10 | # --temperature 0 \ 11 | # --conv-mode vicuna_v1 12 | 13 | 14 | 15 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 16 | IFS=',' read -ra GPULIST <<< "$gpu_list" 17 | 18 | CHUNKS=${#GPULIST[@]} 19 | 20 | CKPT=$1 21 | echo $CKPT 22 | SPLIT="mmbench_dev_20230712" 23 | 24 | for IDX in $(seq 0 $((CHUNKS-1))); do 25 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_mmbench \ 26 | --model-path ./checkpoints_new/$CKPT \ 27 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 28 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 29 | --num-chunks $CHUNKS \ 30 | --chunk-idx $IDX \ 31 | --temperature 0 \ 32 | --single-pred-prompt \ 33 | --num_beams 1 \ 34 | --conv-mode vicuna_v1 & 35 | done 36 | 37 | wait 38 | 39 | output_file=./playground/data/eval/mmbench/answers/$SPLIT/$CKPT/merge.jsonl 40 | 41 | # Clear out the output file if it exists. 42 | > "$output_file" 43 | 44 | # Loop through the indices and concatenate each file. 45 | for IDX in $(seq 0 $((CHUNKS-1))); do 46 | cat ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 47 | done 48 | 49 | 50 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 51 | 52 | python scripts/convert_mmbench_for_submission.py \ 53 | --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 54 | --result-dir ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT \ 55 | --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \ 56 | --experiment merge 57 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mmbench_cn.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # SPLIT="mmbench_dev_cn_20231003" 4 | 5 | 6 | # python -m llava.eval.model_vqa_mmbench \ 7 | # --model-path liuhaotian/llava-v1.5-13b \ 8 | # --question-file ./playground/data/eval/mmbench_cn/$SPLIT.tsv \ 9 | # --answers-file ./playground/data/eval/mmbench_cn/answers/$SPLIT/llava-v1.5-13b.jsonl \ 10 | # --lang cn \ 11 | # --single-pred-prompt \ 12 | # --temperature 0 \ 13 | # --conv-mode vicuna_v1 14 | 15 | 16 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 17 | IFS=',' read -ra GPULIST <<< "$gpu_list" 18 | 19 | CHUNKS=${#GPULIST[@]} 20 | 21 | CKPT=$1 22 | echo $CKPT 23 | SPLIT="mmbench_dev_cn_20231003" 24 | 25 | for IDX in $(seq 0 $((CHUNKS-1))); do 26 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_mmbench \ 27 | --model-path ./checkpoints_new/$CKPT \ 28 | --question-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 29 | --answers-file ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 30 | --lang cn \ 31 | --single-pred-prompt \ 32 | --num-chunks $CHUNKS \ 33 | --chunk-idx $IDX \ 34 | --temperature 0.2 \ 35 | --num_beams 1 \ 36 | --conv-mode vicuna_v1 & 37 | done 38 | 39 | wait 40 | 41 | output_file=./playground/data/eval/mmbench/answers/$SPLIT/$CKPT/merge.jsonl 42 | 43 | # Clear out the output file if it exists. 44 | > "$output_file" 45 | 46 | # Loop through the indices and concatenate each file. 47 | for IDX in $(seq 0 $((CHUNKS-1))); do 48 | cat ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 49 | done 50 | 51 | mkdir -p playground/data/eval/mmbench/answers_upload/$SPLIT 52 | 53 | python scripts/convert_mmbench_for_submission.py \ 54 | --annotation-file ./playground/data/eval/mmbench/$SPLIT.tsv \ 55 | --result-dir ./playground/data/eval/mmbench/answers/$SPLIT/$CKPT \ 56 | --upload-dir ./playground/data/eval/mmbench/answers_upload/$SPLIT \ 57 | --experiment merge 58 | 59 | 60 | 61 | 62 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mme.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT=$1 9 | echo $CKPT 10 | SPLIT="llava_mme" 11 | 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 15 | --model-path ./checkpoints_new/$CKPT \ 16 | --question-file ./playground/data/eval/MME/$SPLIT.jsonl \ 17 | --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \ 18 | --answers-file ./playground/data/eval/MME/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 19 | --num-chunks $CHUNKS \ 20 | --chunk-idx $IDX \ 21 | --temperature 0 \ 22 | --num_beams 5 \ 23 | --conv-mode qwen_1_5 & 24 | done 25 | 26 | wait 27 | 28 | output_file=./playground/data/eval/MME/answers/$SPLIT/$CKPT/merge.jsonl 29 | 30 | # Clear out the output file if it exists. 31 | > "$output_file" 32 | 33 | # Loop through the indices and concatenate each file. 34 | for IDX in $(seq 0 $((CHUNKS-1))); do 35 | cat ./playground/data/eval/MME/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 36 | done 37 | 38 | 39 | cp $output_file ./playground/data/eval/MME/answers/$CKPT.jsonl 40 | 41 | cd ./playground/data/eval/MME 42 | 43 | python convert_answer_to_mme.py --experiment $CKPT 44 | 45 | cd eval_tool 46 | 47 | python calculation.py --results_dir answers/$CKPT 48 | 49 | 50 | 51 | 52 | # python -m llava.eval.model_vqa_loader \ 53 | # --model-path ./checkpoints_new/$CKPT \ 54 | # --question-file ./playground/data/eval/MME/$SPLIT.jsonl \ 55 | # --image-folder ./playground/data/eval/MME/MME_Benchmark_release_version \ 56 | # --answers-file ./playground/data/eval/MME/answers/$CKPT.jsonl \ 57 | # --temperature 0 \ 58 | # --num_beams 3 \ 59 | # --conv-mode vicuna_v1 60 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/mmvet.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 5 | IFS=',' read -ra GPULIST <<< "$gpu_list" 6 | 7 | CHUNKS=${#GPULIST[@]} 8 | 9 | CKPT=$1 10 | echo $CKPT 11 | SPLIT="llava-mm-vet" 12 | 13 | # python -m llava.eval.model_vqa \ 14 | # --model-path liuhaotian/llava-v1.5-13b \ 15 | # --question-file ./playground/data/eval/mm-vet/llava-mm-vet.jsonl \ 16 | # --image-folder ./playground/data/eval/mm-vet/images \ 17 | # --answers-file ./playground/data/eval/mm-vet/answers/llava-v1.5-13b.jsonl \ 18 | # --temperature 0 \ 19 | # --conv-mode vicuna_v1 20 | 21 | for IDX in $(seq 0 $((CHUNKS-1))); do 22 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 23 | --model-path ./checkpoints_new/$CKPT \ 24 | --question-file ./playground/data/eval/mm-vet/$SPLIT.jsonl \ 25 | --image-folder ./playground/data/eval/mm-vet/images \ 26 | --answers-file ./playground/data/eval/mm-vet/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 27 | --num-chunks $CHUNKS \ 28 | --chunk-idx $IDX \ 29 | --temperature 0 \ 30 | --num_beams 5 \ 31 | --conv-mode vicuna_v1 & 32 | done 33 | 34 | wait 35 | 36 | output_file=./playground/data/eval/mm-vet/answers/$SPLIT/$CKPT/merge.jsonl 37 | 38 | # Clear out the output file if it exists. 39 | > "$output_file" 40 | 41 | # Loop through the indices and concatenate each file. 42 | for IDX in $(seq 0 $((CHUNKS-1))); do 43 | cat ./playground/data/eval/mm-vet/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 44 | done 45 | 46 | 47 | mkdir -p ./playground/data/eval/mm-vet/results 48 | 49 | python scripts/convert_mmvet_for_eval.py \ 50 | --src $output_file \ 51 | --dst ./playground/data/eval/mm-vet/results/$CKPT.json 52 | 53 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/pope.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT=$1 9 | echo $CKPT 10 | SPLIT="llava_pope_test_my" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 14 | --model-path ./checkpoints_new/$CKPT \ 15 | --question-file ./playground/data/eval/pope/$SPLIT.jsonl \ 16 | --image-folder ./playground/data/eval/pope/val2014 \ 17 | --answers-file ./playground/data/eval/pope/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --num_beams 1 \ 22 | --conv-mode vicuna_v1 & 23 | done 24 | 25 | wait 26 | 27 | output_file=./playground/data/eval/pope/answers/$SPLIT/$CKPT/merge_slice.jsonl 28 | 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ./playground/data/eval/pope/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | python llava/eval/eval_pope.py \ 38 | --annotation-dir ./playground/data/eval/pope/coco \ 39 | --question-file ./playground/data/eval/pope/$SPLIT.jsonl \ 40 | --result-file ./playground/data/eval/pope/answers/$SPLIT/$CKPT/merge_slice.jsonl 41 | 42 | 43 | # CKPT="llava-v1.5-adapt" 44 | 45 | # python -m llava.eval.model_vqa_loader \ 46 | # --model-path ./checkpoints_new/$CKPT \ 47 | # --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ 48 | # --image-folder ./playground/data/eval/pope/val2014 \ 49 | # --answers-file ./playground/data/eval/pope/answers/$CKPT.jsonl \ 50 | # --temperature 0 \ 51 | # --conv-mode vicuna_v1 52 | 53 | # python llava/eval/eval_pope.py \ 54 | # --annotation-dir ./playground/data/eval/pope/coco \ 55 | # --question-file ./playground/data/eval/pope/llava_pope_test.jsonl \ 56 | # --result-file ./playground/data/eval/pope/answers/$CKPT.jsonl 57 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/qbench.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$1" = "dev" ]; then 4 | echo "Evaluating in 'dev' split." 5 | elif [ "$1" = "test" ]; then 6 | echo "Evaluating in 'test' split." 7 | else 8 | echo "Unknown split, please choose between 'dev' and 'test'." 9 | exit 1 10 | fi 11 | 12 | python -m llava.eval.model_vqa_qbench \ 13 | --model-path liuhaotian/llava-v1.5-13b \ 14 | --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \ 15 | --questions-file ./playground/data/eval/qbench/llvisionqa_$1.json \ 16 | --answers-file ./playground/data/eval/qbench/llvisionqa_$1_answers.jsonl \ 17 | --conv-mode llava_v1 \ 18 | --lang en 19 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/qbench_zh.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | if [ "$1" = "dev" ]; then 4 | ZH_SPLIT="验证集" 5 | echo "Evaluating in 'dev' split." 6 | elif [ "$1" = "test" ]; then 7 | ZH_SPLIT="测试集" 8 | echo "Evaluating in 'test' split." 9 | else 10 | echo "Unknown split, please choose between 'dev' and 'test'." 11 | exit 1 12 | fi 13 | 14 | python -m llava.eval.model_vqa_qbench \ 15 | --model-path liuhaotian/llava-v1.5-13b \ 16 | --image-folder ./playground/data/eval/qbench/images_llvisionqa/ \ 17 | --questions-file ./playground/data/eval/qbench/质衡-问答-$ZH_SPLIT.json \ 18 | --answers-file ./playground/data/eval/qbench/llvisionqa_zh_$1_answers.jsonl \ 19 | --conv-mode llava_v1 \ 20 | --lang zh 21 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/rec.sh: -------------------------------------------------------------------------------- 1 | # #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT=$1 9 | echo $CKPT 10 | SPLIT="llava_ref3_test_2017" 11 | 12 | 13 | for IDX in $(seq 0 $((CHUNKS-1))); do 14 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 15 | --model-path ./checkpoints_new/$CKPT \ 16 | --question-file ./playground/data/eval/rec/$SPLIT.jsonl \ 17 | --image-folder ./playground/data/coco/train2017 \ 18 | --answers-file ./playground/data/eval/rec/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 19 | --num-chunks $CHUNKS \ 20 | --chunk-idx $IDX \ 21 | --temperature 0 \ 22 | --num_beams 3 \ 23 | --conv-mode vicuna_v1 & 24 | done 25 | 26 | wait 27 | 28 | output_file=./playground/data/eval/rec/answers/$SPLIT/$CKPT/merge.jsonl 29 | 30 | # Clear out the output file if it exists. 31 | > "$output_file" 32 | 33 | # Loop through the indices and concatenate each file. 34 | for IDX in $(seq 0 $((CHUNKS-1))); do 35 | cat ./playground/data/eval/rec/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 36 | done 37 | 38 | python -m llava.eval.eval_rec \ 39 | --annotation-file ./playground/data/eval/rec/llava_ref3_labels.jsonl \ 40 | --question-file ./playground/data/eval/rec/$SPLIT.jsonl \ 41 | --result-file ./playground/data/eval/rec/answers/$SPLIT/$CKPT/merge.jsonl -------------------------------------------------------------------------------- /scripts/v1_5/eval/textvqa.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # # multiple evaluation 5 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 6 | IFS=',' read -ra GPULIST <<< "$gpu_list" 7 | 8 | CHUNKS=${#GPULIST[@]} 9 | 10 | CKPT=$1 11 | echo $CKPT 12 | SPLIT="llava_textvqa_val_v051_ocr" 13 | 14 | for IDX in $(seq 0 $((CHUNKS-1))); do 15 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 16 | --model-path ./checkpoints_new/$CKPT \ 17 | --question-file ./playground/data/eval/textvqa/$SPLIT.jsonl \ 18 | --image-folder ./playground/data/eval/textvqa/train_images \ 19 | --answers-file ./playground/data/eval/textvqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 20 | --num-chunks $CHUNKS \ 21 | --chunk-idx $IDX \ 22 | --temperature 0 \ 23 | --num_beams 3 \ 24 | --conv-mode qwen_1_5 & 25 | done 26 | 27 | wait 28 | 29 | output_file=./playground/data/eval/textvqa/answers/$SPLIT/$CKPT/merge_slice.jsonl 30 | 31 | # Clear out the output file if it exists. 32 | > "$output_file" 33 | 34 | # Loop through the indices and concatenate each file. 35 | for IDX in $(seq 0 $((CHUNKS-1))); do 36 | cat ./playground/data/eval/textvqa/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 37 | done 38 | 39 | python -m llava.eval.eval_textvqa \ 40 | --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \ 41 | --result-file ./playground/data/eval/textvqa/answers/$SPLIT/$CKPT/merge_slice.jsonl 42 | 43 | # python -m llava.eval.model_vqa_loader \ 44 | # --model-path liuhaotian/llava-v1.5-13b \ 45 | # --question-file ./playground/data/eval/textvqa/llava_textvqa_val_v051_ocr.jsonl \ 46 | # --image-folder ./playground/data/eval/textvqa/train_images \ 47 | # --answers-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl \ 48 | # --temperature 0 \ 49 | # --conv-mode vicuna_v1 50 | 51 | # python -m llava.eval.eval_textvqa \ 52 | # --annotation-file ./playground/data/eval/textvqa/TextVQA_0.5.1_val.json \ 53 | # --result-file ./playground/data/eval/textvqa/answers/llava-v1.5-13b.jsonl 54 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/vizwiz.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | 4 | # python -m llava.eval.model_vqa_loader \ 5 | # --model-path liuhaotian/llava-v1.5-13b \ 6 | # --question-file ./playground/data/eval/vizwiz/llava_test.jsonl \ 7 | # --image-folder ./playground/data/eval/vizwiz/test \ 8 | # --answers-file ./playground/data/eval/vizwiz/answers/llava-v1.5-13b.jsonl \ 9 | # --temperature 0 \ 10 | # --conv-mode vicuna_v1 11 | 12 | # # multiple evaluation 13 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 14 | IFS=',' read -ra GPULIST <<< "$gpu_list" 15 | 16 | CHUNKS=${#GPULIST[@]} 17 | 18 | CKPT=$1 19 | echo $CKPT 20 | SPLIT="llava_test" 21 | 22 | for IDX in $(seq 0 $((CHUNKS-1))); do 23 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 24 | --model-path ./checkpoints_new/$CKPT \ 25 | --question-file ./playground/data/eval/vizwiz/$SPLIT.jsonl \ 26 | --image-folder ./playground/data/eval/vizwiz/test \ 27 | --answers-file ./playground/data/eval/vizwiz/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 28 | --num-chunks $CHUNKS \ 29 | --chunk-idx $IDX \ 30 | --temperature 0 \ 31 | --num_beams 3 \ 32 | --conv-mode vicuna_v1 & 33 | done 34 | 35 | wait 36 | 37 | output_file=./playground/data/eval/vizwiz/answers/$SPLIT/$CKPT/merge_slice.jsonl 38 | 39 | # Clear out the output file if it exists. 40 | > "$output_file" 41 | 42 | # Loop through the indices and concatenate each file. 43 | for IDX in $(seq 0 $((CHUNKS-1))); do 44 | cat ./playground/data/eval/vizwiz/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 45 | done 46 | 47 | 48 | python scripts/convert_vizwiz_for_submission.py \ 49 | --annotation-file ./playground/data/eval/vizwiz/$SPLIT.jsonl \ 50 | --result-file $output_file \ 51 | --result-upload-file ./playground/data/eval/vizwiz/answers_upload/$CKPT.json 52 | -------------------------------------------------------------------------------- /scripts/v1_5/eval/vqav2.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | gpu_list="${CUDA_VISIBLE_DEVICES:-0}" 4 | IFS=',' read -ra GPULIST <<< "$gpu_list" 5 | 6 | CHUNKS=${#GPULIST[@]} 7 | 8 | CKPT=$1 9 | echo $CKPT 10 | SPLIT="llava_vqav2_mscoco_test-dev2015" 11 | 12 | for IDX in $(seq 0 $((CHUNKS-1))); do 13 | CUDA_VISIBLE_DEVICES=${GPULIST[$IDX]} python -m llava.eval.model_vqa_loader \ 14 | --model-path ./checkpoints_new/$CKPT \ 15 | --question-file ./playground/data/eval/vqav2/$SPLIT.jsonl \ 16 | --image-folder ./playground/data/eval/vqav2/test2015 \ 17 | --answers-file ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl \ 18 | --num-chunks $CHUNKS \ 19 | --chunk-idx $IDX \ 20 | --temperature 0 \ 21 | --num_beams 3 \ 22 | --conv-mode vicuna_v1 & 23 | done 24 | 25 | wait 26 | 27 | output_file=./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/merge.jsonl 28 | 29 | # Clear out the output file if it exists. 30 | > "$output_file" 31 | 32 | # Loop through the indices and concatenate each file. 33 | for IDX in $(seq 0 $((CHUNKS-1))); do 34 | cat ./playground/data/eval/vqav2/answers/$SPLIT/$CKPT/${CHUNKS}_${IDX}.jsonl >> "$output_file" 35 | done 36 | 37 | python scripts/convert_vqav2_for_submission.py --split $SPLIT --ckpt $CKPT 38 | 39 | -------------------------------------------------------------------------------- /scripts/video/demo/video_demo.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ROOT_DIR="/mnt/bn/vl-research/workspace/yhzhang/LLaVA-NeXT" 3 | 4 | if [ ! -e $ROOT_DIR ]; then 5 | echo "The root dir does not exist. Exiting the script." 6 | exit 1 7 | fi 8 | 9 | cd $ROOT_DIR 10 | 11 | export PYTHONWARNINGS=ignore 12 | export TOKENIZERS_PARALLELISM=false 13 | 14 | CKPT=$1 15 | CONV_MODE=$2 16 | FRAMES=$3 17 | POOL_STRIDE=$4 18 | POOL_MODE=$5 19 | NEWLINE_POSITION=$6 20 | OVERWRITE=$7 21 | VIDEO_PATH=$8 22 | 23 | 24 | if [ "$OVERWRITE" = False ]; then 25 | SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE}_overwrite_${OVERWRITE} 26 | 27 | else 28 | SAVE_DIR=$(basename $CKPT)_${CONV_MODE}_frames_${FRAMES}_stride_${POOL_STRIDE} 29 | fi 30 | 31 | python3 playground/demo/video_demo.py \ 32 | --model-path $CKPT \ 33 | --video_path ${VIDEO_PATH} \ 34 | --output_dir ./work_dirs/video_demo/$SAVE_DIR \ 35 | --output_name pred \ 36 | --chunk-idx $(($IDX - 1)) \ 37 | --overwrite ${OVERWRITE} \ 38 | --mm_spatial_pool_stride ${POOL_STRIDE:-4} \ 39 | --for_get_frames_num $FRAMES \ 40 | --conv-mode $CONV_MODE \ 41 | --mm_spatial_pool_mode ${POOL_MODE:-average} \ 42 | --mm_newline_position ${NEWLINE_POSITION:-grid} \ 43 | --prompt "Please provide a detailed description of the video, focusing on the main subjects, their actions, the background scenes." -------------------------------------------------------------------------------- /scripts/video/eval/video_detail_description_eval_only.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | ROOT_DIR="root to LLaVA-NeXT-Video" 3 | 4 | if [ ! -e $ROOT_DIR ]; then 5 | echo "The root dir does not exist. Exiting the script." 6 | exit 1 7 | fi 8 | 9 | cd $ROOT_DIR 10 | 11 | export PYTHONWARNINGS=ignore 12 | export TOKENIZERS_PARALLELISM=false 13 | 14 | OPENAIKEY="INPUT YOUR OPENAI API" 15 | 16 | SAVE_DIR=$1 17 | 18 | python3 llava/eval/evaluate_benchmark_video_detail_description.py \ 19 | --pred_path ./work_dirs/eval_video_detail_description/$SAVE_DIR/pred.json \ 20 | --output_dir ./work_dirs/eval_video_detail_description/$SAVE_DIR/detail_results \ 21 | --output_json ./work_dirs/eval_video_detail_description/$SAVE_DIR/detail_results.json \ 22 | --num_chunks 1 \ 23 | --num_tasks 16 \ 24 | --api_key $OPENAIKEY \ -------------------------------------------------------------------------------- /scripts/zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 2, 24 | "offload_optimizer": { 25 | "device": "none", 26 | "pin_memory": true 27 | }, 28 | "allgather_partitions": true, 29 | "allgather_bucket_size": 2e8, 30 | "overlap_comm": false, 31 | "reduce_scatter": true, 32 | "reduce_bucket_size": 2e8, 33 | "contiguous_gradients": true 34 | }, 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 100, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": false 41 | } -------------------------------------------------------------------------------- /scripts/zero2_fused_adamw.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 2, 24 | "offload_optimizer": { 25 | "device": "none", 26 | "pin_memory": true 27 | }, 28 | "allgather_partitions": true, 29 | "allgather_bucket_size": 2e8, 30 | "overlap_comm": true, 31 | "reduce_scatter": true, 32 | "reduce_bucket_size": 2e8, 33 | "contiguous_gradients": true 34 | }, 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 100, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": false 41 | } -------------------------------------------------------------------------------- /scripts/zero2_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "offload_optimizer": { 19 | "device": "cpu", 20 | "pin_memory": true 21 | }, 22 | "offload_param": { 23 | "device": "cpu", 24 | "pin_memory": true 25 | }, 26 | "overlap_comm": true, 27 | "contiguous_gradients": true, 28 | "sub_group_size": 1e9, 29 | "reduce_bucket_size": "auto" 30 | } 31 | } -------------------------------------------------------------------------------- /scripts/zero2_old.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 2, 18 | "overlap_comm": false, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto" 22 | } 23 | } -------------------------------------------------------------------------------- /scripts/zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | 14 | "zero_optimization": { 15 | "stage": 3, 16 | "offload_optimizer": { 17 | "device": "none", 18 | "pin_memory": true 19 | }, 20 | "offload_param": { 21 | "device": "none", 22 | "pin_memory": true 23 | }, 24 | "overlap_comm": true, 25 | "contiguous_gradients": true, 26 | "sub_group_size": 1e9, 27 | "reduce_bucket_size": "auto", 28 | "stage3_prefetch_bucket_size": "auto", 29 | "stage3_param_persistence_threshold": "auto", 30 | "stage3_max_live_parameters": 1e9, 31 | "stage3_max_reuse_distance": 1e9, 32 | "stage3_gather_16bit_weights_on_model_save": true 33 | }, 34 | 35 | "gradient_accumulation_steps": "auto", 36 | "gradient_clipping": "auto", 37 | "steps_per_print": 100, 38 | "train_batch_size": "auto", 39 | "train_micro_batch_size_per_gpu": "auto", 40 | "wall_clock_breakdown": false 41 | } -------------------------------------------------------------------------------- /scripts/zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "zero_optimization": { 23 | "stage": 3, 24 | "offload_optimizer": { 25 | "device": "cpu", 26 | "pin_memory": true 27 | }, 28 | "offload_param": { 29 | "device": "cpu", 30 | "pin_memory": true 31 | }, 32 | "overlap_comm": true, 33 | "contiguous_gradients": true, 34 | "sub_group_size": 1e9, 35 | "reduce_bucket_size": "auto", 36 | "stage3_prefetch_bucket_size": "auto", 37 | "stage3_param_persistence_threshold": "auto", 38 | "stage3_max_live_parameters": 1e9, 39 | "stage3_max_reuse_distance": 1e9, 40 | "gather_16bit_weights_on_model_save": true 41 | }, 42 | "gradient_accumulation_steps": "auto", 43 | "gradient_clipping": "auto", 44 | "train_batch_size": "auto", 45 | "train_micro_batch_size_per_gpu": "auto", 46 | "steps_per_print": 1e5, 47 | "wall_clock_breakdown": false 48 | } -------------------------------------------------------------------------------- /scripts/zero3pp.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | 23 | "zero_optimization": { 24 | "stage": 3, 25 | "offload_optimizer": { 26 | "device": "none", 27 | "pin_memory": true 28 | }, 29 | "offload_param": { 30 | "device": "none", 31 | "pin_memory": true 32 | }, 33 | "overlap_comm": true, 34 | "contiguous_gradients": true, 35 | "zero_quantized_weights": true, 36 | "zero_hpz_partition_size": 16, 37 | "zero_quantized_gradients": true, 38 | "sub_group_size": 1e9, 39 | "reduce_bucket_size": "auto", 40 | "stage3_prefetch_bucket_size": "auto", 41 | "stage3_param_persistence_threshold": "auto", 42 | "stage3_max_live_parameters": 1e9, 43 | "stage3_max_reuse_distance": 1e9, 44 | "stage3_gather_16bit_weights_on_model_save": true 45 | }, 46 | 47 | "gradient_accumulation_steps": "auto", 48 | "gradient_clipping": "auto", 49 | "steps_per_print": 100, 50 | "train_batch_size": "auto", 51 | "train_micro_batch_size_per_gpu": "auto", 52 | "wall_clock_breakdown": false 53 | } -------------------------------------------------------------------------------- /trl/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | __version__ = "0.7.11.dev0" 4 | 5 | from .core import set_seed 6 | from .environment import TextEnvironment, TextHistory 7 | from .extras import BestOfNSampler 8 | from .import_utils import ( 9 | is_bitsandbytes_available, 10 | is_diffusers_available, 11 | is_npu_available, 12 | is_peft_available, 13 | is_wandb_available, 14 | is_xpu_available, 15 | ) 16 | from .models import ( 17 | AutoModelForCausalLMWithValueHead, 18 | AutoModelForSeq2SeqLMWithValueHead, 19 | PreTrainedModelWrapper, 20 | create_reference_model, 21 | setup_chat_format, 22 | ) 23 | from .trainer import ( 24 | DataCollatorForCompletionOnlyLM, 25 | DPOTrainer, 26 | IterativeSFTTrainer, 27 | ModelConfig, 28 | PPOConfig, 29 | PPOTrainer, 30 | RewardConfig, 31 | RewardTrainer, 32 | SFTTrainer, 33 | ) 34 | from .trainer.utils import get_kbit_device_map, get_peft_config, get_quantization_config 35 | 36 | 37 | if is_diffusers_available(): 38 | from .models import ( 39 | DDPOPipelineOutput, 40 | DDPOSchedulerOutput, 41 | DDPOStableDiffusionPipeline, 42 | DefaultDDPOStableDiffusionPipeline, 43 | ) 44 | from .trainer import DDPOConfig, DDPOTrainer 45 | -------------------------------------------------------------------------------- /trl/environment/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | from .base_environment import TextEnvironment, TextHistory 4 | -------------------------------------------------------------------------------- /trl/extras/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | # Copyright 2022 The HuggingFace Team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | from .best_of_n_sampler import BestOfNSampler 17 | -------------------------------------------------------------------------------- /trl/models/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | # Copyright 2022 The HuggingFace Team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | from .modeling_base import PreTrainedModelWrapper, create_reference_model 17 | from .modeling_value_head import AutoModelForCausalLMWithValueHead, AutoModelForSeq2SeqLMWithValueHead 18 | from .utils import setup_chat_format 19 | 20 | 21 | SUPPORTED_ARCHITECTURES = ( 22 | AutoModelForCausalLMWithValueHead, 23 | AutoModelForSeq2SeqLMWithValueHead, 24 | ) 25 | 26 | from ..import_utils import is_diffusers_available 27 | 28 | 29 | if is_diffusers_available(): 30 | from .modeling_sd_base import ( 31 | DDPOPipelineOutput, 32 | DDPOSchedulerOutput, 33 | DDPOStableDiffusionPipeline, 34 | DefaultDDPOStableDiffusionPipeline, 35 | ) 36 | -------------------------------------------------------------------------------- /trl/trainer/__init__.py: -------------------------------------------------------------------------------- 1 | # flake8: noqa 2 | 3 | # Copyright 2022 The HuggingFace Team. All rights reserved. 4 | # 5 | # Licensed under the Apache License, Version 2.0 (the "License"); 6 | # you may not use this file except in compliance with the License. 7 | # You may obtain a copy of the License at 8 | # 9 | # http://www.apache.org/licenses/LICENSE-2.0 10 | # 11 | # Unless required by applicable law or agreed to in writing, software 12 | # distributed under the License is distributed on an "AS IS" BASIS, 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 14 | # See the License for the specific language governing permissions and 15 | # limitations under the License. 16 | 17 | # There is a circular import in the PPOTrainer if we let isort sort these 18 | # isort: off 19 | from .utils import ( 20 | AdaptiveKLController, 21 | FixedKLController, 22 | ConstantLengthDataset, 23 | DataCollatorForCompletionOnlyLM, 24 | RunningMoments, 25 | disable_dropout_in_model, 26 | peft_module_casting_to_bf16, 27 | ) 28 | 29 | # isort: on 30 | 31 | from ..import_utils import is_diffusers_available 32 | from .base import BaseTrainer 33 | from .ddpo_config import DDPOConfig 34 | 35 | 36 | if is_diffusers_available(): 37 | from .ddpo_trainer import DDPOTrainer 38 | 39 | from .dpo_trainer import DPOTrainer 40 | from .iterative_sft_trainer import IterativeSFTTrainer 41 | from .model_config import ModelConfig 42 | from .ppo_config import PPOConfig 43 | from .ppo_trainer import PPOTrainer 44 | from .reward_config import RewardConfig 45 | from .reward_trainer import RewardTrainer, compute_accuracy 46 | from .sft_trainer import SFTTrainer 47 | -------------------------------------------------------------------------------- /trl/trainer/base.py: -------------------------------------------------------------------------------- 1 | # Copyright 2022 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from huggingface_hub import PyTorchModelHubMixin 16 | 17 | 18 | class BaseTrainer(PyTorchModelHubMixin): 19 | r""" 20 | Base class for all trainers - this base class implements the basic functions that we 21 | need for a trainer. 22 | 23 | The trainer needs to have the following functions: 24 | - step: takes in a batch of data and performs a step of training 25 | - loss: takes in a batch of data and returns the loss 26 | - compute_rewards: takes in a batch of data and returns the rewards 27 | - _build_models_and_tokenizer: builds the models and tokenizer 28 | - _build_dataset: builds the dataset 29 | Each user is expected to implement their own trainer class that inherits from this base 30 | if they want to use a new training algorithm. 31 | """ 32 | 33 | def __init__(self, config): 34 | self.config = config 35 | 36 | def step(self, *args): 37 | raise NotImplementedError("Not implemented") 38 | 39 | def loss(self, *args): 40 | raise NotImplementedError("Not implemented") 41 | 42 | def compute_rewards(self, *args): 43 | raise NotImplementedError("Not implemented") 44 | 45 | def _save_pretrained(self, save_directory): 46 | raise NotImplementedError("Not implemented") 47 | -------------------------------------------------------------------------------- /trl/trainer/reward_config.py: -------------------------------------------------------------------------------- 1 | # Copyright 2024 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from dataclasses import dataclass 16 | from typing import Optional 17 | 18 | from transformers import TrainingArguments 19 | 20 | 21 | @dataclass 22 | class RewardConfig(TrainingArguments): 23 | """ 24 | RewardConfig collects all training arguments related to the [`RewardTrainer`] class. 25 | 26 | Using [`HfArgumentParser`] we can turn this class into 27 | [argparse](https://docs.python.org/3/library/argparse#module-argparse) arguments that can be specified on the 28 | command line. 29 | 30 | Parameters: 31 | max_length (`int`, *optional*, defaults to `None`): 32 | The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator. 33 | gradient_checkpointing (`bool`, *optional*, defaults to `True`): 34 | If True, use gradient checkpointing to save memory at the expense of slower backward pass. 35 | """ 36 | 37 | max_length: Optional[int] = None 38 | """The maximum length of the sequences in the batch. This argument is required if you want to use the default data collator.""" 39 | -------------------------------------------------------------------------------- /vdim-pretrain.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | python featup/train_vdim_upsampler.py --------------------------------------------------------------------------------