├── .gitignore ├── LICENSE ├── README.md ├── assets ├── figure_acchitecture.png ├── logo.png ├── vis_1.jpg ├── vis_1.png ├── vis_2.jpg ├── vis_2.png ├── vis_3.jpg └── vis_3.png ├── llava ├── __init__.py ├── constants.py ├── conversation.py ├── eval │ ├── evaluate_interleave.py │ └── model_vqa.py ├── mm_utils.py ├── model │ ├── __init__.py │ ├── apply_delta.py │ ├── builder.py │ ├── consolidate.py │ ├── language_model │ │ ├── llava_llama.py │ │ ├── llava_qwen.py │ │ └── modeling_llama.py │ ├── llava_arch.py │ ├── make_delta.py │ ├── multimodal_encoder │ │ ├── builder.py │ │ ├── clip_encoder.py │ │ ├── dev_eva_clip │ │ │ ├── eva_clip │ │ │ │ ├── __init__.py │ │ │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ │ │ ├── constants.py │ │ │ │ ├── eva_vit_model.py │ │ │ │ ├── factory.py │ │ │ │ ├── hf_configs.py │ │ │ │ ├── hf_model.py │ │ │ │ ├── loss.py │ │ │ │ ├── model.py │ │ │ │ ├── model_configs │ │ │ │ │ ├── EVA-CLIP-18B.json │ │ │ │ │ ├── EVA-CLIP-8B-plus.json │ │ │ │ │ ├── EVA-CLIP-8B.json │ │ │ │ │ ├── EVA01-CLIP-B-16.json │ │ │ │ │ ├── EVA01-CLIP-g-14-plus.json │ │ │ │ │ ├── EVA01-CLIP-g-14.json │ │ │ │ │ ├── EVA02-CLIP-B-16.json │ │ │ │ │ ├── EVA02-CLIP-L-14-336.json │ │ │ │ │ ├── EVA02-CLIP-L-14.json │ │ │ │ │ ├── EVA02-CLIP-bigE-14-plus.json │ │ │ │ │ ├── EVA02-CLIP-bigE-14.json │ │ │ │ │ ├── Internal-EVA02-CLIP-10B-14-448.json │ │ │ │ │ └── Internal-EVA02-CLIP-10B-14.json │ │ │ │ ├── modified_resnet.py │ │ │ │ ├── openai.py │ │ │ │ ├── pretrained.py │ │ │ │ ├── rope.py │ │ │ │ ├── timm_model.py │ │ │ │ ├── tokenizer.py │ │ │ │ ├── transform.py │ │ │ │ ├── transformer.py │ │ │ │ └── utils.py │ │ │ └── eva_vit.py │ │ ├── eva_clip │ │ │ ├── eva_clip_encoder.py │ │ │ ├── eva_clip_processors.py │ │ │ ├── eva_vit.py │ │ │ ├── factory.py │ │ │ └── model_configs │ │ │ │ ├── EVA-CLIP-18B.json │ │ │ │ ├── EVA-CLIP-8B-plus.json │ │ │ │ ├── EVA-CLIP-8B.json │ │ │ │ ├── EVA01-CLIP-B-16.json │ │ │ │ ├── EVA01-CLIP-g-14-plus.json │ │ │ │ ├── EVA01-CLIP-g-14.json │ │ │ │ ├── EVA02-CLIP-B-16.json │ │ │ │ ├── EVA02-CLIP-L-14-336.json │ │ │ │ ├── EVA02-CLIP-L-14.json │ │ │ │ ├── EVA02-CLIP-bigE-14-plus.json │ │ │ │ ├── EVA02-CLIP-bigE-14.json │ │ │ │ ├── Internal-EVA02-CLIP-10B-14-448.json │ │ │ │ └── Internal-EVA02-CLIP-10B-14.json │ │ ├── hf_vision.py │ │ ├── imagebind.py │ │ ├── mlcd │ │ │ └── vit_rope2d_hf.py │ │ ├── mlcd_encoder.py │ │ ├── open_clip_encoder.py │ │ ├── qwen2_5_encoder.py │ │ └── siglip_encoder.py │ ├── multimodal_projector │ │ ├── builder.py │ │ └── pooler_projector.py │ ├── multimodal_resampler │ │ ├── builder.py │ │ ├── masked_drop.py │ │ ├── perceiver.py │ │ ├── qformer.py │ │ └── spatial_pool.py │ └── utils.py ├── serve │ ├── __init__.py │ ├── cli.py │ ├── controller.py │ ├── examples │ │ ├── extreme_ironing.jpg │ │ └── waterview.jpg │ ├── gradio_multi_image.py │ ├── gradio_web_server.py │ ├── model_worker.py │ ├── register_worker.py │ ├── sglang_worker.py │ └── test_message.py ├── train │ ├── llama_flash_attn_monkey_patch.py │ ├── llava_trainer.py │ ├── llava_trainer_eval.py │ ├── train.py │ ├── train_dpo.py │ └── train_mem.py └── utils.py ├── pyproject.toml └── scripts ├── archived ├── convert_gqa_for_eval.py ├── convert_mmvet_for_eval.py ├── convert_sqa_to_llava.py ├── convert_sqa_to_llava_base_prompt.py ├── convert_vizwiz_for_submission.py ├── convert_vqav2_for_submission.py ├── data_info.py ├── dpo_data_info.py ├── entry_cmd.sh ├── finetune.sh ├── finetune_1.5.sh ├── finetune_full_schedule.sh ├── finetune_lora.sh ├── finetune_mixtral.sh ├── finetune_mixtral_1.5.sh ├── finetune_mixtral_1.6_336px_anyres.sh ├── finetune_mixtral_1.6_336px_anyres_freeze_vision.sh ├── finetune_mixtral_1.6_336px_anyres_lmms_eval.sh ├── finetune_mixtral_copy.sh ├── finetune_qlora.sh ├── finetune_sqa.sh ├── merge_lora_weights.py ├── pretrain.sh ├── quick_check.py ├── sqa_eval_batch.sh └── sqa_eval_gather.sh ├── interleave ├── eval_all.sh ├── eval_interleave_3d.sh └── eval_multiprocess.sh ├── qwen.py ├── summarize_data.py ├── train ├── README.md ├── stage1.5_caption_clip_qwen_victor.sh ├── stage1.5_caption_siglip_qwen_victor.sh ├── stage1_pretrain_clip.sh ├── stage1_pretrain_siglip.sh ├── stage2_finetune_clip_qwen.sh └── stage2_finetune_siglip_qwen.sh ├── video ├── demo │ └── video_demo.sh ├── eval │ ├── activitynet_eval.sh │ ├── video_chatgpt_benchmark_eval_shard.sh │ ├── video_description_from_t2v.sh │ ├── video_detail_description_eval_only.sh │ └── video_detail_description_eval_shard.sh └── train │ ├── SO400M_Qwen2_72B_ov_to_video_am9.sh │ ├── SO400M_Qwen2_7B_ov_to_video_am9.sh │ └── exp.yaml ├── zero2.json ├── zero2_fused_adamw.json ├── zero2_offload.json ├── zero3.json ├── zero3_offload.json └── zero3pp.json /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/README.md -------------------------------------------------------------------------------- /assets/figure_acchitecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/assets/figure_acchitecture.png -------------------------------------------------------------------------------- /assets/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/assets/logo.png -------------------------------------------------------------------------------- /assets/vis_1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/assets/vis_1.jpg -------------------------------------------------------------------------------- /assets/vis_1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/assets/vis_1.png -------------------------------------------------------------------------------- /assets/vis_2.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/assets/vis_2.jpg -------------------------------------------------------------------------------- /assets/vis_2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/assets/vis_2.png -------------------------------------------------------------------------------- /assets/vis_3.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/assets/vis_3.jpg -------------------------------------------------------------------------------- /assets/vis_3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/assets/vis_3.png -------------------------------------------------------------------------------- /llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /llava/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/constants.py -------------------------------------------------------------------------------- /llava/conversation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/conversation.py -------------------------------------------------------------------------------- /llava/eval/evaluate_interleave.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/eval/evaluate_interleave.py -------------------------------------------------------------------------------- /llava/eval/model_vqa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/eval/model_vqa.py -------------------------------------------------------------------------------- /llava/mm_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/mm_utils.py -------------------------------------------------------------------------------- /llava/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/__init__.py -------------------------------------------------------------------------------- /llava/model/apply_delta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/apply_delta.py -------------------------------------------------------------------------------- /llava/model/builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/builder.py -------------------------------------------------------------------------------- /llava/model/consolidate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/consolidate.py -------------------------------------------------------------------------------- /llava/model/language_model/llava_llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/language_model/llava_llama.py -------------------------------------------------------------------------------- /llava/model/language_model/llava_qwen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/language_model/llava_qwen.py -------------------------------------------------------------------------------- /llava/model/language_model/modeling_llama.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/language_model/modeling_llama.py -------------------------------------------------------------------------------- /llava/model/llava_arch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/llava_arch.py -------------------------------------------------------------------------------- /llava/model/make_delta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/make_delta.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/builder.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/clip_encoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/clip_encoder.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/__init__.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/constants.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/eva_vit_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/eva_vit_model.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/factory.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/factory.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/hf_configs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/hf_configs.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/hf_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/hf_model.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/loss.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/loss.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-18B.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-18B.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-8B-plus.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-8B-plus.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-8B.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA-CLIP-8B.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-B-16.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-B-16.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-g-14.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA01-CLIP-g-14.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-B-16.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-B-16.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-L-14-336.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-L-14-336.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-L-14.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-L-14.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/EVA02-CLIP-bigE-14.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14-448.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14-448.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/modified_resnet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/modified_resnet.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/openai.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/openai.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/pretrained.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/pretrained.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/rope.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/rope.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/timm_model.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/timm_model.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/tokenizer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/tokenizer.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/transform.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/transform.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/transformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/transformer.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_clip/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_clip/utils.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/dev_eva_clip/eva_vit.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/dev_eva_clip/eva_vit.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/eva_clip_encoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/eva_clip/eva_clip_encoder.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/eva_clip_processors.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/eva_clip/eva_clip_processors.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/eva_vit.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/eva_clip/eva_vit.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/factory.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/eva_clip/factory.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-18B.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-18B.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-8B-plus.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-8B-plus.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-8B.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/eva_clip/model_configs/EVA-CLIP-8B.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-B-16.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-B-16.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-g-14-plus.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-g-14.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/eva_clip/model_configs/EVA01-CLIP-g-14.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-B-16.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-B-16.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-L-14-336.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-L-14-336.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-L-14.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-L-14.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-bigE-14-plus.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-bigE-14.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/eva_clip/model_configs/EVA02-CLIP-bigE-14.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14-448.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14-448.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/eva_clip/model_configs/Internal-EVA02-CLIP-10B-14.json -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/hf_vision.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/hf_vision.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/imagebind.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/imagebind.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/mlcd/vit_rope2d_hf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/mlcd/vit_rope2d_hf.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/mlcd_encoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/mlcd_encoder.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/open_clip_encoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/open_clip_encoder.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/qwen2_5_encoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/qwen2_5_encoder.py -------------------------------------------------------------------------------- /llava/model/multimodal_encoder/siglip_encoder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_encoder/siglip_encoder.py -------------------------------------------------------------------------------- /llava/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_projector/builder.py -------------------------------------------------------------------------------- /llava/model/multimodal_projector/pooler_projector.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_projector/pooler_projector.py -------------------------------------------------------------------------------- /llava/model/multimodal_resampler/builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_resampler/builder.py -------------------------------------------------------------------------------- /llava/model/multimodal_resampler/masked_drop.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_resampler/masked_drop.py -------------------------------------------------------------------------------- /llava/model/multimodal_resampler/perceiver.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_resampler/perceiver.py -------------------------------------------------------------------------------- /llava/model/multimodal_resampler/qformer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_resampler/qformer.py -------------------------------------------------------------------------------- /llava/model/multimodal_resampler/spatial_pool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/multimodal_resampler/spatial_pool.py -------------------------------------------------------------------------------- /llava/model/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/model/utils.py -------------------------------------------------------------------------------- /llava/serve/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /llava/serve/cli.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/serve/cli.py -------------------------------------------------------------------------------- /llava/serve/controller.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/serve/controller.py -------------------------------------------------------------------------------- /llava/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /llava/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /llava/serve/gradio_multi_image.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/serve/gradio_multi_image.py -------------------------------------------------------------------------------- /llava/serve/gradio_web_server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/serve/gradio_web_server.py -------------------------------------------------------------------------------- /llava/serve/model_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/serve/model_worker.py -------------------------------------------------------------------------------- /llava/serve/register_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/serve/register_worker.py -------------------------------------------------------------------------------- /llava/serve/sglang_worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/serve/sglang_worker.py -------------------------------------------------------------------------------- /llava/serve/test_message.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/serve/test_message.py -------------------------------------------------------------------------------- /llava/train/llama_flash_attn_monkey_patch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/train/llama_flash_attn_monkey_patch.py -------------------------------------------------------------------------------- /llava/train/llava_trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/train/llava_trainer.py -------------------------------------------------------------------------------- /llava/train/llava_trainer_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/train/llava_trainer_eval.py -------------------------------------------------------------------------------- /llava/train/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/train/train.py -------------------------------------------------------------------------------- /llava/train/train_dpo.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/train/train_dpo.py -------------------------------------------------------------------------------- /llava/train/train_mem.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/train/train_mem.py -------------------------------------------------------------------------------- /llava/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/llava/utils.py -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/pyproject.toml -------------------------------------------------------------------------------- /scripts/archived/convert_gqa_for_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/convert_gqa_for_eval.py -------------------------------------------------------------------------------- /scripts/archived/convert_mmvet_for_eval.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/convert_mmvet_for_eval.py -------------------------------------------------------------------------------- /scripts/archived/convert_sqa_to_llava.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/convert_sqa_to_llava.py -------------------------------------------------------------------------------- /scripts/archived/convert_sqa_to_llava_base_prompt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/convert_sqa_to_llava_base_prompt.py -------------------------------------------------------------------------------- /scripts/archived/convert_vizwiz_for_submission.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/convert_vizwiz_for_submission.py -------------------------------------------------------------------------------- /scripts/archived/convert_vqav2_for_submission.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/convert_vqav2_for_submission.py -------------------------------------------------------------------------------- /scripts/archived/data_info.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/data_info.py -------------------------------------------------------------------------------- /scripts/archived/dpo_data_info.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/dpo_data_info.py -------------------------------------------------------------------------------- /scripts/archived/entry_cmd.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/entry_cmd.sh -------------------------------------------------------------------------------- /scripts/archived/finetune.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/finetune.sh -------------------------------------------------------------------------------- /scripts/archived/finetune_1.5.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/finetune_1.5.sh -------------------------------------------------------------------------------- /scripts/archived/finetune_full_schedule.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/finetune_full_schedule.sh -------------------------------------------------------------------------------- /scripts/archived/finetune_lora.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/finetune_lora.sh -------------------------------------------------------------------------------- /scripts/archived/finetune_mixtral.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/finetune_mixtral.sh -------------------------------------------------------------------------------- /scripts/archived/finetune_mixtral_1.5.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/finetune_mixtral_1.5.sh -------------------------------------------------------------------------------- /scripts/archived/finetune_mixtral_1.6_336px_anyres.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/finetune_mixtral_1.6_336px_anyres.sh -------------------------------------------------------------------------------- /scripts/archived/finetune_mixtral_1.6_336px_anyres_freeze_vision.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/finetune_mixtral_1.6_336px_anyres_freeze_vision.sh -------------------------------------------------------------------------------- /scripts/archived/finetune_mixtral_1.6_336px_anyres_lmms_eval.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/finetune_mixtral_1.6_336px_anyres_lmms_eval.sh -------------------------------------------------------------------------------- /scripts/archived/finetune_mixtral_copy.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/finetune_mixtral_copy.sh -------------------------------------------------------------------------------- /scripts/archived/finetune_qlora.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/finetune_qlora.sh -------------------------------------------------------------------------------- /scripts/archived/finetune_sqa.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/finetune_sqa.sh -------------------------------------------------------------------------------- /scripts/archived/merge_lora_weights.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/merge_lora_weights.py -------------------------------------------------------------------------------- /scripts/archived/pretrain.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/pretrain.sh -------------------------------------------------------------------------------- /scripts/archived/quick_check.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/quick_check.py -------------------------------------------------------------------------------- /scripts/archived/sqa_eval_batch.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/sqa_eval_batch.sh -------------------------------------------------------------------------------- /scripts/archived/sqa_eval_gather.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/archived/sqa_eval_gather.sh -------------------------------------------------------------------------------- /scripts/interleave/eval_all.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/interleave/eval_all.sh -------------------------------------------------------------------------------- /scripts/interleave/eval_interleave_3d.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/interleave/eval_interleave_3d.sh -------------------------------------------------------------------------------- /scripts/interleave/eval_multiprocess.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/interleave/eval_multiprocess.sh -------------------------------------------------------------------------------- /scripts/qwen.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/qwen.py -------------------------------------------------------------------------------- /scripts/summarize_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/summarize_data.py -------------------------------------------------------------------------------- /scripts/train/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/train/README.md -------------------------------------------------------------------------------- /scripts/train/stage1.5_caption_clip_qwen_victor.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/train/stage1.5_caption_clip_qwen_victor.sh -------------------------------------------------------------------------------- /scripts/train/stage1.5_caption_siglip_qwen_victor.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/train/stage1.5_caption_siglip_qwen_victor.sh -------------------------------------------------------------------------------- /scripts/train/stage1_pretrain_clip.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/train/stage1_pretrain_clip.sh -------------------------------------------------------------------------------- /scripts/train/stage1_pretrain_siglip.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/train/stage1_pretrain_siglip.sh -------------------------------------------------------------------------------- /scripts/train/stage2_finetune_clip_qwen.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/train/stage2_finetune_clip_qwen.sh -------------------------------------------------------------------------------- /scripts/train/stage2_finetune_siglip_qwen.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/train/stage2_finetune_siglip_qwen.sh -------------------------------------------------------------------------------- /scripts/video/demo/video_demo.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/video/demo/video_demo.sh -------------------------------------------------------------------------------- /scripts/video/eval/activitynet_eval.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/video/eval/activitynet_eval.sh -------------------------------------------------------------------------------- /scripts/video/eval/video_chatgpt_benchmark_eval_shard.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/video/eval/video_chatgpt_benchmark_eval_shard.sh -------------------------------------------------------------------------------- /scripts/video/eval/video_description_from_t2v.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/video/eval/video_description_from_t2v.sh -------------------------------------------------------------------------------- /scripts/video/eval/video_detail_description_eval_only.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/video/eval/video_detail_description_eval_only.sh -------------------------------------------------------------------------------- /scripts/video/eval/video_detail_description_eval_shard.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/video/eval/video_detail_description_eval_shard.sh -------------------------------------------------------------------------------- /scripts/video/train/SO400M_Qwen2_72B_ov_to_video_am9.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/video/train/SO400M_Qwen2_72B_ov_to_video_am9.sh -------------------------------------------------------------------------------- /scripts/video/train/SO400M_Qwen2_7B_ov_to_video_am9.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/video/train/SO400M_Qwen2_7B_ov_to_video_am9.sh -------------------------------------------------------------------------------- /scripts/video/train/exp.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/video/train/exp.yaml -------------------------------------------------------------------------------- /scripts/zero2.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/zero2.json -------------------------------------------------------------------------------- /scripts/zero2_fused_adamw.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/zero2_fused_adamw.json -------------------------------------------------------------------------------- /scripts/zero2_offload.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/zero2_offload.json -------------------------------------------------------------------------------- /scripts/zero3.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/zero3.json -------------------------------------------------------------------------------- /scripts/zero3_offload.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/zero3_offload.json -------------------------------------------------------------------------------- /scripts/zero3pp.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/deepglint/Victor/HEAD/scripts/zero3pp.json --------------------------------------------------------------------------------