├── .gitignore ├── INSTALLATION.md ├── LICENSE ├── README.md ├── assets └── teaser.png ├── baselines ├── base.py ├── config.json ├── gemini │ ├── __init__.py │ ├── extract_frames.py │ └── upload.py ├── gemini_modeling.py ├── gpt4o │ ├── __init__.py │ └── api_wrap.py ├── gpt4o_modeling.py ├── gpt4v │ ├── __init__.py │ └── api_wrap.py ├── gpt4v_modeling.py ├── llamavid │ ├── __init__.py │ ├── constants.py │ ├── conversation.py │ ├── model │ │ ├── __init__.py │ │ ├── builder.py │ │ ├── language_model │ │ │ └── llava_llama_vid.py │ │ ├── llamavid_arch.py │ │ ├── multimodal_encoder │ │ │ ├── builder.py │ │ │ ├── clip_encoder.py │ │ │ └── eva_vit.py │ │ ├── multimodal_projector │ │ │ └── builder.py │ │ └── qformer.py │ ├── processor │ │ ├── clip-patch14-224 │ │ │ ├── config.json │ │ │ └── preprocessor_config.json │ │ └── clip-patch14-336 │ │ │ ├── config.json │ │ │ └── preprocessor_config.json │ ├── serve │ │ ├── __init__.py │ │ ├── cli.py │ │ ├── controller.py │ │ ├── examples │ │ │ ├── Avatar.png │ │ │ ├── Avengers.jpg │ │ │ ├── Forrest_Gump.jpg │ │ │ ├── Interstellar.jpg │ │ │ ├── Titanic.jpg │ │ │ ├── extreme_ironing.jpg │ │ │ └── waterview.jpg │ │ ├── gradio_web_server.py │ │ ├── model_worker.py │ │ ├── model_worker_short.py │ │ ├── register_worker.py │ │ └── run_llamavid_movie.py │ └── train │ │ ├── llama_flash_attn_monkey_patch.py │ │ ├── llava_trainer.py │ │ ├── train.py │ │ └── train_mem.py ├── llamavid_modeling.py ├── llava │ ├── __init__.py │ ├── constants.py │ ├── conversation.py │ ├── eval │ │ ├── eval_gpt_review.py │ │ ├── eval_gpt_review_bench.py │ │ ├── eval_gpt_review_visual.py │ │ ├── eval_pope.py │ │ ├── eval_science_qa.py │ │ ├── eval_science_qa_gpt4.py │ │ ├── eval_science_qa_gpt4_requery.py │ │ ├── eval_textvqa.py │ │ ├── generate_webpage_data_from_table.py │ │ ├── m4c_evaluator.py │ │ ├── model_qa.py │ │ ├── model_vqa.py │ │ ├── model_vqa_loader.py │ │ ├── model_vqa_mmbench.py │ │ ├── model_vqa_science.py │ │ ├── qa_baseline_gpt35.py │ │ ├── run_llava.py │ │ ├── summarize_gpt_review.py │ │ ├── table │ │ │ ├── answer │ │ │ │ ├── answer_alpaca-13b.jsonl │ │ │ │ ├── answer_bard.jsonl │ │ │ │ ├── answer_gpt35.jsonl │ │ │ │ ├── answer_llama-13b.jsonl │ │ │ │ └── answer_vicuna-13b.jsonl │ │ │ ├── caps_boxes_coco2014_val_80.jsonl │ │ │ ├── model.jsonl │ │ │ ├── prompt.jsonl │ │ │ ├── question.jsonl │ │ │ ├── results │ │ │ │ ├── test_sqa_llava_13b_v0.json │ │ │ │ └── test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json │ │ │ ├── review │ │ │ │ ├── review_alpaca-13b_vicuna-13b.jsonl │ │ │ │ ├── review_bard_vicuna-13b.jsonl │ │ │ │ ├── review_gpt35_vicuna-13b.jsonl │ │ │ │ └── review_llama-13b_vicuna-13b.jsonl │ │ │ ├── reviewer.jsonl │ │ │ └── rule.json │ │ └── webpage │ │ │ ├── figures │ │ │ ├── alpaca.png │ │ │ ├── bard.jpg │ │ │ ├── chatgpt.svg │ │ │ ├── llama.jpg │ │ │ ├── swords_FILL0_wght300_GRAD0_opsz48.svg │ │ │ └── vicuna.jpeg │ │ │ ├── index.html │ │ │ ├── script.js │ │ │ └── styles.css │ ├── mm_utils.py │ ├── model │ │ ├── __init__.py │ │ ├── apply_delta.py │ │ ├── builder.py │ │ ├── consolidate.py │ │ ├── language_model │ │ │ ├── llava_llama.py │ │ │ ├── llava_mistral.py │ │ │ └── llava_mpt.py │ │ ├── llava_arch.py │ │ ├── make_delta.py │ │ ├── multimodal_encoder │ │ │ ├── builder.py │ │ │ └── clip_encoder.py │ │ ├── multimodal_projector │ │ │ └── builder.py │ │ └── utils.py │ ├── serve │ │ ├── __init__.py │ │ ├── cli.py │ │ ├── controller.py │ │ ├── examples │ │ │ ├── extreme_ironing.jpg │ │ │ └── waterview.jpg │ │ ├── gradio_web_server.py │ │ ├── model_worker.py │ │ ├── register_worker.py │ │ ├── sglang_worker.py │ │ └── test_message.py │ ├── train │ │ ├── llama_flash_attn_monkey_patch.py │ │ ├── llama_xformers_attn_monkey_patch.py │ │ ├── llava_trainer.py │ │ ├── train.py │ │ ├── train_mem.py │ │ └── train_xformers.py │ └── utils.py ├── llava_modeling.py ├── llavanext_modeling.py ├── llavavid │ ├── __init__.py │ ├── constants.py │ ├── conversation.py │ ├── mm_utils.py │ ├── model │ │ ├── __init__.py │ │ ├── apply_delta.py │ │ ├── builder.py │ │ ├── consolidate.py │ │ ├── language_model │ │ │ ├── llava_llama.py │ │ │ ├── llava_mistral.py │ │ │ └── llava_mpt.py │ │ ├── llava_arch.py │ │ ├── make_delta.py │ │ ├── multimodal_encoder │ │ │ ├── builder.py │ │ │ └── clip_encoder.py │ │ ├── multimodal_projector │ │ │ └── builder.py │ │ ├── multimodal_resampler │ │ │ ├── builder.py │ │ │ └── spatial_pool.py │ │ └── utils.py │ └── utils.py ├── minigpt4 │ ├── __init__.py │ ├── common │ │ ├── __init__.py │ │ ├── config.py │ │ ├── dist_utils.py │ │ ├── eval_utils.py │ │ ├── gradcam.py │ │ ├── logger.py │ │ ├── optims.py │ │ ├── registry.py │ │ ├── utils.py │ │ └── vqa_tools │ │ │ ├── VQA │ │ │ ├── PythonEvaluationTools │ │ │ │ ├── vqaEvalDemo.py │ │ │ │ └── vqaEvaluation │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── vqaEval.py │ │ │ ├── PythonHelperTools │ │ │ │ ├── vqaDemo.py │ │ │ │ └── vqaTools │ │ │ │ │ ├── __init__.py │ │ │ │ │ └── vqa.py │ │ │ └── README.md │ │ │ ├── __init__.py │ │ │ ├── aokvqa │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── data_scripts │ │ │ │ ├── build_vocab.py │ │ │ │ ├── encode_vocab_clip.py │ │ │ │ ├── extract_bert_features.py │ │ │ │ ├── extract_clip_features.py │ │ │ │ └── extract_resnet_features.py │ │ │ ├── environment.yml │ │ │ ├── evaluation │ │ │ │ ├── eval_predictions.py │ │ │ │ ├── load_aokvqa.py │ │ │ │ ├── prepare_predictions.py │ │ │ │ └── remap_predictions.py │ │ │ ├── gpt3 │ │ │ │ ├── README.md │ │ │ │ ├── caption_inputs.py │ │ │ │ ├── query_gpt3.py │ │ │ │ └── rationale_inputs.py │ │ │ ├── heuristics │ │ │ │ ├── README.md │ │ │ │ ├── most_common_answer.py │ │ │ │ ├── random_unweighted.py │ │ │ │ └── random_weighted.py │ │ │ ├── load_aokvqa.py │ │ │ └── transfer_experiments │ │ │ │ ├── README.md │ │ │ │ ├── predict.py │ │ │ │ └── train.py │ │ │ ├── vqa.py │ │ │ └── vqa_eval.py │ ├── configs │ │ ├── datasets │ │ │ ├── cc_sbu │ │ │ │ ├── align.yaml │ │ │ │ └── defaults.yaml │ │ │ ├── cmd_video │ │ │ │ └── default.yaml │ │ │ ├── laion │ │ │ │ └── defaults.yaml │ │ │ ├── template │ │ │ │ └── default.yaml │ │ │ ├── video_chatgpt │ │ │ │ └── default.yaml │ │ │ └── webvid │ │ │ │ └── default.yaml │ │ ├── default.yaml │ │ └── models │ │ │ ├── minigpt4.yaml │ │ │ └── minigpt4v.yaml │ ├── conversation │ │ ├── __init__.py │ │ └── conversation.py │ ├── datasets │ │ ├── __init__.py │ │ ├── builders │ │ │ ├── __init__.py │ │ │ ├── base_dataset_builder.py │ │ │ ├── image_text_pair_builder.py │ │ │ └── vqa_builder.py │ │ ├── data_utils.py │ │ └── datasets │ │ │ ├── __init__.py │ │ │ ├── aok_vqa_datasets.py │ │ │ ├── aok_vqa_reasoning_datasets.py │ │ │ ├── base_dataset.py │ │ │ ├── caption_datasets.py │ │ │ ├── caption_reasoning.py │ │ │ ├── cc_sbu_dataset.py │ │ │ ├── coco_caption.py │ │ │ ├── coco_vqa_datasets.py │ │ │ ├── cot.py │ │ │ ├── coyo_dataset.py │ │ │ ├── dataloader_utils.py │ │ │ ├── doc_dataset.py │ │ │ ├── gqa_datasets.py │ │ │ ├── grounded_caption_reasoning.py │ │ │ ├── grounded_detailed_image_caption_dataset.py │ │ │ ├── laion_dataset.py │ │ │ ├── llava_dataset.py │ │ │ ├── locna_dataset.py │ │ │ ├── lvis_dataset.py │ │ │ ├── nav_dataset.py │ │ │ ├── open_images.py │ │ │ ├── paint_dataset.py │ │ │ ├── reasoning_dataset.py │ │ │ ├── text_caps.py │ │ │ ├── textvqa_datasets.py │ │ │ ├── unnatural_instruction.py │ │ │ ├── vg_dataset.py │ │ │ ├── video_datasets.py │ │ │ └── vqa_datasets.py │ ├── mistral_test_config.yaml │ ├── models │ │ ├── Qformer.py │ │ ├── __init__.py │ │ ├── base_model.py │ │ ├── blip2.py │ │ ├── blip2_outputs.py │ │ ├── clip_vision_encoder.py │ │ ├── eva_vit.py │ │ ├── mini_gpt4_llama_v2.py │ │ ├── mistral.py │ │ ├── modeling_llama_v2.py │ │ ├── modeling_mistral.py │ │ └── policies │ │ │ ├── __init__.py │ │ │ ├── activation_checkpointing_functions.py │ │ │ ├── anyprecision_optimizer.py │ │ │ ├── fsdp_utils.py │ │ │ ├── mixed_precision.py │ │ │ └── wrapping.py │ ├── processors │ │ ├── __init__.py │ │ ├── base_processor.py │ │ ├── blip_processors.py │ │ └── randaugment.py │ ├── runners │ │ ├── __init__.py │ │ └── runner_base.py │ └── tasks │ │ ├── __init__.py │ │ ├── base_task.py │ │ ├── image_text_pretrain.py │ │ ├── vqa.py │ │ └── vqa_reading_comprehension.py ├── minigpt4video_modeling.py ├── pllava │ ├── models │ │ ├── __init__.py │ │ └── pllava │ │ │ ├── __init__.py │ │ │ ├── configuration_pllava.py │ │ │ ├── convert_pllava_weights_to_hf.py │ │ │ ├── modeling_pllava.py │ │ │ └── processing_pllava.py │ ├── tasks │ │ ├── eval │ │ │ ├── demo │ │ │ │ ├── __init__.py │ │ │ │ ├── pllava_demo.py │ │ │ │ ├── show_compare.py │ │ │ │ └── show_gallery.py │ │ │ ├── eval_utils.py │ │ │ ├── model_utils.py │ │ │ ├── mvbench │ │ │ │ ├── __init__.py │ │ │ │ └── pllava_eval_mvbench.py │ │ │ ├── recaption │ │ │ │ ├── __init__.py │ │ │ │ ├── pllava_recaption.py │ │ │ │ └── show_recaption.py │ │ │ ├── vcgbench │ │ │ │ ├── __init__.py │ │ │ │ ├── pllava_eval_vcgbench.py │ │ │ │ └── show_vcg.py │ │ │ └── videoqabench │ │ │ │ ├── __init__.py │ │ │ │ └── pllava_eval_videoqabench.py │ │ ├── shared_utils.py │ │ └── train │ │ │ ├── config_pllava_nframe.py │ │ │ ├── config_pllava_nframe_yiprompt.py │ │ │ ├── instruction_data.py │ │ │ └── train_pllava_nframe_accel.py │ └── utils │ │ ├── basic_utils.py │ │ ├── config.py │ │ ├── config_utils.py │ │ ├── distributed.py │ │ ├── easydict.py │ │ ├── logger.py │ │ ├── optimizer.py │ │ └── scheduler.py ├── pllava_modeling.py ├── share4video │ ├── __init__.py │ ├── constants.py │ ├── conversation.py │ ├── eval │ │ ├── evaluate_benchmark_1_correctness.py │ │ ├── evaluate_benchmark_2_detailed_orientation.py │ │ ├── evaluate_benchmark_3_context.py │ │ ├── evaluate_benchmark_4_temporal.py │ │ ├── evaluate_benchmark_5_consistency.py │ │ ├── model_vqa_loader.py │ │ ├── model_vqa_tempcompass.py │ │ ├── run_llava.py │ │ └── video │ │ │ ├── eval_mvbench.py │ │ │ ├── eval_vbench.py │ │ │ ├── general_utils.py │ │ │ ├── mvbench_utils.py │ │ │ └── vbench_utils.py │ ├── mm_utils.py │ ├── model │ │ ├── __init__.py │ │ ├── apply_delta.py │ │ ├── builder.py │ │ ├── consolidate.py │ │ ├── language_model │ │ │ ├── llava_llama.py │ │ │ ├── llava_mistral.py │ │ │ └── llava_mpt.py │ │ ├── llava_arch.py │ │ ├── make_delta.py │ │ ├── multimodal_encoder │ │ │ ├── builder.py │ │ │ ├── clip_encoder.py │ │ │ └── siglip_encoder.py │ │ ├── multimodal_projector │ │ │ └── builder.py │ │ └── utils.py │ ├── serve │ │ └── gradio_utils.py │ ├── train │ │ ├── llava_trainer.py │ │ ├── train.py │ │ └── train_mem.py │ ├── utils.py │ └── video_utils.py ├── sharegpt4video_modeling.py ├── valley │ ├── configs │ │ ├── deepspeed │ │ │ ├── config_zero2.json │ │ │ ├── config_zero3.json │ │ │ └── config_zero3_offload.json │ │ └── experiment │ │ │ ├── valley_stage1.yaml │ │ │ ├── valley_stage2.yaml │ │ │ ├── valley_stage2_lora.yaml │ │ │ └── valley_stage2_zero3.yaml │ ├── constants.py │ ├── conversation.py │ ├── data │ │ ├── dataset.py │ │ └── video_transform.py │ ├── inference │ │ ├── run_valley.py │ │ ├── run_valley_conv.py │ │ └── run_valley_llamma_v2.py │ ├── model │ │ ├── apply_delta.py │ │ ├── make_delta.py │ │ └── valley_model.py │ ├── train │ │ ├── train.py │ │ ├── train.sh │ │ └── trainner.py │ ├── util │ │ ├── config.py │ │ ├── data_util.py │ │ └── decode_img.py │ └── utils.py ├── valley_modeling.py ├── video_chat2 │ ├── configs │ │ ├── config.json │ │ ├── config_bert.json │ │ ├── data.py │ │ ├── instruction_data.py │ │ └── model.py │ ├── conversation.py │ ├── dataset │ │ ├── __init__.py │ │ ├── base_dataset.py │ │ ├── dataloader.py │ │ ├── it_dataset.py │ │ ├── pt_dataset.py │ │ ├── utils.py │ │ ├── video_transforms.py │ │ └── video_utils.py │ ├── models │ │ ├── __init__.py │ │ ├── bert │ │ │ ├── __init__.py │ │ │ ├── builder.py │ │ │ ├── tokenization_bert.py │ │ │ └── xbert.py │ │ ├── blip2 │ │ │ ├── Qformer.py │ │ │ ├── __init__.py │ │ │ ├── blip2.py │ │ │ ├── builder.py │ │ │ ├── modeling_llama.py │ │ │ ├── modeling_llama_mem.py │ │ │ ├── utils.py │ │ │ └── vit.py │ │ ├── criterions.py │ │ ├── utils.py │ │ ├── videochat2_it.py │ │ ├── videochat2_pt.py │ │ └── videochat2_qformer.py │ ├── prompts │ │ ├── concise_description.txt │ │ └── concise_image_description.txt │ ├── tasks │ │ ├── retrieval_utils.py │ │ ├── shared_utils.py │ │ ├── shared_utils_qformer.py │ │ ├── train_it.py │ │ ├── train_pt.py │ │ └── train_qformer.py │ └── utils │ │ ├── basic_utils.py │ │ ├── config.py │ │ ├── config_utils.py │ │ ├── distributed.py │ │ ├── easydict.py │ │ ├── logger.py │ │ ├── optimizer.py │ │ └── scheduler.py ├── video_chatgpt │ ├── __init__.py │ ├── constants.py │ ├── demo │ │ ├── __init__.py │ │ ├── chat.py │ │ ├── gradio_css.py │ │ ├── gradio_patch.py │ │ ├── template.py │ │ └── video_demo.py │ ├── eval │ │ ├── __init__.py │ │ ├── model_utils.py │ │ ├── run_inference_activitynet_qa.py │ │ ├── run_inference_benchmark_consistency.py │ │ └── run_inference_benchmark_general.py │ ├── inference.py │ ├── model │ │ ├── __init__.py │ │ ├── consolidate.py │ │ ├── make_delta.py │ │ ├── utils.py │ │ └── video_chatgpt.py │ ├── single_video_inference.py │ ├── train │ │ ├── llama_flash_attn_monkey_patch.py │ │ ├── llava_trainer.py │ │ ├── train.py │ │ └── train_mem.py │ ├── utils.py │ └── video_conversation.py ├── video_llama │ ├── __init__.py │ ├── common │ │ ├── __init__.py │ │ ├── config.py │ │ ├── dist_utils.py │ │ ├── gradcam.py │ │ ├── logger.py │ │ ├── optims.py │ │ ├── registry.py │ │ └── utils.py │ ├── configs │ │ ├── datasets │ │ │ ├── cc_sbu │ │ │ │ ├── align.yaml │ │ │ │ └── defaults.yaml │ │ │ ├── instruct │ │ │ │ ├── llava_instruct.yaml │ │ │ │ └── webvid_instruct.yaml │ │ │ ├── laion │ │ │ │ └── defaults.yaml │ │ │ └── webvid │ │ │ │ └── defaults.yaml │ │ ├── default.yaml │ │ └── models │ │ │ ├── minigpt4.yaml │ │ │ └── video_llama.yaml │ ├── conversation │ │ ├── __init__.py │ │ └── conversation_video.py │ ├── datasets │ │ ├── __init__.py │ │ ├── builders │ │ │ ├── __init__.py │ │ │ ├── base_dataset_builder.py │ │ │ ├── image_text_pair_builder.py │ │ │ ├── instruct_builder.py │ │ │ └── video_caption_builder.py │ │ ├── data_utils.py │ │ └── datasets │ │ │ ├── __init__.py │ │ │ ├── base_dataset.py │ │ │ ├── caption_datasets.py │ │ │ ├── cc_sbu_dataset.py │ │ │ ├── dataloader_utils.py │ │ │ ├── laion_dataset.py │ │ │ ├── llava_instruct_dataset.py │ │ │ ├── video_instruct_dataset.py │ │ │ └── webvid_datasets.py │ ├── models │ │ ├── ImageBind │ │ │ ├── .assets │ │ │ │ ├── bird_audio.wav │ │ │ │ ├── bird_image.jpg │ │ │ │ ├── car_audio.wav │ │ │ │ ├── car_image.jpg │ │ │ │ ├── dog_audio.wav │ │ │ │ └── dog_image.jpg │ │ │ ├── CODE_OF_CONDUCT.md │ │ │ ├── CONTRIBUTING.md │ │ │ ├── LICENSE │ │ │ ├── README.md │ │ │ ├── bpe │ │ │ │ └── bpe_simple_vocab_16e6.txt.gz │ │ │ ├── data.py │ │ │ ├── model_card.md │ │ │ ├── models │ │ │ │ ├── __init__.py │ │ │ │ ├── helpers.py │ │ │ │ ├── imagebind_model.py │ │ │ │ ├── multimodal_preprocessors.py │ │ │ │ └── transformer.py │ │ │ └── requirements.txt │ │ ├── Qformer.py │ │ ├── __init__.py │ │ ├── base_model.py │ │ ├── blip2.py │ │ ├── blip2_outputs.py │ │ ├── eva_vit.py │ │ ├── modeling_llama.py │ │ └── video_llama.py │ ├── processors │ │ ├── __init__.py │ │ ├── base_processor.py │ │ ├── blip_processors.py │ │ ├── functional_video.py │ │ ├── randaugment.py │ │ ├── transforms_video.py │ │ └── video_processor.py │ ├── runners │ │ ├── __init__.py │ │ ├── runner_base.py │ │ └── test.py │ ├── tasks │ │ ├── __init__.py │ │ ├── base_task.py │ │ ├── image_text_pretrain.py │ │ └── video_text_pretrain.py │ └── video_llama_eval_withaudio.yaml ├── videochat_modeling.py ├── videochatgpt_modeling.py ├── videolavit │ ├── __init__.py │ ├── conversation.py │ ├── models │ │ ├── __init__.py │ │ ├── modeling_3d_unet.py │ │ ├── modeling_decoder.py │ │ ├── modeling_motion_condition.py │ │ ├── modeling_motion_tokenizer.py │ │ ├── modeling_transformer_temporal.py │ │ ├── modeling_unet_3d_blocks.py │ │ ├── modeling_video_lavit_hf.py │ │ ├── modeling_visual_encoder.py │ │ ├── modeling_visual_tokenzier.py │ │ ├── transform.py │ │ ├── video_detokenizer.py │ │ ├── video_lavit_for_generation.py │ │ └── video_lavit_for_understanding.py │ └── utils.py ├── videolavit_modeling.py ├── videollama2 │ ├── __init__.py │ ├── constants.py │ ├── conversation.py │ ├── eval │ │ ├── eval_audio_TUT2017.py │ │ ├── eval_audio_clotho.py │ │ ├── eval_audio_clothoAQA.py │ │ ├── eval_audio_video_AVQA.py │ │ ├── eval_audio_video_AVSD.py │ │ ├── eval_audio_video_AVSSD.py │ │ ├── eval_audio_vocalsound.py │ │ ├── eval_video_cap_msvc_correctness.py │ │ ├── eval_video_cap_msvc_detailedness.py │ │ ├── eval_video_mcqa_mvbench.py │ │ ├── eval_video_mcqa_videomme.py │ │ ├── eval_video_oqa_activitynet.py │ │ ├── eval_video_oqa_vcgpt_1_correctness.py │ │ ├── eval_video_oqa_vcgpt_2_detailed_orientation.py │ │ ├── eval_video_oqa_vcgpt_3_context.py │ │ ├── eval_video_oqa_vcgpt_4_temporal.py │ │ ├── eval_video_oqa_vcgpt_5_consistency.py │ │ ├── inference_audio.py │ │ ├── inference_audio_video.py │ │ ├── inference_video_cap_msvc.py │ │ ├── inference_video_mcqa_egoschema.py │ │ ├── inference_video_mcqa_mvbench.py │ │ ├── inference_video_mcqa_perception_test_mcqa.py │ │ ├── inference_video_mcqa_videomme.py │ │ ├── inference_video_oqa_activitynet.py │ │ ├── inference_video_oqa_vcgpt_consistency.py │ │ └── inference_video_oqa_vcgpt_general.py │ ├── mm_utils.py │ ├── model │ │ ├── __init__.py │ │ ├── beats │ │ │ ├── BEATs.py │ │ │ ├── LICENSE_beats │ │ │ ├── Tokenizers.py │ │ │ ├── __init__.py │ │ │ ├── backbone.py │ │ │ ├── modules.py │ │ │ ├── quantizer.py │ │ │ └── weight_norm_fix.py │ │ ├── encoder.py │ │ ├── mel_filters.npz │ │ ├── projector.py │ │ ├── videollama2_arch.py │ │ ├── videollama2_gemma2.py │ │ ├── videollama2_llama.py │ │ ├── videollama2_mistral.py │ │ ├── videollama2_mixtral.py │ │ ├── videollama2_phi3.py │ │ └── videollama2_qwen2.py │ ├── serve │ │ ├── cli.py │ │ ├── controller.py │ │ ├── examples │ │ │ ├── bird-twitter-car.wav │ │ │ ├── desert.jpg │ │ │ ├── door.of.bar.raining2.wav │ │ │ ├── extreme_ironing.jpg │ │ │ └── waterview.jpg │ │ ├── gradio_web_server.py │ │ ├── gradio_web_server_adhoc.py │ │ ├── gradio_web_server_adhoc_av.py │ │ ├── model_worker.py │ │ ├── register_worker.py │ │ ├── sglang_worker.py │ │ └── test_message.py │ ├── train.py │ ├── utils.py │ └── videollama2_trainer.py ├── videollama2_modeling.py ├── videollama_modeling.py ├── videollava │ ├── __init__.py │ ├── constants.py │ ├── conversation.py │ ├── mm_utils.py │ ├── model │ │ ├── __init__.py │ │ ├── apply_delta.py │ │ ├── builder.py │ │ ├── consolidate.py │ │ ├── language_model │ │ │ ├── llava_llama.py │ │ │ ├── llava_mpt.py │ │ │ └── mpt │ │ │ │ ├── adapt_tokenizer.py │ │ │ │ ├── attention.py │ │ │ │ ├── blocks.py │ │ │ │ ├── configuration_mpt.py │ │ │ │ ├── custom_embedding.py │ │ │ │ ├── flash_attn_triton.py │ │ │ │ ├── hf_prefixlm_converter.py │ │ │ │ ├── meta_init_context.py │ │ │ │ ├── modeling_mpt.py │ │ │ │ ├── norm.py │ │ │ │ └── param_init_fns.py │ │ ├── llava_arch.py │ │ ├── make_delta.py │ │ ├── multimodal_encoder │ │ │ ├── builder.py │ │ │ ├── clip_encoder.py │ │ │ └── languagebind │ │ │ │ ├── __init__.py │ │ │ │ ├── audio │ │ │ │ ├── configuration_audio.py │ │ │ │ ├── modeling_audio.py │ │ │ │ ├── processing_audio.py │ │ │ │ └── tokenization_audio.py │ │ │ │ ├── depth │ │ │ │ ├── configuration_depth.py │ │ │ │ ├── modeling_depth.py │ │ │ │ ├── processing_depth.py │ │ │ │ └── tokenization_depth.py │ │ │ │ ├── image │ │ │ │ ├── configuration_image.py │ │ │ │ ├── modeling_image.py │ │ │ │ ├── processing_image.py │ │ │ │ └── tokenization_image.py │ │ │ │ ├── thermal │ │ │ │ ├── configuration_thermal.py │ │ │ │ ├── modeling_thermal.py │ │ │ │ ├── processing_thermal.py │ │ │ │ └── tokenization_thermal.py │ │ │ │ └── video │ │ │ │ ├── configuration_video.py │ │ │ │ ├── modeling_video.py │ │ │ │ ├── processing_video.py │ │ │ │ └── tokenization_video.py │ │ ├── multimodal_projector │ │ │ └── builder.py │ │ └── utils.py │ └── utils.py └── videollava_modeling.py ├── evaluations ├── evaluation.py ├── evaluation_bias.py ├── evaluation_bias_sep.py ├── evaluation_halluc.py ├── evaluation_pep.py ├── evaluation_pep_utils.py └── evaluation_utils.py ├── model_testing_zoo.py └── videohallucer_datasets ├── external_factual └── external_factual.json ├── external_nonfactual └── external_nonfactual.json ├── fact_detect ├── fact_detect.json ├── fact_detect_yn.json └── modify.py ├── interaction ├── conflict.jsonl ├── interaction.json └── stat.py ├── object_relation └── object_relation.json ├── semantic_detail └── semantic_detail.json └── temporal └── temporal.json /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Yuxuan Wang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /assets/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/assets/teaser.png -------------------------------------------------------------------------------- /baselines/base.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | class ViLLMBaseModel(torch.nn.Module): 4 | def __init__(self, model_path, device): 5 | super().__init__() 6 | self.device=device 7 | self.model_path=model_path 8 | 9 | def forward(self, instruction, videos): 10 | return self.generate(instruction, videos) 11 | 12 | def generate(self, instruction, videos): 13 | """ 14 | instruction: (str) a string of instruction 15 | images: (list) a list of image urls 16 | Return: (str) a string of generated response 17 | """ 18 | raise NotImplementedError -------------------------------------------------------------------------------- /baselines/config.json: -------------------------------------------------------------------------------- 1 | { 2 | "CKPT_DIR": "checkpoints", 3 | "DATA_DIR": "../videohallucer_datasets" 4 | } -------------------------------------------------------------------------------- /baselines/gemini/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/gemini/__init__.py -------------------------------------------------------------------------------- /baselines/gemini/extract_frames.py: -------------------------------------------------------------------------------- 1 | import os, shutil, cv2 2 | 3 | def create_frame_output_dir(output_dir): 4 | if not os.path.exists(output_dir): 5 | os.makedirs(output_dir) 6 | else: 7 | shutil.rmtree(output_dir) 8 | os.makedirs(output_dir) 9 | 10 | def extract_frame_from_video(video_file_path, FRAME_EXTRACTION_DIRECTORY, FRAME_PREFIX, FPS=1): 11 | # print(f"Extracting {video_file_path} at 1 frame per second. This might take a bit...") 12 | # print(video_file_path) 13 | create_frame_output_dir(FRAME_EXTRACTION_DIRECTORY) 14 | vidcap = cv2.VideoCapture(video_file_path) 15 | fps = vidcap.get(cv2.CAP_PROP_FPS) 16 | frame_duration = 1 / fps # Time interval between frames (in seconds) 17 | output_file_prefix = os.path.basename(video_file_path).replace('.', '_') 18 | frame_count = 0 19 | count = 0 20 | while vidcap.isOpened(): 21 | success, frame = vidcap.read() 22 | if not success: # End of video 23 | break 24 | if int(count / fps) == frame_count: # Extract a frame every second 25 | min = frame_count // 60 26 | sec = frame_count % 60 27 | time_string = f"{min:02d}:{sec:02d}" 28 | image_name = f"{output_file_prefix}{FRAME_PREFIX}{time_string}.jpg" 29 | output_filename = os.path.join(FRAME_EXTRACTION_DIRECTORY, image_name) 30 | cv2.imwrite(output_filename, frame) 31 | frame_count += 1 32 | count += 1 33 | vidcap.release() # Release the capture object\n", 34 | # print(f"Completed video frame extraction!\n\nExtracted: {frame_count} frames") 35 | 36 | -------------------------------------------------------------------------------- /baselines/gemini/upload.py: -------------------------------------------------------------------------------- 1 | import os 2 | # import google.generativeai as genai 3 | 4 | class File: 5 | def __init__(self, file_path: str, frame_prefix: str, display_name: str = None): 6 | self.file_path = file_path 7 | if display_name: 8 | self.display_name = display_name 9 | self.timestamp = get_timestamp(file_path, frame_prefix) 10 | 11 | def set_file_response(self, response): 12 | self.response = response 13 | 14 | def get_timestamp(filename, FRAME_PREFIX): 15 | """Extracts the frame count (as an integer) from a filename with the format 16 | 'output_file_prefix_frame00:00.jpg'. 17 | """ 18 | parts = filename.split(FRAME_PREFIX) 19 | # print(parts) 20 | if len(parts) != 2: 21 | return None # Indicates the filename might be incorrectly formatted 22 | return parts[1].split('.')[0] 23 | 24 | def make_request(prompt, files): 25 | request = [prompt] 26 | for file in files: 27 | request.append(file.timestamp) 28 | request.append(file.response) 29 | return request 30 | -------------------------------------------------------------------------------- /baselines/gpt4o/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/gpt4o/__init__.py -------------------------------------------------------------------------------- /baselines/gpt4o_modeling.py: -------------------------------------------------------------------------------- 1 | import os, shutil, cv2 2 | from gpt4o.api_wrap import OpenAIAPIWrapper 3 | 4 | from base import ViLLMBaseModel 5 | 6 | 7 | class GPT4O(ViLLMBaseModel): 8 | def __init__(self, model_args): 9 | super().__init__(model_args["model_path"], model_args["device"]) 10 | assert ( 11 | "model_path" in model_args 12 | and "device" in model_args 13 | ) 14 | 15 | self.model = OpenAIAPIWrapper() 16 | self.model_name = 'GPT4O' 17 | 18 | def generate(self, instruction, video_path): 19 | 20 | response, num_tokens = self.model.get_completion(instruction, video_path=video_path) 21 | response = response.strip() 22 | 23 | return response 24 | 25 | def create_frame_output_dir(output_dir): 26 | if not os.path.exists(output_dir): 27 | os.makedirs(output_dir) 28 | else: 29 | shutil.rmtree(output_dir) 30 | os.makedirs(output_dir) 31 | 32 | -------------------------------------------------------------------------------- /baselines/gpt4v/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/gpt4v/__init__.py -------------------------------------------------------------------------------- /baselines/llamavid/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaAttForCausalLM 2 | -------------------------------------------------------------------------------- /baselines/llamavid/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | -------------------------------------------------------------------------------- /baselines/llamavid/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .language_model.llava_llama_vid import LlavaLlamaAttForCausalLM 2 | -------------------------------------------------------------------------------- /baselines/llamavid/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower 3 | from .eva_vit import EVAVisionTowerLavis 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | # vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', "./checkpoints/LLaMA-VID-7B/LAVIS/eva_vit_g.pth")) 7 | # vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', "./checkpoints/LLaMA-VID-7B/LAVIS/eva_vit_g.pth") 8 | vision_tower = "./checkpoints/LLaMA-VID-7B/LAVIS/eva_vit_g.pth" 9 | image_processor = getattr(vision_tower_cfg, 'image_processor', getattr(vision_tower_cfg, 'image_processor', "./model_zoo/OpenAI/clip-vit-large-patch14")) 10 | is_absolute_path_exists = os.path.exists(vision_tower) 11 | 12 | if not is_absolute_path_exists: 13 | raise ValueError(f'Not find vision tower: {vision_tower}') 14 | 15 | if "openai" in vision_tower.lower() or "laion" in vision_tower.lower(): 16 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 17 | elif "lavis" in vision_tower.lower() or "eva" in vision_tower.lower(): 18 | return EVAVisionTowerLavis(vision_tower, image_processor, args=vision_tower_cfg, **kwargs) 19 | else: 20 | raise ValueError(f'Unknown vision tower: {vision_tower}') 21 | 22 | -------------------------------------------------------------------------------- /baselines/llamavid/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /baselines/llamavid/processor/clip-patch14-224/preprocessor_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "crop_size": 224, 3 | "do_center_crop": true, 4 | "do_normalize": true, 5 | "do_resize": true, 6 | "feature_extractor_type": "CLIPFeatureExtractor", 7 | "image_mean": [ 8 | 0.48145466, 9 | 0.4578275, 10 | 0.40821073 11 | ], 12 | "image_std": [ 13 | 0.26862954, 14 | 0.26130258, 15 | 0.27577711 16 | ], 17 | "resample": 3, 18 | "size": 224 19 | } 20 | -------------------------------------------------------------------------------- /baselines/llamavid/processor/clip-patch14-336/preprocessor_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "crop_size": 336, 3 | "do_center_crop": true, 4 | "do_normalize": true, 5 | "do_resize": true, 6 | "feature_extractor_type": "CLIPFeatureExtractor", 7 | "image_mean": [ 8 | 0.48145466, 9 | 0.4578275, 10 | 0.40821073 11 | ], 12 | "image_std": [ 13 | 0.26862954, 14 | 0.26130258, 15 | 0.27577711 16 | ], 17 | "resample": 3, 18 | "size": 336 19 | } 20 | -------------------------------------------------------------------------------- /baselines/llamavid/serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llamavid/serve/__init__.py -------------------------------------------------------------------------------- /baselines/llamavid/serve/examples/Avatar.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llamavid/serve/examples/Avatar.png -------------------------------------------------------------------------------- /baselines/llamavid/serve/examples/Avengers.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llamavid/serve/examples/Avengers.jpg -------------------------------------------------------------------------------- /baselines/llamavid/serve/examples/Forrest_Gump.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llamavid/serve/examples/Forrest_Gump.jpg -------------------------------------------------------------------------------- /baselines/llamavid/serve/examples/Interstellar.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llamavid/serve/examples/Interstellar.jpg -------------------------------------------------------------------------------- /baselines/llamavid/serve/examples/Titanic.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llamavid/serve/examples/Titanic.jpg -------------------------------------------------------------------------------- /baselines/llamavid/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llamavid/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /baselines/llamavid/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llamavid/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /baselines/llamavid/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /baselines/llamavid/train/train_mem.py: -------------------------------------------------------------------------------- 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright: 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright: 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn. 4 | 5 | # Need to call this before importing transformers. 6 | from llamavid.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn 7 | 8 | replace_llama_attn_with_flash_attn() 9 | 10 | from llamavid.train.train import train 11 | 12 | if __name__ == "__main__": 13 | train() 14 | -------------------------------------------------------------------------------- /baselines/llava/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /baselines/llava/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | IMAGE_PLACEHOLDER = "" 14 | -------------------------------------------------------------------------------- /baselines/llava/eval/table/model.jsonl: -------------------------------------------------------------------------------- 1 | {"model_id": "vicuna-13b:20230322-clean-lang", "model_name": "vicuna-13b", "model_version": "20230322-clean-lang", "model_metadata": "vicuna-13b-20230322-clean-lang"} 2 | {"model_id": "alpaca-13b:v1", "model_name": "alpaca-13b", "model_version": "v1", "model_metadata": "alpaca-13b"} 3 | {"model_id": "llama-13b:v1", "model_name": "llama-13b", "model_version": "v1", "model_metadata": "hf-llama-13b"} 4 | {"model_id": "bard:20230327", "model_name": "bard", "model_version": "20230327", "model_metadata": "Google Bard 20230327"} 5 | {"model_id": "gpt-3.5-turbo:20230327", "model_name": "gpt-3.5-turbo", "model_version": "20230327", "model_metadata": "OpenAI ChatGPT gpt-3.5-turbo Chat Completion"} 6 | -------------------------------------------------------------------------------- /baselines/llava/eval/table/reviewer.jsonl: -------------------------------------------------------------------------------- 1 | {"reviewer_id": "gpt-4-0328-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions"} 2 | {"reviewer_id": "gpt-4-0328-coding", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for coding questions"} 3 | {"reviewer_id": "gpt-4-0328-math", "prompt_id": 3, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 4 | {"reviewer_id": "gpt-4-0417-visual", "prompt_id": 4, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"} 5 | -------------------------------------------------------------------------------- /baselines/llava/eval/webpage/figures/alpaca.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llava/eval/webpage/figures/alpaca.png -------------------------------------------------------------------------------- /baselines/llava/eval/webpage/figures/bard.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llava/eval/webpage/figures/bard.jpg -------------------------------------------------------------------------------- /baselines/llava/eval/webpage/figures/chatgpt.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/llava/eval/webpage/figures/llama.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llava/eval/webpage/figures/llama.jpg -------------------------------------------------------------------------------- /baselines/llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /baselines/llava/eval/webpage/figures/vicuna.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llava/eval/webpage/figures/vicuna.jpeg -------------------------------------------------------------------------------- /baselines/llava/model/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig 3 | from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig 4 | from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig 5 | except: 6 | pass 7 | -------------------------------------------------------------------------------- /baselines/llava/model/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava import LlavaLlamaForCausalLM 11 | 12 | 13 | def apply_delta(base_model_path, target_model_path, delta_path): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading delta") 19 | delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 20 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) 21 | 22 | print("Applying delta") 23 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data += base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \ 31 | f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 32 | bparam = base.state_dict()[name] 33 | param.data[:bparam.shape[0], :bparam.shape[1]] += bparam 34 | 35 | print("Saving target model") 36 | delta.save_pretrained(target_model_path) 37 | delta_tokenizer.save_pretrained(target_model_path) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--base-model-path", type=str, required=True) 43 | parser.add_argument("--target-model-path", type=str, required=True) 44 | parser.add_argument("--delta-path", type=str, required=True) 45 | 46 | args = parser.parse_args() 47 | 48 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 49 | -------------------------------------------------------------------------------- /baselines/llava/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llava.model import * 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /baselines/llava/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | use_s2 = getattr(vision_tower_cfg, 's2', False) 9 | local_file = "checkpoints/clip-vit-large-patch14-336" 10 | if os.path.exists(local_file): vision_tower = local_file 11 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 12 | if use_s2: 13 | return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) 14 | else: 15 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 16 | 17 | raise ValueError(f'Unknown vision tower: {vision_tower}') 18 | -------------------------------------------------------------------------------- /baselines/llava/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /baselines/llava/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /baselines/llava/serve/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llava/serve/__init__.py -------------------------------------------------------------------------------- /baselines/llava/serve/examples/extreme_ironing.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llava/serve/examples/extreme_ironing.jpg -------------------------------------------------------------------------------- /baselines/llava/serve/examples/waterview.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llava/serve/examples/waterview.jpg -------------------------------------------------------------------------------- /baselines/llava/serve/register_worker.py: -------------------------------------------------------------------------------- 1 | """ 2 | Manually register workers. 3 | 4 | Usage: 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002 6 | """ 7 | 8 | import argparse 9 | 10 | import requests 11 | 12 | if __name__ == "__main__": 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument("--controller-address", type=str) 15 | parser.add_argument("--worker-name", type=str) 16 | parser.add_argument("--check-heart-beat", action="store_true") 17 | args = parser.parse_args() 18 | 19 | url = args.controller_address + "/register_worker" 20 | data = { 21 | "worker_name": args.worker_name, 22 | "check_heart_beat": args.check_heart_beat, 23 | "worker_status": None, 24 | } 25 | r = requests.post(url, json=data) 26 | assert r.status_code == 200 27 | -------------------------------------------------------------------------------- /baselines/llava/serve/test_message.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import json 3 | 4 | import requests 5 | 6 | from llava.conversation import default_conversation 7 | 8 | 9 | def main(): 10 | if args.worker_address: 11 | worker_addr = args.worker_address 12 | else: 13 | controller_addr = args.controller_address 14 | ret = requests.post(controller_addr + "/refresh_all_workers") 15 | ret = requests.post(controller_addr + "/list_models") 16 | models = ret.json()["models"] 17 | models.sort() 18 | print(f"Models: {models}") 19 | 20 | ret = requests.post(controller_addr + "/get_worker_address", 21 | json={"model": args.model_name}) 22 | worker_addr = ret.json()["address"] 23 | print(f"worker_addr: {worker_addr}") 24 | 25 | if worker_addr == "": 26 | return 27 | 28 | conv = default_conversation.copy() 29 | conv.append_message(conv.roles[0], args.message) 30 | prompt = conv.get_prompt() 31 | 32 | headers = {"User-Agent": "LLaVA Client"} 33 | pload = { 34 | "model": args.model_name, 35 | "prompt": prompt, 36 | "max_new_tokens": args.max_new_tokens, 37 | "temperature": 0.7, 38 | "stop": conv.sep, 39 | } 40 | response = requests.post(worker_addr + "/worker_generate_stream", headers=headers, 41 | json=pload, stream=True) 42 | 43 | print(prompt.replace(conv.sep, "\n"), end="") 44 | for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"): 45 | if chunk: 46 | data = json.loads(chunk.decode("utf-8")) 47 | output = data["text"].split(conv.sep)[-1] 48 | print(output, end="\r") 49 | print("") 50 | 51 | 52 | if __name__ == "__main__": 53 | parser = argparse.ArgumentParser() 54 | parser.add_argument("--controller-address", type=str, default="http://localhost:21001") 55 | parser.add_argument("--worker-address", type=str) 56 | parser.add_argument("--model-name", type=str, default="facebook/opt-350m") 57 | parser.add_argument("--max-new-tokens", type=int, default=32) 58 | parser.add_argument("--message", type=str, default= 59 | "Tell me a story with more than 1000 words.") 60 | args = parser.parse_args() 61 | 62 | main() 63 | -------------------------------------------------------------------------------- /baselines/llava/train/train_mem.py: -------------------------------------------------------------------------------- 1 | from llava.train.train import train 2 | 3 | if __name__ == "__main__": 4 | train(attn_implementation="flash_attention_2") 5 | -------------------------------------------------------------------------------- /baselines/llava/train/train_xformers.py: -------------------------------------------------------------------------------- 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention. 2 | 3 | # Need to call this before importing transformers. 4 | from llava.train.llama_xformers_attn_monkey_patch import ( 5 | replace_llama_attn_with_xformers_attn, 6 | ) 7 | 8 | replace_llama_attn_with_xformers_attn() 9 | 10 | from llava.train.train import train 11 | 12 | if __name__ == "__main__": 13 | train() 14 | -------------------------------------------------------------------------------- /baselines/llavavid/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /baselines/llavavid/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | IMAGE_PLACEHOLDER = "" 14 | -------------------------------------------------------------------------------- /baselines/llavavid/model/__init__.py: -------------------------------------------------------------------------------- 1 | # try: 2 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig 3 | from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig 4 | from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig 5 | # except: 6 | # pass 7 | -------------------------------------------------------------------------------- /baselines/llavavid/model/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llavavid import LlavaLlamaForCausalLM 11 | 12 | 13 | def apply_delta(base_model_path, target_model_path, delta_path): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading delta") 19 | delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 20 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) 21 | 22 | print("Applying delta") 23 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data += base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \ 31 | f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 32 | bparam = base.state_dict()[name] 33 | param.data[:bparam.shape[0], :bparam.shape[1]] += bparam 34 | 35 | print("Saving target model") 36 | delta.save_pretrained(target_model_path) 37 | delta_tokenizer.save_pretrained(target_model_path) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--base-model-path", type=str, required=True) 43 | parser.add_argument("--target-model-path", type=str, required=True) 44 | parser.add_argument("--delta-path", type=str, required=True) 45 | 46 | args = parser.parse_args() 47 | 48 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 49 | -------------------------------------------------------------------------------- /baselines/llavavid/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llavavid.model import * 10 | from llavavid.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /baselines/llavavid/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from llavavid.model.multimodal_encoder.clip_encoder import CLIPVisionTower 3 | 4 | 5 | def build_vision_tower(vision_tower_cfg, **kwargs): 6 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 7 | is_absolute_path_exists = os.path.exists(vision_tower) 8 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 9 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 10 | 11 | raise ValueError(f'Unknown vision tower: {vision_tower}') 12 | -------------------------------------------------------------------------------- /baselines/llavavid/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /baselines/llavavid/model/multimodal_resampler/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | 3 | from .spatial_pool import SpatialPool 4 | 5 | 6 | class IdentityMap(torch.nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_resampler_type": None} 16 | 17 | 18 | def build_vision_resampler(model_args, delay_load=False, **kwargs): 19 | resampler_type = getattr(model_args, "mm_resampler_type", None) 20 | if resampler_type == "spatial_pool": 21 | return SpatialPool(model_args, **kwargs) 22 | elif resampler_type is None: 23 | return IdentityMap() 24 | 25 | raise ValueError(f"Unknown resampler type: {resampler_type}") 26 | -------------------------------------------------------------------------------- /baselines/llavavid/model/multimodal_resampler/spatial_pool.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import math 4 | 5 | 6 | class SpatialPool(nn.Module): 7 | def __init__(self, model_args, vision_tower): 8 | super().__init__() 9 | 10 | self.mode = model_args.mm_spatial_pool_mode 11 | self.stride = model_args.mm_spatial_pool_stride 12 | # import pdb; pdb.set_trace() 13 | self.out_channels = getattr(model_args, "mm_spatial_pool_out_channels", vision_tower.hidden_size) 14 | 15 | if self.mode == "average": 16 | self.pool = nn.AvgPool2d(kernel_size=self.stride, stride=self.stride) 17 | elif self.mode == "max": 18 | self.pool = nn.MaxPool2d(kernel_size=self.stride, stride=self.stride) 19 | elif self.mode == "conv": 20 | self.pool = nn.Conv2d(in_channels=vision_tower.hidden_size, out_channels=self.out_channels, kernel_size=self.stride, stride=self.stride) 21 | else: 22 | raise ValueError(f"Unknown pooling mode: {self.pool}.") 23 | 24 | def forward(self, image_features, images, *args, **kwargs): 25 | # import pdb; pdb.set_trace() 26 | ori_W = int(math.sqrt(image_features.shape[1] * images.shape[3] // images.shape[2])) 27 | ori_H = int(ori_W * images.shape[2] // images.shape[3]) 28 | 29 | B, _, F = image_features.shape 30 | 31 | image_features_spatial = image_features.view(B, ori_H, ori_H, F).permute(0, 3, 1, 2) 32 | image_features_spatial_pool = self.pool(image_features_spatial) 33 | 34 | return image_features_spatial_pool.flatten(2).transpose(1, 2).contiguous() 35 | 36 | @property 37 | def config(self): 38 | return { 39 | "mm_resampler_type": "spatial_pool", 40 | "mm_spatial_pool_stride": self.stride, 41 | "mm_spatial_pool_mode": self.mode, 42 | "mm_spatial_pool_out_channels": self.out_channels, 43 | } 44 | 45 | @property 46 | def hidden_size(self): 47 | return self.out_channels 48 | -------------------------------------------------------------------------------- /baselines/llavavid/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /baselines/minigpt4/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | import os 9 | import sys 10 | 11 | from omegaconf import OmegaConf 12 | 13 | from minigpt4.common.registry import registry 14 | 15 | from minigpt4.datasets.builders import * 16 | from minigpt4.models import * 17 | from minigpt4.processors import * 18 | from minigpt4.tasks import * 19 | 20 | 21 | root_dir = os.path.dirname(os.path.abspath(__file__)) 22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml")) 23 | 24 | registry.register_path("library_root", root_dir) 25 | repo_root = os.path.join(root_dir, "..") 26 | registry.register_path("repo_root", repo_root) 27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root) 28 | registry.register_path("cache_root", cache_root) 29 | 30 | registry.register("MAX_INT", sys.maxsize) 31 | registry.register("SPLIT_NAMES", ["train", "val", "test"]) 32 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/minigpt4/common/__init__.py -------------------------------------------------------------------------------- /baselines/minigpt4/common/gradcam.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from matplotlib import pyplot as plt 3 | from scipy.ndimage import filters 4 | from skimage import transform as skimage_transform 5 | 6 | 7 | def getAttMap(img, attMap, blur=True, overlap=True): 8 | attMap -= attMap.min() 9 | if attMap.max() > 0: 10 | attMap /= attMap.max() 11 | attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant") 12 | if blur: 13 | attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2])) 14 | attMap -= attMap.min() 15 | attMap /= attMap.max() 16 | cmap = plt.get_cmap("jet") 17 | attMapV = cmap(attMap) 18 | attMapV = np.delete(attMapV, 3, 2) 19 | if overlap: 20 | attMap = ( 21 | 1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img 22 | + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV 23 | ) 24 | return attMap 25 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/VQA/PythonEvaluationTools/vqaEvaluation/__init__.py: -------------------------------------------------------------------------------- 1 | author='aagrawal' 2 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/VQA/PythonHelperTools/vqaTools/__init__.py: -------------------------------------------------------------------------------- 1 | __author__ = 'aagrawal' 2 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | __author__ = "aagrawal" 9 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/data_scripts/build_vocab.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | from collections import Counter 4 | import pathlib 5 | 6 | from load_aokvqa import load_aokvqa 7 | 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir') 11 | parser.add_argument('--out', type=pathlib.Path, required=True, dest='output_file') 12 | args = parser.parse_args() 13 | 14 | 15 | # Build vocab from train set: correct choices + (direct answers appearing in >= 3 ) 16 | 17 | train_set = load_aokvqa(args.aokvqa_dir, 'train') 18 | 19 | vocab = [] 20 | all_choices = Counter() 21 | direct_answers = Counter() 22 | 23 | for i in train_set: 24 | vocab.append( i['choices'][i['correct_choice_idx']] ) 25 | all_choices.update(i['choices']) 26 | direct_answers.update(set(i['direct_answers'])) 27 | vocab += [k for k,v in all_choices.items() if v >= 3] 28 | vocab += [k for k,v in direct_answers.items() if v >= 3] 29 | 30 | vocab = sorted(set(vocab)) 31 | print(f"Vocab size: {len(vocab)}") 32 | 33 | # Save vocabulary Output 34 | 35 | with open(args.output_file, 'w') as f: 36 | for v in vocab: 37 | print(v, file=f) 38 | 39 | ## Check validation set coverage 40 | 41 | val_set = load_aokvqa(args.aokvqa_dir, 'val') 42 | 43 | val_acc = [v['choices'][v['correct_choice_idx']] in vocab for v in val_set] 44 | val_acc = sum(val_acc) / len(val_acc) * 100 45 | print(f"Val set coverage: {val_acc:.2f}" ) 46 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/data_scripts/encode_vocab_clip.py: -------------------------------------------------------------------------------- 1 | import json 2 | from tqdm import tqdm 3 | import argparse 4 | import pathlib 5 | 6 | import torch 7 | import clip 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--vocab', type=pathlib.Path, required=True, dest='vocab_file') 11 | parser.add_argument('--model-type', type=str, choices=['RN50', 'RN50x4', 'RN50x16', 'RN50x64', 'RN101', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px'], required=True, dest='model_type') 12 | parser.add_argument('--out', type=pathlib.Path, required=True, dest='output_file') 13 | args = parser.parse_args() 14 | 15 | assert args.output_file.suffix == '.pt' 16 | 17 | device = "cuda" if torch.cuda.is_available() else "cpu" 18 | model, preprocess = clip.load(args.model_type, device=device) 19 | 20 | with torch.no_grad(): 21 | a = open(args.vocab_file).read().splitlines() 22 | mc_text = clip.tokenize(a).to(device) 23 | mc_text_features = torch.stack([model.encode_text(mct.unsqueeze(0)).cpu() for mct in tqdm(mc_text)], dim=1)[0] 24 | mc_text_features = mc_text_features.float() 25 | model_name = args.model_type.replace('/', '-').replace('@', '-') 26 | torch.save(mc_text_features, args.output_file) 27 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/data_scripts/extract_bert_features.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pathlib 4 | from tqdm import tqdm 5 | 6 | import torch 7 | from transformers import AutoTokenizer, AutoModel 8 | 9 | from load_aokvqa import load_aokvqa 10 | 11 | 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir') 14 | parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True) 15 | parser.add_argument('--out', type=pathlib.Path, required=True, dest='output_file') 16 | args = parser.parse_args() 17 | 18 | assert args.output_file.suffix == '.pt' 19 | 20 | ## Load dataset 21 | 22 | dataset = load_aokvqa(args.aokvqa_dir, args.split) 23 | 24 | ## Load model 25 | 26 | tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens') 27 | model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens') 28 | device = "cuda" if torch.cuda.is_available() else "cpu" 29 | model = model.to(device) 30 | model.eval() 31 | 32 | def mean_pooling(model_output, attention_mask): 33 | token_embeddings = model_output[0] # First element of model_output contains all token embeddings 34 | input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float() 35 | return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9) 36 | 37 | ## Encoding loop 38 | 39 | with torch.no_grad(): 40 | embeddings = {} 41 | 42 | for d in tqdm(dataset): 43 | encoded_input = tokenizer([d['question']], padding=True, return_tensors='pt') 44 | encoded_input = {k:v.to(device) for k,v in encoded_input.items()} 45 | e = mean_pooling(model(**encoded_input), encoded_input['attention_mask']) 46 | embeddings[d['question_id']] = { 47 | 'question' : e[0].cpu() 48 | } 49 | 50 | torch.save(embeddings, args.output_file) 51 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/data_scripts/extract_clip_features.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PIL import Image 3 | from tqdm import tqdm 4 | import argparse 5 | import pathlib 6 | 7 | import torch 8 | import clip 9 | 10 | from load_aokvqa import load_aokvqa, get_coco_path 11 | 12 | 13 | parser = argparse.ArgumentParser() 14 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir') 15 | parser.add_argument('--coco-dir', type=pathlib.Path, required=True, dest='coco_dir') 16 | parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True) 17 | parser.add_argument('--model-type', type=str, choices=['RN50', 'RN50x4', 'RN50x16', 'RN50x64', 'RN101', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px'], required=True, dest='model_type') 18 | parser.add_argument('--out', type=pathlib.Path, required=True, dest='output_file') 19 | args = parser.parse_args() 20 | 21 | assert args.output_file.suffix == '.pt' 22 | 23 | ## Load dataset 24 | 25 | dataset = load_aokvqa(args.aokvqa_dir, args.split) 26 | 27 | ## Load model 28 | 29 | device = "cuda" if torch.cuda.is_available() else "cpu" 30 | model, preprocess = clip.load(args.model_type, device=device) 31 | 32 | ## Encoding loop 33 | 34 | with torch.no_grad(): 35 | embeddings = {} 36 | 37 | for d in tqdm(dataset): 38 | q = d["question"] 39 | q_text = clip.tokenize(q).to(device) 40 | q_text_features = model.encode_text(q_text) 41 | 42 | img = Image.open(get_coco_path(args.split, d['image_id'], args.coco_dir)) 43 | img = preprocess(img).unsqueeze(0).to(device) 44 | image_features = model.encode_image(img) 45 | 46 | embeddings[d['question_id']] = { 47 | 'question' : q_text_features[0].float().cpu(), 48 | 'image' : image_features[0].float().cpu(), 49 | } 50 | 51 | torch.save(embeddings, args.output_file) 52 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/data_scripts/extract_resnet_features.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | import pathlib 4 | from tqdm import tqdm 5 | from PIL import Image 6 | 7 | import torch 8 | import torch.nn as nn 9 | from torchvision import models 10 | from torchvision import transforms as T 11 | 12 | from load_aokvqa import load_aokvqa, get_coco_path 13 | 14 | 15 | parser = argparse.ArgumentParser() 16 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir') 17 | parser.add_argument('--coco-dir', type=pathlib.Path, required=True, dest='coco_dir') 18 | parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True) 19 | parser.add_argument('--out', type=pathlib.Path, required=True, dest='output_file') 20 | args = parser.parse_args() 21 | 22 | assert args.output_file.suffix == '.pt' 23 | 24 | ## Load dataset 25 | 26 | dataset = load_aokvqa(args.aokvqa_dir, args.split) 27 | 28 | ## Load model 29 | 30 | resnet_preprocess = T.Compose([ 31 | T.Resize(size=224, interpolation=T.InterpolationMode.BICUBIC), 32 | T.CenterCrop(size=(224, 224)), 33 | T.ToTensor(), 34 | T.Normalize( 35 | mean=[0.485, 0.456, 0.406], 36 | std=[0.229, 0.224, 0.225] 37 | ) 38 | ]) 39 | 40 | device = "cuda" if torch.cuda.is_available() else "cpu" 41 | 42 | resnet_model = models.resnet50(pretrained=True) 43 | resnet_model = torch.nn.Sequential( 44 | *list(resnet_model.children())[:-1], 45 | nn.Flatten() 46 | ) # strip classification layer 47 | resnet_model = resnet_model.to(device) 48 | 49 | ## Encoding loop 50 | 51 | with torch.no_grad(): 52 | embeddings = {} 53 | 54 | for d in tqdm(dataset): 55 | img = Image.open(get_coco_path(args.split, d['image_id'], args.coco_dir)).convert('RGB') 56 | resnet_input = resnet_preprocess(img).unsqueeze(0).to(device) 57 | resnet_features = resnet_model(resnet_input) 58 | embeddings[d['question_id']] = { 59 | 'image' : resnet_features[0].cpu() 60 | } 61 | 62 | torch.save(embeddings, args.output_file) 63 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/environment.yml: -------------------------------------------------------------------------------- 1 | name: aokvqa 2 | channels: 3 | - pytorch 4 | - nvidia 5 | - huggingface 6 | - conda-forge 7 | - defaults 8 | dependencies: 9 | - python=3.7 10 | - cudatoolkit=11.3 11 | - numpy=1.21.6 12 | - pytorch=1.11.0 13 | - torchvision=0.12.0 14 | - pytorch-lightning=1.6.3 15 | - torchmetrics=0.8.1 16 | - gdown=4.4.0 17 | - pip=22.0.4 18 | - pip: 19 | - argparse==1.4.0 20 | - Pillow==9.0.1 21 | - tensorboard==2.9.0 22 | - ftfy==6.1.1 23 | - regex==2022.3.15 24 | - tqdm==4.64.0 25 | - clip @ git+https://github.com/openai/CLIP.git@b46f5ac7587d2e1862f8b7b1573179d80dcdd620 26 | - openai==0.18.1 27 | - nltk==3.7 28 | - sacrebleu==2.0.0 29 | - sacremoses==0.0.53 30 | - sentence-transformers==2.2.0 31 | - datasets==2.1.0 32 | - tokenizers==0.10.3 33 | - transformers==4.10.3 34 | 35 | # Next: resolve conflict between sentence-transfomers and pytorch-lightning 36 | # pip uninstall sentencepiece 37 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/evaluation/load_aokvqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | 5 | def load_aokvqa(aokvqa_dir, split, version='v1p0'): 6 | assert split in ['train', 'val', 'test', 'test_w_ans'] 7 | dataset = json.load(open( 8 | os.path.join(aokvqa_dir, f"aokvqa_{version}_{split}.json") 9 | )) 10 | return dataset 11 | 12 | def get_coco_path(split, image_id, coco_dir): 13 | return os.path.join(coco_dir, f"{split}2017", f"{image_id:012}.jpg") 14 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/evaluation/prepare_predictions.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pathlib 3 | import json 4 | 5 | from load_aokvqa import load_aokvqa 6 | 7 | 8 | if __name__ == '__main__': 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir') 11 | parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True) 12 | parser.add_argument('--mc', type=argparse.FileType('r'), dest='mc_pred_file') 13 | parser.add_argument('--da', type=argparse.FileType('r'), dest='da_pred_file') 14 | parser.add_argument('--out', type=argparse.FileType('w'), dest='output_file') 15 | args = parser.parse_args() 16 | assert args.mc_pred_file or args.da_pred_file 17 | 18 | dataset = load_aokvqa(args.aokvqa_dir, args.split) 19 | mc_preds = json.load(args.mc_pred_file) if args.mc_pred_file else None 20 | da_preds = json.load(args.da_pred_file) if args.da_pred_file else None 21 | predictions = {} 22 | 23 | for d in dataset: 24 | q = d['question_id'] 25 | predictions[q] = {} 26 | if mc_preds and q in mc_preds.keys(): 27 | predictions[q]['multiple_choice'] = mc_preds[q] 28 | if da_preds and q in da_preds.keys(): 29 | predictions[q]['direct_answer'] = da_preds[q] 30 | 31 | json.dump(predictions, args.output_file) 32 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/evaluation/remap_predictions.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import pathlib 3 | import json 4 | from tqdm import tqdm 5 | 6 | from sentence_transformers import SentenceTransformer 7 | from sentence_transformers.util import cos_sim 8 | 9 | from load_aokvqa import load_aokvqa 10 | 11 | 12 | def map_to_choices(dataset, predictions, device='cpu'): 13 | if isinstance(dataset, list): 14 | dataset = { dataset[i]['question_id'] : dataset[i] for i in range(len(dataset)) } 15 | 16 | if all([p in dataset[q]['choices'] for q, p in predictions.items()]): 17 | return predictions 18 | 19 | model = SentenceTransformer('sentence-transformers/average_word_embeddings_glove.6B.300d') 20 | model.to(device) 21 | for q in tqdm(predictions.keys()): 22 | choices = dataset[q]['choices'] 23 | if predictions[q] not in choices: 24 | choice_embeddings = model.encode([predictions[q]] + choices, convert_to_tensor=True) 25 | a_idx = cos_sim(choice_embeddings[0], choice_embeddings[1:]).argmax().item() 26 | predictions[q] = choices[a_idx] 27 | 28 | return predictions 29 | 30 | 31 | if __name__ == '__main__': 32 | parser = argparse.ArgumentParser() 33 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir') 34 | parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True) 35 | parser.add_argument('--pred', type=argparse.FileType('r'), required=True, dest='prediction_file') 36 | parser.add_argument('--out', type=argparse.FileType('w'), required=True, dest='output_file') 37 | args = parser.parse_args() 38 | 39 | 40 | dataset = load_aokvqa(args.aokvqa_dir, args.split) 41 | predictions = json.load(args.prediction_file) 42 | predictions = map_to_choices(dataset, predictions) 43 | 44 | json.dump(predictions, args.output_file) 45 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/gpt3/README.md: -------------------------------------------------------------------------------- 1 | ## Querying GPT-3 2 | 3 | To follow our experiments which use GPT-3, you must have access to the [OpenAI API](https://openai.com/api/) (at cost). Please retrieve your [organization](https://beta.openai.com/account/org-settings) and [API](https://beta.openai.com/account/api-keys) keys and set them in your environment variables. 4 | 5 | ```bash 6 | export OPENAI_ORG=.... 7 | export OPENAI_API_KEY=... 8 | ``` 9 | 10 | For producing predictions for both DA and MC settings, run: 11 | ```bash 12 | python gpt3/query_gpt3.py --aokvqa-dir ${AOKVQA_DIR} --split val --out ${PREDS_DIR}/gpt3_val-da.json 13 | python remap_predictions.py --aokvqa-dir ${AOKVQA_DIR} --split val --pred ${PREDS_DIR}/gpt3_val-da.json --out ${PREDS_DIR}/gpt3_val-mc.json 14 | ``` 15 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/gpt3/caption_inputs.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import pathlib 5 | 6 | from load_aokvqa import load_aokvqa 7 | 8 | 9 | parser = argparse.ArgumentParser() 10 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir') 11 | parser.add_argument('--coco-dir', type=pathlib.Path, required=True, dest='coco_dir') 12 | parser.add_argument('--split', type=str, choices=['train', 'val'], required=True) 13 | parser.add_argument('--out', type=argparse.FileType('w'), required=True, dest='output_file') 14 | args = parser.parse_args() 15 | 16 | aokvqa_set = load_aokvqa(args.aokvqa_dir, args.split) 17 | 18 | coco_captions = json.load(open(os.path.join(args.coco_dir, 'annotations', f'captions_{args.split}2017.json')))['annotations'] 19 | coco_captions = {c['image_id'] : c['caption'] for c in coco_captions} 20 | 21 | captions = { d['question_id'] : coco_captions[d['image_id']] for d in aokvqa_set } 22 | 23 | json.dump(captions, args.output_file) 24 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/gpt3/rationale_inputs.py: -------------------------------------------------------------------------------- 1 | import json 2 | import argparse 3 | import pathlib 4 | 5 | from load_aokvqa import load_aokvqa 6 | 7 | 8 | parser = argparse.ArgumentParser() 9 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir') 10 | parser.add_argument('--split', type=str, choices=['train', 'val', 'test_w_ans'], required=True) 11 | parser.add_argument('--out', type=argparse.FileType('w'), required=True, dest='output_file') 12 | args = parser.parse_args() 13 | 14 | aokvqa_set = load_aokvqa(args.aokvqa_dir, args.split) 15 | rationales = {d['question_id'] : d['rationales'][0] for d in aokvqa_set} 16 | json.dump(rationales, args.output_file) 17 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/heuristics/README.md: -------------------------------------------------------------------------------- 1 | ## Heuristics 2 | 3 | ```bash 4 | # These scripts accept the same arguments. 5 | # heuristics/random_unweighted.py 6 | # heuristics/random_weighted.py 7 | # heuristics/most_common_answer.py 8 | 9 | python heuristics/random_unweighted.py --aokvqa-dir ${AOKVQA_DIR} --split val --mc --out ${PREDS_DIR}/random-unweighted_val-mc.json 10 | # Exclude --mc for the direct answer setting 11 | ``` 12 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/heuristics/most_common_answer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import argparse 4 | import pathlib 5 | from collections import Counter 6 | 7 | from load_aokvqa import load_aokvqa 8 | 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir') 12 | parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True) 13 | parser.add_argument('--mc', action='store_true', dest='multiple_choice') 14 | parser.add_argument('--out', type=argparse.FileType('w'), required=True, dest='output_file') 15 | args = parser.parse_args() 16 | 17 | 18 | train_set = load_aokvqa(args.aokvqa_dir, 'train') 19 | train_freq = dict(Counter( 20 | [d['choices'][d['correct_choice_idx']] for d in train_set] 21 | )) 22 | most_common_answer = max(train_freq.keys(), key=train_freq.get) 23 | 24 | ## 25 | 26 | eval_set = load_aokvqa(args.aokvqa_dir, args.split) 27 | 28 | predictions = {} 29 | 30 | for d in eval_set: 31 | q = d['question_id'] 32 | predictions[q] = most_common_answer 33 | 34 | if args.multiple_choice: 35 | choices = [c for c in d['choices'] if c in train_freq.keys()] 36 | if len(choices) > 0: 37 | predictions[q] = max(choices, key=train_freq.get) 38 | 39 | json.dump(predictions, args.output_file) 40 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/heuristics/random_unweighted.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | from random import seed, sample 4 | import argparse 5 | import pathlib 6 | 7 | from load_aokvqa import load_aokvqa 8 | 9 | 10 | parser = argparse.ArgumentParser() 11 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir') 12 | parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True) 13 | parser.add_argument('--mc', action='store_true', dest='multiple_choice') 14 | parser.add_argument('--out', type=argparse.FileType('w'), required=True, dest='output_file') 15 | args = parser.parse_args() 16 | 17 | seed(0) 18 | 19 | train_set = load_aokvqa(args.aokvqa_dir, 'train') 20 | 21 | if args.multiple_choice is False: 22 | choices = list(set( 23 | [d['choices'][d['correct_choice_idx']] for d in train_set] 24 | )) 25 | 26 | ## 27 | 28 | predictions = {} 29 | 30 | eval_set = load_aokvqa(args.aokvqa_dir, args.split) 31 | 32 | for d in eval_set: 33 | q = d['question_id'] 34 | if args.multiple_choice: 35 | choices = d['choices'] 36 | predictions[q] = sample(choices, 1)[0] 37 | 38 | json.dump(predictions, args.output_file) 39 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/heuristics/random_weighted.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import numpy as np 4 | import argparse 5 | import pathlib 6 | from collections import Counter 7 | 8 | from load_aokvqa import load_aokvqa 9 | 10 | 11 | parser = argparse.ArgumentParser() 12 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir') 13 | parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True) 14 | parser.add_argument('--mc', action='store_true', dest='multiple_choice') 15 | parser.add_argument('--out', type=argparse.FileType('w'), required=True, dest='output_file') 16 | args = parser.parse_args() 17 | 18 | np.random.seed(0) 19 | 20 | train_set = load_aokvqa(args.aokvqa_dir, 'train') 21 | train_freq = dict(Counter( 22 | [d['choices'][d['correct_choice_idx']] for d in train_set] 23 | )) 24 | 25 | if args.multiple_choice is False: 26 | choices = list(train_freq.keys()) 27 | probs = [f / len(train_set) for f in train_freq.values()] 28 | 29 | ## 30 | 31 | predictions = {} 32 | 33 | eval_set = load_aokvqa(args.aokvqa_dir, args.split) 34 | 35 | for d in eval_set: 36 | if args.multiple_choice: 37 | choices = d['choices'] 38 | probs = [train_freq.get(c, 0) for c in choices] 39 | if probs == [0, 0, 0, 0]: 40 | probs = [1, 1, 1, 1] 41 | probs = [p / sum(probs) for p in probs] 42 | 43 | q = d['question_id'] 44 | predictions[q] = np.random.choice(choices, size=1, p=probs)[0] 45 | 46 | json.dump(predictions, args.output_file) 47 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/load_aokvqa.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | 4 | 5 | def load_aokvqa(aokvqa_dir, split, version='v1p0'): 6 | assert split in ['train', 'val', 'test', 'test_w_ans'] 7 | dataset = json.load(open( 8 | os.path.join(aokvqa_dir, f"aokvqa_{version}_{split}.json") 9 | )) 10 | return dataset 11 | 12 | def get_coco_path(split, image_id, coco_dir): 13 | return os.path.join(coco_dir, f"{split}2017", f"{image_id:012}.jpg") 14 | -------------------------------------------------------------------------------- /baselines/minigpt4/common/vqa_tools/aokvqa/transfer_experiments/README.md: -------------------------------------------------------------------------------- 1 | ## Transfer Learning Experiments 2 | 3 | We use the following training/prediction scripts for the classifier, zero-shot, and contrastive experiments in Table 3. 4 | 5 | ```bash 6 | ## Training 7 | python transfer_experiments/train.py --aokvqa-dir ${AOKVQA_DIR} --vocab ${AOKVQA_DIR}/large_vocab_train.csv --log-dir ${LOG_DIR} 8 | 9 | --backbone clip --clip-model-type ViT-B/32 --train-features ${FEATURES_DIR}/clip-ViT-B-32_train.pt --val-features ${FEATURES_DIR}/clip-ViT-B-32_val.pt 10 | --inputs question # OR --inputs image # OR --inputs question image 11 | # OR 12 | --backbone resnet --train-features ${FEATURES_DIR}/resnet_train.pt --val-features ${FEATURES_DIR}/resnet_val.pt --inputs image 13 | # OR 14 | --backbone bert --train-features ${FEATURES_DIR}/bert_train.pt --val-features ${FEATURES_DIR}/bert_val.pt --inputs question 15 | 16 | --objective classifier 17 | # OR 18 | --objective contrastive --vocab-features ${FEATURE_DIR}/clip-ViT-B-32_large_vocab.pt 19 | ``` 20 | 21 | You can make predictions for CLIP zero-shot or from a classifier/contrastive checkpoint trained above. 22 | 23 | ```bash 24 | ## Predicting 25 | python transfer_experiments/predict.py --aokvqa-dir ${AOKVQA_DIR} --out ${PREDS_DIR}/clip-classifier_val-mc.json 26 | 27 | --split val # or test 28 | --features ${FEATURE_DIR}/clip-ViT-B-32_val.pt # adjust for backbone and eval split 29 | 30 | --ckpt path/to/model.ckpt 31 | # OR 32 | --zero-shot --clip-model-type ViT-B/32 33 | --inputs question # OR --inputs image # OR --inputs question image 34 | 35 | --mc # Multiple-choice. Exclude for direct-answer. 36 | 37 | # IF classifier OR direct-answer 38 | --vocab ${AOKVQA_DIR}/large_vocab_train.csv 39 | # IF contrastive/zero-shot AND direct-answer 40 | --vocab-features ${FEATURES_DIR}/clip-ViT-B-32_large_vocab.pt 41 | ``` 42 | -------------------------------------------------------------------------------- /baselines/minigpt4/configs/datasets/cc_sbu/align.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | cc_sbu_align: 3 | data_type: images 4 | build_info: 5 | # storage: "/ibex/project/c2090/datasets/cc_sbu_align" 6 | storage: "path/to/cc_sbu_align/dataset" 7 | -------------------------------------------------------------------------------- /baselines/minigpt4/configs/datasets/cc_sbu/defaults.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | cc_sbu: 3 | data_type: images 4 | build_info: 5 | storage: /ibex/project/c2133/blip_dataset/cc3m_256/cc3m_cc12m_sbu/{00000..01255}.tar 6 | -------------------------------------------------------------------------------- /baselines/minigpt4/configs/datasets/cmd_video/default.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | cmd_video: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | vis_root: path/to/videos/ 14 | ann_paths: [path/to/annotations.json] 15 | subtitles_path: path/to/subtitles_folder # folder that contains subtitles of .vtt format 16 | model_name: 'llama2' # Language Model Name (available: llama2, mistral) 17 | -------------------------------------------------------------------------------- /baselines/minigpt4/configs/datasets/laion/defaults.yaml: -------------------------------------------------------------------------------- 1 | datasets: 2 | laion: 3 | data_type: images 4 | build_info: 5 | storage: /ibex/project/c2133/blip_dataset/laion_1b/laion_gpu/{00000..10488}.tar 6 | -------------------------------------------------------------------------------- /baselines/minigpt4/configs/datasets/template/default.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | dataset_name: # same as the name of the train_config yaml file 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # let it be images for now even if it is videos 10 | 11 | build_info: # this is the information needed to build the dataset 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | ann_paths: [path/to/annotations_json] # list of paths to annotation files 14 | vis_root: path/to/videos_folder 15 | subtitles_path: path/to/subtitles_folder 16 | model_name: 'llama2' # Language Model Name (available: llama2, mistral) -------------------------------------------------------------------------------- /baselines/minigpt4/configs/datasets/video_chatgpt/default.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | video_chatgpt: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | ann_paths: [path/to/annotations_json] # list of paths to annotation files 14 | vis_root: path/to/videos_folder 15 | subtitles_path: path/to/subtitles_folder # folder that contains subtitles of .vtt format 16 | model_name: 'llama2' # Language Model Name (available: llama2, mistral) -------------------------------------------------------------------------------- /baselines/minigpt4/configs/datasets/webvid/default.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, salesforce.com, inc. 2 | # All rights reserved. 3 | # SPDX-License-Identifier: BSD-3-Clause 4 | # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause 5 | 6 | datasets: 7 | webvid: 8 | # data_dir: ${env.data_dir}/datasets 9 | data_type: images # [images|videos|features] 10 | 11 | build_info: 12 | # Be careful not to append minus sign (-) before split to avoid itemizing 13 | ann_paths: [path/to/annotations.json] 14 | vis_root: path/to/videos/ 15 | subtitles_path: path/to/subtitles_folder/ # folder that contains subtitles of .vtt format 16 | model_name: 'llama2' # Language Model Name (available: llama2, mistral) 17 | -------------------------------------------------------------------------------- /baselines/minigpt4/configs/default.yaml: -------------------------------------------------------------------------------- 1 | env: 2 | # For default users 3 | # cache_root: "cache" 4 | # For internal use with persistent storage 5 | cache_root: "/export/home/.cache/minigpt4" 6 | -------------------------------------------------------------------------------- /baselines/minigpt4/configs/models/minigpt4.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: mini_gpt4_1 3 | 4 | # vit encoder 5 | image_size: 224 6 | drop_path_rate: 0 7 | use_grad_checkpoint: False 8 | vit_precision: "fp16" 9 | freeze_vit: True 10 | freeze_qformer: True 11 | model_type: "vit_h" 12 | device: "cuda" 13 | 14 | # Q-Former 15 | num_query_token: 32 16 | 17 | # Vicuna 18 | llama_model: "lmsys/vicuna-13b-v1.1" 19 | 20 | # generation configs 21 | prompt: "" 22 | 23 | preprocess: 24 | vis_processor: 25 | train: 26 | name: "blip2_image_train" 27 | image_size: 224 28 | eval: 29 | name: "blip2_image_eval" 30 | image_size: 224 31 | text_processor: 32 | train: 33 | name: "blip_caption" 34 | eval: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /baselines/minigpt4/configs/models/minigpt4v.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: mini_gpt4v 3 | 4 | # vit encoder 5 | image_size: 224 6 | drop_path_rate: 0 7 | use_grad_checkpoint: False 8 | vit_precision: "fp16" 9 | freeze_vit: True 10 | freeze_qformer: True 11 | model_type: "vit_h" 12 | device: "cuda" 13 | 14 | # Q-Former 15 | num_query_token: 32 16 | 17 | # Vicuna 18 | llama_model: "lmsys/vicuna-13b-v1.1" 19 | 20 | # generation configs 21 | prompt: "" 22 | 23 | preprocess: 24 | vis_processor: 25 | train: 26 | name: "blip2_image_train" 27 | image_size: 224 28 | eval: 29 | name: "blip2_image_eval" 30 | image_size: 224 31 | text_processor: 32 | train: 33 | name: "blip_caption" 34 | eval: 35 | name: "blip_caption" 36 | -------------------------------------------------------------------------------- /baselines/minigpt4/conversation/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/minigpt4/conversation/__init__.py -------------------------------------------------------------------------------- /baselines/minigpt4/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/minigpt4/datasets/__init__.py -------------------------------------------------------------------------------- /baselines/minigpt4/datasets/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/minigpt4/datasets/datasets/__init__.py -------------------------------------------------------------------------------- /baselines/minigpt4/datasets/datasets/cc_sbu_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | from PIL import Image 3 | import webdataset as wds 4 | from minigpt4.datasets.datasets.base_dataset import BaseDataset 5 | from minigpt4.datasets.datasets.caption_datasets import CaptionDataset 6 | 7 | 8 | class CCSBUDataset(BaseDataset): 9 | def __init__(self, vis_processor, text_processor, location): 10 | super().__init__(vis_processor=vis_processor, text_processor=text_processor) 11 | 12 | self.inner_dataset = wds.DataPipeline( 13 | wds.ResampledShards(location), 14 | wds.tarfile_to_samples(handler=wds.warn_and_continue), 15 | wds.shuffle(1000, handler=wds.warn_and_continue), 16 | wds.decode("pilrgb", handler=wds.warn_and_continue), 17 | wds.to_tuple("jpg", "json", handler=wds.warn_and_continue), 18 | wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue), 19 | wds.map(self.to_dict, handler=wds.warn_and_continue), 20 | ) 21 | 22 | def to_dict(self, sample): 23 | return { 24 | "image": sample[0], 25 | "answer": self.text_processor(sample[1]["caption"]), 26 | } 27 | 28 | 29 | class CCSBUAlignDataset(CaptionDataset): 30 | 31 | def __getitem__(self, index): 32 | 33 | # TODO this assumes image input, not general enough 34 | ann = self.annotation[index] 35 | 36 | img_file = '{}.jpg'.format(ann["image_id"]) 37 | image_path = os.path.join(self.vis_root, img_file) 38 | image = Image.open(image_path).convert("RGB") 39 | 40 | image = self.vis_processor(image) 41 | caption = ann["caption"] 42 | 43 | return { 44 | "image": image, 45 | "answer": caption, 46 | "image_id": self.img_ids[ann["image_id"]], 47 | } 48 | -------------------------------------------------------------------------------- /baselines/minigpt4/datasets/datasets/cot.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pickle 4 | import random 5 | import time 6 | import itertools 7 | 8 | import numpy as np 9 | from PIL import Image 10 | import skimage.io as io 11 | import matplotlib.pyplot as plt 12 | from matplotlib.collections import PatchCollection 13 | from matplotlib.patches import Polygon, Rectangle 14 | from torch.utils.data import Dataset 15 | import webdataset as wds 16 | 17 | from minigpt4.datasets.datasets.base_dataset import BaseDataset 18 | from minigpt4.datasets.datasets.caption_datasets import CaptionDataset 19 | 20 | 21 | class CoTDataset(Dataset): 22 | def __init__(self, text_processor, ann_path): 23 | """ 24 | vis_root (string): Root directory of images (e.g. coco/images/) 25 | ann_root (string): directory to store the annotation file 26 | """ 27 | 28 | self.text_processor = text_processor 29 | 30 | with open(ann_path, 'r') as f: 31 | self.ann = json.load(f) 32 | 33 | def __len__(self): 34 | return len(self.ann) 35 | 36 | def __getitem__(self, index): 37 | info = self.ann[index] 38 | input = info["inputs"] 39 | target = info["targets"] 40 | return { 41 | "instruction_input": input, 42 | "answer": target, 43 | } 44 | -------------------------------------------------------------------------------- /baselines/minigpt4/datasets/datasets/grounded_detailed_image_caption_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pickle 4 | import random 5 | import time 6 | import itertools 7 | 8 | import numpy as np 9 | from PIL import Image 10 | import skimage.io as io 11 | import matplotlib.pyplot as plt 12 | from matplotlib.collections import PatchCollection 13 | from matplotlib.patches import Polygon, Rectangle 14 | from torch.utils.data import Dataset 15 | import webdataset as wds 16 | 17 | from minigpt4.datasets.datasets.base_dataset import BaseDataset 18 | from minigpt4.datasets.datasets.caption_datasets import CaptionDataset 19 | 20 | 21 | class GroundedDetailDataset(Dataset): 22 | def __init__(self, vis_processor, text_processor, vis_root, ann_path): 23 | """ 24 | vis_root (string): Root directory of images (e.g. coco/images/) 25 | ann_root (string): directory to store the annotation file 26 | """ 27 | self.vis_root = vis_root 28 | 29 | self.vis_processor = vis_processor 30 | self.text_processor = text_processor 31 | 32 | self.instruction_pool = [ 33 | '[grounding] please describe this image in details', 34 | '[grounding] describe this image as detailed as possible', 35 | '[grounding] summarize this image in details', 36 | '[grounding] give a thorough description of what you see in this image', 37 | ] 38 | 39 | with open(ann_path, 'r') as f: 40 | self.ann = json.load(f) 41 | 42 | def __len__(self): 43 | return len(self.ann) 44 | 45 | def __getitem__(self, index): 46 | info = self.ann[index] 47 | 48 | image_file = 'COCO_train2014_{}.jpg'.format(info['image_id']) 49 | image_path = os.path.join(self.vis_root, image_file) 50 | image = Image.open(image_path).convert("RGB") 51 | image = self.vis_processor(image) 52 | 53 | answer = info['grounded_caption'] 54 | 55 | instruction = random.choice(self.instruction_pool) 56 | 57 | instruction = " {} ".format(instruction) 58 | 59 | return { 60 | "image": image, 61 | "instruction_input": instruction, 62 | "answer": answer, 63 | "image_id": info['image_id'], 64 | } 65 | -------------------------------------------------------------------------------- /baselines/minigpt4/datasets/datasets/reasoning_dataset.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pickle 4 | import random 5 | import time 6 | import itertools 7 | 8 | import numpy as np 9 | from PIL import Image 10 | import skimage.io as io 11 | import matplotlib.pyplot as plt 12 | from matplotlib.collections import PatchCollection 13 | from matplotlib.patches import Polygon, Rectangle 14 | from torch.utils.data import Dataset 15 | import webdataset as wds 16 | 17 | from minigpt4.datasets.datasets.base_dataset import BaseDataset 18 | from minigpt4.datasets.datasets.caption_datasets import CaptionDataset 19 | 20 | 21 | 22 | class ReasoningDataset(Dataset): 23 | def __init__(self, vis_processor, text_processor, vis_root, ann_path): 24 | """ 25 | vis_root (string): Root directory of images (e.g. coco/images/) 26 | ann_root (string): directory to store the annotation file 27 | """ 28 | self.vis_root = vis_root 29 | 30 | self.vis_processor = vis_processor 31 | self.text_processor = text_processor 32 | self.data = json.load(open(ann_path)) 33 | 34 | # self.data = self.create_data(ann_path) 35 | 36 | # def create_data(self, ann_path): 37 | # # processed_data = [] 38 | # with open(ann_path, 'r') as f: 39 | # data = json.load(f) 40 | 41 | # return processed_data 42 | 43 | def __len__(self): 44 | return len(self.data) 45 | 46 | def __getitem__(self, index): 47 | sample = self.data[index] 48 | image_id = sample["image_id"]+".jpg" 49 | question = sample["question"] 50 | answer = sample["answer"] 51 | 52 | 53 | image = Image.open(os.path.join(self.vis_root, image_id)).convert("RGB") 54 | image = self.vis_processor(image) 55 | 56 | instruction = ' {} '.format(question) 57 | 58 | return { 59 | "image": image, 60 | "instruction_input": instruction, 61 | "answer": answer 62 | } 63 | 64 | 65 | -------------------------------------------------------------------------------- /baselines/minigpt4/datasets/datasets/unnatural_instruction.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pickle 4 | import random 5 | import time 6 | import itertools 7 | 8 | import numpy as np 9 | from PIL import Image 10 | import skimage.io as io 11 | import matplotlib.pyplot as plt 12 | from matplotlib.collections import PatchCollection 13 | from matplotlib.patches import Polygon, Rectangle 14 | from torch.utils.data import Dataset 15 | import webdataset as wds 16 | 17 | from minigpt4.datasets.datasets.base_dataset import BaseDataset 18 | from minigpt4.datasets.datasets.caption_datasets import CaptionDataset 19 | 20 | 21 | class UnnaturalDataset(Dataset): 22 | def __init__(self, text_processor, ann_path): 23 | """ 24 | vis_root (string): Root directory of images (e.g. coco/images/) 25 | ann_root (string): directory to store the annotation file 26 | """ 27 | self.text_processor = text_processor 28 | 29 | with open(ann_path, 'r') as f: 30 | self.ann = json.load(f) 31 | 32 | # with open(ann_path, 'r') as f: 33 | # for data in f.readlines(): 34 | # data = json.loads(data) 35 | # self.ann.append(data) 36 | 37 | def __len__(self): 38 | return len(self.ann) 39 | 40 | def __getitem__(self, index): 41 | info = self.ann[index]["instances"][0] 42 | instruction = info["instruction_with_input"] 43 | constraints = info["constraints"] 44 | answer = info["output"] 45 | if constraints != None: 46 | instruction = instruction+" "+constraints 47 | 48 | return { 49 | # "image":None, 50 | "instruction_input": instruction, 51 | "answer": answer, 52 | } 53 | -------------------------------------------------------------------------------- /baselines/minigpt4/mistral_test_config.yaml: -------------------------------------------------------------------------------- 1 | model: 2 | arch: mini_gpt4_llama_v2 3 | model_type: pretrain_vicuna 4 | freeze_vit: True 5 | freeze_qformer: True 6 | max_txt_len: 512 7 | low_resource: True 8 | image_size: 224 9 | end_sym: "" 10 | llama_model: "checkpoints/MiniGPT4-Video/Mistral-7B-Instruct-v0.2" 11 | ckpt: "checkpoints/video_mistral_all_checkpoint_last.pth" 12 | use_grad_checkpoint: True 13 | chat_template: True 14 | lora_r: 64 15 | lora_alpha: 16 16 | length: 50 17 | use_grad_checkpoint_llm: True 18 | max_context_len: 7200 19 | 20 | 21 | datasets: 22 | video_chatgpt: #99378 row - 13224 video 23 | batch_size: 1 24 | vis_processor: 25 | train: 26 | name: "blip2_image_train" 27 | image_size: 224 28 | text_processor: 29 | train: 30 | name: "blip_caption" 31 | sample_ratio: 200 32 | 33 | 34 | run: 35 | task: image_text_pretrain 36 | seed: 42 37 | amp: True -------------------------------------------------------------------------------- /baselines/minigpt4/models/mistral.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoModelForCausalLM, AutoTokenizer 2 | 3 | device = "cuda" # the device to load the model onto 4 | 5 | model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2") 6 | tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2") 7 | 8 | messages = [ 9 | {"role": "user", "content": "What is your favourite condiment?"}, 10 | {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"}, 11 | {"role": "user", "content": "Do you have mayonnaise recipes?"} 12 | ] 13 | p="Well, I'm quite partial to a good squeeze of fresh lemon juice." 14 | encoded_input = tokenizer(p, return_tensors='pt') 15 | embeds = model.model.embed_tokens(encoded_input.input_ids) 16 | print(embeds.shape) 17 | 18 | 19 | encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt") 20 | model_inputs = encodeds.to(device) 21 | model.to(device) 22 | 23 | generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True) 24 | decoded = tokenizer.batch_decode(generated_ids) 25 | print(decoded[0]) 26 | -------------------------------------------------------------------------------- /baselines/minigpt4/models/policies/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | from .mixed_precision import * 5 | from .wrapping import * 6 | from .activation_checkpointing_functions import apply_fsdp_checkpointing 7 | from .anyprecision_optimizer import AnyPrecisionAdamW 8 | from .fsdp_utils import fsdp_auto_wrap_policy -------------------------------------------------------------------------------- /baselines/minigpt4/models/policies/activation_checkpointing_functions.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import torch 5 | import os 6 | import torch.distributed as dist 7 | from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( 8 | checkpoint_wrapper, 9 | CheckpointImpl, 10 | apply_activation_checkpointing, 11 | ) 12 | 13 | from transformers.models.t5.modeling_t5 import T5Block 14 | from transformers.models.llama.modeling_llama import LlamaDecoderLayer 15 | from functools import partial 16 | 17 | non_reentrant_wrapper = partial( 18 | checkpoint_wrapper, 19 | checkpoint_impl=CheckpointImpl.NO_REENTRANT, 20 | ) 21 | 22 | check_fn = lambda submodule: isinstance(submodule, LlamaDecoderLayer) 23 | 24 | 25 | def apply_fsdp_checkpointing(model): 26 | """apply activation checkpointing to model 27 | returns None as model is updated directly 28 | """ 29 | print(f"--> applying fdsp activation checkpointing...") 30 | 31 | apply_activation_checkpointing( 32 | model, checkpoint_wrapper_fn=non_reentrant_wrapper, check_fn=check_fn 33 | ) 34 | -------------------------------------------------------------------------------- /baselines/minigpt4/models/policies/fsdp_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | def fsdp_auto_wrap_policy(model, transformer_layer_name): 5 | import functools 6 | import os 7 | 8 | from accelerate import FullyShardedDataParallelPlugin 9 | from transformers.models.t5.modeling_t5 import T5Block 10 | from torch.distributed.fsdp.wrap import _or_policy, lambda_auto_wrap_policy, transformer_auto_wrap_policy 11 | 12 | from peft.tuners import PrefixEncoder, PromptEmbedding, PromptEncoder 13 | 14 | def lambda_policy_fn(module): 15 | if ( 16 | len(list(module.named_children())) == 0 17 | and getattr(module, "weight", None) is not None 18 | and module.weight.requires_grad 19 | ): 20 | return True 21 | return False 22 | 23 | lambda_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn) 24 | transformer_wrap_policy = functools.partial( 25 | transformer_auto_wrap_policy, 26 | transformer_layer_cls=( 27 | PrefixEncoder, 28 | PromptEncoder, 29 | PromptEmbedding, 30 | transformer_layer_name, 31 | # FullyShardedDataParallelPlugin.get_module_class_from_name( 32 | # model, transformer_layer_name 33 | # ), 34 | ), 35 | ) 36 | 37 | auto_wrap_policy = functools.partial(_or_policy, policies=[lambda_policy, transformer_wrap_policy]) 38 | return auto_wrap_policy -------------------------------------------------------------------------------- /baselines/minigpt4/models/policies/mixed_precision.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import torch 5 | 6 | from torch.distributed.fsdp import ( 7 | # FullyShardedDataParallel as FSDP, 8 | # CPUOffload, 9 | MixedPrecision, 10 | # BackwardPrefetch, 11 | # ShardingStrategy, 12 | ) 13 | 14 | # requires grad scaler in main loop 15 | fpSixteen = MixedPrecision( 16 | param_dtype=torch.float16, 17 | # Gradient communication precision. 18 | reduce_dtype=torch.float16, 19 | # Buffer precision. 20 | buffer_dtype=torch.float16, 21 | ) 22 | 23 | bfSixteen = MixedPrecision( 24 | param_dtype=torch.bfloat16, 25 | # Gradient communication precision. 26 | reduce_dtype=torch.bfloat16, 27 | # Buffer precision. 28 | buffer_dtype=torch.bfloat16, 29 | cast_forward_inputs=True, 30 | ) 31 | 32 | bfSixteen_mixed = MixedPrecision( 33 | param_dtype=torch.float32, 34 | reduce_dtype=torch.bfloat16, 35 | buffer_dtype=torch.bfloat16, 36 | ) 37 | 38 | fp32_policy = MixedPrecision( 39 | param_dtype=torch.float32, 40 | reduce_dtype=torch.float32, 41 | buffer_dtype=torch.float32, 42 | ) 43 | -------------------------------------------------------------------------------- /baselines/minigpt4/models/policies/wrapping.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. 3 | 4 | import torch.distributed as dist 5 | import torch.nn as nn 6 | import torch 7 | 8 | from transformers.models.llama.modeling_llama import LlamaDecoderLayer 9 | 10 | from torch.distributed.fsdp.fully_sharded_data_parallel import ( 11 | FullyShardedDataParallel as FSDP, 12 | CPUOffload, 13 | BackwardPrefetch, 14 | MixedPrecision, 15 | ) 16 | from torch.distributed.fsdp.wrap import ( 17 | transformer_auto_wrap_policy, 18 | size_based_auto_wrap_policy, 19 | enable_wrap, 20 | wrap, 21 | ) 22 | 23 | import functools 24 | from typing import Type 25 | 26 | 27 | def get_size_policy(min_params=1e8): 28 | num_wrap_policy = functools.partial( 29 | size_based_auto_wrap_policy, min_num_params=min_params 30 | ) 31 | return num_wrap_policy 32 | 33 | 34 | def get_llama_wrapper(): 35 | """we register our main layer class and use the fsdp transformer wrapping policy 36 | ensures embedding layers are in the root fsdp unit for shared access and that fsdp units map to transformer layers 37 | """ 38 | # ==== use new transformer wrapper 39 | 40 | llama_auto_wrap_policy = functools.partial( 41 | transformer_auto_wrap_policy, 42 | transformer_layer_cls={ 43 | LlamaDecoderLayer, 44 | }, 45 | ) 46 | 47 | return llama_auto_wrap_policy 48 | -------------------------------------------------------------------------------- /baselines/minigpt4/processors/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from minigpt4.processors.base_processor import BaseProcessor 9 | from minigpt4.processors.blip_processors import ( 10 | Blip2ImageTrainProcessor, 11 | Blip2ImageEvalProcessor, 12 | BlipCaptionProcessor, 13 | ) 14 | 15 | from minigpt4.common.registry import registry 16 | 17 | __all__ = [ 18 | "BaseProcessor", 19 | "Blip2ImageTrainProcessor", 20 | "Blip2ImageEvalProcessor", 21 | "BlipCaptionProcessor", 22 | ] 23 | 24 | 25 | def load_processor(name, cfg=None): 26 | """ 27 | Example 28 | 29 | >>> processor = load_processor("alpro_video_train", cfg=None) 30 | """ 31 | processor = registry.get_processor_class(name).from_config(cfg) 32 | 33 | return processor 34 | -------------------------------------------------------------------------------- /baselines/minigpt4/processors/base_processor.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from omegaconf import OmegaConf 9 | 10 | 11 | class BaseProcessor: 12 | def __init__(self): 13 | self.transform = lambda x: x 14 | return 15 | 16 | def __call__(self, item): 17 | return self.transform(item) 18 | 19 | @classmethod 20 | def from_config(cls, cfg=None): 21 | return cls() 22 | 23 | def build(self, **kwargs): 24 | cfg = OmegaConf.create(kwargs) 25 | 26 | return self.from_config(cfg) 27 | -------------------------------------------------------------------------------- /baselines/minigpt4/runners/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from minigpt4.runners.runner_base import RunnerBase 9 | 10 | __all__ = ["RunnerBase"] 11 | -------------------------------------------------------------------------------- /baselines/minigpt4/tasks/__init__.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from minigpt4.common.registry import registry 9 | from minigpt4.tasks.base_task import BaseTask 10 | from minigpt4.tasks.image_text_pretrain import ImageTextPretrainTask 11 | 12 | from minigpt4.tasks.vqa import VQATask, GQATask 13 | from minigpt4.tasks.vqa_reading_comprehension import VQARCTask, GQARCTask 14 | 15 | 16 | def setup_task(cfg): 17 | assert "task" in cfg.run_cfg, "Task name must be provided." 18 | 19 | task_name = cfg.run_cfg.task 20 | task = registry.get_task_class(task_name).setup_task(cfg=cfg) 21 | assert task is not None, "Task {} not properly registered.".format(task_name) 22 | 23 | return task 24 | 25 | 26 | __all__ = [ 27 | "BaseTask", 28 | "ImageTextPretrainTask", 29 | "VQATask", 30 | "GQATask", 31 | "VQARCTask", 32 | "GQARCTask", 33 | ] 34 | -------------------------------------------------------------------------------- /baselines/minigpt4/tasks/image_text_pretrain.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright (c) 2022, salesforce.com, inc. 3 | All rights reserved. 4 | SPDX-License-Identifier: BSD-3-Clause 5 | For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause 6 | """ 7 | 8 | from minigpt4.common.registry import registry 9 | from minigpt4.tasks.base_task import BaseTask 10 | 11 | 12 | @registry.register_task("image_text_pretrain") 13 | class ImageTextPretrainTask(BaseTask): 14 | def __init__(self): 15 | super().__init__() 16 | 17 | # def evaluation(self, model, data_loader, cuda_enabled=True): 18 | # pass 19 | -------------------------------------------------------------------------------- /baselines/pllava/models/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/pllava/models/__init__.py -------------------------------------------------------------------------------- /baselines/pllava/models/pllava/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright 2023 The HuggingFace Team. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | from typing import TYPE_CHECKING 15 | 16 | from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available 17 | 18 | 19 | _import_structure = {"configuration_pllava": ["PLLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP", "PllavaConfig"]} 20 | 21 | try: 22 | if not is_torch_available(): 23 | raise OptionalDependencyNotAvailable() 24 | except OptionalDependencyNotAvailable: 25 | pass 26 | else: 27 | _import_structure["modeling_pllava"] = [ 28 | "PLLAVA_PRETRAINED_MODEL_ARCHIVE_LIST", 29 | "PllavaForConditionalGeneration", 30 | "PllavaPreTrainedModel", 31 | ] 32 | _import_structure["processing_pllava"] = ["PllavaProcessor"] 33 | 34 | 35 | if TYPE_CHECKING: 36 | from .configuration_pllava import PLLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP, PllavaConfig 37 | 38 | try: 39 | if not is_torch_available(): 40 | raise OptionalDependencyNotAvailable() 41 | except OptionalDependencyNotAvailable: 42 | pass 43 | else: 44 | from .modeling_pllava import ( 45 | PLLAVA_PRETRAINED_MODEL_ARCHIVE_LIST, 46 | PllavaForConditionalGeneration, 47 | PllavaPreTrainedModel, 48 | ) 49 | from .processing_pllava import PllavaProcessor 50 | 51 | 52 | else: 53 | import sys 54 | 55 | sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure) 56 | -------------------------------------------------------------------------------- /baselines/pllava/models/pllava/convert_pllava_weights_to_hf.py: -------------------------------------------------------------------------------- 1 | # Not yet -------------------------------------------------------------------------------- /baselines/pllava/tasks/eval/demo/__init__.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | from gradio.themes.utils import colors, fonts, sizes 3 | 4 | 5 | pllava_theme = gr.themes.Monochrome( 6 | text_size="sm", 7 | spacing_size="sm", 8 | primary_hue=gr.themes.Color(c100="#f5f5f5", c200="#e5e5e5", c300="#d4d4d4", c400="#a3a3a3", c50="#fafafa", c500="#737373", c600="#525252", c700="#404040", c800="#262626", c900="#171717", c950="#000000"), 9 | secondary_hue=gr.themes.Color(c100="#f5f5f5", c200="#e5e5e5", c300="#d4d4d4", c400="#a3a3a3", c50="#fafafa", c500="#737373", c600="#525252", c700="#404040", c800="#262626", c900="#171717", c950="#000000"), 10 | neutral_hue=gr.themes.Color(c100="#f5f5f5", c200="#e5e5e5", c300="#d4d4d4", c400="#a3a3a3", c50="#fafafa", c500="#737373", c600="#525252", c700="#404040", c800="#262626", c900="#171717", c950="#000000"), 11 | ).set( 12 | background_fill_primary_dark='*primary_950', 13 | background_fill_secondary_dark='*neutral_950' 14 | ) 15 | 16 | -------------------------------------------------------------------------------- /baselines/pllava/tasks/eval/recaption/show_recaption.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import gradio as gr 4 | 5 | from tasks.eval.recaption import load_results 6 | import json 7 | 8 | # example = videogallery().example_inputs() 9 | 10 | 11 | def parse_args(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument( 14 | '--save_path', 15 | required=True, 16 | ) 17 | args = parser.parse_args() 18 | return args 19 | 20 | 21 | args = parse_args() 22 | result_list = load_results(args.save_path) 23 | 24 | 25 | def show(result_index, ): 26 | info = result_list[result_index] 27 | video_path = info['video_path'] 28 | info_str = json.dumps(info, indent=4) 29 | return video_path, info_str 30 | 31 | 32 | 33 | from tasks.eval.recaption import load_results 34 | 35 | with gr.Blocks() as demo: 36 | gr.Markdown("# Showing of what has came out.") 37 | gr.Markdown(f"From Saved Results {args.save_path}") 38 | with gr.Row(): 39 | with gr.Column(1): 40 | show_video = gr.Video(interactive=False) 41 | 42 | with gr.Column(): 43 | result_index = gr.Slider(0, len(result_list), step=1) 44 | info = gr.Text(interactive=False) 45 | 46 | result_index.change(show, [result_index], [show_video, info]) 47 | 48 | 49 | 50 | 51 | 52 | demo.launch(share=True) 53 | -------------------------------------------------------------------------------- /baselines/pllava/tasks/eval/vcgbench/show_vcg.py: -------------------------------------------------------------------------------- 1 | 2 | import argparse 3 | import gradio as gr 4 | 5 | from tasks.eval.vcgbench import load_results 6 | import json 7 | 8 | # example = videogallery().example_inputs() 9 | 10 | 11 | def parse_args(): 12 | parser = argparse.ArgumentParser() 13 | parser.add_argument( 14 | '--save_path', 15 | required=True, 16 | ) 17 | args = parser.parse_args() 18 | return args 19 | 20 | 21 | args = parse_args() 22 | result_list = load_results(args.save_path) 23 | 24 | 25 | def show(result_index, ): 26 | info = result_list[result_index] 27 | video_path = info['video_path'] 28 | info_str = json.dumps(info, indent=4) 29 | return video_path, info_str 30 | 31 | with gr.Blocks() as demo: 32 | gr.Markdown( 33 | f"# Showing The Results from {args.save_path}" 34 | ) 35 | with gr.Row(): 36 | with gr.Column(): 37 | show_video = gr.Video(interactive=False) 38 | 39 | with gr.Column(): 40 | result_index = gr.Slider(0, len(result_list), step=1) 41 | info = gr.Text(interactive=False) 42 | 43 | result_index.change(show, [result_index], [show_video, info]) 44 | 45 | demo.launch(share=True) 46 | -------------------------------------------------------------------------------- /baselines/pllava/tasks/shared_utils.py: -------------------------------------------------------------------------------- 1 | import copy 2 | import logging 3 | import os 4 | import os.path as osp 5 | from os.path import join 6 | 7 | import torch 8 | from torch.utils.data import ConcatDataset, DataLoader 9 | 10 | from utils.optimizer import create_optimizer 11 | from utils.scheduler import create_scheduler 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | 16 | def get_media_types(datasources): 17 | """get the media types for for all the dataloaders. 18 | 19 | Args: 20 | datasources (List): List of dataloaders or datasets. 21 | 22 | Returns: List. The media_types. 23 | 24 | """ 25 | if isinstance(datasources[0], DataLoader): 26 | datasets = [dataloader.dataset for dataloader in datasources] 27 | else: 28 | datasets = datasources 29 | media_types = [ 30 | dataset.datasets[0].media_type 31 | if isinstance(dataset, ConcatDataset) 32 | else dataset.media_type 33 | for dataset in datasets 34 | ] 35 | 36 | return media_types 37 | -------------------------------------------------------------------------------- /baselines/pllava/utils/config_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | from os.path import dirname, join 5 | 6 | from utils.config import Config 7 | from utils.distributed import init_distributed_mode, is_main_process 8 | from utils.logger import setup_logger 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def setup_config(): 14 | """Conbine yaml config and command line config with OmegaConf. 15 | Also converts types, e.g., `'None'` (str) --> `None` (None) 16 | """ 17 | config = Config.get_config() 18 | if config.debug: 19 | config.wandb.enable = False 20 | return config 21 | 22 | 23 | def setup_evaluate_config(config): 24 | """setup evaluation default settings, e.g., disable wandb""" 25 | assert config.evaluate 26 | config.wandb.enable = False 27 | if config.output_dir is None: 28 | config.output_dir = join(dirname(config.pretrained_path), "eval") 29 | return config 30 | 31 | 32 | def setup_output_dir(output_dir, excludes=["code"]): 33 | """ensure not overwritting an exisiting/non-empty output dir""" 34 | if not os.path.exists(output_dir): 35 | os.makedirs(output_dir, exist_ok=False) 36 | else: 37 | existing_dirs_files = os.listdir(output_dir) # list 38 | remaining = set(existing_dirs_files) - set(excludes) 39 | remaining = [e for e in remaining if "slurm" not in e] 40 | remaining = [e for e in remaining if ".out" not in e] 41 | # assert len(remaining) == 0, f"remaining dirs or files: {remaining}" 42 | logger.warn(f"remaining dirs or files: {remaining}") 43 | 44 | 45 | def setup_main(): 46 | """ 47 | Setup config, logger, output_dir, etc. 48 | Shared for pretrain and all downstream tasks. 49 | """ 50 | config = setup_config() 51 | if hasattr(config, "evaluate") and config.evaluate: 52 | config = setup_evaluate_config(config) 53 | init_distributed_mode(config) 54 | 55 | if is_main_process(): 56 | setup_output_dir(config.output_dir, excludes=["code"]) 57 | setup_logger(output=config.output_dir, color=True, name="vindlu") 58 | logger.info(f"config: {Config.pretty_text(config)}") 59 | Config.dump(config, os.path.join(config.output_dir, "config.json")) 60 | return config 61 | -------------------------------------------------------------------------------- /baselines/share4video/__init__.py: -------------------------------------------------------------------------------- 1 | # from .model import LlavaLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /baselines/share4video/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | # Model Constants 7 | IGNORE_INDEX = -100 8 | IMAGE_TOKEN_INDEX = -200 9 | DEFAULT_IMAGE_TOKEN = "" 10 | DEFAULT_IMAGE_PATCH_TOKEN = "" 11 | DEFAULT_IM_START_TOKEN = "" 12 | DEFAULT_IM_END_TOKEN = "" 13 | IMAGE_PLACEHOLDER = "" 14 | -------------------------------------------------------------------------------- /baselines/share4video/model/__init__.py: -------------------------------------------------------------------------------- 1 | try: 2 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig 3 | from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig 4 | from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig 5 | except: 6 | pass 7 | -------------------------------------------------------------------------------- /baselines/share4video/model/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from llava import LlavaLlamaForCausalLM 11 | 12 | 13 | def apply_delta(base_model_path, target_model_path, delta_path): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading delta") 19 | delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 20 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) 21 | 22 | print("Applying delta") 23 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data += base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \ 31 | f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 32 | bparam = base.state_dict()[name] 33 | param.data[:bparam.shape[0], :bparam.shape[1]] += bparam 34 | 35 | print("Saving target model") 36 | delta.save_pretrained(target_model_path) 37 | delta_tokenizer.save_pretrained(target_model_path) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--base-model-path", type=str, required=True) 43 | parser.add_argument("--target-model-path", type=str, required=True) 44 | parser.add_argument("--delta-path", type=str, required=True) 45 | 46 | args = parser.parse_args() 47 | 48 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 49 | -------------------------------------------------------------------------------- /baselines/share4video/model/consolidate.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from transformers import AutoTokenizer, AutoModelForCausalLM 9 | from llava.model import * 10 | from llava.model.utils import auto_upgrade 11 | 12 | 13 | def consolidate_ckpt(src_path, dst_path): 14 | print("Loading model") 15 | auto_upgrade(src_path) 16 | src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False) 18 | src_model.save_pretrained(dst_path) 19 | src_tokenizer.save_pretrained(dst_path) 20 | 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser() 24 | parser.add_argument("--src", type=str, required=True) 25 | parser.add_argument("--dst", type=str, required=True) 26 | 27 | args = parser.parse_args() 28 | 29 | consolidate_ckpt(args.src, args.dst) 30 | -------------------------------------------------------------------------------- /baselines/share4video/model/multimodal_encoder/builder.py: -------------------------------------------------------------------------------- 1 | import os 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2 3 | from .siglip_encoder import SigLipVisionTower 4 | 5 | 6 | def build_vision_tower(vision_tower_cfg, **kwargs): 7 | vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None)) 8 | is_absolute_path_exists = os.path.exists(vision_tower) 9 | use_s2 = getattr(vision_tower_cfg, 's2', False) 10 | if 'siglip' not in vision_tower.lower(): 11 | if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower: 12 | if use_s2: 13 | return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs) 14 | else: 15 | return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs) 16 | else: 17 | if is_absolute_path_exists or vision_tower.startswith("google") or vision_tower.startswith('bczhou'): 18 | return SigLipVisionTower(vision_tower, vision_tower_cfg, **kwargs) 19 | 20 | raise ValueError(f'Unknown vision tower: {vision_tower}') 21 | -------------------------------------------------------------------------------- /baselines/share4video/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import re 4 | 5 | 6 | class IdentityMap(nn.Module): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def forward(self, x, *args, **kwargs): 11 | return x 12 | 13 | @property 14 | def config(self): 15 | return {"mm_projector_type": 'identity'} 16 | 17 | 18 | class SimpleResBlock(nn.Module): 19 | def __init__(self, channels): 20 | super().__init__() 21 | self.pre_norm = nn.LayerNorm(channels) 22 | 23 | self.proj = nn.Sequential( 24 | nn.Linear(channels, channels), 25 | nn.GELU(), 26 | nn.Linear(channels, channels) 27 | ) 28 | def forward(self, x): 29 | x = self.pre_norm(x) 30 | return x + self.proj(x) 31 | 32 | 33 | def build_vision_projector(config, delay_load=False, **kwargs): 34 | projector_type = getattr(config, 'mm_projector_type', 'linear') 35 | 36 | if projector_type == 'linear': 37 | return nn.Linear(config.mm_hidden_size, config.hidden_size) 38 | 39 | mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) 40 | if mlp_gelu_match: 41 | mlp_depth = int(mlp_gelu_match.group(1)) 42 | modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)] 43 | for _ in range(1, mlp_depth): 44 | modules.append(nn.GELU()) 45 | modules.append(nn.Linear(config.hidden_size, config.hidden_size)) 46 | return nn.Sequential(*modules) 47 | 48 | if projector_type == 'identity': 49 | return IdentityMap() 50 | 51 | raise ValueError(f'Unknown projector type: {projector_type}') 52 | -------------------------------------------------------------------------------- /baselines/share4video/model/utils.py: -------------------------------------------------------------------------------- 1 | from transformers import AutoConfig 2 | 3 | 4 | def auto_upgrade(config): 5 | cfg = AutoConfig.from_pretrained(config) 6 | if 'llava' in config and 'llava' not in cfg.model_type: 7 | assert cfg.model_type == 'llama' 8 | print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.") 9 | print("You must upgrade the checkpoint to the new code base (this can be done automatically).") 10 | confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]") 11 | if confirm.lower() in ["y", "yes"]: 12 | print("Upgrading checkpoint...") 13 | assert len(cfg.architectures) == 1 14 | setattr(cfg.__class__, "model_type", "llava") 15 | cfg.architectures[0] = 'LlavaLlamaForCausalLM' 16 | cfg.save_pretrained(config) 17 | print("Checkpoint upgraded.") 18 | else: 19 | print("Checkpoint upgrade aborted.") 20 | exit(1) 21 | -------------------------------------------------------------------------------- /baselines/share4video/train/train_mem.py: -------------------------------------------------------------------------------- 1 | from llava.train.train import train 2 | 3 | if __name__ == "__main__": 4 | train(attn_implementation="flash_attention_2") 5 | -------------------------------------------------------------------------------- /baselines/valley/configs/deepspeed/config_zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | 14 | "zero_optimization": { 15 | "stage": 2, 16 | "allgather_partitions": true, 17 | "allgather_bucket_size": 5e8, 18 | "overlap_comm": true, 19 | "reduce_scatter": true, 20 | "reduce_bucket_size": 5e8, 21 | "contiguous_gradients": true 22 | }, 23 | 24 | "gradient_accumulation_steps": "auto", 25 | "gradient_clipping": "auto", 26 | "steps_per_print": 2000, 27 | "train_batch_size": "auto", 28 | "train_micro_batch_size_per_gpu": "auto", 29 | "wall_clock_breakdown": false 30 | } -------------------------------------------------------------------------------- /baselines/valley/configs/deepspeed/config_zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /baselines/valley/configs/deepspeed/config_zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 3, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_param": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "overlap_comm": true, 41 | "contiguous_gradients": true, 42 | "sub_group_size": 1e9, 43 | "reduce_bucket_size": "auto", 44 | "stage3_prefetch_bucket_size": "auto", 45 | "stage3_param_persistence_threshold": "auto", 46 | "stage3_max_live_parameters": 1e9, 47 | "stage3_max_reuse_distance": 1e9, 48 | "gather_16bit_weights_on_model_save": true 49 | }, 50 | "gradient_accumulation_steps": "auto", 51 | "gradient_clipping": "auto", 52 | "train_batch_size": "auto", 53 | "train_micro_batch_size_per_gpu": "auto", 54 | "steps_per_print": 1e5, 55 | "wall_clock_breakdown": false 56 | } -------------------------------------------------------------------------------- /baselines/valley/configs/experiment/valley_stage1.yaml: -------------------------------------------------------------------------------- 1 | model_name_or_path: Path/to/opensource/LLM 2 | data_path: Path/to/LLaVA-CC3M-Pretrain-595K/chat.json 3 | image_folder: Path/to/LLaVA-CC3M-Pretrain-595K/image_new 4 | video_data_path: Path/to/webvid_703K/chat.json 5 | video_folder: Path/to/webvid_703K/videos 6 | output_dir: Path/to/model/out/dir 7 | # experiment name 8 | project_name: valley 9 | run_name: valley_stage1 10 | 11 | # Whether to make the system prompt a mask in the label, and others do not mask 12 | only_mask_system: False 13 | # system prompt style 14 | conv_mode: v1 15 | # wether freeze backbone 16 | freeze_backbone: True 17 | # wether tune multimodal projection layer 18 | tune_mm_mlp_adapter: True 19 | # wether lora 20 | lora: False 21 | # wether multimodal 22 | is_multimodal: True 23 | 24 | num_train_epochs: 1 25 | per_device_train_batch_size: 16 26 | save_strategy: steps 27 | save_steps: 2400 28 | learning_rate: 2e-3 29 | gradient_checkpointing: True 30 | 31 | # wether do fast epoch 32 | fast_epoch: False 33 | 34 | vision_tower: openai/clip-vit-large-patch14 35 | mm_vision_select_layer: -2 36 | mm_use_im_start_end: True 37 | lazy_preprocess: True 38 | bf16: False 39 | fp16: True 40 | tf32: False 41 | per_device_eval_batch_size: 1 42 | gradient_accumulation_steps: 1 43 | evaluation_strategy: "no" 44 | save_total_limit: 1 45 | weight_decay: 0. 46 | warmup_ratio: 0.03 47 | lr_scheduler_type: cosine 48 | logging_steps: 1 49 | model_max_length: 2048 50 | adam_beta1: 0.9 51 | adam_beta2: 0.95 52 | deepspeed: valley/configs/deepspeed/config_zero2.json 53 | report_to: wandb -------------------------------------------------------------------------------- /baselines/valley/configs/experiment/valley_stage2.yaml: -------------------------------------------------------------------------------- 1 | model_name_or_path: Path/ to/ pretrain/ valley/ from/ stage1 2 | data_path: Path/ to/ LLaVA-Instruct-150K/ llava_instruct_150k.json 3 | image_folder: Path/ to/ COCO/ train2014 4 | video_data_path: /Path/ to/ Valley-Instruct/ valley_instruct_73k.json 5 | video_folder: Path/ to/ Valley-Instruct/ videos 6 | output_dir: Model/ Output/ path 7 | prediction_file_name: Model/ Output/ path/ eval_result.jsonl # evaluation file output path 8 | # experiment name 9 | project_name: valley 10 | run_name: valley_stage2 11 | # Whether to make the system prompt a mask in the label, and others do not mask 12 | only_mask_system: False 13 | # system prompt style 14 | conv_mode: v1 15 | # wether freeze backbone 16 | freeze_backbone: False 17 | # wether tune multimodal projection layer 18 | tune_mm_mlp_adapter: True 19 | # wether lora 20 | lora: False 21 | # wether multimodal 22 | is_multimodal: True 23 | 24 | num_train_epochs: 3 25 | per_device_train_batch_size: 1 26 | per_device_eval_batch_size: 1 # must 1 27 | save_strategy: steps 28 | save_steps: 3000 29 | evaluation_strategy: 'no' 30 | eval_steps: 3000 31 | eval_num: 600 32 | use_legacy_prediction_loop: True 33 | predict_with_generate: True 34 | prediction_loss_only: False 35 | generation_max_length: 1536 36 | learning_rate: 2e-5 37 | gradient_checkpointing: True 38 | 39 | # wether do fast epoch 40 | fast_epoch: False 41 | 42 | vision_tower: openai/clip-vit-large-patch14 43 | mm_vision_select_layer: -2 44 | mm_use_im_start_end: True 45 | lazy_preprocess: True 46 | bf16: True 47 | fp16: False 48 | tf32: False 49 | gradient_accumulation_steps: 1 50 | weight_decay: 0. 51 | warmup_ratio: 0.03 52 | lr_scheduler_type: cosine 53 | logging_steps: 1 54 | model_max_length: 2048 55 | deepspeed: valley/configs/deepspeed/config_zero2.json 56 | report_to: wandb -------------------------------------------------------------------------------- /baselines/valley/configs/experiment/valley_stage2_lora.yaml: -------------------------------------------------------------------------------- 1 | model_name_or_path: Path/ to/ pretrain/ valley/ from/ stage1 2 | data_path: Path/ to/ LLaVA-Instruct-150K/ llava_instruct_150k.json 3 | image_folder: Path/ to/ COCO/ train2014 4 | video_data_path: /Path/ to/ Valley-Instruct/ valley_instruct_73k.json 5 | video_folder: Path/ to/ Valley-Instruct/ videos 6 | output_dir: Model/ Output/ path 7 | prediction_file_name: Model/ Output/ path/ eval_result.jsonl # evaluation file output path 8 | # experiment name 9 | project_name: valley 10 | run_name: valley_stage2_lora 11 | # Whether to make the system prompt a mask in the label, and others do not mask 12 | only_mask_system: False 13 | # system prompt style 14 | conv_mode: v1 15 | # wether freeze backbone 16 | freeze_backbone: False 17 | # wether tune multimodal projection layer 18 | tune_mm_mlp_adapter: True 19 | # wether lora 20 | lora: True 21 | # wether multimodal 22 | is_multimodal: True 23 | 24 | num_train_epochs: 3 25 | per_device_train_batch_size: 4 26 | save_strategy: 'no' 27 | lora_save_strategy: steps # if do lora training, turn on this button, to only save lora weight. support ['steps','epochs','no'] 28 | save_steps: 5000 29 | learning_rate: 5e-4 30 | gradient_checkpointing: True 31 | 32 | # wether do fast epoch 33 | fast_epoch: False 34 | 35 | vision_tower: openai/clip-vit-large-patch14 36 | mm_vision_select_layer: -2 37 | mm_use_im_start_end: True 38 | lazy_preprocess: True 39 | bf16: False 40 | fp16: True 41 | tf32: False 42 | per_device_eval_batch_size: 1 43 | gradient_accumulation_steps: 1 44 | evaluation_strategy: "no" 45 | save_total_limit: 3 46 | weight_decay: 0. 47 | warmup_ratio: 0.03 48 | lr_scheduler_type: cosine 49 | logging_steps: 1 50 | model_max_length: 2048 51 | adam_beta1: 0.9 52 | adam_beta2: 0.95 53 | deepspeed: valley/configs/deepspeed/config_zero2.json 54 | report_to: wandb -------------------------------------------------------------------------------- /baselines/valley/configs/experiment/valley_stage2_zero3.yaml: -------------------------------------------------------------------------------- 1 | model_name_or_path: Path/ to/ pretrain/ valley/ from/ stage1 2 | data_path: Path/ to/ LLaVA-Instruct-150K/ llava_instruct_150k.json 3 | image_folder: Path/ to/ COCO/ train2014 4 | video_data_path: /Path/ to/ Valley-Instruct/ valley_instruct_73k.json 5 | video_folder: Path/ to/ Valley-Instruct/ videos 6 | output_dir: Model/ Output/ path 7 | prediction_file_name: Model/ Output/ path/ eval_result.jsonl # evaluation file output path 8 | # experiment name 9 | project_name: valley2 10 | run_name: valley_stage2_zero3 11 | # Whether to make the system prompt a mask in the label, and others do not mask 12 | only_mask_system: False 13 | # system prompt style 14 | conv_mode: v1 15 | # wether freeze backbone 16 | freeze_backbone: False 17 | # wether tune multimodal projection layer 18 | tune_mm_mlp_adapter: True 19 | # wether freeze multimodal projection layer 20 | freeze_mm_mlp_adapter: False 21 | # wether lora 22 | lora: False 23 | # wether multimodal 24 | is_multimodal: True 25 | 26 | num_train_epochs: 3 27 | per_device_train_batch_size: 1 # zero3 must 1 28 | per_device_eval_batch_size: 1 # must 1 29 | save_strategy: steps 30 | save_steps: 3000 31 | evaluation_strategy: "no" 32 | eval_steps: 3000 33 | eval_num: 600 34 | use_legacy_prediction_loop: True 35 | predict_with_generate: True 36 | prediction_loss_only: False 37 | generation_max_length: 1536 38 | learning_rate: 2e-5 39 | gradient_checkpointing: True 40 | 41 | # wether do fast epoch 42 | fast_epoch: False 43 | 44 | vision_tower: openai/clip-vit-large-patch14 45 | mm_vision_select_layer: -2 46 | mm_use_im_start_end: True 47 | lazy_preprocess: True 48 | bf16: False 49 | fp16: True 50 | tf32: False 51 | gradient_accumulation_steps: 1 52 | weight_decay: 0. 53 | warmup_ratio: 0.03 54 | lr_scheduler_type: cosine 55 | logging_steps: 1 56 | model_max_length: 2048 57 | deepspeed: valley/configs/deepspeed/config_zero3.json 58 | report_to: wandb -------------------------------------------------------------------------------- /baselines/valley/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | -------------------------------------------------------------------------------- /baselines/valley/model/apply_delta.py: -------------------------------------------------------------------------------- 1 | """ 2 | Usage: 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta 4 | """ 5 | import argparse 6 | 7 | import torch 8 | from tqdm import tqdm 9 | from transformers import AutoTokenizer, AutoModelForCausalLM 10 | from valley import ValleyLlamaForCausalLM 11 | 12 | 13 | def apply_delta(base_model_path, target_model_path, delta_path): 14 | print("Loading base model") 15 | base = AutoModelForCausalLM.from_pretrained( 16 | base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 17 | 18 | print("Loading delta") 19 | delta = ValleyLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True) 20 | delta_tokenizer = AutoTokenizer.from_pretrained(delta_path) 21 | 22 | print("Applying delta") 23 | for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"): 24 | if name not in base.state_dict(): 25 | assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'] or 'vision_tower' in name, f'{name} not in base model' 26 | continue 27 | if param.data.shape == base.state_dict()[name].shape: 28 | param.data += base.state_dict()[name] 29 | else: 30 | assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \ 31 | f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}' 32 | bparam = base.state_dict()[name] 33 | param.data[:bparam.shape[0], :bparam.shape[1]] += bparam 34 | 35 | print("Saving target model") 36 | delta.save_pretrained(target_model_path) 37 | delta_tokenizer.save_pretrained(target_model_path) 38 | 39 | 40 | if __name__ == "__main__": 41 | parser = argparse.ArgumentParser() 42 | parser.add_argument("--base-model-path", type=str, required=True) 43 | parser.add_argument("--target-model-path", type=str, required=True) 44 | parser.add_argument("--delta-path", type=str, required=True) 45 | 46 | args = parser.parse_args() 47 | 48 | apply_delta(args.base_model_path, args.target_model_path, args.delta_path) 49 | -------------------------------------------------------------------------------- /baselines/valley/train/train.sh: -------------------------------------------------------------------------------- 1 | torchrun --nproc_per_node 8 --nnodes 1 --node_rank=0 --master_addr 10.192.24.78 --master_port 10404 valley/train/train.py --conf $1 -------------------------------------------------------------------------------- /baselines/valley/util/config.py: -------------------------------------------------------------------------------- 1 | IGNORE_INDEX = -100 2 | DEFAULT_PAD_TOKEN = "[PAD]" 3 | DEFAULT_EOS_TOKEN = "" 4 | DEFAULT_BOS_TOKEN = "" 5 | DEFAULT_UNK_TOKEN = "" 6 | DEFAULT_IMAGE_TOKEN = "" 7 | DEFAULT_IMAGE_PATCH_TOKEN = "" 8 | DEFAULT_IM_START_TOKEN = "" 9 | DEFAULT_IM_END_TOKEN = "" 10 | DEFAULT_VIDEO_TOKEN = "