├── .gitignore
├── INSTALLATION.md
├── LICENSE
├── README.md
├── assets
    └── teaser.png
├── baselines
    ├── base.py
    ├── config.json
    ├── gemini
    │   ├── __init__.py
    │   ├── extract_frames.py
    │   └── upload.py
    ├── gemini_modeling.py
    ├── gpt4o
    │   ├── __init__.py
    │   └── api_wrap.py
    ├── gpt4o_modeling.py
    ├── gpt4v
    │   ├── __init__.py
    │   └── api_wrap.py
    ├── gpt4v_modeling.py
    ├── llamavid
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── conversation.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── builder.py
    │   │   ├── language_model
    │   │   │   └── llava_llama_vid.py
    │   │   ├── llamavid_arch.py
    │   │   ├── multimodal_encoder
    │   │   │   ├── builder.py
    │   │   │   ├── clip_encoder.py
    │   │   │   └── eva_vit.py
    │   │   ├── multimodal_projector
    │   │   │   └── builder.py
    │   │   └── qformer.py
    │   ├── processor
    │   │   ├── clip-patch14-224
    │   │   │   ├── config.json
    │   │   │   └── preprocessor_config.json
    │   │   └── clip-patch14-336
    │   │   │   ├── config.json
    │   │   │   └── preprocessor_config.json
    │   ├── serve
    │   │   ├── __init__.py
    │   │   ├── cli.py
    │   │   ├── controller.py
    │   │   ├── examples
    │   │   │   ├── Avatar.png
    │   │   │   ├── Avengers.jpg
    │   │   │   ├── Forrest_Gump.jpg
    │   │   │   ├── Interstellar.jpg
    │   │   │   ├── Titanic.jpg
    │   │   │   ├── extreme_ironing.jpg
    │   │   │   └── waterview.jpg
    │   │   ├── gradio_web_server.py
    │   │   ├── model_worker.py
    │   │   ├── model_worker_short.py
    │   │   ├── register_worker.py
    │   │   └── run_llamavid_movie.py
    │   └── train
    │   │   ├── llama_flash_attn_monkey_patch.py
    │   │   ├── llava_trainer.py
    │   │   ├── train.py
    │   │   └── train_mem.py
    ├── llamavid_modeling.py
    ├── llava
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── conversation.py
    │   ├── eval
    │   │   ├── eval_gpt_review.py
    │   │   ├── eval_gpt_review_bench.py
    │   │   ├── eval_gpt_review_visual.py
    │   │   ├── eval_pope.py
    │   │   ├── eval_science_qa.py
    │   │   ├── eval_science_qa_gpt4.py
    │   │   ├── eval_science_qa_gpt4_requery.py
    │   │   ├── eval_textvqa.py
    │   │   ├── generate_webpage_data_from_table.py
    │   │   ├── m4c_evaluator.py
    │   │   ├── model_qa.py
    │   │   ├── model_vqa.py
    │   │   ├── model_vqa_loader.py
    │   │   ├── model_vqa_mmbench.py
    │   │   ├── model_vqa_science.py
    │   │   ├── qa_baseline_gpt35.py
    │   │   ├── run_llava.py
    │   │   ├── summarize_gpt_review.py
    │   │   ├── table
    │   │   │   ├── answer
    │   │   │   │   ├── answer_alpaca-13b.jsonl
    │   │   │   │   ├── answer_bard.jsonl
    │   │   │   │   ├── answer_gpt35.jsonl
    │   │   │   │   ├── answer_llama-13b.jsonl
    │   │   │   │   └── answer_vicuna-13b.jsonl
    │   │   │   ├── caps_boxes_coco2014_val_80.jsonl
    │   │   │   ├── model.jsonl
    │   │   │   ├── prompt.jsonl
    │   │   │   ├── question.jsonl
    │   │   │   ├── results
    │   │   │   │   ├── test_sqa_llava_13b_v0.json
    │   │   │   │   └── test_sqa_llava_lcs_558k_sqa_12e_vicuna_v1_3_13b.json
    │   │   │   ├── review
    │   │   │   │   ├── review_alpaca-13b_vicuna-13b.jsonl
    │   │   │   │   ├── review_bard_vicuna-13b.jsonl
    │   │   │   │   ├── review_gpt35_vicuna-13b.jsonl
    │   │   │   │   └── review_llama-13b_vicuna-13b.jsonl
    │   │   │   ├── reviewer.jsonl
    │   │   │   └── rule.json
    │   │   └── webpage
    │   │   │   ├── figures
    │   │   │       ├── alpaca.png
    │   │   │       ├── bard.jpg
    │   │   │       ├── chatgpt.svg
    │   │   │       ├── llama.jpg
    │   │   │       ├── swords_FILL0_wght300_GRAD0_opsz48.svg
    │   │   │       └── vicuna.jpeg
    │   │   │   ├── index.html
    │   │   │   ├── script.js
    │   │   │   └── styles.css
    │   ├── mm_utils.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── apply_delta.py
    │   │   ├── builder.py
    │   │   ├── consolidate.py
    │   │   ├── language_model
    │   │   │   ├── llava_llama.py
    │   │   │   ├── llava_mistral.py
    │   │   │   └── llava_mpt.py
    │   │   ├── llava_arch.py
    │   │   ├── make_delta.py
    │   │   ├── multimodal_encoder
    │   │   │   ├── builder.py
    │   │   │   └── clip_encoder.py
    │   │   ├── multimodal_projector
    │   │   │   └── builder.py
    │   │   └── utils.py
    │   ├── serve
    │   │   ├── __init__.py
    │   │   ├── cli.py
    │   │   ├── controller.py
    │   │   ├── examples
    │   │   │   ├── extreme_ironing.jpg
    │   │   │   └── waterview.jpg
    │   │   ├── gradio_web_server.py
    │   │   ├── model_worker.py
    │   │   ├── register_worker.py
    │   │   ├── sglang_worker.py
    │   │   └── test_message.py
    │   ├── train
    │   │   ├── llama_flash_attn_monkey_patch.py
    │   │   ├── llama_xformers_attn_monkey_patch.py
    │   │   ├── llava_trainer.py
    │   │   ├── train.py
    │   │   ├── train_mem.py
    │   │   └── train_xformers.py
    │   └── utils.py
    ├── llava_modeling.py
    ├── llavanext_modeling.py
    ├── llavavid
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── conversation.py
    │   ├── mm_utils.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── apply_delta.py
    │   │   ├── builder.py
    │   │   ├── consolidate.py
    │   │   ├── language_model
    │   │   │   ├── llava_llama.py
    │   │   │   ├── llava_mistral.py
    │   │   │   └── llava_mpt.py
    │   │   ├── llava_arch.py
    │   │   ├── make_delta.py
    │   │   ├── multimodal_encoder
    │   │   │   ├── builder.py
    │   │   │   └── clip_encoder.py
    │   │   ├── multimodal_projector
    │   │   │   └── builder.py
    │   │   ├── multimodal_resampler
    │   │   │   ├── builder.py
    │   │   │   └── spatial_pool.py
    │   │   └── utils.py
    │   └── utils.py
    ├── minigpt4
    │   ├── __init__.py
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── config.py
    │   │   ├── dist_utils.py
    │   │   ├── eval_utils.py
    │   │   ├── gradcam.py
    │   │   ├── logger.py
    │   │   ├── optims.py
    │   │   ├── registry.py
    │   │   ├── utils.py
    │   │   └── vqa_tools
    │   │   │   ├── VQA
    │   │   │       ├── PythonEvaluationTools
    │   │   │       │   ├── vqaEvalDemo.py
    │   │   │       │   └── vqaEvaluation
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   └── vqaEval.py
    │   │   │       ├── PythonHelperTools
    │   │   │       │   ├── vqaDemo.py
    │   │   │       │   └── vqaTools
    │   │   │       │   │   ├── __init__.py
    │   │   │       │   │   └── vqa.py
    │   │   │       └── README.md
    │   │   │   ├── __init__.py
    │   │   │   ├── aokvqa
    │   │   │       ├── LICENSE
    │   │   │       ├── README.md
    │   │   │       ├── data_scripts
    │   │   │       │   ├── build_vocab.py
    │   │   │       │   ├── encode_vocab_clip.py
    │   │   │       │   ├── extract_bert_features.py
    │   │   │       │   ├── extract_clip_features.py
    │   │   │       │   └── extract_resnet_features.py
    │   │   │       ├── environment.yml
    │   │   │       ├── evaluation
    │   │   │       │   ├── eval_predictions.py
    │   │   │       │   ├── load_aokvqa.py
    │   │   │       │   ├── prepare_predictions.py
    │   │   │       │   └── remap_predictions.py
    │   │   │       ├── gpt3
    │   │   │       │   ├── README.md
    │   │   │       │   ├── caption_inputs.py
    │   │   │       │   ├── query_gpt3.py
    │   │   │       │   └── rationale_inputs.py
    │   │   │       ├── heuristics
    │   │   │       │   ├── README.md
    │   │   │       │   ├── most_common_answer.py
    │   │   │       │   ├── random_unweighted.py
    │   │   │       │   └── random_weighted.py
    │   │   │       ├── load_aokvqa.py
    │   │   │       └── transfer_experiments
    │   │   │       │   ├── README.md
    │   │   │       │   ├── predict.py
    │   │   │       │   └── train.py
    │   │   │   ├── vqa.py
    │   │   │   └── vqa_eval.py
    │   ├── configs
    │   │   ├── datasets
    │   │   │   ├── cc_sbu
    │   │   │   │   ├── align.yaml
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── cmd_video
    │   │   │   │   └── default.yaml
    │   │   │   ├── laion
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── template
    │   │   │   │   └── default.yaml
    │   │   │   ├── video_chatgpt
    │   │   │   │   └── default.yaml
    │   │   │   └── webvid
    │   │   │   │   └── default.yaml
    │   │   ├── default.yaml
    │   │   └── models
    │   │   │   ├── minigpt4.yaml
    │   │   │   └── minigpt4v.yaml
    │   ├── conversation
    │   │   ├── __init__.py
    │   │   └── conversation.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── builders
    │   │   │   ├── __init__.py
    │   │   │   ├── base_dataset_builder.py
    │   │   │   ├── image_text_pair_builder.py
    │   │   │   └── vqa_builder.py
    │   │   ├── data_utils.py
    │   │   └── datasets
    │   │   │   ├── __init__.py
    │   │   │   ├── aok_vqa_datasets.py
    │   │   │   ├── aok_vqa_reasoning_datasets.py
    │   │   │   ├── base_dataset.py
    │   │   │   ├── caption_datasets.py
    │   │   │   ├── caption_reasoning.py
    │   │   │   ├── cc_sbu_dataset.py
    │   │   │   ├── coco_caption.py
    │   │   │   ├── coco_vqa_datasets.py
    │   │   │   ├── cot.py
    │   │   │   ├── coyo_dataset.py
    │   │   │   ├── dataloader_utils.py
    │   │   │   ├── doc_dataset.py
    │   │   │   ├── gqa_datasets.py
    │   │   │   ├── grounded_caption_reasoning.py
    │   │   │   ├── grounded_detailed_image_caption_dataset.py
    │   │   │   ├── laion_dataset.py
    │   │   │   ├── llava_dataset.py
    │   │   │   ├── locna_dataset.py
    │   │   │   ├── lvis_dataset.py
    │   │   │   ├── nav_dataset.py
    │   │   │   ├── open_images.py
    │   │   │   ├── paint_dataset.py
    │   │   │   ├── reasoning_dataset.py
    │   │   │   ├── text_caps.py
    │   │   │   ├── textvqa_datasets.py
    │   │   │   ├── unnatural_instruction.py
    │   │   │   ├── vg_dataset.py
    │   │   │   ├── video_datasets.py
    │   │   │   └── vqa_datasets.py
    │   ├── mistral_test_config.yaml
    │   ├── models
    │   │   ├── Qformer.py
    │   │   ├── __init__.py
    │   │   ├── base_model.py
    │   │   ├── blip2.py
    │   │   ├── blip2_outputs.py
    │   │   ├── clip_vision_encoder.py
    │   │   ├── eva_vit.py
    │   │   ├── mini_gpt4_llama_v2.py
    │   │   ├── mistral.py
    │   │   ├── modeling_llama_v2.py
    │   │   ├── modeling_mistral.py
    │   │   └── policies
    │   │   │   ├── __init__.py
    │   │   │   ├── activation_checkpointing_functions.py
    │   │   │   ├── anyprecision_optimizer.py
    │   │   │   ├── fsdp_utils.py
    │   │   │   ├── mixed_precision.py
    │   │   │   └── wrapping.py
    │   ├── processors
    │   │   ├── __init__.py
    │   │   ├── base_processor.py
    │   │   ├── blip_processors.py
    │   │   └── randaugment.py
    │   ├── runners
    │   │   ├── __init__.py
    │   │   └── runner_base.py
    │   └── tasks
    │   │   ├── __init__.py
    │   │   ├── base_task.py
    │   │   ├── image_text_pretrain.py
    │   │   ├── vqa.py
    │   │   └── vqa_reading_comprehension.py
    ├── minigpt4video_modeling.py
    ├── pllava
    │   ├── models
    │   │   ├── __init__.py
    │   │   └── pllava
    │   │   │   ├── __init__.py
    │   │   │   ├── configuration_pllava.py
    │   │   │   ├── convert_pllava_weights_to_hf.py
    │   │   │   ├── modeling_pllava.py
    │   │   │   └── processing_pllava.py
    │   ├── tasks
    │   │   ├── eval
    │   │   │   ├── demo
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── pllava_demo.py
    │   │   │   │   ├── show_compare.py
    │   │   │   │   └── show_gallery.py
    │   │   │   ├── eval_utils.py
    │   │   │   ├── model_utils.py
    │   │   │   ├── mvbench
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── pllava_eval_mvbench.py
    │   │   │   ├── recaption
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── pllava_recaption.py
    │   │   │   │   └── show_recaption.py
    │   │   │   ├── vcgbench
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── pllava_eval_vcgbench.py
    │   │   │   │   └── show_vcg.py
    │   │   │   └── videoqabench
    │   │   │   │   ├── __init__.py
    │   │   │   │   └── pllava_eval_videoqabench.py
    │   │   ├── shared_utils.py
    │   │   └── train
    │   │   │   ├── config_pllava_nframe.py
    │   │   │   ├── config_pllava_nframe_yiprompt.py
    │   │   │   ├── instruction_data.py
    │   │   │   └── train_pllava_nframe_accel.py
    │   └── utils
    │   │   ├── basic_utils.py
    │   │   ├── config.py
    │   │   ├── config_utils.py
    │   │   ├── distributed.py
    │   │   ├── easydict.py
    │   │   ├── logger.py
    │   │   ├── optimizer.py
    │   │   └── scheduler.py
    ├── pllava_modeling.py
    ├── share4video
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── conversation.py
    │   ├── eval
    │   │   ├── evaluate_benchmark_1_correctness.py
    │   │   ├── evaluate_benchmark_2_detailed_orientation.py
    │   │   ├── evaluate_benchmark_3_context.py
    │   │   ├── evaluate_benchmark_4_temporal.py
    │   │   ├── evaluate_benchmark_5_consistency.py
    │   │   ├── model_vqa_loader.py
    │   │   ├── model_vqa_tempcompass.py
    │   │   ├── run_llava.py
    │   │   └── video
    │   │   │   ├── eval_mvbench.py
    │   │   │   ├── eval_vbench.py
    │   │   │   ├── general_utils.py
    │   │   │   ├── mvbench_utils.py
    │   │   │   └── vbench_utils.py
    │   ├── mm_utils.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── apply_delta.py
    │   │   ├── builder.py
    │   │   ├── consolidate.py
    │   │   ├── language_model
    │   │   │   ├── llava_llama.py
    │   │   │   ├── llava_mistral.py
    │   │   │   └── llava_mpt.py
    │   │   ├── llava_arch.py
    │   │   ├── make_delta.py
    │   │   ├── multimodal_encoder
    │   │   │   ├── builder.py
    │   │   │   ├── clip_encoder.py
    │   │   │   └── siglip_encoder.py
    │   │   ├── multimodal_projector
    │   │   │   └── builder.py
    │   │   └── utils.py
    │   ├── serve
    │   │   └── gradio_utils.py
    │   ├── train
    │   │   ├── llava_trainer.py
    │   │   ├── train.py
    │   │   └── train_mem.py
    │   ├── utils.py
    │   └── video_utils.py
    ├── sharegpt4video_modeling.py
    ├── valley
    │   ├── configs
    │   │   ├── deepspeed
    │   │   │   ├── config_zero2.json
    │   │   │   ├── config_zero3.json
    │   │   │   └── config_zero3_offload.json
    │   │   └── experiment
    │   │   │   ├── valley_stage1.yaml
    │   │   │   ├── valley_stage2.yaml
    │   │   │   ├── valley_stage2_lora.yaml
    │   │   │   └── valley_stage2_zero3.yaml
    │   ├── constants.py
    │   ├── conversation.py
    │   ├── data
    │   │   ├── dataset.py
    │   │   └── video_transform.py
    │   ├── inference
    │   │   ├── run_valley.py
    │   │   ├── run_valley_conv.py
    │   │   └── run_valley_llamma_v2.py
    │   ├── model
    │   │   ├── apply_delta.py
    │   │   ├── make_delta.py
    │   │   └── valley_model.py
    │   ├── train
    │   │   ├── train.py
    │   │   ├── train.sh
    │   │   └── trainner.py
    │   ├── util
    │   │   ├── config.py
    │   │   ├── data_util.py
    │   │   └── decode_img.py
    │   └── utils.py
    ├── valley_modeling.py
    ├── video_chat2
    │   ├── configs
    │   │   ├── config.json
    │   │   ├── config_bert.json
    │   │   ├── data.py
    │   │   ├── instruction_data.py
    │   │   └── model.py
    │   ├── conversation.py
    │   ├── dataset
    │   │   ├── __init__.py
    │   │   ├── base_dataset.py
    │   │   ├── dataloader.py
    │   │   ├── it_dataset.py
    │   │   ├── pt_dataset.py
    │   │   ├── utils.py
    │   │   ├── video_transforms.py
    │   │   └── video_utils.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── bert
    │   │   │   ├── __init__.py
    │   │   │   ├── builder.py
    │   │   │   ├── tokenization_bert.py
    │   │   │   └── xbert.py
    │   │   ├── blip2
    │   │   │   ├── Qformer.py
    │   │   │   ├── __init__.py
    │   │   │   ├── blip2.py
    │   │   │   ├── builder.py
    │   │   │   ├── modeling_llama.py
    │   │   │   ├── modeling_llama_mem.py
    │   │   │   ├── utils.py
    │   │   │   └── vit.py
    │   │   ├── criterions.py
    │   │   ├── utils.py
    │   │   ├── videochat2_it.py
    │   │   ├── videochat2_pt.py
    │   │   └── videochat2_qformer.py
    │   ├── prompts
    │   │   ├── concise_description.txt
    │   │   └── concise_image_description.txt
    │   ├── tasks
    │   │   ├── retrieval_utils.py
    │   │   ├── shared_utils.py
    │   │   ├── shared_utils_qformer.py
    │   │   ├── train_it.py
    │   │   ├── train_pt.py
    │   │   └── train_qformer.py
    │   └── utils
    │   │   ├── basic_utils.py
    │   │   ├── config.py
    │   │   ├── config_utils.py
    │   │   ├── distributed.py
    │   │   ├── easydict.py
    │   │   ├── logger.py
    │   │   ├── optimizer.py
    │   │   └── scheduler.py
    ├── video_chatgpt
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── demo
    │   │   ├── __init__.py
    │   │   ├── chat.py
    │   │   ├── gradio_css.py
    │   │   ├── gradio_patch.py
    │   │   ├── template.py
    │   │   └── video_demo.py
    │   ├── eval
    │   │   ├── __init__.py
    │   │   ├── model_utils.py
    │   │   ├── run_inference_activitynet_qa.py
    │   │   ├── run_inference_benchmark_consistency.py
    │   │   └── run_inference_benchmark_general.py
    │   ├── inference.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── consolidate.py
    │   │   ├── make_delta.py
    │   │   ├── utils.py
    │   │   └── video_chatgpt.py
    │   ├── single_video_inference.py
    │   ├── train
    │   │   ├── llama_flash_attn_monkey_patch.py
    │   │   ├── llava_trainer.py
    │   │   ├── train.py
    │   │   └── train_mem.py
    │   ├── utils.py
    │   └── video_conversation.py
    ├── video_llama
    │   ├── __init__.py
    │   ├── common
    │   │   ├── __init__.py
    │   │   ├── config.py
    │   │   ├── dist_utils.py
    │   │   ├── gradcam.py
    │   │   ├── logger.py
    │   │   ├── optims.py
    │   │   ├── registry.py
    │   │   └── utils.py
    │   ├── configs
    │   │   ├── datasets
    │   │   │   ├── cc_sbu
    │   │   │   │   ├── align.yaml
    │   │   │   │   └── defaults.yaml
    │   │   │   ├── instruct
    │   │   │   │   ├── llava_instruct.yaml
    │   │   │   │   └── webvid_instruct.yaml
    │   │   │   ├── laion
    │   │   │   │   └── defaults.yaml
    │   │   │   └── webvid
    │   │   │   │   └── defaults.yaml
    │   │   ├── default.yaml
    │   │   └── models
    │   │   │   ├── minigpt4.yaml
    │   │   │   └── video_llama.yaml
    │   ├── conversation
    │   │   ├── __init__.py
    │   │   └── conversation_video.py
    │   ├── datasets
    │   │   ├── __init__.py
    │   │   ├── builders
    │   │   │   ├── __init__.py
    │   │   │   ├── base_dataset_builder.py
    │   │   │   ├── image_text_pair_builder.py
    │   │   │   ├── instruct_builder.py
    │   │   │   └── video_caption_builder.py
    │   │   ├── data_utils.py
    │   │   └── datasets
    │   │   │   ├── __init__.py
    │   │   │   ├── base_dataset.py
    │   │   │   ├── caption_datasets.py
    │   │   │   ├── cc_sbu_dataset.py
    │   │   │   ├── dataloader_utils.py
    │   │   │   ├── laion_dataset.py
    │   │   │   ├── llava_instruct_dataset.py
    │   │   │   ├── video_instruct_dataset.py
    │   │   │   └── webvid_datasets.py
    │   ├── models
    │   │   ├── ImageBind
    │   │   │   ├── .assets
    │   │   │   │   ├── bird_audio.wav
    │   │   │   │   ├── bird_image.jpg
    │   │   │   │   ├── car_audio.wav
    │   │   │   │   ├── car_image.jpg
    │   │   │   │   ├── dog_audio.wav
    │   │   │   │   └── dog_image.jpg
    │   │   │   ├── CODE_OF_CONDUCT.md
    │   │   │   ├── CONTRIBUTING.md
    │   │   │   ├── LICENSE
    │   │   │   ├── README.md
    │   │   │   ├── bpe
    │   │   │   │   └── bpe_simple_vocab_16e6.txt.gz
    │   │   │   ├── data.py
    │   │   │   ├── model_card.md
    │   │   │   ├── models
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── helpers.py
    │   │   │   │   ├── imagebind_model.py
    │   │   │   │   ├── multimodal_preprocessors.py
    │   │   │   │   └── transformer.py
    │   │   │   └── requirements.txt
    │   │   ├── Qformer.py
    │   │   ├── __init__.py
    │   │   ├── base_model.py
    │   │   ├── blip2.py
    │   │   ├── blip2_outputs.py
    │   │   ├── eva_vit.py
    │   │   ├── modeling_llama.py
    │   │   └── video_llama.py
    │   ├── processors
    │   │   ├── __init__.py
    │   │   ├── base_processor.py
    │   │   ├── blip_processors.py
    │   │   ├── functional_video.py
    │   │   ├── randaugment.py
    │   │   ├── transforms_video.py
    │   │   └── video_processor.py
    │   ├── runners
    │   │   ├── __init__.py
    │   │   ├── runner_base.py
    │   │   └── test.py
    │   ├── tasks
    │   │   ├── __init__.py
    │   │   ├── base_task.py
    │   │   ├── image_text_pretrain.py
    │   │   └── video_text_pretrain.py
    │   └── video_llama_eval_withaudio.yaml
    ├── videochat_modeling.py
    ├── videochatgpt_modeling.py
    ├── videolavit
    │   ├── __init__.py
    │   ├── conversation.py
    │   ├── models
    │   │   ├── __init__.py
    │   │   ├── modeling_3d_unet.py
    │   │   ├── modeling_decoder.py
    │   │   ├── modeling_motion_condition.py
    │   │   ├── modeling_motion_tokenizer.py
    │   │   ├── modeling_transformer_temporal.py
    │   │   ├── modeling_unet_3d_blocks.py
    │   │   ├── modeling_video_lavit_hf.py
    │   │   ├── modeling_visual_encoder.py
    │   │   ├── modeling_visual_tokenzier.py
    │   │   ├── transform.py
    │   │   ├── video_detokenizer.py
    │   │   ├── video_lavit_for_generation.py
    │   │   └── video_lavit_for_understanding.py
    │   └── utils.py
    ├── videolavit_modeling.py
    ├── videollama2
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── conversation.py
    │   ├── eval
    │   │   ├── eval_audio_TUT2017.py
    │   │   ├── eval_audio_clotho.py
    │   │   ├── eval_audio_clothoAQA.py
    │   │   ├── eval_audio_video_AVQA.py
    │   │   ├── eval_audio_video_AVSD.py
    │   │   ├── eval_audio_video_AVSSD.py
    │   │   ├── eval_audio_vocalsound.py
    │   │   ├── eval_video_cap_msvc_correctness.py
    │   │   ├── eval_video_cap_msvc_detailedness.py
    │   │   ├── eval_video_mcqa_mvbench.py
    │   │   ├── eval_video_mcqa_videomme.py
    │   │   ├── eval_video_oqa_activitynet.py
    │   │   ├── eval_video_oqa_vcgpt_1_correctness.py
    │   │   ├── eval_video_oqa_vcgpt_2_detailed_orientation.py
    │   │   ├── eval_video_oqa_vcgpt_3_context.py
    │   │   ├── eval_video_oqa_vcgpt_4_temporal.py
    │   │   ├── eval_video_oqa_vcgpt_5_consistency.py
    │   │   ├── inference_audio.py
    │   │   ├── inference_audio_video.py
    │   │   ├── inference_video_cap_msvc.py
    │   │   ├── inference_video_mcqa_egoschema.py
    │   │   ├── inference_video_mcqa_mvbench.py
    │   │   ├── inference_video_mcqa_perception_test_mcqa.py
    │   │   ├── inference_video_mcqa_videomme.py
    │   │   ├── inference_video_oqa_activitynet.py
    │   │   ├── inference_video_oqa_vcgpt_consistency.py
    │   │   └── inference_video_oqa_vcgpt_general.py
    │   ├── mm_utils.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── beats
    │   │   │   ├── BEATs.py
    │   │   │   ├── LICENSE_beats
    │   │   │   ├── Tokenizers.py
    │   │   │   ├── __init__.py
    │   │   │   ├── backbone.py
    │   │   │   ├── modules.py
    │   │   │   ├── quantizer.py
    │   │   │   └── weight_norm_fix.py
    │   │   ├── encoder.py
    │   │   ├── mel_filters.npz
    │   │   ├── projector.py
    │   │   ├── videollama2_arch.py
    │   │   ├── videollama2_gemma2.py
    │   │   ├── videollama2_llama.py
    │   │   ├── videollama2_mistral.py
    │   │   ├── videollama2_mixtral.py
    │   │   ├── videollama2_phi3.py
    │   │   └── videollama2_qwen2.py
    │   ├── serve
    │   │   ├── cli.py
    │   │   ├── controller.py
    │   │   ├── examples
    │   │   │   ├── bird-twitter-car.wav
    │   │   │   ├── desert.jpg
    │   │   │   ├── door.of.bar.raining2.wav
    │   │   │   ├── extreme_ironing.jpg
    │   │   │   └── waterview.jpg
    │   │   ├── gradio_web_server.py
    │   │   ├── gradio_web_server_adhoc.py
    │   │   ├── gradio_web_server_adhoc_av.py
    │   │   ├── model_worker.py
    │   │   ├── register_worker.py
    │   │   ├── sglang_worker.py
    │   │   └── test_message.py
    │   ├── train.py
    │   ├── utils.py
    │   └── videollama2_trainer.py
    ├── videollama2_modeling.py
    ├── videollama_modeling.py
    ├── videollava
    │   ├── __init__.py
    │   ├── constants.py
    │   ├── conversation.py
    │   ├── mm_utils.py
    │   ├── model
    │   │   ├── __init__.py
    │   │   ├── apply_delta.py
    │   │   ├── builder.py
    │   │   ├── consolidate.py
    │   │   ├── language_model
    │   │   │   ├── llava_llama.py
    │   │   │   ├── llava_mpt.py
    │   │   │   └── mpt
    │   │   │   │   ├── adapt_tokenizer.py
    │   │   │   │   ├── attention.py
    │   │   │   │   ├── blocks.py
    │   │   │   │   ├── configuration_mpt.py
    │   │   │   │   ├── custom_embedding.py
    │   │   │   │   ├── flash_attn_triton.py
    │   │   │   │   ├── hf_prefixlm_converter.py
    │   │   │   │   ├── meta_init_context.py
    │   │   │   │   ├── modeling_mpt.py
    │   │   │   │   ├── norm.py
    │   │   │   │   └── param_init_fns.py
    │   │   ├── llava_arch.py
    │   │   ├── make_delta.py
    │   │   ├── multimodal_encoder
    │   │   │   ├── builder.py
    │   │   │   ├── clip_encoder.py
    │   │   │   └── languagebind
    │   │   │   │   ├── __init__.py
    │   │   │   │   ├── audio
    │   │   │   │       ├── configuration_audio.py
    │   │   │   │       ├── modeling_audio.py
    │   │   │   │       ├── processing_audio.py
    │   │   │   │       └── tokenization_audio.py
    │   │   │   │   ├── depth
    │   │   │   │       ├── configuration_depth.py
    │   │   │   │       ├── modeling_depth.py
    │   │   │   │       ├── processing_depth.py
    │   │   │   │       └── tokenization_depth.py
    │   │   │   │   ├── image
    │   │   │   │       ├── configuration_image.py
    │   │   │   │       ├── modeling_image.py
    │   │   │   │       ├── processing_image.py
    │   │   │   │       └── tokenization_image.py
    │   │   │   │   ├── thermal
    │   │   │   │       ├── configuration_thermal.py
    │   │   │   │       ├── modeling_thermal.py
    │   │   │   │       ├── processing_thermal.py
    │   │   │   │       └── tokenization_thermal.py
    │   │   │   │   └── video
    │   │   │   │       ├── configuration_video.py
    │   │   │   │       ├── modeling_video.py
    │   │   │   │       ├── processing_video.py
    │   │   │   │       └── tokenization_video.py
    │   │   ├── multimodal_projector
    │   │   │   └── builder.py
    │   │   └── utils.py
    │   └── utils.py
    └── videollava_modeling.py
├── evaluations
    ├── evaluation.py
    ├── evaluation_bias.py
    ├── evaluation_bias_sep.py
    ├── evaluation_halluc.py
    ├── evaluation_pep.py
    ├── evaluation_pep_utils.py
    └── evaluation_utils.py
├── model_testing_zoo.py
└── videohallucer_datasets
    ├── external_factual
        └── external_factual.json
    ├── external_nonfactual
        └── external_nonfactual.json
    ├── fact_detect
        ├── fact_detect.json
        ├── fact_detect_yn.json
        └── modify.py
    ├── interaction
        ├── conflict.jsonl
        ├── interaction.json
        └── stat.py
    ├── object_relation
        └── object_relation.json
    ├── semantic_detail
        └── semantic_detail.json
    └── temporal
        └── temporal.json


/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Yuxuan Wang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/assets/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/assets/teaser.png


--------------------------------------------------------------------------------
/baselines/base.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | class ViLLMBaseModel(torch.nn.Module):
 4 |     def __init__(self, model_path, device):
 5 |         super().__init__()
 6 |         self.device=device
 7 |         self.model_path=model_path
 8 | 
 9 |     def forward(self, instruction, videos):
10 |         return self.generate(instruction, videos)
11 |     
12 |     def generate(self, instruction, videos):
13 |         """
14 |         instruction: (str) a string of instruction
15 |         images: (list) a list of image urls
16 |         Return: (str) a string of generated response
17 |         """
18 |         raise NotImplementedError


--------------------------------------------------------------------------------
/baselines/config.json:
--------------------------------------------------------------------------------
1 | {
2 |     "CKPT_DIR": "checkpoints",
3 |     "DATA_DIR": "../videohallucer_datasets"
4 | }


--------------------------------------------------------------------------------
/baselines/gemini/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/gemini/__init__.py


--------------------------------------------------------------------------------
/baselines/gemini/extract_frames.py:
--------------------------------------------------------------------------------
 1 | import os, shutil, cv2
 2 | 
 3 | def create_frame_output_dir(output_dir):
 4 |     if not os.path.exists(output_dir):
 5 |         os.makedirs(output_dir)
 6 |     else:
 7 |         shutil.rmtree(output_dir)
 8 |         os.makedirs(output_dir)
 9 | 
10 | def extract_frame_from_video(video_file_path, FRAME_EXTRACTION_DIRECTORY, FRAME_PREFIX, FPS=1):
11 |     # print(f"Extracting {video_file_path} at 1 frame per second. This might take a bit...")
12 |     # print(video_file_path)
13 |     create_frame_output_dir(FRAME_EXTRACTION_DIRECTORY)
14 |     vidcap = cv2.VideoCapture(video_file_path)
15 |     fps = vidcap.get(cv2.CAP_PROP_FPS)
16 |     frame_duration = 1 / fps  # Time interval between frames (in seconds)
17 |     output_file_prefix = os.path.basename(video_file_path).replace('.', '_')
18 |     frame_count = 0
19 |     count = 0
20 |     while vidcap.isOpened():
21 |         success, frame = vidcap.read()
22 |         if not success: # End of video
23 |             break
24 |         if int(count / fps) == frame_count: # Extract a frame every second
25 |             min = frame_count // 60
26 |             sec = frame_count % 60
27 |             time_string = f"{min:02d}:{sec:02d}"
28 |             image_name = f"{output_file_prefix}{FRAME_PREFIX}{time_string}.jpg"
29 |             output_filename = os.path.join(FRAME_EXTRACTION_DIRECTORY, image_name)
30 |             cv2.imwrite(output_filename, frame)
31 |             frame_count += 1
32 |         count += 1
33 |     vidcap.release() # Release the capture object\n",
34 |     # print(f"Completed video frame extraction!\n\nExtracted: {frame_count} frames")
35 | 
36 | 


--------------------------------------------------------------------------------
/baselines/gemini/upload.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | # import google.generativeai as genai
 3 | 
 4 | class File:
 5 |     def __init__(self, file_path: str, frame_prefix: str, display_name: str = None):
 6 |         self.file_path = file_path
 7 |         if display_name:
 8 |             self.display_name = display_name
 9 |         self.timestamp = get_timestamp(file_path, frame_prefix)
10 | 
11 |     def set_file_response(self, response):
12 |         self.response = response
13 | 
14 | def get_timestamp(filename, FRAME_PREFIX):
15 |     """Extracts the frame count (as an integer) from a filename with the format
16 |         'output_file_prefix_frame00:00.jpg'.
17 |     """
18 |     parts = filename.split(FRAME_PREFIX)
19 |     # print(parts)
20 |     if len(parts) != 2:
21 |         return None  # Indicates the filename might be incorrectly formatted
22 |     return parts[1].split('.')[0] 
23 | 
24 | def make_request(prompt, files):
25 |     request = [prompt]
26 |     for file in files:
27 |         request.append(file.timestamp)
28 |         request.append(file.response)
29 |     return request
30 | 


--------------------------------------------------------------------------------
/baselines/gpt4o/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/gpt4o/__init__.py


--------------------------------------------------------------------------------
/baselines/gpt4o_modeling.py:
--------------------------------------------------------------------------------
 1 | import os, shutil, cv2
 2 | from gpt4o.api_wrap import OpenAIAPIWrapper
 3 | 
 4 | from base import ViLLMBaseModel
 5 | 
 6 | 
 7 | class GPT4O(ViLLMBaseModel):
 8 |     def __init__(self, model_args):
 9 |         super().__init__(model_args["model_path"], model_args["device"])
10 |         assert (
11 |             "model_path" in model_args
12 |             and "device" in model_args
13 |         )
14 | 
15 |         self.model = OpenAIAPIWrapper()
16 |         self.model_name = 'GPT4O'
17 | 
18 |     def generate(self, instruction, video_path):
19 |         
20 |         response, num_tokens = self.model.get_completion(instruction, video_path=video_path)
21 |         response = response.strip()
22 | 
23 |         return response
24 | 
25 | def create_frame_output_dir(output_dir):
26 |     if not os.path.exists(output_dir):
27 |         os.makedirs(output_dir)
28 |     else:
29 |         shutil.rmtree(output_dir)
30 |         os.makedirs(output_dir)
31 | 
32 | 


--------------------------------------------------------------------------------
/baselines/gpt4v/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/gpt4v/__init__.py


--------------------------------------------------------------------------------
/baselines/llamavid/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaAttForCausalLM
2 | 


--------------------------------------------------------------------------------
/baselines/llamavid/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | 


--------------------------------------------------------------------------------
/baselines/llamavid/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.llava_llama_vid import LlavaLlamaAttForCausalLM
2 | 


--------------------------------------------------------------------------------
/baselines/llamavid/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower
 3 | from .eva_vit import EVAVisionTowerLavis
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     # vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', "./checkpoints/LLaMA-VID-7B/LAVIS/eva_vit_g.pth"))
 7 |     # vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', "./checkpoints/LLaMA-VID-7B/LAVIS/eva_vit_g.pth")
 8 |     vision_tower = "./checkpoints/LLaMA-VID-7B/LAVIS/eva_vit_g.pth"
 9 |     image_processor = getattr(vision_tower_cfg, 'image_processor', getattr(vision_tower_cfg, 'image_processor', "./model_zoo/OpenAI/clip-vit-large-patch14"))
10 |     is_absolute_path_exists = os.path.exists(vision_tower)
11 |     
12 |     if not is_absolute_path_exists:
13 |         raise ValueError(f'Not find vision tower: {vision_tower}')
14 |     
15 |     if "openai" in vision_tower.lower() or "laion" in vision_tower.lower():
16 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
17 |     elif "lavis" in vision_tower.lower() or "eva" in vision_tower.lower():
18 |         return EVAVisionTowerLavis(vision_tower, image_processor, args=vision_tower_cfg, **kwargs)
19 |     else:
20 |         raise ValueError(f'Unknown vision tower: {vision_tower}')
21 |     
22 | 


--------------------------------------------------------------------------------
/baselines/llamavid/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | 
 6 | class IdentityMap(nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_projector_type": 'identity'}
16 | 
17 | 
18 | class SimpleResBlock(nn.Module):
19 |     def __init__(self, channels):
20 |         super().__init__()
21 |         self.pre_norm = nn.LayerNorm(channels)
22 | 
23 |         self.proj = nn.Sequential(
24 |             nn.Linear(channels, channels),
25 |             nn.GELU(),
26 |             nn.Linear(channels, channels)
27 |         )
28 |     def forward(self, x):
29 |         x = self.pre_norm(x)
30 |         return x + self.proj(x)
31 | 
32 | 
33 | def build_vision_projector(config, delay_load=False, **kwargs):
34 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
35 | 
36 |     if projector_type == 'linear':
37 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
38 | 
39 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40 |     if mlp_gelu_match:
41 |         mlp_depth = int(mlp_gelu_match.group(1))
42 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43 |         for _ in range(1, mlp_depth):
44 |             modules.append(nn.GELU())
45 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46 |         return nn.Sequential(*modules)
47 | 
48 |     if projector_type == 'identity':
49 |         return IdentityMap()
50 | 
51 |     raise ValueError(f'Unknown projector type: {projector_type}')
52 | 


--------------------------------------------------------------------------------
/baselines/llamavid/processor/clip-patch14-224/preprocessor_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "crop_size": 224,
 3 |   "do_center_crop": true,
 4 |   "do_normalize": true,
 5 |   "do_resize": true,
 6 |   "feature_extractor_type": "CLIPFeatureExtractor",
 7 |   "image_mean": [
 8 |     0.48145466,
 9 |     0.4578275,
10 |     0.40821073
11 |   ],
12 |   "image_std": [
13 |     0.26862954,
14 |     0.26130258,
15 |     0.27577711
16 |   ],
17 |   "resample": 3,
18 |   "size": 224
19 | }
20 | 


--------------------------------------------------------------------------------
/baselines/llamavid/processor/clip-patch14-336/preprocessor_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "crop_size": 336,
 3 |   "do_center_crop": true,
 4 |   "do_normalize": true,
 5 |   "do_resize": true,
 6 |   "feature_extractor_type": "CLIPFeatureExtractor",
 7 |   "image_mean": [
 8 |     0.48145466,
 9 |     0.4578275,
10 |     0.40821073
11 |   ],
12 |   "image_std": [
13 |     0.26862954,
14 |     0.26130258,
15 |     0.27577711
16 |   ],
17 |   "resample": 3,
18 |   "size": 336
19 | }
20 | 


--------------------------------------------------------------------------------
/baselines/llamavid/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llamavid/serve/__init__.py


--------------------------------------------------------------------------------
/baselines/llamavid/serve/examples/Avatar.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llamavid/serve/examples/Avatar.png


--------------------------------------------------------------------------------
/baselines/llamavid/serve/examples/Avengers.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llamavid/serve/examples/Avengers.jpg


--------------------------------------------------------------------------------
/baselines/llamavid/serve/examples/Forrest_Gump.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llamavid/serve/examples/Forrest_Gump.jpg


--------------------------------------------------------------------------------
/baselines/llamavid/serve/examples/Interstellar.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llamavid/serve/examples/Interstellar.jpg


--------------------------------------------------------------------------------
/baselines/llamavid/serve/examples/Titanic.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llamavid/serve/examples/Titanic.jpg


--------------------------------------------------------------------------------
/baselines/llamavid/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llamavid/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/baselines/llamavid/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llamavid/serve/examples/waterview.jpg


--------------------------------------------------------------------------------
/baselines/llamavid/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200
27 | 


--------------------------------------------------------------------------------
/baselines/llamavid/train/train_mem.py:
--------------------------------------------------------------------------------
 1 | # Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
 2 | # Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
 3 | # Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
 4 | 
 5 | # Need to call this before importing transformers.
 6 | from llamavid.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
 7 | 
 8 | replace_llama_attn_with_flash_attn()
 9 | 
10 | from llamavid.train.train import train
11 | 
12 | if __name__ == "__main__":
13 |     train()
14 | 


--------------------------------------------------------------------------------
/baselines/llava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/baselines/llava/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | IMAGE_PLACEHOLDER = "<image-placeholder>"
14 | 


--------------------------------------------------------------------------------
/baselines/llava/eval/table/model.jsonl:
--------------------------------------------------------------------------------
1 | {"model_id": "vicuna-13b:20230322-clean-lang", "model_name": "vicuna-13b", "model_version": "20230322-clean-lang", "model_metadata": "vicuna-13b-20230322-clean-lang"}
2 | {"model_id": "alpaca-13b:v1", "model_name": "alpaca-13b", "model_version": "v1", "model_metadata": "alpaca-13b"}
3 | {"model_id": "llama-13b:v1", "model_name": "llama-13b", "model_version": "v1", "model_metadata": "hf-llama-13b"}
4 | {"model_id": "bard:20230327", "model_name": "bard", "model_version": "20230327", "model_metadata": "Google Bard 20230327"}
5 | {"model_id": "gpt-3.5-turbo:20230327", "model_name": "gpt-3.5-turbo", "model_version": "20230327", "model_metadata": "OpenAI ChatGPT gpt-3.5-turbo Chat Completion"}
6 | 


--------------------------------------------------------------------------------
/baselines/llava/eval/table/reviewer.jsonl:
--------------------------------------------------------------------------------
1 | {"reviewer_id": "gpt-4-0328-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions"}
2 | {"reviewer_id": "gpt-4-0328-coding", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for coding questions"}
3 | {"reviewer_id": "gpt-4-0328-math", "prompt_id": 3, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"}
4 | {"reviewer_id": "gpt-4-0417-visual", "prompt_id": 4, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"}
5 | 


--------------------------------------------------------------------------------
/baselines/llava/eval/webpage/figures/alpaca.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llava/eval/webpage/figures/alpaca.png


--------------------------------------------------------------------------------
/baselines/llava/eval/webpage/figures/bard.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llava/eval/webpage/figures/bard.jpg


--------------------------------------------------------------------------------
/baselines/llava/eval/webpage/figures/chatgpt.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" viewBox="0 0 2406 2406"><path d="M1 578.4C1 259.5 259.5 1 578.4 1h1249.1c319 0 577.5 258.5 577.5 577.4V2406H578.4C259.5 2406 1 2147.5 1 1828.6V578.4z" fill="#74aa9c"/><path d="M1107.3 299.1c-198 0-373.9 127.3-435.2 315.3C544.8 640.6 434.9 720.2 370.5 833c-99.3 171.4-76.6 386.9 56.4 533.8-41.1 123.1-27 257.7 38.6 369.2 98.7 172 297.3 260.2 491.6 219.2 86.1 97 209.8 152.3 339.6 151.8 198 0 373.9-127.3 435.3-315.3 127.5-26.3 237.2-105.9 301-218.5 99.9-171.4 77.2-386.9-55.8-533.9v-.6c41.1-123.1 27-257.8-38.6-369.8-98.7-171.4-297.3-259.6-491-218.6-86.6-96.8-210.5-151.8-340.3-151.2zm0 117.5-.6.6c79.7 0 156.3 27.5 217.6 78.4-2.5 1.2-7.4 4.3-11 6.1L952.8 709.3c-18.4 10.4-29.4 30-29.4 51.4V1248l-155.1-89.4V755.8c-.1-187.1 151.6-338.9 339-339.2zm434.2 141.9c121.6-.2 234 64.5 294.7 169.8 39.2 68.6 53.9 148.8 40.4 226.5-2.5-1.8-7.3-4.3-10.4-6.1l-360.4-208.2c-18.4-10.4-41-10.4-59.4 0L1024 984.2V805.4L1372.7 604c51.3-29.7 109.5-45.4 168.8-45.5zM650 743.5v427.9c0 21.4 11 40.4 29.4 51.4l421.7 243-155.7 90L597.2 1355c-162-93.8-217.4-300.9-123.8-462.8C513.1 823.6 575.5 771 650 743.5zm807.9 106 348.8 200.8c162.5 93.7 217.6 300.6 123.8 462.8l.6.6c-39.8 68.6-102.4 121.2-176.5 148.2v-428c0-21.4-11-41-29.4-51.4l-422.3-243.7 155-89.3zM1201.7 997l177.8 102.8v205.1l-177.8 102.8-177.8-102.8v-205.1L1201.7 997zm279.5 161.6 155.1 89.4v402.2c0 187.3-152 339.2-339 339.2v-.6c-79.1 0-156.3-27.6-217-78.4 2.5-1.2 8-4.3 11-6.1l360.4-207.5c18.4-10.4 30-30 29.4-51.4l.1-486.8zM1380 1421.9v178.8l-348.8 200.8c-162.5 93.1-369.6 38-463.4-123.7h.6c-39.8-68-54-148.8-40.5-226.5 2.5 1.8 7.4 4.3 10.4 6.1l360.4 208.2c18.4 10.4 41 10.4 59.4 0l421.9-243.7z" fill="white"/></svg>


--------------------------------------------------------------------------------
/baselines/llava/eval/webpage/figures/llama.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llava/eval/webpage/figures/llama.jpg


--------------------------------------------------------------------------------
/baselines/llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg:
--------------------------------------------------------------------------------
1 | <svg xmlns="http://www.w3.org/2000/svg" height="48" viewBox="0 96 960 960" width="48"><path d="m762.846 947.614-124.77-124.769-88 88-30.306-30.692q-16.616-16.231-16.616-40.077 0-23.846 16.616-40.461L708 611.385q16.23-16.231 40.076-16.231t40.462 16.231l30.307 30.691-88 88 124.154 124.77q8.615 8.615 8.615 20.23 0 11.616-8.615 20.231l-51.692 52.307q-8.615 9-20.231 9-11.615 0-20.23-9Zm97.153-624.076L412.768 771.153l27.847 28.077q16.231 16.616 16.231 40.462 0 23.846-16.231 40.077l-30.691 30.691-88-88-124.77 124.769q-8.615 9-20.23 9-11.616 0-20.231-9l-52.307-52.307q-9-8.615-9-20.23 0-11.616 9-20.231l124.769-124.769-88-88L171.847 611q16.231-16.23 40.077-16.23 23.846 0 40.461 16.23l28.462 28.232 447.615-447.231h131.537v131.537ZM323.846 483.769l33.769-34.154 34.154-34.153-34.154 34.153-33.769 34.154Zm-31.999 31.999-191.846-192.23V192.001h131.537l191.461 191.846-31.23 31.615-179.077-178.077h-67.307v67.307l178.461 179.077-31.999 31.999Zm87.691 222.77 435.077-433.846v-67.307h-67.307L312.231 670.846l67.307 67.692Zm0 0L346.385 704l-34.154-33.154L346.385 704l33.153 34.538Z"/></svg>


--------------------------------------------------------------------------------
/baselines/llava/eval/webpage/figures/vicuna.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llava/eval/webpage/figures/vicuna.jpeg


--------------------------------------------------------------------------------
/baselines/llava/model/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
3 |     from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
4 |     from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
5 | except:
6 |     pass
7 | 


--------------------------------------------------------------------------------
/baselines/llava/model/apply_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava import LlavaLlamaForCausalLM
11 | 
12 | 
13 | def apply_delta(base_model_path, target_model_path, delta_path):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading delta")
19 |     delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20 |     delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21 | 
22 |     print("Applying delta")
23 |     for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data += base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
31 |                 f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
32 |             bparam = base.state_dict()[name]
33 |             param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
34 | 
35 |     print("Saving target model")
36 |     delta.save_pretrained(target_model_path)
37 |     delta_tokenizer.save_pretrained(target_model_path)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--base-model-path", type=str, required=True)
43 |     parser.add_argument("--target-model-path", type=str, required=True)
44 |     parser.add_argument("--delta-path", type=str, required=True)
45 | 
46 |     args = parser.parse_args()
47 | 
48 |     apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
49 | 


--------------------------------------------------------------------------------
/baselines/llava/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from llava.model import *
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/baselines/llava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     use_s2 = getattr(vision_tower_cfg, 's2', False)
 9 |     local_file = "checkpoints/clip-vit-large-patch14-336"
10 |     if os.path.exists(local_file): vision_tower = local_file
11 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
12 |         if use_s2:
13 |             return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
14 |         else:
15 |             return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
16 | 
17 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
18 | 


--------------------------------------------------------------------------------
/baselines/llava/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | 
 6 | class IdentityMap(nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_projector_type": 'identity'}
16 | 
17 | 
18 | class SimpleResBlock(nn.Module):
19 |     def __init__(self, channels):
20 |         super().__init__()
21 |         self.pre_norm = nn.LayerNorm(channels)
22 | 
23 |         self.proj = nn.Sequential(
24 |             nn.Linear(channels, channels),
25 |             nn.GELU(),
26 |             nn.Linear(channels, channels)
27 |         )
28 |     def forward(self, x):
29 |         x = self.pre_norm(x)
30 |         return x + self.proj(x)
31 | 
32 | 
33 | def build_vision_projector(config, delay_load=False, **kwargs):
34 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
35 | 
36 |     if projector_type == 'linear':
37 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
38 | 
39 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40 |     if mlp_gelu_match:
41 |         mlp_depth = int(mlp_gelu_match.group(1))
42 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43 |         for _ in range(1, mlp_depth):
44 |             modules.append(nn.GELU())
45 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46 |         return nn.Sequential(*modules)
47 | 
48 |     if projector_type == 'identity':
49 |         return IdentityMap()
50 | 
51 |     raise ValueError(f'Unknown projector type: {projector_type}')
52 | 


--------------------------------------------------------------------------------
/baselines/llava/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/baselines/llava/serve/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llava/serve/__init__.py


--------------------------------------------------------------------------------
/baselines/llava/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llava/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/baselines/llava/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/llava/serve/examples/waterview.jpg


--------------------------------------------------------------------------------
/baselines/llava/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200
27 | 


--------------------------------------------------------------------------------
/baselines/llava/serve/test_message.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import requests
 5 | 
 6 | from llava.conversation import default_conversation
 7 | 
 8 | 
 9 | def main():
10 |     if args.worker_address:
11 |         worker_addr = args.worker_address
12 |     else:
13 |         controller_addr = args.controller_address
14 |         ret = requests.post(controller_addr + "/refresh_all_workers")
15 |         ret = requests.post(controller_addr + "/list_models")
16 |         models = ret.json()["models"]
17 |         models.sort()
18 |         print(f"Models: {models}")
19 | 
20 |         ret = requests.post(controller_addr + "/get_worker_address",
21 |             json={"model": args.model_name})
22 |         worker_addr = ret.json()["address"]
23 |         print(f"worker_addr: {worker_addr}")
24 | 
25 |     if worker_addr == "":
26 |         return
27 | 
28 |     conv = default_conversation.copy()
29 |     conv.append_message(conv.roles[0], args.message)
30 |     prompt = conv.get_prompt()
31 | 
32 |     headers = {"User-Agent": "LLaVA Client"}
33 |     pload = {
34 |         "model": args.model_name,
35 |         "prompt": prompt,
36 |         "max_new_tokens": args.max_new_tokens,
37 |         "temperature": 0.7,
38 |         "stop": conv.sep,
39 |     }
40 |     response = requests.post(worker_addr + "/worker_generate_stream", headers=headers,
41 |             json=pload, stream=True)
42 | 
43 |     print(prompt.replace(conv.sep, "\n"), end="")
44 |     for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
45 |         if chunk:
46 |             data = json.loads(chunk.decode("utf-8"))
47 |             output = data["text"].split(conv.sep)[-1]
48 |             print(output, end="\r")
49 |     print("")
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
55 |     parser.add_argument("--worker-address", type=str)
56 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
57 |     parser.add_argument("--max-new-tokens", type=int, default=32)
58 |     parser.add_argument("--message", type=str, default=
59 |         "Tell me a story with more than 1000 words.")
60 |     args = parser.parse_args()
61 | 
62 |     main()
63 | 


--------------------------------------------------------------------------------
/baselines/llava/train/train_mem.py:
--------------------------------------------------------------------------------
1 | from llava.train.train import train
2 | 
3 | if __name__ == "__main__":
4 |     train(attn_implementation="flash_attention_2")
5 | 


--------------------------------------------------------------------------------
/baselines/llava/train/train_xformers.py:
--------------------------------------------------------------------------------
 1 | # Make it more memory efficient by monkey patching the LLaMA model with xformers attention.
 2 | 
 3 | # Need to call this before importing transformers.
 4 | from llava.train.llama_xformers_attn_monkey_patch import (
 5 |     replace_llama_attn_with_xformers_attn,
 6 | )
 7 | 
 8 | replace_llama_attn_with_xformers_attn()
 9 | 
10 | from llava.train.train import train
11 | 
12 | if __name__ == "__main__":
13 |     train()
14 | 


--------------------------------------------------------------------------------
/baselines/llavavid/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/baselines/llavavid/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | IMAGE_PLACEHOLDER = "<image-placeholder>"
14 | 


--------------------------------------------------------------------------------
/baselines/llavavid/model/__init__.py:
--------------------------------------------------------------------------------
1 | # try:
2 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
3 | from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
4 | from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
5 | # except:
6 | #     pass
7 | 


--------------------------------------------------------------------------------
/baselines/llavavid/model/apply_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llavavid import LlavaLlamaForCausalLM
11 | 
12 | 
13 | def apply_delta(base_model_path, target_model_path, delta_path):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading delta")
19 |     delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20 |     delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21 | 
22 |     print("Applying delta")
23 |     for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data += base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
31 |                 f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
32 |             bparam = base.state_dict()[name]
33 |             param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
34 | 
35 |     print("Saving target model")
36 |     delta.save_pretrained(target_model_path)
37 |     delta_tokenizer.save_pretrained(target_model_path)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--base-model-path", type=str, required=True)
43 |     parser.add_argument("--target-model-path", type=str, required=True)
44 |     parser.add_argument("--delta-path", type=str, required=True)
45 | 
46 |     args = parser.parse_args()
47 | 
48 |     apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
49 | 


--------------------------------------------------------------------------------
/baselines/llavavid/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from llavavid.model import *
10 | from llavavid.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/baselines/llavavid/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from llavavid.model.multimodal_encoder.clip_encoder import CLIPVisionTower
 3 | 
 4 | 
 5 | def build_vision_tower(vision_tower_cfg, **kwargs):
 6 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 7 |     is_absolute_path_exists = os.path.exists(vision_tower)
 8 |     if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
 9 |         return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
10 | 
11 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
12 | 


--------------------------------------------------------------------------------
/baselines/llavavid/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | 
 6 | class IdentityMap(nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_projector_type": 'identity'}
16 | 
17 | 
18 | class SimpleResBlock(nn.Module):
19 |     def __init__(self, channels):
20 |         super().__init__()
21 |         self.pre_norm = nn.LayerNorm(channels)
22 | 
23 |         self.proj = nn.Sequential(
24 |             nn.Linear(channels, channels),
25 |             nn.GELU(),
26 |             nn.Linear(channels, channels)
27 |         )
28 |     def forward(self, x):
29 |         x = self.pre_norm(x)
30 |         return x + self.proj(x)
31 | 
32 | 
33 | def build_vision_projector(config, delay_load=False, **kwargs):
34 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
35 | 
36 |     if projector_type == 'linear':
37 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
38 | 
39 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40 |     if mlp_gelu_match:
41 |         mlp_depth = int(mlp_gelu_match.group(1))
42 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43 |         for _ in range(1, mlp_depth):
44 |             modules.append(nn.GELU())
45 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46 |         return nn.Sequential(*modules)
47 | 
48 |     if projector_type == 'identity':
49 |         return IdentityMap()
50 | 
51 |     raise ValueError(f'Unknown projector type: {projector_type}')
52 | 


--------------------------------------------------------------------------------
/baselines/llavavid/model/multimodal_resampler/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | 
 3 | from .spatial_pool import SpatialPool
 4 | 
 5 | 
 6 | class IdentityMap(torch.nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_resampler_type": None}
16 | 
17 | 
18 | def build_vision_resampler(model_args, delay_load=False, **kwargs):
19 |     resampler_type = getattr(model_args, "mm_resampler_type", None)
20 |     if resampler_type == "spatial_pool":
21 |         return SpatialPool(model_args, **kwargs)
22 |     elif resampler_type is None:
23 |         return IdentityMap()
24 | 
25 |     raise ValueError(f"Unknown resampler type: {resampler_type}")
26 | 


--------------------------------------------------------------------------------
/baselines/llavavid/model/multimodal_resampler/spatial_pool.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import math
 4 | 
 5 | 
 6 | class SpatialPool(nn.Module):
 7 |     def __init__(self, model_args, vision_tower):
 8 |         super().__init__()
 9 | 
10 |         self.mode = model_args.mm_spatial_pool_mode
11 |         self.stride = model_args.mm_spatial_pool_stride
12 |         # import pdb; pdb.set_trace()
13 |         self.out_channels = getattr(model_args, "mm_spatial_pool_out_channels", vision_tower.hidden_size)
14 | 
15 |         if self.mode == "average":
16 |             self.pool = nn.AvgPool2d(kernel_size=self.stride, stride=self.stride)
17 |         elif self.mode == "max":
18 |             self.pool = nn.MaxPool2d(kernel_size=self.stride, stride=self.stride)
19 |         elif self.mode == "conv":
20 |             self.pool = nn.Conv2d(in_channels=vision_tower.hidden_size, out_channels=self.out_channels, kernel_size=self.stride, stride=self.stride)
21 |         else:
22 |             raise ValueError(f"Unknown pooling mode: {self.pool}.")
23 | 
24 |     def forward(self, image_features, images, *args, **kwargs):
25 |         # import pdb; pdb.set_trace()
26 |         ori_W = int(math.sqrt(image_features.shape[1] * images.shape[3] // images.shape[2]))
27 |         ori_H = int(ori_W * images.shape[2] // images.shape[3])
28 | 
29 |         B, _, F = image_features.shape
30 | 
31 |         image_features_spatial = image_features.view(B, ori_H, ori_H, F).permute(0, 3, 1, 2)
32 |         image_features_spatial_pool = self.pool(image_features_spatial)
33 | 
34 |         return image_features_spatial_pool.flatten(2).transpose(1, 2).contiguous()
35 | 
36 |     @property
37 |     def config(self):
38 |         return {
39 |             "mm_resampler_type": "spatial_pool",
40 |             "mm_spatial_pool_stride": self.stride,
41 |             "mm_spatial_pool_mode": self.mode,
42 |             "mm_spatial_pool_out_channels": self.out_channels,
43 |         }
44 | 
45 |     @property
46 |     def hidden_size(self):
47 |         return self.out_channels
48 | 


--------------------------------------------------------------------------------
/baselines/llavavid/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from minigpt4.common.registry import registry
14 | 
15 | from minigpt4.datasets.builders import *
16 | from minigpt4.models import *
17 | from minigpt4.processors import *
18 | from minigpt4.tasks import *
19 | 
20 | 
21 | root_dir = os.path.dirname(os.path.abspath(__file__))
22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
23 | 
24 | registry.register_path("library_root", root_dir)
25 | repo_root = os.path.join(root_dir, "..")
26 | registry.register_path("repo_root", repo_root)
27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
28 | registry.register_path("cache_root", cache_root)
29 | 
30 | registry.register("MAX_INT", sys.maxsize)
31 | registry.register("SPLIT_NAMES", ["train", "val", "test"])
32 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/minigpt4/common/__init__.py


--------------------------------------------------------------------------------
/baselines/minigpt4/common/gradcam.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from matplotlib import pyplot as plt
 3 | from scipy.ndimage import filters
 4 | from skimage import transform as skimage_transform
 5 | 
 6 | 
 7 | def getAttMap(img, attMap, blur=True, overlap=True):
 8 |     attMap -= attMap.min()
 9 |     if attMap.max() > 0:
10 |         attMap /= attMap.max()
11 |     attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
12 |     if blur:
13 |         attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
14 |         attMap -= attMap.min()
15 |         attMap /= attMap.max()
16 |     cmap = plt.get_cmap("jet")
17 |     attMapV = cmap(attMap)
18 |     attMapV = np.delete(attMapV, 3, 2)
19 |     if overlap:
20 |         attMap = (
21 |             1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
22 |             + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
23 |         )
24 |     return attMap
25 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/VQA/PythonEvaluationTools/vqaEvaluation/__init__.py:
--------------------------------------------------------------------------------
1 | author='aagrawal'
2 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/VQA/PythonHelperTools/vqaTools/__init__.py:
--------------------------------------------------------------------------------
1 | __author__ = 'aagrawal'
2 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/__init__.py:
--------------------------------------------------------------------------------
1 | """
2 |  Copyright (c) 2022, salesforce.com, inc.
3 |  All rights reserved.
4 |  SPDX-License-Identifier: BSD-3-Clause
5 |  For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
6 | """
7 | 
8 | __author__ = "aagrawal"
9 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/data_scripts/build_vocab.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | from collections import Counter
 4 | import pathlib
 5 | 
 6 | from load_aokvqa import load_aokvqa
 7 | 
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')
11 | parser.add_argument('--out', type=pathlib.Path, required=True, dest='output_file')
12 | args = parser.parse_args()
13 | 
14 | 
15 | # Build vocab from train set: correct choices + (direct answers appearing in >= 3 )
16 | 
17 | train_set = load_aokvqa(args.aokvqa_dir, 'train')
18 | 
19 | vocab = []
20 | all_choices = Counter()
21 | direct_answers = Counter()
22 | 
23 | for i in train_set:
24 |     vocab.append( i['choices'][i['correct_choice_idx']] )
25 |     all_choices.update(i['choices'])
26 |     direct_answers.update(set(i['direct_answers']))
27 | vocab += [k for k,v in all_choices.items() if v >= 3]
28 | vocab += [k for k,v in direct_answers.items() if v >= 3]
29 | 
30 | vocab = sorted(set(vocab))
31 | print(f"Vocab size: {len(vocab)}")
32 | 
33 | # Save vocabulary Output
34 | 
35 | with open(args.output_file, 'w') as f:
36 |     for v in vocab:
37 |         print(v, file=f)
38 | 
39 | ## Check validation set coverage
40 | 
41 | val_set = load_aokvqa(args.aokvqa_dir, 'val')
42 | 
43 | val_acc = [v['choices'][v['correct_choice_idx']] in vocab for v in val_set]
44 | val_acc = sum(val_acc) / len(val_acc) * 100
45 | print(f"Val set coverage: {val_acc:.2f}" )
46 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/data_scripts/encode_vocab_clip.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | from tqdm import tqdm
 3 | import argparse
 4 | import pathlib
 5 | 
 6 | import torch
 7 | import clip
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--vocab', type=pathlib.Path, required=True, dest='vocab_file')
11 | parser.add_argument('--model-type', type=str, choices=['RN50', 'RN50x4', 'RN50x16', 'RN50x64', 'RN101', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px'], required=True, dest='model_type')
12 | parser.add_argument('--out', type=pathlib.Path, required=True, dest='output_file')
13 | args = parser.parse_args()
14 | 
15 | assert args.output_file.suffix == '.pt'
16 | 
17 | device = "cuda" if torch.cuda.is_available() else "cpu"
18 | model, preprocess = clip.load(args.model_type, device=device)
19 | 
20 | with torch.no_grad():
21 |     a = open(args.vocab_file).read().splitlines()
22 |     mc_text = clip.tokenize(a).to(device)
23 |     mc_text_features = torch.stack([model.encode_text(mct.unsqueeze(0)).cpu() for mct in tqdm(mc_text)], dim=1)[0]
24 |     mc_text_features = mc_text_features.float()
25 |     model_name = args.model_type.replace('/', '-').replace('@', '-')
26 |     torch.save(mc_text_features, args.output_file)
27 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/data_scripts/extract_bert_features.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import pathlib
 4 | from tqdm import tqdm
 5 | 
 6 | import torch
 7 | from transformers import AutoTokenizer, AutoModel
 8 | 
 9 | from load_aokvqa import load_aokvqa
10 | 
11 | 
12 | parser = argparse.ArgumentParser()
13 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')
14 | parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True)
15 | parser.add_argument('--out', type=pathlib.Path, required=True, dest='output_file')
16 | args = parser.parse_args()
17 | 
18 | assert args.output_file.suffix == '.pt'
19 | 
20 | ## Load dataset
21 | 
22 | dataset = load_aokvqa(args.aokvqa_dir, args.split)
23 | 
24 | ## Load model
25 | 
26 | tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
27 | model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
28 | device = "cuda" if torch.cuda.is_available() else "cpu"
29 | model = model.to(device)
30 | model.eval()
31 | 
32 | def mean_pooling(model_output, attention_mask):
33 |     token_embeddings = model_output[0] # First element of model_output contains all token embeddings
34 |     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
35 |     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
36 | 
37 | ## Encoding loop
38 | 
39 | with torch.no_grad():
40 |     embeddings = {}
41 | 
42 |     for d in tqdm(dataset):
43 |         encoded_input = tokenizer([d['question']], padding=True, return_tensors='pt')
44 |         encoded_input = {k:v.to(device) for k,v in encoded_input.items()}
45 |         e = mean_pooling(model(**encoded_input), encoded_input['attention_mask'])
46 |         embeddings[d['question_id']] = {
47 |             'question' : e[0].cpu()
48 |         }
49 | 
50 |     torch.save(embeddings, args.output_file)
51 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/data_scripts/extract_clip_features.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from PIL import Image
 3 | from tqdm import tqdm
 4 | import argparse
 5 | import pathlib
 6 | 
 7 | import torch
 8 | import clip
 9 | 
10 | from load_aokvqa import load_aokvqa, get_coco_path
11 | 
12 | 
13 | parser = argparse.ArgumentParser()
14 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')
15 | parser.add_argument('--coco-dir', type=pathlib.Path, required=True, dest='coco_dir')
16 | parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True)
17 | parser.add_argument('--model-type', type=str, choices=['RN50', 'RN50x4', 'RN50x16', 'RN50x64', 'RN101', 'ViT-B/32', 'ViT-B/16', 'ViT-L/14', 'ViT-L/14@336px'], required=True, dest='model_type')
18 | parser.add_argument('--out', type=pathlib.Path, required=True, dest='output_file')
19 | args = parser.parse_args()
20 | 
21 | assert args.output_file.suffix == '.pt'
22 | 
23 | ## Load dataset
24 | 
25 | dataset = load_aokvqa(args.aokvqa_dir, args.split)
26 | 
27 | ## Load model
28 | 
29 | device = "cuda" if torch.cuda.is_available() else "cpu"
30 | model, preprocess = clip.load(args.model_type, device=device)
31 | 
32 | ## Encoding loop
33 | 
34 | with torch.no_grad():
35 |     embeddings = {}
36 | 
37 |     for d in tqdm(dataset):
38 |         q = d["question"]
39 |         q_text = clip.tokenize(q).to(device)
40 |         q_text_features = model.encode_text(q_text)
41 | 
42 |         img = Image.open(get_coco_path(args.split, d['image_id'], args.coco_dir))
43 |         img = preprocess(img).unsqueeze(0).to(device)
44 |         image_features = model.encode_image(img)
45 | 
46 |         embeddings[d['question_id']] = {
47 |             'question' : q_text_features[0].float().cpu(),
48 |             'image' : image_features[0].float().cpu(),
49 |         }
50 | 
51 |     torch.save(embeddings, args.output_file)
52 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/data_scripts/extract_resnet_features.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import argparse
 3 | import pathlib
 4 | from tqdm import tqdm
 5 | from PIL import Image
 6 | 
 7 | import torch
 8 | import torch.nn as nn
 9 | from torchvision import models
10 | from torchvision import transforms as T
11 | 
12 | from load_aokvqa import load_aokvqa, get_coco_path
13 | 
14 | 
15 | parser = argparse.ArgumentParser()
16 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')
17 | parser.add_argument('--coco-dir', type=pathlib.Path, required=True, dest='coco_dir')
18 | parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True)
19 | parser.add_argument('--out', type=pathlib.Path, required=True, dest='output_file')
20 | args = parser.parse_args()
21 | 
22 | assert args.output_file.suffix == '.pt'
23 | 
24 | ## Load dataset
25 | 
26 | dataset = load_aokvqa(args.aokvqa_dir, args.split)
27 | 
28 | ## Load model
29 | 
30 | resnet_preprocess = T.Compose([
31 |     T.Resize(size=224, interpolation=T.InterpolationMode.BICUBIC),
32 |     T.CenterCrop(size=(224, 224)),
33 |     T.ToTensor(),
34 |     T.Normalize(
35 |         mean=[0.485, 0.456, 0.406],
36 |         std=[0.229, 0.224, 0.225]
37 |     )
38 | ])
39 | 
40 | device = "cuda" if torch.cuda.is_available() else "cpu"
41 | 
42 | resnet_model = models.resnet50(pretrained=True)
43 | resnet_model = torch.nn.Sequential(
44 |     *list(resnet_model.children())[:-1],
45 |     nn.Flatten()
46 | )  # strip classification layer
47 | resnet_model = resnet_model.to(device)
48 | 
49 | ## Encoding loop
50 | 
51 | with torch.no_grad():
52 |     embeddings = {}
53 | 
54 |     for d in tqdm(dataset):
55 |         img = Image.open(get_coco_path(args.split, d['image_id'], args.coco_dir)).convert('RGB')
56 |         resnet_input = resnet_preprocess(img).unsqueeze(0).to(device)
57 |         resnet_features = resnet_model(resnet_input)
58 |         embeddings[d['question_id']] = {
59 |             'image' : resnet_features[0].cpu()
60 |         }
61 | 
62 |     torch.save(embeddings, args.output_file)
63 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/environment.yml:
--------------------------------------------------------------------------------
 1 | name: aokvqa
 2 | channels:
 3 |   - pytorch
 4 |   - nvidia
 5 |   - huggingface
 6 |   - conda-forge
 7 |   - defaults
 8 | dependencies:
 9 |   - python=3.7
10 |   - cudatoolkit=11.3
11 |   - numpy=1.21.6
12 |   - pytorch=1.11.0
13 |   - torchvision=0.12.0
14 |   - pytorch-lightning=1.6.3
15 |   - torchmetrics=0.8.1
16 |   - gdown=4.4.0
17 |   - pip=22.0.4
18 |   - pip:
19 |     - argparse==1.4.0
20 |     - Pillow==9.0.1
21 |     - tensorboard==2.9.0
22 |     - ftfy==6.1.1
23 |     - regex==2022.3.15
24 |     - tqdm==4.64.0
25 |     - clip @ git+https://github.com/openai/CLIP.git@b46f5ac7587d2e1862f8b7b1573179d80dcdd620
26 |     - openai==0.18.1
27 |     - nltk==3.7
28 |     - sacrebleu==2.0.0
29 |     - sacremoses==0.0.53
30 |     - sentence-transformers==2.2.0
31 |     - datasets==2.1.0
32 |     - tokenizers==0.10.3
33 |     - transformers==4.10.3
34 | 
35 | # Next: resolve conflict between sentence-transfomers and pytorch-lightning
36 | # pip uninstall sentencepiece
37 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/evaluation/load_aokvqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | 
 4 | 
 5 | def load_aokvqa(aokvqa_dir, split, version='v1p0'):
 6 |     assert split in ['train', 'val', 'test', 'test_w_ans']
 7 |     dataset = json.load(open(
 8 |         os.path.join(aokvqa_dir, f"aokvqa_{version}_{split}.json")
 9 |     ))
10 |     return dataset
11 | 
12 | def get_coco_path(split, image_id, coco_dir):
13 |     return os.path.join(coco_dir, f"{split}2017", f"{image_id:012}.jpg")
14 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/evaluation/prepare_predictions.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pathlib
 3 | import json
 4 | 
 5 | from load_aokvqa import load_aokvqa
 6 | 
 7 | 
 8 | if __name__ == '__main__':
 9 |     parser = argparse.ArgumentParser()
10 |     parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')
11 |     parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True)
12 |     parser.add_argument('--mc', type=argparse.FileType('r'), dest='mc_pred_file')
13 |     parser.add_argument('--da', type=argparse.FileType('r'), dest='da_pred_file')
14 |     parser.add_argument('--out', type=argparse.FileType('w'), dest='output_file')
15 |     args = parser.parse_args()
16 |     assert args.mc_pred_file or args.da_pred_file
17 | 
18 |     dataset = load_aokvqa(args.aokvqa_dir, args.split)
19 |     mc_preds = json.load(args.mc_pred_file) if args.mc_pred_file else None
20 |     da_preds = json.load(args.da_pred_file) if args.da_pred_file else None
21 |     predictions = {}
22 | 
23 |     for d in dataset:
24 |         q = d['question_id']
25 |         predictions[q] = {}
26 |         if mc_preds and q in mc_preds.keys():
27 |             predictions[q]['multiple_choice'] = mc_preds[q]
28 |         if da_preds and q in da_preds.keys():
29 |             predictions[q]['direct_answer'] = da_preds[q]
30 | 
31 |     json.dump(predictions, args.output_file)
32 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/evaluation/remap_predictions.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import pathlib
 3 | import json
 4 | from tqdm import tqdm
 5 | 
 6 | from sentence_transformers import SentenceTransformer
 7 | from sentence_transformers.util import cos_sim
 8 | 
 9 | from load_aokvqa import load_aokvqa
10 | 
11 | 
12 | def map_to_choices(dataset, predictions, device='cpu'):
13 |     if isinstance(dataset, list):
14 |         dataset = { dataset[i]['question_id'] : dataset[i] for i in range(len(dataset)) }
15 | 
16 |     if all([p in dataset[q]['choices'] for q, p in predictions.items()]):
17 |         return predictions
18 | 
19 |     model = SentenceTransformer('sentence-transformers/average_word_embeddings_glove.6B.300d')
20 |     model.to(device)
21 |     for q in tqdm(predictions.keys()):
22 |         choices = dataset[q]['choices']
23 |         if predictions[q] not in choices:
24 |             choice_embeddings = model.encode([predictions[q]] + choices, convert_to_tensor=True)
25 |             a_idx = cos_sim(choice_embeddings[0], choice_embeddings[1:]).argmax().item()
26 |             predictions[q] = choices[a_idx]
27 | 
28 |     return predictions
29 | 
30 | 
31 | if __name__ == '__main__':
32 |     parser = argparse.ArgumentParser()
33 |     parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')
34 |     parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True)
35 |     parser.add_argument('--pred', type=argparse.FileType('r'), required=True, dest='prediction_file')
36 |     parser.add_argument('--out', type=argparse.FileType('w'), required=True, dest='output_file')
37 |     args = parser.parse_args()
38 | 
39 | 
40 |     dataset = load_aokvqa(args.aokvqa_dir, args.split)
41 |     predictions = json.load(args.prediction_file)
42 |     predictions = map_to_choices(dataset, predictions)
43 | 
44 |     json.dump(predictions, args.output_file)
45 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/gpt3/README.md:
--------------------------------------------------------------------------------
 1 | ## Querying GPT-3
 2 | 
 3 | To follow our experiments which use GPT-3, you must have access to the [OpenAI API](https://openai.com/api/) (at cost). Please retrieve your [organization](https://beta.openai.com/account/org-settings) and [API](https://beta.openai.com/account/api-keys) keys and set them in your environment variables.
 4 | 
 5 | ```bash
 6 | export OPENAI_ORG=....
 7 | export OPENAI_API_KEY=...
 8 | ```
 9 | 
10 | For producing predictions for both DA and MC settings, run:
11 | ```bash
12 | python gpt3/query_gpt3.py --aokvqa-dir ${AOKVQA_DIR} --split val --out ${PREDS_DIR}/gpt3_val-da.json
13 | python remap_predictions.py --aokvqa-dir ${AOKVQA_DIR} --split val --pred ${PREDS_DIR}/gpt3_val-da.json --out ${PREDS_DIR}/gpt3_val-mc.json
14 | ```
15 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/gpt3/caption_inputs.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import pathlib
 5 | 
 6 | from load_aokvqa import load_aokvqa
 7 | 
 8 | 
 9 | parser = argparse.ArgumentParser()
10 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')
11 | parser.add_argument('--coco-dir', type=pathlib.Path, required=True, dest='coco_dir')
12 | parser.add_argument('--split', type=str, choices=['train', 'val'], required=True)
13 | parser.add_argument('--out', type=argparse.FileType('w'), required=True, dest='output_file')
14 | args = parser.parse_args()
15 | 
16 | aokvqa_set = load_aokvqa(args.aokvqa_dir, args.split)
17 | 
18 | coco_captions = json.load(open(os.path.join(args.coco_dir, 'annotations', f'captions_{args.split}2017.json')))['annotations']
19 | coco_captions = {c['image_id'] : c['caption'] for c in coco_captions}
20 | 
21 | captions = { d['question_id'] : coco_captions[d['image_id']] for d in aokvqa_set }
22 | 
23 | json.dump(captions, args.output_file)
24 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/gpt3/rationale_inputs.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import argparse
 3 | import pathlib
 4 | 
 5 | from load_aokvqa import load_aokvqa
 6 | 
 7 | 
 8 | parser = argparse.ArgumentParser()
 9 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')
10 | parser.add_argument('--split', type=str, choices=['train', 'val', 'test_w_ans'], required=True)
11 | parser.add_argument('--out', type=argparse.FileType('w'), required=True, dest='output_file')
12 | args = parser.parse_args()
13 | 
14 | aokvqa_set = load_aokvqa(args.aokvqa_dir, args.split)
15 | rationales = {d['question_id'] : d['rationales'][0] for d in aokvqa_set}
16 | json.dump(rationales, args.output_file)
17 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/heuristics/README.md:
--------------------------------------------------------------------------------
 1 | ## Heuristics
 2 | 
 3 | ```bash
 4 | # These scripts accept the same arguments.
 5 | # heuristics/random_unweighted.py
 6 | # heuristics/random_weighted.py
 7 | # heuristics/most_common_answer.py
 8 | 
 9 | python heuristics/random_unweighted.py --aokvqa-dir ${AOKVQA_DIR} --split val --mc --out ${PREDS_DIR}/random-unweighted_val-mc.json
10 | # Exclude --mc for the direct answer setting
11 | ```
12 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/heuristics/most_common_answer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import argparse
 4 | import pathlib
 5 | from collections import Counter
 6 | 
 7 | from load_aokvqa import load_aokvqa
 8 | 
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')
12 | parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True)
13 | parser.add_argument('--mc', action='store_true', dest='multiple_choice')
14 | parser.add_argument('--out', type=argparse.FileType('w'), required=True, dest='output_file')
15 | args = parser.parse_args()
16 | 
17 | 
18 | train_set = load_aokvqa(args.aokvqa_dir, 'train')
19 | train_freq = dict(Counter(
20 |     [d['choices'][d['correct_choice_idx']] for d in train_set]
21 | ))
22 | most_common_answer = max(train_freq.keys(), key=train_freq.get)
23 | 
24 | ##
25 | 
26 | eval_set = load_aokvqa(args.aokvqa_dir, args.split)
27 | 
28 | predictions = {}
29 | 
30 | for d in eval_set:
31 |     q = d['question_id']
32 |     predictions[q] = most_common_answer
33 | 
34 |     if args.multiple_choice:
35 |         choices = [c for c in d['choices'] if c in train_freq.keys()]
36 |         if len(choices) > 0:
37 |             predictions[q] = max(choices, key=train_freq.get)
38 | 
39 | json.dump(predictions, args.output_file)
40 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/heuristics/random_unweighted.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from random import seed, sample
 4 | import argparse
 5 | import pathlib
 6 | 
 7 | from load_aokvqa import load_aokvqa
 8 | 
 9 | 
10 | parser = argparse.ArgumentParser()
11 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')
12 | parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True)
13 | parser.add_argument('--mc', action='store_true', dest='multiple_choice')
14 | parser.add_argument('--out', type=argparse.FileType('w'), required=True, dest='output_file')
15 | args = parser.parse_args()
16 | 
17 | seed(0)
18 | 
19 | train_set = load_aokvqa(args.aokvqa_dir, 'train')
20 | 
21 | if args.multiple_choice is False:
22 |     choices = list(set(
23 |         [d['choices'][d['correct_choice_idx']] for d in train_set]
24 |     ))
25 | 
26 | ##
27 | 
28 | predictions = {}
29 | 
30 | eval_set = load_aokvqa(args.aokvqa_dir, args.split)
31 | 
32 | for d in eval_set:
33 |     q = d['question_id']
34 |     if args.multiple_choice:
35 |         choices = d['choices']
36 |     predictions[q] = sample(choices, 1)[0]
37 | 
38 | json.dump(predictions, args.output_file)
39 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/heuristics/random_weighted.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import numpy as np
 4 | import argparse
 5 | import pathlib
 6 | from collections import Counter
 7 | 
 8 | from load_aokvqa import load_aokvqa
 9 | 
10 | 
11 | parser = argparse.ArgumentParser()
12 | parser.add_argument('--aokvqa-dir', type=pathlib.Path, required=True, dest='aokvqa_dir')
13 | parser.add_argument('--split', type=str, choices=['train', 'val', 'test'], required=True)
14 | parser.add_argument('--mc', action='store_true', dest='multiple_choice')
15 | parser.add_argument('--out', type=argparse.FileType('w'), required=True, dest='output_file')
16 | args = parser.parse_args()
17 | 
18 | np.random.seed(0)
19 | 
20 | train_set = load_aokvqa(args.aokvqa_dir, 'train')
21 | train_freq = dict(Counter(
22 |     [d['choices'][d['correct_choice_idx']] for d in train_set]
23 | ))
24 | 
25 | if args.multiple_choice is False:
26 |     choices = list(train_freq.keys())
27 |     probs = [f / len(train_set) for f in train_freq.values()]
28 | 
29 | ##
30 | 
31 | predictions = {}
32 | 
33 | eval_set = load_aokvqa(args.aokvqa_dir, args.split)
34 | 
35 | for d in eval_set:
36 |     if args.multiple_choice:
37 |         choices = d['choices']
38 |         probs = [train_freq.get(c, 0) for c in choices]
39 |         if probs == [0, 0, 0, 0]:
40 |             probs = [1, 1, 1, 1]
41 |         probs = [p / sum(probs) for p in probs]
42 | 
43 |     q = d['question_id']
44 |     predictions[q] = np.random.choice(choices, size=1, p=probs)[0]
45 | 
46 | json.dump(predictions, args.output_file)
47 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/load_aokvqa.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | 
 4 | 
 5 | def load_aokvqa(aokvqa_dir, split, version='v1p0'):
 6 |     assert split in ['train', 'val', 'test', 'test_w_ans']
 7 |     dataset = json.load(open(
 8 |         os.path.join(aokvqa_dir, f"aokvqa_{version}_{split}.json")
 9 |     ))
10 |     return dataset
11 | 
12 | def get_coco_path(split, image_id, coco_dir):
13 |     return os.path.join(coco_dir, f"{split}2017", f"{image_id:012}.jpg")
14 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/common/vqa_tools/aokvqa/transfer_experiments/README.md:
--------------------------------------------------------------------------------
 1 | ## Transfer Learning Experiments
 2 | 
 3 | We use the following training/prediction scripts for the classifier, zero-shot, and contrastive experiments in Table 3.
 4 | 
 5 | ```bash
 6 | ## Training
 7 | python transfer_experiments/train.py --aokvqa-dir ${AOKVQA_DIR} --vocab ${AOKVQA_DIR}/large_vocab_train.csv --log-dir ${LOG_DIR}
 8 | 
 9 | --backbone clip --clip-model-type ViT-B/32 --train-features ${FEATURES_DIR}/clip-ViT-B-32_train.pt --val-features ${FEATURES_DIR}/clip-ViT-B-32_val.pt
10 | --inputs question # OR --inputs image  # OR --inputs question image
11 | # OR
12 | --backbone resnet --train-features ${FEATURES_DIR}/resnet_train.pt --val-features ${FEATURES_DIR}/resnet_val.pt --inputs image
13 | # OR
14 | --backbone bert --train-features ${FEATURES_DIR}/bert_train.pt --val-features ${FEATURES_DIR}/bert_val.pt --inputs question
15 | 
16 | --objective classifier
17 | # OR
18 | --objective contrastive --vocab-features ${FEATURE_DIR}/clip-ViT-B-32_large_vocab.pt
19 | ```
20 | 
21 | You can make predictions for CLIP zero-shot or from a classifier/contrastive checkpoint trained above.
22 | 
23 | ```bash
24 | ## Predicting
25 | python transfer_experiments/predict.py --aokvqa-dir ${AOKVQA_DIR} --out ${PREDS_DIR}/clip-classifier_val-mc.json
26 | 
27 | --split val  # or test
28 | --features ${FEATURE_DIR}/clip-ViT-B-32_val.pt  # adjust for backbone and eval split
29 | 
30 | --ckpt path/to/model.ckpt
31 | # OR
32 | --zero-shot --clip-model-type ViT-B/32
33 | --inputs question  # OR --inputs image  # OR --inputs question image
34 | 
35 | --mc  # Multiple-choice. Exclude for direct-answer.
36 | 
37 | # IF classifier OR direct-answer
38 | --vocab ${AOKVQA_DIR}/large_vocab_train.csv
39 | # IF contrastive/zero-shot AND direct-answer
40 | --vocab-features ${FEATURES_DIR}/clip-ViT-B-32_large_vocab.pt
41 | ```
42 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/configs/datasets/cc_sbu/align.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   cc_sbu_align:
3 |     data_type: images
4 |     build_info:
5 |       # storage: "/ibex/project/c2090/datasets/cc_sbu_align"
6 |       storage: "path/to/cc_sbu_align/dataset"
7 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/configs/datasets/cc_sbu/defaults.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   cc_sbu:
3 |     data_type: images
4 |     build_info:
5 |       storage: /ibex/project/c2133/blip_dataset/cc3m_256/cc3m_cc12m_sbu/{00000..01255}.tar
6 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/configs/datasets/cmd_video/default.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   cmd_video:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       vis_root: path/to/videos/
14 |       ann_paths: [path/to/annotations.json]
15 |       subtitles_path: path/to/subtitles_folder # folder that contains subtitles of .vtt format
16 |       model_name: 'llama2' # Language Model Name (available: llama2, mistral)
17 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/configs/datasets/laion/defaults.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   laion:
3 |     data_type: images
4 |     build_info:
5 |       storage: /ibex/project/c2133/blip_dataset/laion_1b/laion_gpu/{00000..10488}.tar
6 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/configs/datasets/template/default.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   dataset_name: # same as the name of the train_config yaml file
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # let it be images for now even if it is videos
10 | 
11 |     build_info: # this is the information needed to build the dataset
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       ann_paths: [path/to/annotations_json] # list of paths to annotation files
14 |       vis_root: path/to/videos_folder
15 |       subtitles_path: path/to/subtitles_folder
16 |       model_name: 'llama2' # Language Model Name (available: llama2, mistral)


--------------------------------------------------------------------------------
/baselines/minigpt4/configs/datasets/video_chatgpt/default.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   video_chatgpt:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       ann_paths: [path/to/annotations_json] # list of paths to annotation files
14 |       vis_root: path/to/videos_folder
15 |       subtitles_path: path/to/subtitles_folder # folder that contains subtitles of .vtt format
16 |       model_name: 'llama2' # Language Model Name (available: llama2, mistral)


--------------------------------------------------------------------------------
/baselines/minigpt4/configs/datasets/webvid/default.yaml:
--------------------------------------------------------------------------------
 1 |  # Copyright (c) 2022, salesforce.com, inc.
 2 |  # All rights reserved.
 3 |  # SPDX-License-Identifier: BSD-3-Clause
 4 |  # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 5 | 
 6 | datasets:
 7 |   webvid:
 8 |     # data_dir: ${env.data_dir}/datasets
 9 |     data_type: images # [images|videos|features]
10 | 
11 |     build_info:
12 |       # Be careful not to append minus sign (-) before split to avoid itemizing
13 |       ann_paths: [path/to/annotations.json]
14 |       vis_root: path/to/videos/
15 |       subtitles_path: path/to/subtitles_folder/ # folder that contains subtitles of .vtt format
16 |       model_name: 'llama2' # Language Model Name (available: llama2, mistral)
17 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/configs/default.yaml:
--------------------------------------------------------------------------------
1 | env:
2 |   # For default users
3 |   # cache_root: "cache"
4 |   # For internal use with persistent storage
5 |   cache_root: "/export/home/.cache/minigpt4"
6 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/configs/models/minigpt4.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: mini_gpt4_1
 3 | 
 4 |   # vit encoder
 5 |   image_size: 224
 6 |   drop_path_rate: 0
 7 |   use_grad_checkpoint: False
 8 |   vit_precision: "fp16"
 9 |   freeze_vit: True
10 |   freeze_qformer: True
11 |   model_type: "vit_h"
12 |   device: "cuda"
13 | 
14 |   # Q-Former
15 |   num_query_token: 32
16 | 
17 |   # Vicuna
18 |   llama_model: "lmsys/vicuna-13b-v1.1"
19 | 
20 |   # generation configs
21 |   prompt: ""
22 | 
23 | preprocess:
24 |     vis_processor:
25 |         train:
26 |           name: "blip2_image_train"
27 |           image_size: 224
28 |         eval:
29 |           name: "blip2_image_eval"
30 |           image_size: 224
31 |     text_processor:
32 |         train:
33 |           name: "blip_caption"
34 |         eval:
35 |           name: "blip_caption"
36 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/configs/models/minigpt4v.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: mini_gpt4v
 3 | 
 4 |   # vit encoder
 5 |   image_size: 224
 6 |   drop_path_rate: 0
 7 |   use_grad_checkpoint: False
 8 |   vit_precision: "fp16"
 9 |   freeze_vit: True
10 |   freeze_qformer: True
11 |   model_type: "vit_h"
12 |   device: "cuda"
13 | 
14 |   # Q-Former
15 |   num_query_token: 32
16 | 
17 |   # Vicuna
18 |   llama_model: "lmsys/vicuna-13b-v1.1"
19 | 
20 |   # generation configs
21 |   prompt: ""
22 | 
23 | preprocess:
24 |     vis_processor:
25 |         train:
26 |           name: "blip2_image_train"
27 |           image_size: 224
28 |         eval:
29 |           name: "blip2_image_eval"
30 |           image_size: 224
31 |     text_processor:
32 |         train:
33 |           name: "blip_caption"
34 |         eval:
35 |           name: "blip_caption"
36 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/conversation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/minigpt4/conversation/__init__.py


--------------------------------------------------------------------------------
/baselines/minigpt4/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/minigpt4/datasets/__init__.py


--------------------------------------------------------------------------------
/baselines/minigpt4/datasets/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/minigpt4/datasets/datasets/__init__.py


--------------------------------------------------------------------------------
/baselines/minigpt4/datasets/datasets/cc_sbu_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from PIL import Image
 3 | import webdataset as wds
 4 | from minigpt4.datasets.datasets.base_dataset import BaseDataset
 5 | from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
 6 | 
 7 | 
 8 | class CCSBUDataset(BaseDataset):
 9 |     def __init__(self, vis_processor, text_processor, location):
10 |         super().__init__(vis_processor=vis_processor, text_processor=text_processor)
11 | 
12 |         self.inner_dataset = wds.DataPipeline(
13 |             wds.ResampledShards(location),
14 |             wds.tarfile_to_samples(handler=wds.warn_and_continue),
15 |             wds.shuffle(1000, handler=wds.warn_and_continue),
16 |             wds.decode("pilrgb", handler=wds.warn_and_continue),
17 |             wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
18 |             wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
19 |             wds.map(self.to_dict, handler=wds.warn_and_continue),
20 |         )
21 | 
22 |     def to_dict(self, sample):
23 |         return {
24 |             "image": sample[0],
25 |             "answer": self.text_processor(sample[1]["caption"]),
26 |         }
27 | 
28 | 
29 | class CCSBUAlignDataset(CaptionDataset):
30 | 
31 |     def __getitem__(self, index):
32 | 
33 |         # TODO this assumes image input, not general enough
34 |         ann = self.annotation[index]
35 | 
36 |         img_file = '{}.jpg'.format(ann["image_id"])
37 |         image_path = os.path.join(self.vis_root, img_file)
38 |         image = Image.open(image_path).convert("RGB")
39 | 
40 |         image = self.vis_processor(image)
41 |         caption = ann["caption"]
42 | 
43 |         return {
44 |             "image": image,
45 |             "answer": caption,
46 |             "image_id": self.img_ids[ann["image_id"]],
47 |         }
48 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/datasets/datasets/cot.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import pickle
 4 | import random
 5 | import time
 6 | import itertools
 7 | 
 8 | import numpy as np
 9 | from PIL import Image
10 | import skimage.io as io
11 | import matplotlib.pyplot as plt
12 | from matplotlib.collections import PatchCollection
13 | from matplotlib.patches import Polygon, Rectangle
14 | from torch.utils.data import Dataset
15 | import webdataset as wds
16 | 
17 | from minigpt4.datasets.datasets.base_dataset import BaseDataset
18 | from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
19 | 
20 | 
21 | class CoTDataset(Dataset):
22 |     def __init__(self, text_processor, ann_path):
23 |         """
24 |         vis_root (string): Root directory of images (e.g. coco/images/)
25 |         ann_root (string): directory to store the annotation file
26 |         """
27 | 
28 |         self.text_processor = text_processor
29 | 
30 |         with open(ann_path, 'r') as f:
31 |             self.ann = json.load(f)
32 | 
33 |     def __len__(self):
34 |         return len(self.ann)
35 | 
36 |     def __getitem__(self, index):
37 |         info = self.ann[index]
38 |         input = info["inputs"]
39 |         target = info["targets"]
40 |         return {
41 |             "instruction_input": input,
42 |             "answer": target,
43 |         }
44 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/datasets/datasets/grounded_detailed_image_caption_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import pickle
 4 | import random
 5 | import time
 6 | import itertools
 7 | 
 8 | import numpy as np
 9 | from PIL import Image
10 | import skimage.io as io
11 | import matplotlib.pyplot as plt
12 | from matplotlib.collections import PatchCollection
13 | from matplotlib.patches import Polygon, Rectangle
14 | from torch.utils.data import Dataset
15 | import webdataset as wds
16 | 
17 | from minigpt4.datasets.datasets.base_dataset import BaseDataset
18 | from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
19 | 
20 | 
21 | class GroundedDetailDataset(Dataset):
22 |     def __init__(self, vis_processor, text_processor, vis_root, ann_path):
23 |         """
24 |         vis_root (string): Root directory of images (e.g. coco/images/)
25 |         ann_root (string): directory to store the annotation file
26 |         """
27 |         self.vis_root = vis_root
28 | 
29 |         self.vis_processor = vis_processor
30 |         self.text_processor = text_processor
31 | 
32 |         self.instruction_pool = [
33 |             '[grounding] please describe this image in details',
34 |             '[grounding] describe this image as detailed as possible',
35 |             '[grounding] summarize this image in details',
36 |             '[grounding] give a thorough description of what you see in this image',
37 |         ]
38 | 
39 |         with open(ann_path, 'r') as f:
40 |             self.ann = json.load(f)
41 | 
42 |     def __len__(self):
43 |         return len(self.ann)
44 | 
45 |     def __getitem__(self, index):
46 |         info = self.ann[index]
47 | 
48 |         image_file = 'COCO_train2014_{}.jpg'.format(info['image_id'])
49 |         image_path = os.path.join(self.vis_root, image_file)
50 |         image = Image.open(image_path).convert("RGB")
51 |         image = self.vis_processor(image)
52 | 
53 |         answer = info['grounded_caption']
54 | 
55 |         instruction = random.choice(self.instruction_pool)
56 | 
57 |         instruction = "<Img><ImageHere></Img> {} ".format(instruction)
58 | 
59 |         return {
60 |             "image": image,
61 |             "instruction_input": instruction,
62 |             "answer": answer,
63 |             "image_id": info['image_id'],
64 |         }
65 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/datasets/datasets/reasoning_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import pickle
 4 | import random
 5 | import time
 6 | import itertools
 7 | 
 8 | import numpy as np
 9 | from PIL import Image
10 | import skimage.io as io
11 | import matplotlib.pyplot as plt
12 | from matplotlib.collections import PatchCollection
13 | from matplotlib.patches import Polygon, Rectangle
14 | from torch.utils.data import Dataset
15 | import webdataset as wds
16 | 
17 | from minigpt4.datasets.datasets.base_dataset import BaseDataset
18 | from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
19 | 
20 | 
21 | 
22 | class ReasoningDataset(Dataset):
23 |     def __init__(self, vis_processor, text_processor, vis_root, ann_path):
24 |         """
25 |         vis_root (string): Root directory of images (e.g. coco/images/)
26 |         ann_root (string): directory to store the annotation file
27 |         """
28 |         self.vis_root = vis_root
29 | 
30 |         self.vis_processor = vis_processor
31 |         self.text_processor = text_processor
32 |         self.data = json.load(open(ann_path))
33 | 
34 |         # self.data = self.create_data(ann_path)
35 | 
36 |     # def create_data(self, ann_path):
37 |     #     # processed_data = []
38 |     #     with open(ann_path, 'r') as f:
39 |     #         data = json.load(f)
40 | 
41 |     #     return processed_data
42 | 
43 |     def __len__(self):
44 |         return len(self.data)
45 | 
46 |     def __getitem__(self, index):
47 |         sample = self.data[index]
48 |         image_id = sample["image_id"]+".jpg"
49 |         question = sample["question"]
50 |         answer =  sample["answer"]
51 | 
52 | 
53 |         image = Image.open(os.path.join(self.vis_root, image_id)).convert("RGB")
54 |         image = self.vis_processor(image)
55 | 
56 |         instruction = '<Img><ImageHere></Img> {} '.format(question)
57 |     
58 |         return {
59 |             "image": image,
60 |             "instruction_input": instruction,
61 |             "answer": answer
62 |         }
63 | 
64 | 
65 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/datasets/datasets/unnatural_instruction.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import pickle
 4 | import random
 5 | import time
 6 | import itertools
 7 | 
 8 | import numpy as np
 9 | from PIL import Image
10 | import skimage.io as io
11 | import matplotlib.pyplot as plt
12 | from matplotlib.collections import PatchCollection
13 | from matplotlib.patches import Polygon, Rectangle
14 | from torch.utils.data import Dataset
15 | import webdataset as wds
16 | 
17 | from minigpt4.datasets.datasets.base_dataset import BaseDataset
18 | from minigpt4.datasets.datasets.caption_datasets import CaptionDataset
19 | 
20 | 
21 | class UnnaturalDataset(Dataset):
22 |     def __init__(self, text_processor, ann_path):
23 |         """
24 |         vis_root (string): Root directory of images (e.g. coco/images/)
25 |         ann_root (string): directory to store the annotation file
26 |         """
27 |         self.text_processor = text_processor
28 | 
29 |         with open(ann_path, 'r') as f:
30 |             self.ann = json.load(f)
31 | 
32 |         # with open(ann_path, 'r') as f:
33 |         #     for data in f.readlines():
34 |         #         data = json.loads(data)
35 |         #         self.ann.append(data)
36 | 
37 |     def __len__(self):
38 |         return len(self.ann)
39 | 
40 |     def __getitem__(self, index):
41 |         info = self.ann[index]["instances"][0]
42 |         instruction = info["instruction_with_input"]
43 |         constraints = info["constraints"]
44 |         answer = info["output"]
45 |         if constraints != None:
46 |             instruction = instruction+" "+constraints
47 | 
48 |         return {
49 |             # "image":None,
50 |             "instruction_input": instruction,
51 |             "answer": answer,
52 |         }
53 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/mistral_test_config.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: mini_gpt4_llama_v2
 3 |   model_type: pretrain_vicuna
 4 |   freeze_vit: True
 5 |   freeze_qformer: True
 6 |   max_txt_len: 512
 7 |   low_resource: True
 8 |   image_size: 224
 9 |   end_sym: "</s>"
10 |   llama_model: "checkpoints/MiniGPT4-Video/Mistral-7B-Instruct-v0.2"
11 |   ckpt: "checkpoints/video_mistral_all_checkpoint_last.pth"
12 |   use_grad_checkpoint: True
13 |   chat_template: True
14 |   lora_r: 64
15 |   lora_alpha: 16
16 |   length: 50
17 |   use_grad_checkpoint_llm: True
18 |   max_context_len: 7200
19 | 
20 | 
21 | datasets:
22 |   video_chatgpt: #99378 row  - 13224 video
23 |     batch_size: 1
24 |     vis_processor:
25 |       train:
26 |         name: "blip2_image_train"
27 |         image_size: 224
28 |     text_processor:
29 |       train:
30 |         name: "blip_caption"
31 |     sample_ratio: 200
32 | 
33 | 
34 | run:
35 |   task: image_text_pretrain
36 |   seed: 42
37 |   amp: True


--------------------------------------------------------------------------------
/baselines/minigpt4/models/mistral.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoModelForCausalLM, AutoTokenizer
 2 | 
 3 | device = "cuda" # the device to load the model onto
 4 | 
 5 | model = AutoModelForCausalLM.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
 6 | tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
 7 | 
 8 | messages = [
 9 |     {"role": "user", "content": "What is your favourite condiment?"},
10 |     {"role": "assistant", "content": "Well, I'm quite partial to a good squeeze of fresh lemon juice. It adds just the right amount of zesty flavour to whatever I'm cooking up in the kitchen!"},
11 |     {"role": "user", "content": "Do you have mayonnaise recipes?"}
12 | ]
13 | p="Well, I'm quite partial to a good squeeze of fresh lemon juice."
14 | encoded_input = tokenizer(p, return_tensors='pt')
15 | embeds = model.model.embed_tokens(encoded_input.input_ids)
16 | print(embeds.shape)
17 | 
18 | 
19 | encodeds = tokenizer.apply_chat_template(messages, return_tensors="pt")
20 | model_inputs = encodeds.to(device)
21 | model.to(device)
22 | 
23 | generated_ids = model.generate(model_inputs, max_new_tokens=1000, do_sample=True)
24 | decoded = tokenizer.batch_decode(generated_ids)
25 | print(decoded[0])
26 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/models/policies/__init__.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
3 | 
4 | from .mixed_precision import *
5 | from .wrapping import *
6 | from .activation_checkpointing_functions import apply_fsdp_checkpointing
7 | from .anyprecision_optimizer import AnyPrecisionAdamW
8 | from .fsdp_utils import fsdp_auto_wrap_policy


--------------------------------------------------------------------------------
/baselines/minigpt4/models/policies/activation_checkpointing_functions.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 3 | 
 4 | import torch
 5 | import os
 6 | import torch.distributed as dist
 7 | from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
 8 |     checkpoint_wrapper,
 9 |     CheckpointImpl,
10 |     apply_activation_checkpointing,
11 | )
12 | 
13 | from transformers.models.t5.modeling_t5 import T5Block
14 | from transformers.models.llama.modeling_llama import LlamaDecoderLayer
15 | from functools import partial
16 | 
17 | non_reentrant_wrapper = partial(
18 |     checkpoint_wrapper,
19 |     checkpoint_impl=CheckpointImpl.NO_REENTRANT,
20 | )
21 | 
22 | check_fn = lambda submodule: isinstance(submodule, LlamaDecoderLayer)
23 | 
24 | 
25 | def apply_fsdp_checkpointing(model):
26 |     """apply activation checkpointing to model
27 |     returns None as model is updated directly
28 |     """
29 |     print(f"--> applying fdsp activation checkpointing...")
30 | 
31 |     apply_activation_checkpointing(
32 |         model, checkpoint_wrapper_fn=non_reentrant_wrapper, check_fn=check_fn
33 |     )
34 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/models/policies/fsdp_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 3 | 
 4 | def fsdp_auto_wrap_policy(model, transformer_layer_name):
 5 |     import functools
 6 |     import os
 7 | 
 8 |     from accelerate import FullyShardedDataParallelPlugin
 9 |     from transformers.models.t5.modeling_t5 import T5Block
10 |     from torch.distributed.fsdp.wrap import _or_policy, lambda_auto_wrap_policy, transformer_auto_wrap_policy
11 | 
12 |     from peft.tuners import PrefixEncoder, PromptEmbedding, PromptEncoder
13 | 
14 |     def lambda_policy_fn(module):
15 |         if (
16 |             len(list(module.named_children())) == 0
17 |             and getattr(module, "weight", None) is not None
18 |             and module.weight.requires_grad
19 |         ):
20 |             return True
21 |         return False
22 | 
23 |     lambda_policy = functools.partial(lambda_auto_wrap_policy, lambda_fn=lambda_policy_fn)
24 |     transformer_wrap_policy = functools.partial(
25 |         transformer_auto_wrap_policy,
26 |         transformer_layer_cls=(
27 |             PrefixEncoder,
28 |             PromptEncoder,
29 |             PromptEmbedding,
30 |             transformer_layer_name,
31 |             # FullyShardedDataParallelPlugin.get_module_class_from_name(
32 |             #     model, transformer_layer_name
33 |             # ),
34 |         ),
35 |     )
36 | 
37 |     auto_wrap_policy = functools.partial(_or_policy, policies=[lambda_policy, transformer_wrap_policy])
38 |     return auto_wrap_policy


--------------------------------------------------------------------------------
/baselines/minigpt4/models/policies/mixed_precision.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 3 | 
 4 | import torch
 5 | 
 6 | from torch.distributed.fsdp import (
 7 |     # FullyShardedDataParallel as FSDP,
 8 |     # CPUOffload,
 9 |     MixedPrecision,
10 |     # BackwardPrefetch,
11 |     # ShardingStrategy,
12 | )
13 | 
14 | # requires grad scaler in main loop
15 | fpSixteen = MixedPrecision(
16 |     param_dtype=torch.float16,
17 |     # Gradient communication precision.
18 |     reduce_dtype=torch.float16,
19 |     # Buffer precision.
20 |     buffer_dtype=torch.float16,
21 | )
22 | 
23 | bfSixteen = MixedPrecision(
24 |     param_dtype=torch.bfloat16,
25 |     # Gradient communication precision.
26 |     reduce_dtype=torch.bfloat16,
27 |     # Buffer precision.
28 |     buffer_dtype=torch.bfloat16,
29 |     cast_forward_inputs=True,
30 | )
31 | 
32 | bfSixteen_mixed = MixedPrecision(
33 |     param_dtype=torch.float32,
34 |     reduce_dtype=torch.bfloat16,
35 |     buffer_dtype=torch.bfloat16,
36 | )
37 | 
38 | fp32_policy = MixedPrecision(
39 |     param_dtype=torch.float32,
40 |     reduce_dtype=torch.float32,
41 |     buffer_dtype=torch.float32,
42 | )
43 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/models/policies/wrapping.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 3 | 
 4 | import torch.distributed as dist
 5 | import torch.nn as nn
 6 | import torch
 7 | 
 8 | from transformers.models.llama.modeling_llama import LlamaDecoderLayer
 9 | 
10 | from torch.distributed.fsdp.fully_sharded_data_parallel import (
11 |     FullyShardedDataParallel as FSDP,
12 |     CPUOffload,
13 |     BackwardPrefetch,
14 |     MixedPrecision,
15 | )
16 | from torch.distributed.fsdp.wrap import (
17 |     transformer_auto_wrap_policy,
18 |     size_based_auto_wrap_policy,
19 |     enable_wrap,
20 |     wrap,
21 | )
22 | 
23 | import functools
24 | from typing import Type
25 | 
26 | 
27 | def get_size_policy(min_params=1e8):
28 |     num_wrap_policy = functools.partial(
29 |         size_based_auto_wrap_policy, min_num_params=min_params
30 |     )
31 |     return num_wrap_policy
32 | 
33 | 
34 | def get_llama_wrapper():
35 |     """we register our main layer class and use the fsdp transformer wrapping policy
36 |     ensures embedding layers are in the root fsdp unit for shared access and that fsdp units map to transformer layers
37 |     """
38 |     # ====   use new transformer wrapper
39 | 
40 |     llama_auto_wrap_policy = functools.partial(
41 |         transformer_auto_wrap_policy,
42 |         transformer_layer_cls={
43 |             LlamaDecoderLayer,
44 |         },
45 |     )
46 | 
47 |     return llama_auto_wrap_policy
48 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from minigpt4.processors.base_processor import BaseProcessor
 9 | from minigpt4.processors.blip_processors import (
10 |     Blip2ImageTrainProcessor,
11 |     Blip2ImageEvalProcessor,
12 |     BlipCaptionProcessor,
13 | )
14 | 
15 | from minigpt4.common.registry import registry
16 | 
17 | __all__ = [
18 |     "BaseProcessor",
19 |     "Blip2ImageTrainProcessor",
20 |     "Blip2ImageEvalProcessor",
21 |     "BlipCaptionProcessor",
22 | ]
23 | 
24 | 
25 | def load_processor(name, cfg=None):
26 |     """
27 |     Example
28 | 
29 |     >>> processor = load_processor("alpro_video_train", cfg=None)
30 |     """
31 |     processor = registry.get_processor_class(name).from_config(cfg)
32 | 
33 |     return processor
34 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/processors/base_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from omegaconf import OmegaConf
 9 | 
10 | 
11 | class BaseProcessor:
12 |     def __init__(self):
13 |         self.transform = lambda x: x
14 |         return
15 | 
16 |     def __call__(self, item):
17 |         return self.transform(item)
18 | 
19 |     @classmethod
20 |     def from_config(cls, cfg=None):
21 |         return cls()
22 | 
23 |     def build(self, **kwargs):
24 |         cfg = OmegaConf.create(kwargs)
25 | 
26 |         return self.from_config(cfg)
27 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/runners/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from minigpt4.runners.runner_base import RunnerBase
 9 | 
10 | __all__ = ["RunnerBase"]
11 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from minigpt4.common.registry import registry
 9 | from minigpt4.tasks.base_task import BaseTask
10 | from minigpt4.tasks.image_text_pretrain import ImageTextPretrainTask
11 | 
12 | from minigpt4.tasks.vqa import VQATask, GQATask
13 | from minigpt4.tasks.vqa_reading_comprehension import VQARCTask, GQARCTask
14 | 
15 | 
16 | def setup_task(cfg):
17 |     assert "task" in cfg.run_cfg, "Task name must be provided."
18 | 
19 |     task_name = cfg.run_cfg.task
20 |     task = registry.get_task_class(task_name).setup_task(cfg=cfg)
21 |     assert task is not None, "Task {} not properly registered.".format(task_name)
22 | 
23 |     return task
24 | 
25 | 
26 | __all__ = [
27 |     "BaseTask",
28 |     "ImageTextPretrainTask",
29 |     "VQATask",
30 |     "GQATask",
31 |     "VQARCTask",
32 |     "GQARCTask",
33 | ]
34 | 


--------------------------------------------------------------------------------
/baselines/minigpt4/tasks/image_text_pretrain.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from minigpt4.common.registry import registry
 9 | from minigpt4.tasks.base_task import BaseTask
10 | 
11 | 
12 | @registry.register_task("image_text_pretrain")
13 | class ImageTextPretrainTask(BaseTask):
14 |     def __init__(self):
15 |         super().__init__()
16 | 
17 |     # def evaluation(self, model, data_loader, cuda_enabled=True):
18 |     #     pass
19 | 


--------------------------------------------------------------------------------
/baselines/pllava/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/pllava/models/__init__.py


--------------------------------------------------------------------------------
/baselines/pllava/models/pllava/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import TYPE_CHECKING
15 | 
16 | from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
17 | 
18 | 
19 | _import_structure = {"configuration_pllava": ["PLLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP", "PllavaConfig"]}
20 | 
21 | try:
22 |     if not is_torch_available():
23 |         raise OptionalDependencyNotAvailable()
24 | except OptionalDependencyNotAvailable:
25 |     pass
26 | else:
27 |     _import_structure["modeling_pllava"] = [
28 |         "PLLAVA_PRETRAINED_MODEL_ARCHIVE_LIST",
29 |         "PllavaForConditionalGeneration",
30 |         "PllavaPreTrainedModel",
31 |     ]
32 |     _import_structure["processing_pllava"] = ["PllavaProcessor"]
33 | 
34 | 
35 | if TYPE_CHECKING:
36 |     from .configuration_pllava import PLLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP, PllavaConfig
37 | 
38 |     try:
39 |         if not is_torch_available():
40 |             raise OptionalDependencyNotAvailable()
41 |     except OptionalDependencyNotAvailable:
42 |         pass
43 |     else:
44 |         from .modeling_pllava import (
45 |             PLLAVA_PRETRAINED_MODEL_ARCHIVE_LIST,
46 |             PllavaForConditionalGeneration,
47 |             PllavaPreTrainedModel,
48 |         )
49 |         from .processing_pllava import PllavaProcessor
50 | 
51 | 
52 | else:
53 |     import sys
54 | 
55 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure)
56 | 


--------------------------------------------------------------------------------
/baselines/pllava/models/pllava/convert_pllava_weights_to_hf.py:
--------------------------------------------------------------------------------
1 | # Not yet


--------------------------------------------------------------------------------
/baselines/pllava/tasks/eval/demo/__init__.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | from gradio.themes.utils import colors, fonts, sizes
 3 | 
 4 | 
 5 | pllava_theme = gr.themes.Monochrome(
 6 |     text_size="sm",
 7 |     spacing_size="sm",
 8 |     primary_hue=gr.themes.Color(c100="#f5f5f5", c200="#e5e5e5", c300="#d4d4d4", c400="#a3a3a3", c50="#fafafa", c500="#737373", c600="#525252", c700="#404040", c800="#262626", c900="#171717", c950="#000000"),
 9 |     secondary_hue=gr.themes.Color(c100="#f5f5f5", c200="#e5e5e5", c300="#d4d4d4", c400="#a3a3a3", c50="#fafafa", c500="#737373", c600="#525252", c700="#404040", c800="#262626", c900="#171717", c950="#000000"),
10 |     neutral_hue=gr.themes.Color(c100="#f5f5f5", c200="#e5e5e5", c300="#d4d4d4", c400="#a3a3a3", c50="#fafafa", c500="#737373", c600="#525252", c700="#404040", c800="#262626", c900="#171717", c950="#000000"),
11 | ).set(
12 |     background_fill_primary_dark='*primary_950',
13 |     background_fill_secondary_dark='*neutral_950'
14 | )
15 | 
16 | 


--------------------------------------------------------------------------------
/baselines/pllava/tasks/eval/recaption/show_recaption.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | import gradio as gr
 4 | 
 5 | from tasks.eval.recaption import load_results
 6 | import json
 7 | 
 8 | # example = videogallery().example_inputs()
 9 | 
10 | 
11 | def parse_args():
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument(
14 |         '--save_path',
15 |         required=True,
16 |     )
17 |     args = parser.parse_args()
18 |     return args
19 | 
20 | 
21 | args = parse_args()
22 | result_list = load_results(args.save_path)
23 | 
24 | 
25 | def show(result_index, ):
26 |     info = result_list[result_index]
27 |     video_path = info['video_path']
28 |     info_str = json.dumps(info, indent=4)
29 |     return video_path, info_str
30 | 
31 | 
32 | 
33 | from tasks.eval.recaption import load_results
34 | 
35 | with gr.Blocks() as demo:
36 |     gr.Markdown("# Showing of what has came out.")
37 |     gr.Markdown(f"From Saved Results {args.save_path}")
38 |     with gr.Row():
39 |         with gr.Column(1):
40 |             show_video = gr.Video(interactive=False)
41 | 
42 |         with gr.Column():
43 |             result_index = gr.Slider(0, len(result_list), step=1)
44 |             info = gr.Text(interactive=False)
45 |         
46 |         result_index.change(show, [result_index], [show_video, info])
47 | 
48 | 
49 | 
50 | 
51 | 
52 | demo.launch(share=True)
53 | 


--------------------------------------------------------------------------------
/baselines/pllava/tasks/eval/vcgbench/show_vcg.py:
--------------------------------------------------------------------------------
 1 | 
 2 | import argparse
 3 | import gradio as gr
 4 | 
 5 | from tasks.eval.vcgbench import load_results
 6 | import json
 7 | 
 8 | # example = videogallery().example_inputs()
 9 | 
10 | 
11 | def parse_args():
12 |     parser = argparse.ArgumentParser()
13 |     parser.add_argument(
14 |         '--save_path',
15 |         required=True,
16 |     )
17 |     args = parser.parse_args()
18 |     return args
19 | 
20 | 
21 | args = parse_args()
22 | result_list = load_results(args.save_path)
23 | 
24 | 
25 | def show(result_index, ):
26 |     info = result_list[result_index]
27 |     video_path = info['video_path']
28 |     info_str = json.dumps(info, indent=4)
29 |     return video_path, info_str
30 | 
31 | with gr.Blocks() as demo:
32 |     gr.Markdown(
33 |         f"# Showing The Results from {args.save_path}"
34 |     )
35 |     with gr.Row():
36 |         with gr.Column():
37 |             show_video = gr.Video(interactive=False)
38 | 
39 |         with gr.Column():
40 |             result_index = gr.Slider(0, len(result_list), step=1)
41 |             info = gr.Text(interactive=False)
42 |         
43 |         result_index.change(show, [result_index], [show_video, info])
44 | 
45 | demo.launch(share=True)
46 | 


--------------------------------------------------------------------------------
/baselines/pllava/tasks/shared_utils.py:
--------------------------------------------------------------------------------
 1 | import copy
 2 | import logging
 3 | import os
 4 | import os.path as osp
 5 | from os.path import join
 6 | 
 7 | import torch
 8 | from torch.utils.data import ConcatDataset, DataLoader
 9 | 
10 | from utils.optimizer import create_optimizer
11 | from utils.scheduler import create_scheduler
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | 
16 | def get_media_types(datasources):
17 |     """get the media types for for all the dataloaders.
18 | 
19 |     Args:
20 |         datasources (List): List of dataloaders or datasets.
21 | 
22 |     Returns: List. The media_types.
23 | 
24 |     """
25 |     if isinstance(datasources[0], DataLoader):
26 |         datasets = [dataloader.dataset for dataloader in datasources]
27 |     else:
28 |         datasets = datasources
29 |     media_types = [
30 |         dataset.datasets[0].media_type
31 |         if isinstance(dataset, ConcatDataset)
32 |         else dataset.media_type
33 |         for dataset in datasets
34 |     ]
35 | 
36 |     return media_types
37 | 


--------------------------------------------------------------------------------
/baselines/pllava/utils/config_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import sys
 4 | from os.path import dirname, join
 5 | 
 6 | from utils.config import Config
 7 | from utils.distributed import init_distributed_mode, is_main_process
 8 | from utils.logger import setup_logger
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | def setup_config():
14 |     """Conbine yaml config and command line config with OmegaConf.
15 |     Also converts types, e.g., `'None'` (str) --> `None` (None)
16 |     """
17 |     config = Config.get_config()
18 |     if config.debug:
19 |         config.wandb.enable = False
20 |     return config
21 | 
22 | 
23 | def setup_evaluate_config(config):
24 |     """setup evaluation default settings, e.g., disable wandb"""
25 |     assert config.evaluate
26 |     config.wandb.enable = False
27 |     if config.output_dir is None:
28 |         config.output_dir = join(dirname(config.pretrained_path), "eval")
29 |     return config
30 | 
31 | 
32 | def setup_output_dir(output_dir, excludes=["code"]):
33 |     """ensure not overwritting an exisiting/non-empty output dir"""
34 |     if not os.path.exists(output_dir):
35 |         os.makedirs(output_dir, exist_ok=False)
36 |     else:
37 |         existing_dirs_files = os.listdir(output_dir)  # list
38 |         remaining = set(existing_dirs_files) - set(excludes)
39 |         remaining = [e for e in remaining if "slurm" not in e]
40 |         remaining = [e for e in remaining if ".out" not in e]
41 |         # assert len(remaining) == 0, f"remaining dirs or files: {remaining}"
42 |         logger.warn(f"remaining dirs or files: {remaining}")
43 | 
44 | 
45 | def setup_main():
46 |     """
47 |     Setup config, logger, output_dir, etc.
48 |     Shared for pretrain and all downstream tasks.
49 |     """
50 |     config = setup_config()
51 |     if hasattr(config, "evaluate") and config.evaluate:
52 |         config = setup_evaluate_config(config)
53 |     init_distributed_mode(config)
54 | 
55 |     if is_main_process():
56 |         setup_output_dir(config.output_dir, excludes=["code"])
57 |         setup_logger(output=config.output_dir, color=True, name="vindlu")
58 |         logger.info(f"config: {Config.pretty_text(config)}")
59 |         Config.dump(config, os.path.join(config.output_dir, "config.json"))
60 |     return config
61 | 


--------------------------------------------------------------------------------
/baselines/share4video/__init__.py:
--------------------------------------------------------------------------------
1 | # from .model import LlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/baselines/share4video/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | IMAGE_TOKEN_INDEX = -200
 9 | DEFAULT_IMAGE_TOKEN = "<image>"
10 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11 | DEFAULT_IM_START_TOKEN = "<im_start>"
12 | DEFAULT_IM_END_TOKEN = "<im_end>"
13 | IMAGE_PLACEHOLDER = "<image-placeholder>"
14 | 


--------------------------------------------------------------------------------
/baselines/share4video/model/__init__.py:
--------------------------------------------------------------------------------
1 | try:
2 |     from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
3 |     from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
4 |     from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
5 | except:
6 |     pass
7 | 


--------------------------------------------------------------------------------
/baselines/share4video/model/apply_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from llava import LlavaLlamaForCausalLM
11 | 
12 | 
13 | def apply_delta(base_model_path, target_model_path, delta_path):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading delta")
19 |     delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20 |     delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21 | 
22 |     print("Applying delta")
23 |     for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data += base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
31 |                 f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
32 |             bparam = base.state_dict()[name]
33 |             param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
34 | 
35 |     print("Saving target model")
36 |     delta.save_pretrained(target_model_path)
37 |     delta_tokenizer.save_pretrained(target_model_path)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--base-model-path", type=str, required=True)
43 |     parser.add_argument("--target-model-path", type=str, required=True)
44 |     parser.add_argument("--delta-path", type=str, required=True)
45 | 
46 |     args = parser.parse_args()
47 | 
48 |     apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
49 | 


--------------------------------------------------------------------------------
/baselines/share4video/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from llava.model import *
10 | from llava.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/baselines/share4video/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
 3 | from .siglip_encoder import SigLipVisionTower
 4 | 
 5 | 
 6 | def build_vision_tower(vision_tower_cfg, **kwargs):
 7 |     vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
 8 |     is_absolute_path_exists = os.path.exists(vision_tower)
 9 |     use_s2 = getattr(vision_tower_cfg, 's2', False)
10 |     if 'siglip' not in vision_tower.lower():
11 |         if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
12 |             if use_s2:
13 |                 return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
14 |             else:
15 |                 return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
16 |     else:
17 |         if is_absolute_path_exists or vision_tower.startswith("google") or vision_tower.startswith('bczhou'):
18 |             return SigLipVisionTower(vision_tower, vision_tower_cfg, **kwargs)
19 | 
20 |     raise ValueError(f'Unknown vision tower: {vision_tower}')
21 | 


--------------------------------------------------------------------------------
/baselines/share4video/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | 
 6 | class IdentityMap(nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_projector_type": 'identity'}
16 | 
17 | 
18 | class SimpleResBlock(nn.Module):
19 |     def __init__(self, channels):
20 |         super().__init__()
21 |         self.pre_norm = nn.LayerNorm(channels)
22 | 
23 |         self.proj = nn.Sequential(
24 |             nn.Linear(channels, channels),
25 |             nn.GELU(),
26 |             nn.Linear(channels, channels)
27 |         )
28 |     def forward(self, x):
29 |         x = self.pre_norm(x)
30 |         return x + self.proj(x)
31 | 
32 | 
33 | def build_vision_projector(config, delay_load=False, **kwargs):
34 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
35 | 
36 |     if projector_type == 'linear':
37 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
38 | 
39 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40 |     if mlp_gelu_match:
41 |         mlp_depth = int(mlp_gelu_match.group(1))
42 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43 |         for _ in range(1, mlp_depth):
44 |             modules.append(nn.GELU())
45 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46 |         return nn.Sequential(*modules)
47 | 
48 |     if projector_type == 'identity':
49 |         return IdentityMap()
50 | 
51 |     raise ValueError(f'Unknown projector type: {projector_type}')
52 | 


--------------------------------------------------------------------------------
/baselines/share4video/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/baselines/share4video/train/train_mem.py:
--------------------------------------------------------------------------------
1 | from llava.train.train import train
2 | 
3 | if __name__ == "__main__":
4 |     train(attn_implementation="flash_attention_2")
5 | 


--------------------------------------------------------------------------------
/baselines/valley/configs/deepspeed/config_zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 | 
14 |     "zero_optimization": {
15 |         "stage": 2,
16 |         "allgather_partitions": true,
17 |         "allgather_bucket_size": 5e8,
18 |         "overlap_comm": true,
19 |         "reduce_scatter": true,
20 |         "reduce_bucket_size": 5e8,
21 |         "contiguous_gradients": true
22 |     },
23 | 
24 |     "gradient_accumulation_steps": "auto",
25 |     "gradient_clipping": "auto",
26 |     "steps_per_print": 2000,
27 |     "train_batch_size": "auto",
28 |     "train_micro_batch_size_per_gpu": "auto",
29 |     "wall_clock_breakdown": false
30 | }


--------------------------------------------------------------------------------
/baselines/valley/configs/deepspeed/config_zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |         "enabled": "auto",
 4 |         "loss_scale": 0,
 5 |         "loss_scale_window": 1000,
 6 |         "initial_scale_power": 16,
 7 |         "hysteresis": 2,
 8 |         "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |         "enabled": "auto"
12 |     },
13 |     "train_micro_batch_size_per_gpu": "auto",
14 |     "train_batch_size": "auto",
15 |     "gradient_accumulation_steps": "auto",
16 |     "zero_optimization": {
17 |         "stage": 3,
18 |         "overlap_comm": true,
19 |         "contiguous_gradients": true,
20 |         "sub_group_size": 1e9,
21 |         "reduce_bucket_size": "auto",
22 |         "stage3_prefetch_bucket_size": "auto",
23 |         "stage3_param_persistence_threshold": "auto",
24 |         "stage3_max_live_parameters": 1e9,
25 |         "stage3_max_reuse_distance": 1e9,
26 |         "stage3_gather_16bit_weights_on_model_save": true
27 |     }
28 | }


--------------------------------------------------------------------------------
/baselines/valley/configs/deepspeed/config_zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "fp16": {
 3 |       "enabled": "auto",
 4 |       "loss_scale": 0,
 5 |       "loss_scale_window": 1000,
 6 |       "initial_scale_power": 16,
 7 |       "hysteresis": 2,
 8 |       "min_loss_scale": 1
 9 |     },
10 |     "bf16": {
11 |       "enabled": "auto"
12 |     },
13 |     "optimizer": {
14 |       "type": "AdamW",
15 |       "params": {
16 |         "lr": "auto",
17 |         "betas": "auto",
18 |         "eps": "auto",
19 |         "weight_decay": "auto"
20 |       }
21 |     },
22 |     "scheduler": {
23 |       "type": "WarmupLR",
24 |       "params": {
25 |         "warmup_min_lr": "auto",
26 |         "warmup_max_lr": "auto",
27 |         "warmup_num_steps": "auto"
28 |       }
29 |     },
30 |     "zero_optimization": {
31 |       "stage": 3,
32 |       "offload_optimizer": {
33 |         "device": "cpu",
34 |         "pin_memory": true
35 |       },
36 |       "offload_param": {
37 |         "device": "cpu",
38 |         "pin_memory": true
39 |       },
40 |       "overlap_comm": true,
41 |       "contiguous_gradients": true,
42 |       "sub_group_size": 1e9,
43 |       "reduce_bucket_size": "auto",
44 |       "stage3_prefetch_bucket_size": "auto",
45 |       "stage3_param_persistence_threshold": "auto",
46 |       "stage3_max_live_parameters": 1e9,
47 |       "stage3_max_reuse_distance": 1e9,
48 |       "gather_16bit_weights_on_model_save": true
49 |     },
50 |     "gradient_accumulation_steps": "auto",
51 |     "gradient_clipping": "auto",
52 |     "train_batch_size": "auto",
53 |     "train_micro_batch_size_per_gpu": "auto",
54 |     "steps_per_print": 1e5,
55 |     "wall_clock_breakdown": false
56 |   }


--------------------------------------------------------------------------------
/baselines/valley/configs/experiment/valley_stage1.yaml:
--------------------------------------------------------------------------------
 1 | model_name_or_path: Path/to/opensource/LLM
 2 | data_path: Path/to/LLaVA-CC3M-Pretrain-595K/chat.json
 3 | image_folder: Path/to/LLaVA-CC3M-Pretrain-595K/image_new
 4 | video_data_path: Path/to/webvid_703K/chat.json
 5 | video_folder: Path/to/webvid_703K/videos
 6 | output_dir: Path/to/model/out/dir
 7 | # experiment name
 8 | project_name: valley
 9 | run_name: valley_stage1
10 | 
11 | # Whether to make the system prompt a mask in the label, and others do not mask
12 | only_mask_system: False
13 | # system prompt style
14 | conv_mode: v1
15 | # wether freeze backbone
16 | freeze_backbone: True
17 | # wether tune multimodal projection layer
18 | tune_mm_mlp_adapter: True
19 | # wether lora
20 | lora: False
21 | # wether multimodal
22 | is_multimodal: True
23 | 
24 | num_train_epochs: 1
25 | per_device_train_batch_size: 16
26 | save_strategy: steps
27 | save_steps: 2400
28 | learning_rate: 2e-3
29 | gradient_checkpointing: True
30 | 
31 | # wether do fast epoch
32 | fast_epoch: False
33 | 
34 | vision_tower: openai/clip-vit-large-patch14
35 | mm_vision_select_layer: -2
36 | mm_use_im_start_end: True
37 | lazy_preprocess: True
38 | bf16: False
39 | fp16: True
40 | tf32: False
41 | per_device_eval_batch_size: 1
42 | gradient_accumulation_steps: 1
43 | evaluation_strategy: "no"
44 | save_total_limit: 1 
45 | weight_decay: 0.
46 | warmup_ratio: 0.03 
47 | lr_scheduler_type: cosine
48 | logging_steps: 1 
49 | model_max_length: 2048 
50 | adam_beta1: 0.9 
51 | adam_beta2: 0.95 
52 | deepspeed: valley/configs/deepspeed/config_zero2.json
53 | report_to: wandb


--------------------------------------------------------------------------------
/baselines/valley/configs/experiment/valley_stage2.yaml:
--------------------------------------------------------------------------------
 1 | model_name_or_path:  Path/ to/ pretrain/ valley/ from/ stage1
 2 | data_path:  Path/ to/ LLaVA-Instruct-150K/ llava_instruct_150k.json
 3 | image_folder:  Path/ to/ COCO/ train2014
 4 | video_data_path: /Path/ to/ Valley-Instruct/ valley_instruct_73k.json
 5 | video_folder:  Path/ to/ Valley-Instruct/ videos
 6 | output_dir: Model/ Output/ path
 7 | prediction_file_name: Model/ Output/ path/ eval_result.jsonl # evaluation file output path
 8 | # experiment name
 9 | project_name: valley
10 | run_name: valley_stage2
11 | # Whether to make the system prompt a mask in the label, and others do not mask
12 | only_mask_system: False
13 | # system prompt style
14 | conv_mode: v1
15 | # wether freeze backbone
16 | freeze_backbone: False
17 | # wether tune multimodal projection layer
18 | tune_mm_mlp_adapter: True
19 | # wether lora
20 | lora: False
21 | # wether multimodal
22 | is_multimodal: True
23 | 
24 | num_train_epochs: 3
25 | per_device_train_batch_size: 1
26 | per_device_eval_batch_size: 1 # must 1
27 | save_strategy: steps
28 | save_steps: 3000
29 | evaluation_strategy: 'no'
30 | eval_steps: 3000
31 | eval_num: 600
32 | use_legacy_prediction_loop: True
33 | predict_with_generate: True
34 | prediction_loss_only: False
35 | generation_max_length: 1536
36 | learning_rate: 2e-5
37 | gradient_checkpointing: True
38 | 
39 | # wether do fast epoch
40 | fast_epoch: False
41 | 
42 | vision_tower: openai/clip-vit-large-patch14
43 | mm_vision_select_layer: -2
44 | mm_use_im_start_end: True
45 | lazy_preprocess: True
46 | bf16: True
47 | fp16: False
48 | tf32: False
49 | gradient_accumulation_steps: 1
50 | weight_decay: 0.
51 | warmup_ratio: 0.03 
52 | lr_scheduler_type: cosine
53 | logging_steps: 1 
54 | model_max_length: 2048 
55 | deepspeed: valley/configs/deepspeed/config_zero2.json
56 | report_to: wandb


--------------------------------------------------------------------------------
/baselines/valley/configs/experiment/valley_stage2_lora.yaml:
--------------------------------------------------------------------------------
 1 | model_name_or_path:  Path/ to/ pretrain/ valley/ from/ stage1
 2 | data_path:  Path/ to/ LLaVA-Instruct-150K/ llava_instruct_150k.json
 3 | image_folder:  Path/ to/ COCO/ train2014
 4 | video_data_path: /Path/ to/ Valley-Instruct/ valley_instruct_73k.json
 5 | video_folder:  Path/ to/ Valley-Instruct/ videos
 6 | output_dir: Model/ Output/ path
 7 | prediction_file_name: Model/ Output/ path/ eval_result.jsonl # evaluation file output path
 8 | # experiment name
 9 | project_name: valley
10 | run_name: valley_stage2_lora
11 | # Whether to make the system prompt a mask in the label, and others do not mask
12 | only_mask_system: False
13 | # system prompt style
14 | conv_mode: v1
15 | # wether freeze backbone
16 | freeze_backbone: False
17 | # wether tune multimodal projection layer
18 | tune_mm_mlp_adapter: True
19 | # wether lora
20 | lora: True
21 | # wether multimodal
22 | is_multimodal: True
23 | 
24 | num_train_epochs: 3
25 | per_device_train_batch_size: 4
26 | save_strategy: 'no'
27 | lora_save_strategy: steps # if do lora training, turn on this button, to only save lora weight. support ['steps','epochs','no']
28 | save_steps: 5000
29 | learning_rate: 5e-4
30 | gradient_checkpointing: True
31 | 
32 | # wether do fast epoch
33 | fast_epoch: False
34 | 
35 | vision_tower: openai/clip-vit-large-patch14
36 | mm_vision_select_layer: -2
37 | mm_use_im_start_end: True
38 | lazy_preprocess: True
39 | bf16: False
40 | fp16: True
41 | tf32: False
42 | per_device_eval_batch_size: 1
43 | gradient_accumulation_steps: 1
44 | evaluation_strategy: "no"
45 | save_total_limit: 3
46 | weight_decay: 0.
47 | warmup_ratio: 0.03 
48 | lr_scheduler_type: cosine
49 | logging_steps: 1 
50 | model_max_length: 2048 
51 | adam_beta1: 0.9 
52 | adam_beta2: 0.95 
53 | deepspeed: valley/configs/deepspeed/config_zero2.json
54 | report_to: wandb


--------------------------------------------------------------------------------
/baselines/valley/configs/experiment/valley_stage2_zero3.yaml:
--------------------------------------------------------------------------------
 1 | model_name_or_path:  Path/ to/ pretrain/ valley/ from/ stage1
 2 | data_path:  Path/ to/ LLaVA-Instruct-150K/ llava_instruct_150k.json
 3 | image_folder:  Path/ to/ COCO/ train2014
 4 | video_data_path: /Path/ to/ Valley-Instruct/ valley_instruct_73k.json
 5 | video_folder:  Path/ to/ Valley-Instruct/ videos
 6 | output_dir: Model/ Output/ path
 7 | prediction_file_name: Model/ Output/ path/ eval_result.jsonl  # evaluation file output path
 8 | # experiment name
 9 | project_name: valley2
10 | run_name: valley_stage2_zero3
11 | # Whether to make the system prompt a mask in the label, and others do not mask
12 | only_mask_system: False
13 | # system prompt style
14 | conv_mode: v1
15 | # wether freeze backbone
16 | freeze_backbone: False
17 | # wether tune multimodal projection layer
18 | tune_mm_mlp_adapter: True
19 | # wether freeze multimodal projection layer
20 | freeze_mm_mlp_adapter: False
21 | # wether lora
22 | lora: False
23 | # wether multimodal
24 | is_multimodal: True
25 | 
26 | num_train_epochs: 3
27 | per_device_train_batch_size: 1 # zero3 must 1
28 | per_device_eval_batch_size: 1 # must 1
29 | save_strategy: steps
30 | save_steps: 3000
31 | evaluation_strategy: "no"
32 | eval_steps: 3000
33 | eval_num: 600
34 | use_legacy_prediction_loop: True
35 | predict_with_generate: True
36 | prediction_loss_only: False
37 | generation_max_length: 1536
38 | learning_rate: 2e-5
39 | gradient_checkpointing: True
40 | 
41 | # wether do fast epoch
42 | fast_epoch: False
43 | 
44 | vision_tower: openai/clip-vit-large-patch14
45 | mm_vision_select_layer: -2
46 | mm_use_im_start_end: True
47 | lazy_preprocess: True
48 | bf16: False
49 | fp16: True
50 | tf32: False
51 | gradient_accumulation_steps: 1
52 | weight_decay: 0.
53 | warmup_ratio: 0.03 
54 | lr_scheduler_type: cosine
55 | logging_steps: 1 
56 | model_max_length: 2048 
57 | deepspeed: valley/configs/deepspeed/config_zero3.json
58 | report_to: wandb


--------------------------------------------------------------------------------
/baselines/valley/constants.py:
--------------------------------------------------------------------------------
1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
2 | WORKER_HEART_BEAT_INTERVAL = 15
3 | 
4 | LOGDIR = "."
5 | 


--------------------------------------------------------------------------------
/baselines/valley/model/apply_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from valley import ValleyLlamaForCausalLM
11 | 
12 | 
13 | def apply_delta(base_model_path, target_model_path, delta_path):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading delta")
19 |     delta = ValleyLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20 |     delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21 | 
22 |     print("Applying delta")
23 |     for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'] or 'vision_tower' in name, f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data += base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
31 |                 f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
32 |             bparam = base.state_dict()[name]
33 |             param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
34 | 
35 |     print("Saving target model")
36 |     delta.save_pretrained(target_model_path)
37 |     delta_tokenizer.save_pretrained(target_model_path)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--base-model-path", type=str, required=True)
43 |     parser.add_argument("--target-model-path", type=str, required=True)
44 |     parser.add_argument("--delta-path", type=str, required=True)
45 | 
46 |     args = parser.parse_args()
47 | 
48 |     apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
49 | 


--------------------------------------------------------------------------------
/baselines/valley/train/train.sh:
--------------------------------------------------------------------------------
1 | torchrun --nproc_per_node 8 --nnodes 1 --node_rank=0 --master_addr 10.192.24.78 --master_port 10404 valley/train/train.py --conf $1


--------------------------------------------------------------------------------
/baselines/valley/util/config.py:
--------------------------------------------------------------------------------
 1 | IGNORE_INDEX = -100
 2 | DEFAULT_PAD_TOKEN = "[PAD]"
 3 | DEFAULT_EOS_TOKEN = "</s>"
 4 | DEFAULT_BOS_TOKEN = "</s>"
 5 | DEFAULT_UNK_TOKEN = "<unk>"
 6 | DEFAULT_IMAGE_TOKEN = "<image>"
 7 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
 8 | DEFAULT_IM_START_TOKEN = "<im_start>"
 9 | DEFAULT_IM_END_TOKEN = "<im_end>"
10 | DEFAULT_VIDEO_TOKEN = "<video>"
11 | DEFAULT_VIDEO_FRAME_TOKEN = "<vi_frame>"
12 | DEFAULT_VI_START_TOKEN = "<vi_start>"
13 | DEFAULT_VI_END_TOKEN = "<vi_end>"


--------------------------------------------------------------------------------
/baselines/valley/util/decode_img.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import os
 3 | import base64
 4 | from PIL import Image
 5 | import random
 6 | import io
 7 | 
 8 | 
 9 | def image_preprocess(image_str):
10 |     image = _load_image(b64_decode(image_str))
11 | 
12 |     return image
13 | 
14 | 
15 | def b64_decode(string):
16 |     if isinstance(string, str):
17 |         string = string.encode()
18 |     return base64.decodebytes(string)
19 | 
20 | 
21 | def _load_image(buffer):
22 |     img = Image.open(io.BytesIO(buffer))
23 |     img = img.convert('RGB')
24 |     return img
25 | 
26 | 
27 | path = 'datas/GB_val.txt'  # 测试集路径
28 | save_path = 'datas/gambling/'  # 保存路径
29 | 
30 | if not os.path.exists(save_path):
31 |     os.mkdir(save_path)
32 | 
33 | with open(path, 'r') as f:
34 |     lines = f.readlines()
35 | 
36 | for idx, ex in enumerate(lines):
37 |     data = json.loads(ex)
38 |     # print(data.keys())  TODO: 看看有哪些字段，选择一些保存
39 |     title = data['title']
40 |     # asr = data['asr']
41 |     merge_ocr = data['merge_ocr']
42 |     # product_title = data['product_title']
43 |     # video_desp = data['video_desp']
44 |     gt_label = data['gt_label']
45 |     video_frame = data['video_frame']
46 |     # text = {'asr': asr, 'merge_ocr': merge_ocr, 'product_title': product_title, 'video_desp': video_desp,
47 |     #         'gt_label': gt_label}
48 |     text = {'merge_ocr': merge_ocr, 'title': title, 'gt_label': gt_label}
49 |     with open(os.path.join(save_path, f'{idx}.json'), 'w') as f:
50 |         f.write(json.dumps(text, indent=4))
51 | 
52 |     for i, video in enumerate(video_frame):
53 |         if random.random() < 0.5:
54 |             image = image_preprocess(video)
55 |             image.save(os.path.join(save_path, f'{idx}_{i}.jpg'), quality=95)
56 | 
57 |     if idx > 50:
58 |         break
59 | 


--------------------------------------------------------------------------------
/baselines/video_chat2/configs/config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "model": {
 3 |     "model_cls": "VideoChat2_it",
 4 |     "vit_blip_model_path": "your_model_path/umt_l16_qformer.pth",
 5 |     "llama_model_path": "your_model_path/vicuna-7b-v0",
 6 |     "videochat2_model_path": "your_model_path/videochat2_7b_stage2.pth",
 7 |     "freeze_vit": false,
 8 |     "freeze_qformer": false,
 9 |     "max_txt_len": 512,
10 |     "low_resource": false,
11 |     "vision_encoder": {
12 |       "name": "vit_l14",
13 |       "img_size": 224,
14 |       "patch_size": 16,
15 |       "d_model": 1024,
16 |       "encoder_embed_dim": 1024,
17 |       "encoder_depth": 24,
18 |       "encoder_num_heads": 16,
19 |       "drop_path_rate": 0.0,
20 |       "num_frames": 32,
21 |       "tubelet_size": 1,
22 |       "use_checkpoint": false,
23 |       "checkpoint_num": 0,
24 |       "pretrained": "",
25 |       "return_index": -2,
26 |       "vit_add_ln": true,
27 |       "ckpt_num_frame": 4
28 |     },
29 |     "num_query_token": 32,
30 |     "qformer_hidden_dropout_prob": 0.1,
31 |     "qformer_attention_probs_dropout_prob": 0.1,
32 |     "qformer_drop_path_rate": 0.2,
33 |     "extra_num_query_token": 64,
34 |     "qformer_text_input": true,
35 |     "system": "",
36 |     "start_token": "<Video>",
37 |     "end_token": "</Video>",
38 |     "img_start_token": "<Image>",
39 |     "img_end_token": "</Image>",
40 |     "random_shuffle": true,
41 |     "use_lora": false,
42 |     "lora_r": 16,
43 |     "lora_alpha": 32,
44 |     "lora_dropout": 0.1
45 |   },
46 |   "device": "cuda"
47 | }
48 | 


--------------------------------------------------------------------------------
/baselines/video_chat2/configs/config_bert.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "architectures": [
 3 |     "BertForMaskedLM"
 4 |   ],
 5 |   "attention_probs_dropout_prob": 0.1,
 6 |   "hidden_act": "gelu",
 7 |   "hidden_dropout_prob": 0.1,
 8 |   "hidden_size": 768,
 9 |   "initializer_range": 0.02,
10 |   "intermediate_size": 3072,
11 |   "layer_norm_eps": 1e-12,
12 |   "max_position_embeddings": 512,
13 |   "model_type": "bert",
14 |   "num_attention_heads": 12,
15 |   "num_hidden_layers": 12,
16 |   "pad_token_id": 0,
17 |   "type_vocab_size": 2,
18 |   "vocab_size": 30522,
19 |   "fusion_layer": 9,
20 |   "encoder_width": 768,
21 |   "cross_module": "ca"
22 | }
23 | 


--------------------------------------------------------------------------------
/baselines/video_chat2/configs/data.py:
--------------------------------------------------------------------------------
 1 | import os as __os  # add "__" if not want to be exported
 2 | from copy import deepcopy as __deepcopy
 3 | 
 4 | data_dir = 'your_annotation_path'
 5 | if data_dir is None:
 6 |     raise ValueError("please set environment `VL_DATA_DIR` before continue")
 7 | 
 8 | data_root = __os.path.join(data_dir, "videos_images")
 9 | anno_root_pt = __os.path.join(data_dir, "anno_pretrain")
10 | 
11 | # ============== pretraining datasets=================
12 | available_corpus = dict(
13 |     # pretraining datasets
14 |     cc3m=[
15 |         f"{anno_root_pt}/cc3m_train.json", 
16 |         f"{data_root}/cc3m",
17 |     ],
18 |     cc12m=[
19 |         f"{anno_root_pt}/cc12m_train.json", 
20 |         f"{data_root}/cc12m",
21 |     ],
22 |     sbu=[
23 |         f"{anno_root_pt}/sbu.json", 
24 |         f"{data_root}/sbu",
25 |     ],
26 |     vg=[
27 |         f"{anno_root_pt}/vg.json", 
28 |         f"{data_root}/vg",
29 |     ],
30 |     coco=[
31 |         f"{anno_root_pt}/coco.json", 
32 |         f"{data_root}/coco",
33 |     ],
34 |     webvid=[
35 |         f"{anno_root_pt}/webvid_train.json", 
36 |         f"{data_root}/webvid",
37 |         "video"
38 |     ],
39 |     webvid_10m=[
40 |         f"{anno_root_pt}/webvid_10m_train.json", 
41 |         f"{data_root}/webvid_10m",
42 |         "video",
43 |     ],
44 |     internvid_10m=[
45 |         f"{anno_root_pt}/internvid_10m_train.json",
46 |         f"{data_root}/internvid_10m",
47 |         "video"
48 |     ],
49 | )
50 | 
51 | # composed datasets.
52 | available_corpus["msrvtt_1k_test"] = [
53 |     f"{anno_root_pt}/msrvtt_test1k.json",
54 |     f"{data_root}/MSRVTT_Videos",
55 |     "video",
56 | ]
57 | 
58 | available_corpus["webvid10m_cc14m"] = [
59 |     available_corpus["webvid_10m"],
60 |     available_corpus["cc3m"],
61 |     available_corpus["cc12m"],
62 | ]
63 | available_corpus["webvid10m_cc14m_plus"] = [
64 |     available_corpus["webvid_10m"],
65 |     available_corpus["cc3m"],
66 |     available_corpus["coco"],
67 |     available_corpus["vg"],
68 |     available_corpus["sbu"],
69 |     available_corpus["cc12m"],
70 |     available_corpus["internvid_10m"],
71 | ]


--------------------------------------------------------------------------------
/baselines/video_chat2/configs/model.py:
--------------------------------------------------------------------------------
1 | TextEncoders = dict()
2 | TextEncoders["bert"] = dict(
3 |     name="bert_base",
4 |     pretrained="bert-base-uncased",
5 |     config="configs/config_bert.json",
6 |     d_model=768,
7 |     fusion_layer=9,
8 | )


--------------------------------------------------------------------------------
/baselines/video_chat2/dataset/dataloader.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.distributed as dist
 3 | from video_chat2.utils.distributed import get_rank, is_dist_avail_and_initialized, is_main_process
 4 | import random
 5 | import logging
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | 
10 | class MetaLoader(object):
11 |     """ wraps multiple data loader """
12 |     def __init__(self, name2loader):
13 |         """Iterates over multiple dataloaders, it ensures all processes
14 |         work on data from the same dataloader. This loader will end when
15 |         the shorter dataloader raises StopIteration exception.
16 | 
17 |         loaders: Dict, {name: dataloader}
18 |         """
19 |         self.name2loader = name2loader
20 |         self.name2iter = {name: iter(l) for name, l in name2loader.items()}
21 |         name2index = {name: idx for idx, (name, l) in enumerate(name2loader.items())}
22 |         index2name = {v: k for k, v in name2index.items()}
23 | 
24 |         iter_order = []
25 |         for n, l in name2loader.items():
26 |             iter_order.extend([name2index[n]]*len(l))
27 | 
28 |         random.shuffle(iter_order)
29 |         iter_order = torch.Tensor(iter_order).to(torch.device("cuda")).to(torch.uint8)
30 | 
31 |         # sync
32 |         if is_dist_avail_and_initialized():
33 |             # make sure all processes have the same order so that
34 |             # each step they will have data from the same loader
35 |             dist.broadcast(iter_order, src=0)
36 |         self.iter_order = [index2name[int(e.item())] for e in iter_order.cpu()]
37 | 
38 |         logger.info(str(self))
39 | 
40 |     def __str__(self):
41 |         output = [f"MetaLoader has {len(self.name2loader)} dataloaders, {len(self)} batches in total"]
42 |         for idx, (name, loader) in enumerate(self.name2loader.items()):
43 |             output.append(
44 |                 f"dataloader index={idx} name={name}, batch-size={loader.batch_size} length(#batches)={len(loader)} "
45 |             )
46 |         return "\n".join(output)
47 | 
48 |     def __len__(self):
49 |         return len(self.iter_order)
50 | 
51 |     def __iter__(self):
52 |         """ this iterator will run indefinitely """
53 |         for name in self.iter_order:
54 |             _iter = self.name2iter[name]
55 |             batch = next(_iter)
56 |             yield name, batch
57 | 


--------------------------------------------------------------------------------
/baselines/video_chat2/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_chat2/models/__init__.py


--------------------------------------------------------------------------------
/baselines/video_chat2/models/bert/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_chat2/models/bert/__init__.py


--------------------------------------------------------------------------------
/baselines/video_chat2/models/blip2/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_chat2/models/blip2/__init__.py


--------------------------------------------------------------------------------
/baselines/video_chat2/models/blip2/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import logging
 4 | 
 5 | 
 6 | from .Qformer import BertConfig, BertLMHeadModel
 7 | from models.utils import load_temp_embed_with_mismatch
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | 
11 | 
12 | def build_qformer(num_query_token, vision_width, 
13 |                   qformer_hidden_dropout_prob=0.1,
14 |                   qformer_attention_probs_dropout_prob=0.1,
15 |                   drop_path_rate=0.,
16 |                   ):
17 |     encoder_config = BertConfig.from_pretrained("bert-base-uncased", local_files_only=True)
18 |     encoder_config.encoder_width = vision_width
19 |     # insert cross-attention layer every other block
20 |     encoder_config.add_cross_attention = True
21 |     encoder_config.cross_attention_freq = 2
22 |     encoder_config.query_length = num_query_token
23 |     encoder_config.hidden_dropout_prob = qformer_hidden_dropout_prob
24 |     encoder_config.attention_probs_dropout_prob = qformer_attention_probs_dropout_prob
25 |     encoder_config.drop_path_list = [x.item() for x in torch.linspace(0, drop_path_rate, encoder_config.num_hidden_layers)]
26 |     logger.info(f"Drop_path:{encoder_config.drop_path_list}")
27 |     logger.info(encoder_config)
28 |     Qformer = BertLMHeadModel.from_pretrained(
29 |         "bert-base-uncased", config=encoder_config, local_files_only=True
30 |     )                 
31 |     query_tokens = nn.Parameter(
32 |         torch.zeros(1, num_query_token, encoder_config.hidden_size)
33 |     )
34 |     query_tokens.data.normal_(mean=0.0, std=encoder_config.initializer_range)
35 |     return Qformer, query_tokens
36 | 
37 | def interpolate_pos_embed_blip(state_dict, new_model):
38 |     if "vision_temp_embed" in state_dict:
39 |         vision_temp_embed_new = new_model.state_dict()["vision_temp_embed"]
40 |         state_dict["vision_temp_embed"] = load_temp_embed_with_mismatch(
41 |             state_dict["vision_temp_embed"], vision_temp_embed_new, add_zero=False
42 |         )
43 |     return state_dict
44 | 


--------------------------------------------------------------------------------
/baselines/video_chat2/prompts/concise_description.txt:
--------------------------------------------------------------------------------
 1 | <Video><VideoHere></Video> Describe the following video concisely.
 2 | <Video><VideoHere></Video> Provide a brief description of the given video clip.
 3 | <Video><VideoHere></Video> Offer a succinct explanation of the footage presented.
 4 | <Video><VideoHere></Video> Summarize the visual content of the following video.
 5 | <Video><VideoHere></Video> Give a short and clear explanation of the subsequent video clip.
 6 | <Video><VideoHere></Video> Share a concise interpretation of the video provided.
 7 | <Video><VideoHere></Video> Present a compact description of the clip's key features.
 8 | <Video><VideoHere></Video> Relay a brief, clear account of the video shown.
 9 | <Video><VideoHere></Video> Render a clear and concise summary of the video below.
10 | <Video><VideoHere></Video> Write a terse but informative summary of the following video clip.
11 | <Video><VideoHere></Video> Create a compact narrative representing the video presented.


--------------------------------------------------------------------------------
/baselines/video_chat2/prompts/concise_image_description.txt:
--------------------------------------------------------------------------------
 1 | <Image><ImageHere></Image> Describe the following image concisely.
 2 | <Image><ImageHere></Image> Provide a brief description of the given image.
 3 | <Image><ImageHere></Image> Offer a succinct explanation of the picture presented.
 4 | <Image><ImageHere></Image> Summarize the visual content of the following image.
 5 | <Image><ImageHere></Image> Give a short and clear explanation of the subsequent image.
 6 | <Image><ImageHere></Image> Share a concise interpretation of the image provided.
 7 | <Image><ImageHere></Image> Present a compact description of the photo's key features.
 8 | <Image><ImageHere></Image> Relay a brief, clear account of the picture shown.
 9 | <Image><ImageHere></Image> Render a clear and concise summary of the photo below.
10 | <Image><ImageHere></Image> Write a terse but informative summary of the following picture.
11 | <Image><ImageHere></Image> Create a compact narrative representing the image presented.


--------------------------------------------------------------------------------
/baselines/video_chat2/utils/config_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import sys
 4 | from os.path import dirname, join
 5 | 
 6 | from utils.config import Config
 7 | from utils.distributed import init_distributed_mode, is_main_process
 8 | from utils.logger import setup_logger
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | def setup_config():
14 |     """Conbine yaml config and command line config with OmegaConf.
15 |     Also converts types, e.g., `'None'` (str) --> `None` (None)
16 |     """
17 |     config = Config.get_config()
18 |     if config.debug:
19 |         config.wandb.enable = False
20 |     return config
21 | 
22 | 
23 | def setup_evaluate_config(config):
24 |     """setup evaluation default settings, e.g., disable wandb"""
25 |     assert config.evaluate
26 |     config.wandb.enable = False
27 |     if config.output_dir is None:
28 |         config.output_dir = join(dirname(config.pretrained_path), "eval")
29 |     return config
30 | 
31 | 
32 | def setup_output_dir(output_dir, excludes=["code"]):
33 |     """ensure not overwritting an exisiting/non-empty output dir"""
34 |     if not os.path.exists(output_dir):
35 |         os.makedirs(output_dir, exist_ok=False)
36 |     else:
37 |         existing_dirs_files = os.listdir(output_dir)  # list
38 |         remaining = set(existing_dirs_files) - set(excludes)
39 |         remaining = [e for e in remaining if "slurm" not in e]
40 |         remaining = [e for e in remaining if ".out" not in e]
41 |         # assert len(remaining) == 0, f"remaining dirs or files: {remaining}"
42 |         logger.warn(f"remaining dirs or files: {remaining}")
43 | 
44 | 
45 | def setup_main():
46 |     """
47 |     Setup config, logger, output_dir, etc.
48 |     Shared for pretrain and all downstream tasks.
49 |     """
50 |     config = setup_config()
51 |     if hasattr(config, "evaluate") and config.evaluate:
52 |         config = setup_evaluate_config(config)
53 |     init_distributed_mode(config)
54 | 
55 |     if is_main_process():
56 |         setup_output_dir(config.output_dir, excludes=["code"])
57 |         setup_logger(output=config.output_dir, color=True, name="vindlu")
58 |         logger.info(f"config: {Config.pretty_text(config)}")
59 |         Config.dump(config, os.path.join(config.output_dir, "config.json"))
60 |     return config
61 | 


--------------------------------------------------------------------------------
/baselines/video_chatgpt/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import VideoChatGPTLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/baselines/video_chatgpt/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | 
 7 | # Defining model
 8 | DEFAULT_VIDEO_TOKEN = "<video>"
 9 | DEFAULT_VIDEO_PATCH_TOKEN = "<vid_patch>"
10 | DEFAULT_VID_START_TOKEN = "<vid_start>"
11 | DEFAULT_VID_END_TOKEN = "<vid_end>"
12 | 


--------------------------------------------------------------------------------
/baselines/video_chatgpt/demo/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_chatgpt/demo/__init__.py


--------------------------------------------------------------------------------
/baselines/video_chatgpt/eval/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_chatgpt/eval/__init__.py


--------------------------------------------------------------------------------
/baselines/video_chatgpt/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .video_chatgpt import VideoChatGPTLlamaForCausalLM, VideoChatGPTConfig
2 | 


--------------------------------------------------------------------------------
/baselines/video_chatgpt/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from video_chatgpt.model import *
10 | 
11 | 
12 | def consolidate_ckpt(src_path, dst_path):
13 |     print("Loading model")
14 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
15 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path)
16 |     src_model.save_pretrained(dst_path)
17 |     src_tokenizer.save_pretrained(dst_path)
18 | 
19 | 
20 | if __name__ == "__main__":
21 |     parser = argparse.ArgumentParser()
22 |     parser.add_argument("--src", type=str, required=True)
23 |     parser.add_argument("--dst", type=str, required=True)
24 | 
25 |     args = parser.parse_args()
26 | 
27 |     consolidate_ckpt(args.src, args.dst)
28 | 


--------------------------------------------------------------------------------
/baselines/video_chatgpt/model/make_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | 
11 | 
12 | def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
13 |     print("Loading base model")
14 |     base = AutoModelForCausalLM.from_pretrained(
15 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
16 | 
17 |     print("Loading target model")
18 |     target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
19 | 
20 |     print("Calculating delta")
21 |     for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
22 |         if name not in base.state_dict():
23 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
24 |             continue
25 |         if param.data.shape == base.state_dict()[name].shape:
26 |             param.data -= base.state_dict()[name]
27 |         else:
28 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
29 |             bparam = base.state_dict()[name]
30 |             param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
31 | 
32 |     print("Saving delta")
33 |     if hub_repo_id:
34 |         kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
35 |     else:
36 |         kwargs = {}
37 |     target.save_pretrained(delta_path, **kwargs)
38 |     target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
39 |     target_tokenizer.save_pretrained(delta_path, **kwargs)
40 | 
41 | 
42 | if __name__ == "__main__":
43 |     parser = argparse.ArgumentParser()
44 |     parser.add_argument("--base-model-path", type=str, required=True)
45 |     parser.add_argument("--target-model-path", type=str, required=True)
46 |     parser.add_argument("--delta-path", type=str, required=True)
47 |     parser.add_argument("--hub-repo-id", type=str, default=None)
48 |     args = parser.parse_args()
49 | 
50 |     make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
51 | 


--------------------------------------------------------------------------------
/baselines/video_chatgpt/model/utils.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from video_chatgpt.model import *
 3 | from transformers import StoppingCriteria
 4 | 
 5 | 
 6 | class KeywordsStoppingCriteria(StoppingCriteria):
 7 |     def __init__(self, keywords, tokenizer, input_ids):
 8 |         self.keywords = keywords
 9 |         self.keyword_ids = [tokenizer(keyword).input_ids for keyword in keywords]
10 |         self.keyword_ids = [keyword_id[0] for keyword_id in self.keyword_ids if type(keyword_id) is list and len(keyword_id) == 1]
11 |         self.tokenizer = tokenizer
12 |         self.start_len = None
13 |         self.input_ids = input_ids
14 | 
15 |     def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
16 |         if self.start_len is None:
17 |             self.start_len = self.input_ids.shape[1]
18 |         else:
19 |             for keyword_id in self.keyword_ids:
20 |                 if output_ids[0, -1] == keyword_id:
21 |                     return True
22 |             outputs = self.tokenizer.batch_decode(output_ids[:, self.start_len:], skip_special_tokens=True)[0]
23 |             for keyword in self.keywords:
24 |                 if keyword in outputs:
25 |                     return True
26 |         return False
27 | 


--------------------------------------------------------------------------------
/baselines/video_chatgpt/train/llava_trainer.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import torch.nn as nn
 4 | from transformers import Trainer
 5 | from typing import Optional
 6 | 
 7 | 
 8 | def unwrap_model(model: nn.Module) -> nn.Module:
 9 |     """
10 |     Recursively unwraps a model from potential containers (as used in distributed training).
11 | 
12 |     Args:
13 |         model (`torch.nn.Module`): The model to unwrap.
14 |     """
15 |     # since there could be multiple levels of wrapping, unwrap recursively
16 |     if hasattr(model, "module"):
17 |         return unwrap_model(model.module)
18 |     else:
19 |         return model
20 | 
21 | 
22 | class VideoChatGPTTrainer(Trainer):
23 | 
24 |     def _save(self, output_dir: Optional[str] = None, state_dict=None):
25 |         if getattr(self.args, 'tune_mm_mlp_adapter', False):
26 |             # Save the model
27 |             _state_dict = state_dict
28 |             if _state_dict is None:
29 |                 # Only save the model itself if we are using distributed training
30 |                 model_to_save = unwrap_model(self.model)
31 |                 _state_dict = model_to_save.state_dict()
32 | 
33 |             weight_to_save = {}
34 |             keys_to_match = ['mm_projector', 'embed_tokens', 'embed_in']
35 |             for k, v in _state_dict.items():
36 |                 if any(key_match in k for key_match in keys_to_match):
37 |                     weight_to_save[k] = v
38 | 
39 |             current_folder = output_dir.split('/')[-1]
40 |             parent_folder = os.path.dirname(output_dir)
41 |             if current_folder.startswith('checkpoint-'):
42 |                 mm_projector_folder = os.path.join(parent_folder, "mm_projector")
43 |                 os.makedirs(mm_projector_folder, exist_ok=True)
44 |                 torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
45 |             else:
46 |                 torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'), )
47 | 
48 |         # super(VideoChatGPTTrainer, self)._save(output_dir, state_dict)
49 | 


--------------------------------------------------------------------------------
/baselines/video_chatgpt/train/train_mem.py:
--------------------------------------------------------------------------------
 1 | # Need to call this before importing transformers.
 2 | from video_chatgpt.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
 3 | 
 4 | replace_llama_attn_with_flash_attn()
 5 | 
 6 | from video_chatgpt.train.train import train
 7 | 
 8 | if __name__ == "__main__":
 9 |     train()
10 | 


--------------------------------------------------------------------------------
/baselines/video_llama/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import os
 9 | import sys
10 | 
11 | from omegaconf import OmegaConf
12 | 
13 | from video_llama.common.registry import registry
14 | 
15 | from video_llama.datasets.builders import *
16 | from video_llama.models import *
17 | from video_llama.processors import *
18 | from video_llama.tasks import *
19 | 
20 | 
21 | root_dir = os.path.dirname(os.path.abspath(__file__))
22 | default_cfg = OmegaConf.load(os.path.join(root_dir, "configs/default.yaml"))
23 | 
24 | registry.register_path("library_root", root_dir)
25 | repo_root = os.path.join(root_dir, "..")
26 | registry.register_path("repo_root", repo_root)
27 | cache_root = os.path.join(repo_root, default_cfg.env.cache_root)
28 | registry.register_path("cache_root", cache_root)
29 | 
30 | registry.register("MAX_INT", sys.maxsize)
31 | registry.register("SPLIT_NAMES", ["train", "val", "test"])
32 | 


--------------------------------------------------------------------------------
/baselines/video_llama/common/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_llama/common/__init__.py


--------------------------------------------------------------------------------
/baselines/video_llama/common/gradcam.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | from matplotlib import pyplot as plt
 3 | from scipy.ndimage import filters
 4 | from skimage import transform as skimage_transform
 5 | 
 6 | 
 7 | def getAttMap(img, attMap, blur=True, overlap=True):
 8 |     attMap -= attMap.min()
 9 |     if attMap.max() > 0:
10 |         attMap /= attMap.max()
11 |     attMap = skimage_transform.resize(attMap, (img.shape[:2]), order=3, mode="constant")
12 |     if blur:
13 |         attMap = filters.gaussian_filter(attMap, 0.02 * max(img.shape[:2]))
14 |         attMap -= attMap.min()
15 |         attMap /= attMap.max()
16 |     cmap = plt.get_cmap("jet")
17 |     attMapV = cmap(attMap)
18 |     attMapV = np.delete(attMapV, 3, 2)
19 |     if overlap:
20 |         attMap = (
21 |             1 * (1 - attMap**0.7).reshape(attMap.shape + (1,)) * img
22 |             + (attMap**0.7).reshape(attMap.shape + (1,)) * attMapV
23 |         )
24 |     return attMap
25 | 


--------------------------------------------------------------------------------
/baselines/video_llama/configs/datasets/cc_sbu/align.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   cc_sbu_align:
3 |     data_type: images
4 |     build_info:
5 |       storage: /path/to/cc_sbu_align_dataset
6 | 


--------------------------------------------------------------------------------
/baselines/video_llama/configs/datasets/cc_sbu/defaults.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   cc_sbu:
3 |     data_type: images
4 |     build_info:
5 |       storage: /path/to/cc_sbu_dataset/{00000..00001}.tar
6 | 


--------------------------------------------------------------------------------
/baselines/video_llama/configs/datasets/instruct/llava_instruct.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   llava_instruct:
3 |     data_type: image
4 |     build_info:
5 |       anno_dir: /path/llava_instruct_150k.json
6 |       videos_dir: /path/train2014/train2014/
7 | 


--------------------------------------------------------------------------------
/baselines/video_llama/configs/datasets/instruct/webvid_instruct.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   webvid_instruct:
3 |     data_type: image
4 |     build_info:
5 |       anno_dir: /path/webvid_align/videochat_instruct_11k.json
6 |       videos_dir: /path/webvid_align/videos/
7 | 


--------------------------------------------------------------------------------
/baselines/video_llama/configs/datasets/laion/defaults.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   laion:
3 |     data_type: images
4 |     build_info:
5 |       storage: path/laion/laion_dataset/{00000..00001}.tar
6 | 


--------------------------------------------------------------------------------
/baselines/video_llama/configs/datasets/webvid/defaults.yaml:
--------------------------------------------------------------------------------
1 | datasets:
2 |   webvid:
3 |     data_type: video
4 |     build_info:
5 |       anno_dir: path/webvid/webvid_tain_data/annotations/
6 |       videos_dir: path//webvid/webvid_tain_data/videos/
7 | 


--------------------------------------------------------------------------------
/baselines/video_llama/configs/default.yaml:
--------------------------------------------------------------------------------
1 | env:
2 |   # For default users
3 |   # cache_root: "cache"
4 |   # For internal use with persistent storage
5 |   cache_root: "/export/home/.cache/minigpt4"
6 | 


--------------------------------------------------------------------------------
/baselines/video_llama/configs/models/minigpt4.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: mini_gpt4
 3 | 
 4 |   # vit encoder
 5 |   image_size: 224
 6 |   drop_path_rate: 0
 7 |   use_grad_checkpoint: False
 8 |   vit_precision: "fp16"
 9 |   freeze_vit: True
10 |   freeze_qformer: True
11 | 
12 |   # Q-Former
13 |   num_query_token: 32
14 | 
15 |   # Vicuna
16 |   llama_model: "ckpt/vicuna-13b/"
17 | 
18 |   # generation configs
19 |   prompt: ""
20 | 
21 | preprocess:
22 |     vis_processor:
23 |         train:
24 |           name: "blip2_image_train"
25 |           image_size: 224
26 |         eval:
27 |           name: "blip2_image_eval"
28 |           image_size: 224
29 |     text_processor:
30 |         train:
31 |           name: "blip_caption"
32 |         eval:
33 |           name: "blip_caption"
34 | 


--------------------------------------------------------------------------------
/baselines/video_llama/configs/models/video_llama.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: video_llama
 3 | 
 4 |   # vit encoder
 5 |   image_size: 224
 6 |   drop_path_rate: 0
 7 |   use_grad_checkpoint: False
 8 |   vit_precision: "fp16"
 9 |   freeze_vit: True
10 |   freeze_qformer: True
11 | 
12 |   # Q-Former
13 |   num_query_token: 32
14 | 
15 |   # Vicuna
16 |   llama_model: "ckpt/vicuna-7b/"
17 | 
18 |   # generation configs
19 |   prompt: ""
20 | 
21 | preprocess:
22 |     vis_processor:
23 |         train:
24 |           name: "alpro_video_train"
25 |           image_size: 224
26 |           n_frms: 8
27 |         eval:
28 |           name: "alpro_video_eval"
29 |           image_size: 224
30 |           n_frms: 8
31 |     text_processor:
32 |         train:
33 |           name: "blip_caption"
34 |         eval:
35 |           name: "blip_caption"
36 |     


--------------------------------------------------------------------------------
/baselines/video_llama/conversation/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_llama/conversation/__init__.py


--------------------------------------------------------------------------------
/baselines/video_llama/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_llama/datasets/__init__.py


--------------------------------------------------------------------------------
/baselines/video_llama/datasets/builders/video_caption_builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import logging
 3 | import warnings
 4 | 
 5 | from video_llama.common.registry import registry
 6 | from video_llama.datasets.builders.base_dataset_builder import BaseDatasetBuilder
 7 | from video_llama.datasets.datasets.webvid_datasets import WebvidDataset
 8 | 
 9 | @registry.register_builder("webvid")
10 | class WebvidBuilder(BaseDatasetBuilder):
11 |     train_dataset_cls = WebvidDataset
12 |     DATASET_CONFIG_DICT = {"default": "configs/datasets/webvid/defaults.yaml"}
13 |     
14 |     def _download_ann(self):
15 |         pass
16 | 
17 |     def _download_vis(self):
18 |         pass
19 | 
20 |     def build(self):
21 |         self.build_processors()
22 |         datasets = dict()
23 |         split = "train"
24 | 
25 |         build_info = self.config.build_info
26 |         dataset_cls = self.train_dataset_cls
27 |         datasets[split] = dataset_cls(
28 |             vis_processor=self.vis_processors[split],
29 |             text_processor=self.text_processors[split],
30 |             vis_root=build_info.videos_dir,
31 |             ann_root=build_info.anno_dir
32 |         )
33 | 
34 |         return datasets


--------------------------------------------------------------------------------
/baselines/video_llama/datasets/datasets/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_llama/datasets/datasets/__init__.py


--------------------------------------------------------------------------------
/baselines/video_llama/datasets/datasets/base_dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import json
 9 | from typing import Iterable
10 | 
11 | from torch.utils.data import Dataset, ConcatDataset
12 | from torch.utils.data.dataloader import default_collate
13 | 
14 | 
15 | class BaseDataset(Dataset):
16 |     def __init__(
17 |         self, vis_processor=None, text_processor=None, vis_root=None, ann_paths=[]
18 |     ):
19 |         """
20 |         vis_root (string): Root directory of images (e.g. coco/images/)
21 |         ann_root (string): directory to store the annotation file
22 |         """
23 |         self.vis_root = vis_root
24 | 
25 |         self.annotation = []
26 |         for ann_path in ann_paths:
27 |             self.annotation.extend(json.load(open(ann_path, "r"))['annotations'])
28 | 
29 |         self.vis_processor = vis_processor
30 |         self.text_processor = text_processor
31 | 
32 |         self._add_instance_ids()
33 | 
34 |     def __len__(self):
35 |         return len(self.annotation)
36 | 
37 |     def collater(self, samples):
38 |         return default_collate(samples)
39 | 
40 |     def set_processors(self, vis_processor, text_processor):
41 |         self.vis_processor = vis_processor
42 |         self.text_processor = text_processor
43 | 
44 |     def _add_instance_ids(self, key="instance_id"):
45 |         for idx, ann in enumerate(self.annotation):
46 |             ann[key] = str(idx)
47 | 
48 | 
49 | class ConcatDataset(ConcatDataset):
50 |     def __init__(self, datasets: Iterable[Dataset]) -> None:
51 |         super().__init__(datasets)
52 | 
53 |     def collater(self, samples):
54 |         # TODO For now only supports datasets with same underlying collater implementations
55 | 
56 |         all_keys = set()
57 |         for s in samples:
58 |             all_keys.update(s)
59 | 
60 |         shared_keys = all_keys
61 |         for s in samples:
62 |             shared_keys = shared_keys & set(s.keys())
63 | 
64 |         samples_shared_keys = []
65 |         for s in samples:
66 |             samples_shared_keys.append({k: s[k] for k in s.keys() if k in shared_keys})
67 | 
68 |         return self.datasets[0].collater(samples_shared_keys)
69 | 


--------------------------------------------------------------------------------
/baselines/video_llama/datasets/datasets/cc_sbu_dataset.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from PIL import Image
 3 | import webdataset as wds
 4 | from video_llama.datasets.datasets.base_dataset import BaseDataset
 5 | from video_llama.datasets.datasets.caption_datasets import CaptionDataset
 6 | 
 7 | 
 8 | class CCSBUDataset(BaseDataset):
 9 |     def __init__(self, vis_processor, text_processor, location):
10 |         super().__init__(vis_processor=vis_processor, text_processor=text_processor)
11 | 
12 |         self.inner_dataset = wds.DataPipeline(
13 |             wds.ResampledShards(location),
14 |             wds.tarfile_to_samples(handler=wds.warn_and_continue),
15 |             wds.shuffle(1000, handler=wds.warn_and_continue),
16 |             wds.decode("pilrgb", handler=wds.warn_and_continue),
17 |             wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
18 |             wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
19 |             wds.map(self.to_dict, handler=wds.warn_and_continue),
20 |         )
21 | 
22 |     def to_dict(self, sample):
23 |         return {
24 |             "image": sample[0],
25 |             "text_input": self.text_processor(sample[1]["caption"]),
26 |             "type":'image',
27 |         }
28 | 
29 | 
30 | class CCSBUAlignDataset(CaptionDataset):
31 | 
32 |     def __getitem__(self, index):
33 | 
34 |         # TODO this assumes image input, not general enough
35 |         ann = self.annotation[index]
36 | 
37 |         img_file = '{}.jpg'.format(ann["image_id"])
38 |         image_path = os.path.join(self.vis_root, img_file)
39 |         image = Image.open(image_path).convert("RGB")
40 | 
41 |         image = self.vis_processor(image)
42 |         caption = ann["caption"]
43 | 
44 |         return {
45 |             "image": image,
46 |             "text_input": caption,
47 |             "image_id": self.img_ids[ann["image_id"]],
48 |             "type":'image',
49 |         }


--------------------------------------------------------------------------------
/baselines/video_llama/datasets/datasets/laion_dataset.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | import webdataset as wds
 9 | from video_llama.datasets.datasets.base_dataset import BaseDataset
10 | 
11 | 
12 | class LaionDataset(BaseDataset):
13 |     def __init__(self, vis_processor, text_processor, location):
14 |         super().__init__(vis_processor=vis_processor, text_processor=text_processor)
15 | 
16 |         self.inner_dataset = wds.DataPipeline(
17 |             wds.ResampledShards(location),
18 |             wds.tarfile_to_samples(handler=wds.warn_and_continue),
19 |             wds.shuffle(1000, handler=wds.warn_and_continue),
20 |             wds.decode("pilrgb", handler=wds.warn_and_continue),
21 |             wds.to_tuple("jpg", "json", handler=wds.warn_and_continue),
22 |             wds.map_tuple(self.vis_processor, handler=wds.warn_and_continue),
23 |             wds.map(self.to_dict, handler=wds.warn_and_continue),
24 |         )
25 | 
26 |     def to_dict(self, sample):
27 |         return {
28 |             "image": sample[0],
29 |             "text_input": self.text_processor(sample[1]["caption"]),
30 |         }
31 | 
32 | 


--------------------------------------------------------------------------------
/baselines/video_llama/models/ImageBind/.assets/bird_audio.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_llama/models/ImageBind/.assets/bird_audio.wav


--------------------------------------------------------------------------------
/baselines/video_llama/models/ImageBind/.assets/bird_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_llama/models/ImageBind/.assets/bird_image.jpg


--------------------------------------------------------------------------------
/baselines/video_llama/models/ImageBind/.assets/car_audio.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_llama/models/ImageBind/.assets/car_audio.wav


--------------------------------------------------------------------------------
/baselines/video_llama/models/ImageBind/.assets/car_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_llama/models/ImageBind/.assets/car_image.jpg


--------------------------------------------------------------------------------
/baselines/video_llama/models/ImageBind/.assets/dog_audio.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_llama/models/ImageBind/.assets/dog_audio.wav


--------------------------------------------------------------------------------
/baselines/video_llama/models/ImageBind/.assets/dog_image.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_llama/models/ImageBind/.assets/dog_image.jpg


--------------------------------------------------------------------------------
/baselines/video_llama/models/ImageBind/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to ImageBind
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Pull Requests
 6 | We actively welcome your pull requests.
 7 | 
 8 | 1. Fork the repo and create your branch from `main`.
 9 | 2. If you've added code that should be tested, add tests.
10 | 3. If you've changed APIs, update the documentation.
11 | 4. Ensure the test suite passes.
12 | 5. Make sure your code lints.
13 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
14 | 
15 | ## Contributor License Agreement ("CLA")
16 | In order to accept your pull request, we need you to submit a CLA. You only need
17 | to do this once to work on any of Meta's open source projects.
18 | 
19 | Complete your CLA here: <https://code.facebook.com/cla>
20 | 
21 | ## Issues
22 | We use GitHub issues to track public bugs. Please ensure your description is
23 | clear and has sufficient instructions to be able to reproduce the issue.
24 | 
25 | Meta has a [bounty program](https://www.facebook.com/whitehat/) for the safe
26 | disclosure of security bugs. In those cases, please go through the process
27 | outlined on that page and do not file a public issue.
28 | 
29 | ## License
30 | By contributing to Omnivore, you agree that your contributions will be licensed
31 | under the [LICENSE](LICENSE) file in the root directory of this source tree.
32 | 


--------------------------------------------------------------------------------
/baselines/video_llama/models/ImageBind/bpe/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_llama/models/ImageBind/bpe/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/baselines/video_llama/models/ImageBind/models/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_llama/models/ImageBind/models/__init__.py


--------------------------------------------------------------------------------
/baselines/video_llama/models/ImageBind/requirements.txt:
--------------------------------------------------------------------------------
 1 | --extra-index-url https://download.pytorch.org/whl/cu113
 2 | torch==1.13.0
 3 | torchvision==0.14.0
 4 | torchaudio==0.13.0
 5 | pytorchvideo @ git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d
 6 | timm==0.6.7
 7 | ftfy
 8 | regex
 9 | einops
10 | fvcore
11 | decord==0.6.0
12 | iopath
13 | numpy
14 | matplotlib
15 | types-regex
16 | mayavi
17 | cartopy
18 | 


--------------------------------------------------------------------------------
/baselines/video_llama/processors/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from video_llama.processors.base_processor import BaseProcessor
 9 | from video_llama.processors.blip_processors import (
10 |     Blip2ImageTrainProcessor,
11 |     Blip2ImageEvalProcessor,
12 |     BlipCaptionProcessor,
13 | )
14 | from video_llama.processors.video_processor import (
15 |     AlproVideoTrainProcessor,
16 |     AlproVideoEvalProcessor
17 | )
18 | from video_llama.common.registry import registry
19 | 
20 | __all__ = [
21 |     "BaseProcessor",
22 |     "Blip2ImageTrainProcessor",
23 |     "Blip2ImageEvalProcessor",
24 |     "BlipCaptionProcessor",
25 |     "AlproVideoTrainProcessor",
26 |     "AlproVideoEvalProcessor",
27 | ]
28 | 
29 | 
30 | def load_processor(name, cfg=None):
31 |     """
32 |     Example
33 | 
34 |     >>> processor = load_processor("alpro_video_train", cfg=None)
35 |     """
36 |     processor = registry.get_processor_class(name).from_config(cfg)
37 | 
38 |     return processor
39 | 


--------------------------------------------------------------------------------
/baselines/video_llama/processors/base_processor.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from omegaconf import OmegaConf
 9 | 
10 | 
11 | class BaseProcessor:
12 |     def __init__(self):
13 |         self.transform = lambda x: x
14 |         return
15 | 
16 |     def __call__(self, item):
17 |         return self.transform(item)
18 | 
19 |     @classmethod
20 |     def from_config(cls, cfg=None):
21 |         return cls()
22 | 
23 |     def build(self, **kwargs):
24 |         cfg = OmegaConf.create(kwargs)
25 | 
26 |         return self.from_config(cfg)
27 | 


--------------------------------------------------------------------------------
/baselines/video_llama/runners/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from video_llama.runners.runner_base import RunnerBase
 9 | 
10 | __all__ = ["RunnerBase"]
11 | 


--------------------------------------------------------------------------------
/baselines/video_llama/runners/test.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/video_llama/runners/test.py


--------------------------------------------------------------------------------
/baselines/video_llama/tasks/__init__.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from video_llama.common.registry import registry
 9 | from video_llama.tasks.base_task import BaseTask
10 | from video_llama.tasks.image_text_pretrain import ImageTextPretrainTask
11 | from video_llama.tasks.video_text_pretrain import VideoTextPretrainTask
12 | 
13 | 
14 | def setup_task(cfg):
15 |     assert "task" in cfg.run_cfg, "Task name must be provided."
16 | 
17 |     task_name = cfg.run_cfg.task
18 |     task = registry.get_task_class(task_name).setup_task(cfg=cfg)
19 |     assert task is not None, "Task {} not properly registered.".format(task_name)
20 | 
21 |     return task
22 | 
23 | 
24 | __all__ = [
25 |     "BaseTask",
26 |     "ImageTextPretrainTask",
27 |     "VideoTextPretrainTask"
28 | ]
29 | 


--------------------------------------------------------------------------------
/baselines/video_llama/tasks/image_text_pretrain.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from video_llama.common.registry import registry
 9 | from video_llama.tasks.base_task import BaseTask
10 | 
11 | 
12 | @registry.register_task("image_text_pretrain")
13 | class ImageTextPretrainTask(BaseTask):
14 |     def __init__(self):
15 |         super().__init__()
16 | 
17 |     def evaluation(self, model, data_loader, cuda_enabled=True):
18 |         pass
19 | 


--------------------------------------------------------------------------------
/baselines/video_llama/tasks/video_text_pretrain.py:
--------------------------------------------------------------------------------
 1 | """
 2 |  Copyright (c) 2022, salesforce.com, inc.
 3 |  All rights reserved.
 4 |  SPDX-License-Identifier: BSD-3-Clause
 5 |  For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
 6 | """
 7 | 
 8 | from video_llama.common.registry import registry
 9 | from video_llama.tasks.base_task import BaseTask
10 | 
11 | 
12 | @registry.register_task("video_text_pretrain")
13 | class VideoTextPretrainTask(BaseTask):
14 |     def __init__(self):
15 |         super().__init__()
16 | 
17 |     def evaluation(self, model, data_loader, cuda_enabled=True):
18 |         pass
19 | 


--------------------------------------------------------------------------------
/baselines/video_llama/video_llama_eval_withaudio.yaml:
--------------------------------------------------------------------------------
 1 | model:
 2 |   arch: video_llama
 3 |   model_type: pretrain_vicuna
 4 |   freeze_vit: True
 5 |   freeze_qformer: True
 6 |   max_txt_len: 512
 7 |   end_sym: "###"
 8 |   low_resource: False
 9 | 
10 |   frozen_llama_proj: False
11 | 
12 |   # If you want use LLaMA-2-chat,
13 |   # some ckpts could be download from our provided huggingface repo
14 |   # i.e.  https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Finetuned
15 |   llama_model: "ckpt/llama-2-7b-chat-hf" # "ckpt/vicuna-13b/" or "ckpt/vicuna-7b/" or "ckpt/llama-2-7b-chat-hf"  or "ckpt/llama-2-13b-chat-hf"
16 |   imagebind_ckpt_path: "ckpt/imagebind_path/"
17 |   ckpt: 'path/pretrained_visual_branch_ckpt'   # you can use our pretrained ckpt from https://huggingface.co/DAMO-NLP-SG/Video-LLaMA-2-13B-Pretrained/
18 |   ckpt_2:  'path/pretrained_audio_branch_ckpt'
19 | 
20 |   equip_audio_branch: True  # whether equips the audio branch
21 |   fusion_head_layers: 2
22 |   max_frame_pos: 32
23 |   fusion_header_type: "seqTransf"
24 | 
25 | 
26 | datasets:
27 |   webvid:
28 |     vis_processor:
29 |       train:
30 |         name: "alpro_video_eval"
31 |         n_frms: 8
32 |         image_size: 224
33 |     text_processor:
34 |       train:
35 |         name: "blip_caption"
36 | 
37 | run:
38 |   task: video_text_pretrain
39 | 


--------------------------------------------------------------------------------
/baselines/videolavit/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/videolavit/__init__.py


--------------------------------------------------------------------------------
/baselines/videolavit_modeling.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import torch
 3 | import random
 4 | import torch.nn as nn
 5 | import torch.backends.cudnn as cudnn
 6 | import numpy as np
 7 | from PIL import Image
 8 | 
 9 | os.environ['DECORD_EOF_RETRY_MAX'] = '20480'
10 | 
11 | from videolavit.models import build_model
12 | 
13 | from base import ViLLMBaseModel
14 | 
15 | def seed_everything(seed=42):
16 |     torch.manual_seed(seed)
17 |     np.random.seed(seed)
18 |     random.seed(seed)
19 | 
20 | class VideoLaVIT(ViLLMBaseModel):
21 |     def __init__(self, model_args):
22 |         super().__init__(model_args['model_path'], model_args['device'])
23 |         assert(
24 |             "model_path" in model_args
25 |             and "device" in model_args
26 |         )
27 |         
28 |         model_path = os.path.join(model_args["model_path"], "language_model_sft")
29 |         device_id = model_args["device"]
30 |         model_dtype = "bf16"
31 |         # model_dtype = "fp16"
32 | 
33 |         max_video_clips = 16
34 |         torch.cuda.set_device(device_id)
35 |         self.device = torch.device("cuda")
36 | 
37 |         # For Multi-Modal Understanding
38 |         self.runner = build_model(model_path=model_path, model_dtype=model_dtype, understanding=True, 
39 |                 device_id=device_id, use_xformers=True, max_video_clips=max_video_clips,)
40 | 
41 | 
42 |     def generate(self, instruction, video_path):
43 | 
44 |         
45 |         output = self.runner({"video": video_path, "text_input": instruction}, length_penalty=1, \
46 |             use_nucleus_sampling=False, num_beams=1, max_length=512, temperature=1.0)[0]
47 |         outputs = output.strip()
48 |         
49 |         # print(outputs)
50 |         return outputs


--------------------------------------------------------------------------------
/baselines/videollama2/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | 
 9 | # Image arguments
10 | IMAGE_TOKEN_INDEX = -200
11 | DEFAULT_IMAGE_TOKEN = "<image>"
12 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
13 | DEFAULT_IM_START_TOKEN = "<im_start>"
14 | DEFAULT_IM_END_TOKEN = "<im_end>"
15 | IMAGE_PLACEHOLDER = "<image-placeholder>"
16 | 
17 | # Video arguments
18 | VIDEO_TOKEN_INDEX = -201
19 | DEFAULT_VIDEO_TOKEN = "<video>"
20 | NUM_FRAMES = 8
21 | MAX_FRAMES = 32
22 | NUM_FRAMES_PER_SECOND = 1
23 | 
24 | # Audio arguments
25 | AUDIO_TOKEN_INDEX = -202
26 | DEFAULT_AUDIO_TOKEN = "<audio>"
27 | 
28 | MODAL_INDEX_MAP = {
29 |     "<image>": -200,
30 |     "<video>": -201,
31 |     "<audio>": -202,
32 | }
33 | 


--------------------------------------------------------------------------------
/baselines/videollama2/eval/eval_audio_TUT2017.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import ast
 3 | import json
 4 | import time
 5 | import argparse
 6 | import traceback
 7 | from tqdm import tqdm
 8 | from concurrent.futures import ThreadPoolExecutor, as_completed
 9 | from sklearn.metrics import accuracy_score
10 | 
11 | label_mapping = {
12 |     '(A)': 'beach',
13 |     '(B)': 'bus',
14 |     '(C)': 'cafe or restaurant',
15 |     '(D)': 'car',
16 |     '(E)': 'city center',
17 |     '(F)': 'forest path',
18 |     '(G)': 'grocery store',
19 |     '(H)': 'home',
20 |     '(I)': 'library',
21 |     '(J)': 'metro station',
22 |     '(K)': 'office',
23 |     '(L)': 'park',
24 |     '(M)': 'residential area',
25 |     '(N)': 'train',
26 |     '(O)': 'tram',
27 |     'A': 'beach',
28 |     'B': 'bus',
29 |     'C': 'cafe or restaurant',
30 |     'D': 'car',
31 |     'E': 'city center',
32 |     'F': 'forest path',
33 |     'G': 'grocery store',
34 |     'H': 'home',
35 |     'I': 'library',
36 |     'J': 'metro station',
37 |     'K': 'office',
38 |     'L': 'park',
39 |     'M': 'residential area',
40 |     'N': 'train',
41 |     'O': 'tram',
42 | }
43 | 
44 | 
45 | def main(args):
46 | 
47 |     file = open(args.pred_path)
48 |     new_pred_contents = [eval(i.strip()) for i in file.readlines()]
49 | 
50 |     # Calculate average score and accuracy
51 |     correct_count = 0
52 | 
53 |     for sample in tqdm(new_pred_contents):
54 | 
55 |         prediction = sample['pred'].strip().upper()
56 |         answer = sample['answer'].upper()
57 | 
58 |         if answer in prediction:
59 |             correct_count += 1
60 |         elif prediction in label_mapping and answer in label_mapping[prediction].upper():
61 |             correct_count += 1
62 | 
63 |     # Calculate accuracy for all responses using sklearn's accuracy_score
64 |     print(f'Overall Accuracy: {correct_count / len(new_pred_contents):.2%}', f'Total Responses: {len(new_pred_contents)}')  # Formatted as percentage
65 | 
66 | 
67 | if __name__ == "__main__":
68 |     parser = argparse.ArgumentParser(description="question-answer")
69 |     parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
70 |     args = parser.parse_args()
71 | 
72 |     main(args)
73 | 


--------------------------------------------------------------------------------
/baselines/videollama2/eval/eval_audio_clothoAQA.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import ast
 3 | import json
 4 | import time
 5 | import argparse
 6 | import traceback
 7 | from tqdm import tqdm
 8 | from concurrent.futures import ThreadPoolExecutor, as_completed
 9 | from sklearn.metrics import accuracy_score
10 | 
11 | 
12 | def main(args):
13 | 
14 |     file = open(args.pred_path)
15 |     new_pred_contents = [eval(i.strip()) for i in file.readlines()]
16 | 
17 |     # Calculate average score and accuracy
18 |     binary_refs = []
19 |     binary_hyps = []
20 |     all_refs = []
21 |     all_hyps = []
22 |    
23 |     for sample in tqdm(new_pred_contents):
24 |         # Computing accuracy
25 |         all_refs.append(sample['answer'].upper())
26 |         all_hyps.append(sample['pred'].rstrip(' .').upper())
27 | 
28 |         if sample['answer'].lower() in ["yes", "no"]:
29 |             binary_refs.append(sample['answer'].lower())
30 |             binary_hyps.append(sample['pred'].lower())
31 | 
32 |     # Calculate accuracy for all responses using sklearn's accuracy_score
33 |     overall_accuracy = accuracy_score(all_refs, all_hyps)
34 |     print(f'Overall Accuracy: {overall_accuracy:.2%}', f'Total Responses: {len(all_refs)}')  # Formatted as percentage
35 |     # Calculate binary accuracy
36 |     bi_score = accuracy_score(binary_refs, binary_hyps)
37 |     print(f'Binary Accuracy: {bi_score:.2%}', f'Total Binary Responses: {len(binary_refs)}')
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser(description="question-answer")
42 |     parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
43 |     args = parser.parse_args()
44 | 
45 |     main(args)
46 | 


--------------------------------------------------------------------------------
/baselines/videollama2/eval/eval_audio_video_AVQA.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import ast
 3 | import json
 4 | import time
 5 | import argparse
 6 | import traceback
 7 | from tqdm import tqdm
 8 | from concurrent.futures import ThreadPoolExecutor, as_completed
 9 | from sklearn.metrics import accuracy_score
10 | 
11 | 
12 | def main(args):
13 | 
14 |     file = open(args.pred_path)
15 |     new_pred_contents = [eval(i.strip()) for i in file.readlines()]
16 | 
17 |     # Calculate average score and accuracy
18 |     all_refs = []
19 |     all_hyps = []
20 |    
21 |     for sample in tqdm(new_pred_contents):
22 |         # Computing accuracy
23 |         all_refs.append(sample['answer'].upper())
24 |         all_hyps.append(sample['pred'].rstrip(' .').upper())
25 | 
26 |     # Calculate accuracy for all responses using sklearn's accuracy_score
27 |     overall_accuracy = accuracy_score(all_refs, all_hyps)
28 |     print(f'Overall Accuracy: {overall_accuracy:.2%}', f'Total Responses: {len(all_refs)}')  # Formatted as percentage
29 | 
30 | if __name__ == "__main__":
31 |     parser = argparse.ArgumentParser(description="question-answer-generation-using-gpt-3")
32 |     parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
33 |     args = parser.parse_args()
34 | 
35 |     main(args)
36 | 


--------------------------------------------------------------------------------
/baselines/videollama2/eval/eval_audio_vocalsound.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import ast
 3 | import json
 4 | import time
 5 | import argparse
 6 | import traceback
 7 | from tqdm import tqdm
 8 | from concurrent.futures import ThreadPoolExecutor, as_completed
 9 | from sklearn.metrics import accuracy_score
10 | 
11 | label_mapping = {
12 |     '(A)': 'Laughter',
13 |     '(B)': 'Sigh',
14 |     '(C)': 'Cough',
15 |     '(D)': 'Throat clearing',
16 |     '(E)': 'Sneeze',
17 |     '(F)': 'Sniff',
18 |     'A': 'Laughter',
19 |     'B': 'Sigh',
20 |     'C': 'Cough',
21 |     'D': 'Throat clearing',
22 |     'E': 'Sneeze',
23 |     'F': 'Sniff',
24 | }
25 | 
26 | 
27 | def main(args):
28 | 
29 |     file = open(args.pred_path)
30 |     new_pred_contents = [eval(i.strip()) for i in file.readlines()]
31 | 
32 |     # Calculate average score and accuracy
33 |     correct_count = 0
34 | 
35 |     for sample in tqdm(new_pred_contents):
36 | 
37 |         prediction = sample['pred'].strip().upper()
38 |         answer = sample['answer'].upper()
39 | 
40 |         if answer in prediction:
41 |             correct_count += 1
42 |         elif prediction in label_mapping and answer in label_mapping[prediction].upper():
43 |             correct_count += 1
44 | 
45 |     # Calculate accuracy for all responses using sklearn's accuracy_score
46 |     print(f'Overall Accuracy: {correct_count / len(new_pred_contents):.2%}', f'Total Responses: {len(new_pred_contents)}')  # Formatted as percentage
47 | 
48 | 
49 | if __name__ == "__main__":
50 |     parser = argparse.ArgumentParser(description="question-answer")
51 |     parser.add_argument("--pred-path", required=True, help="The path to file containing prediction.")
52 |     args = parser.parse_args()
53 | 
54 |     main(args)
55 | 


--------------------------------------------------------------------------------
/baselines/videollama2/model/beats/LICENSE_beats:
--------------------------------------------------------------------------------
 1 | The MIT License (MIT)
 2 | 
 3 | Copyright (c) Microsoft Corporation
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/baselines/videollama2/model/beats/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/videollama2/model/beats/__init__.py


--------------------------------------------------------------------------------
/baselines/videollama2/model/mel_filters.npz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/videollama2/model/mel_filters.npz


--------------------------------------------------------------------------------
/baselines/videollama2/serve/examples/bird-twitter-car.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/videollama2/serve/examples/bird-twitter-car.wav


--------------------------------------------------------------------------------
/baselines/videollama2/serve/examples/desert.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/videollama2/serve/examples/desert.jpg


--------------------------------------------------------------------------------
/baselines/videollama2/serve/examples/door.of.bar.raining2.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/videollama2/serve/examples/door.of.bar.raining2.wav


--------------------------------------------------------------------------------
/baselines/videollama2/serve/examples/extreme_ironing.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/videollama2/serve/examples/extreme_ironing.jpg


--------------------------------------------------------------------------------
/baselines/videollama2/serve/examples/waterview.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/patrick-tssn/VideoHallucer/2c6152e61207fd00db1d6d8b72d4893533828867/baselines/videollama2/serve/examples/waterview.jpg


--------------------------------------------------------------------------------
/baselines/videollama2/serve/register_worker.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Manually register workers.
 3 | 
 4 | Usage:
 5 | python3 -m fastchat.serve.register_worker --controller http://localhost:21001 --worker-name http://localhost:21002
 6 | """
 7 | 
 8 | import argparse
 9 | 
10 | import requests
11 | 
12 | if __name__ == "__main__":
13 |     parser = argparse.ArgumentParser()
14 |     parser.add_argument("--controller-address", type=str)
15 |     parser.add_argument("--worker-name", type=str)
16 |     parser.add_argument("--check-heart-beat", action="store_true")
17 |     args = parser.parse_args()
18 | 
19 |     url = args.controller_address + "/register_worker"
20 |     data = {
21 |         "worker_name": args.worker_name,
22 |         "check_heart_beat": args.check_heart_beat,
23 |         "worker_status": None,
24 |     }
25 |     r = requests.post(url, json=data)
26 |     assert r.status_code == 200
27 | 


--------------------------------------------------------------------------------
/baselines/videollama2/serve/test_message.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import json
 3 | 
 4 | import requests
 5 | 
 6 | from llava.conversation import default_conversation
 7 | 
 8 | 
 9 | def main():
10 |     if args.worker_address:
11 |         worker_addr = args.worker_address
12 |     else:
13 |         controller_addr = args.controller_address
14 |         ret = requests.post(controller_addr + "/refresh_all_workers")
15 |         ret = requests.post(controller_addr + "/list_models")
16 |         models = ret.json()["models"]
17 |         models.sort()
18 |         print(f"Models: {models}")
19 | 
20 |         ret = requests.post(controller_addr + "/get_worker_address",
21 |             json={"model": args.model_name})
22 |         worker_addr = ret.json()["address"]
23 |         print(f"worker_addr: {worker_addr}")
24 | 
25 |     if worker_addr == "":
26 |         return
27 | 
28 |     conv = default_conversation.copy()
29 |     conv.append_message(conv.roles[0], args.message)
30 |     prompt = conv.get_prompt()
31 | 
32 |     headers = {"User-Agent": "LLaVA Client"}
33 |     pload = {
34 |         "model": args.model_name,
35 |         "prompt": prompt,
36 |         "max_new_tokens": args.max_new_tokens,
37 |         "temperature": 0.7,
38 |         "stop": conv.sep,
39 |     }
40 |     response = requests.post(worker_addr + "/worker_generate_stream", headers=headers,
41 |             json=pload, stream=True)
42 | 
43 |     print(prompt.replace(conv.sep, "\n"), end="")
44 |     for chunk in response.iter_lines(chunk_size=8192, decode_unicode=False, delimiter=b"\0"):
45 |         if chunk:
46 |             data = json.loads(chunk.decode("utf-8"))
47 |             output = data["text"].split(conv.sep)[-1]
48 |             print(output, end="\r")
49 |     print("")
50 | 
51 | 
52 | if __name__ == "__main__":
53 |     parser = argparse.ArgumentParser()
54 |     parser.add_argument("--controller-address", type=str, default="http://localhost:21001")
55 |     parser.add_argument("--worker-address", type=str)
56 |     parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
57 |     parser.add_argument("--max-new-tokens", type=int, default=32)
58 |     parser.add_argument("--message", type=str, default=
59 |         "Tell me a story with more than 1000 words.")
60 |     args = parser.parse_args()
61 | 
62 |     main()
63 | 


--------------------------------------------------------------------------------
/baselines/videollama2_modeling.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import random
 3 | 
 4 | import torch
 5 | 
 6 | 
 7 | from videollama2 import model_init, mm_infer
 8 | from videollama2.utils import disable_torch_init
 9 | 
10 | 
11 | from base import ViLLMBaseModel
12 | 
13 | 
14 | class Args:
15 |     def __init__(self):
16 |         self.model_type = 'av' # a, v, av
17 |         self.options = None  # Assuming None is the default when no options are provided
18 | 
19 | class VideoLLaMA2(ViLLMBaseModel):
20 |     def __init__(self, model_args):
21 |         super().__init__(model_args['model_path'], model_args['device'])
22 |         assert(
23 |             "model_path" in model_args
24 |             and "device" in model_args
25 |         )
26 |         args = Args()
27 |         
28 |         self.device = "cuda:"+str(model_args['device'])
29 |         model_path = model_args['model_path']
30 |         
31 |         self.model, self.processor, self.tokenizer = model_init(model_path, device=self.device)
32 |         
33 |     @torch.no_grad()
34 |     def generate(self, instruction, video_path):
35 |         preprocess = self.processor["video"]
36 |         audio_video_tensor = preprocess(video_path, va=True)
37 |         outputs = mm_infer(
38 |             audio_video_tensor,
39 |             instruction,
40 |             model=self.model,
41 |             tokenizer=self.tokenizer,
42 |             modal="video",
43 |             do_sample=False
44 |         )
45 |         
46 |         outputs = outputs.strip()
47 |         # print(outputs)
48 |         return outputs


--------------------------------------------------------------------------------
/baselines/videollava/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import LlavaLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/baselines/videollava/constants.py:
--------------------------------------------------------------------------------
 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
 2 | WORKER_HEART_BEAT_INTERVAL = 15
 3 | 
 4 | LOGDIR = "."
 5 | 
 6 | # Model Constants
 7 | IGNORE_INDEX = -100
 8 | 
 9 | IMAGE_TOKEN_INDEX = -200
10 | DEFAULT_IMAGE_TOKEN = "<image>"
11 | DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
12 | DEFAULT_IM_START_TOKEN = "<im_start>"
13 | DEFAULT_IM_END_TOKEN = "<im_end>"
14 | IMAGE_PLACEHOLDER = "<image-placeholder>"
15 | 
16 | # ======================================================================================================
17 | DEFAULT_VIDEO_TOKEN = "<video>"
18 | DEFAULT_VIDEO_PATCH_TOKEN = "<im_patch>"
19 | DEFAULT_VID_START_TOKEN = "<vid_start>"
20 | DEFAULT_VID_END_TOKEN = "<vid_end>"
21 | VIDEO_PLACEHOLDER = "<video-placeholder>"
22 | # ======================================================================================================
23 | 
24 | MAX_IMAGE_LENGTH = 16
25 | MAX_VIDEO_LENGTH = 1  # current video datasets only have 1 video?
26 | 
27 | PAD_LENGTH = 620
28 | 


--------------------------------------------------------------------------------
/baselines/videollava/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
2 | from .language_model.llava_mpt import LlavaMPTForCausalLM, LlavaMPTConfig
3 | 


--------------------------------------------------------------------------------
/baselines/videollava/model/apply_delta.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from tqdm import tqdm
 9 | from transformers import AutoTokenizer, AutoModelForCausalLM
10 | from videollava import LlavaLlamaForCausalLM
11 | 
12 | 
13 | def apply_delta(base_model_path, target_model_path, delta_path):
14 |     print("Loading base model")
15 |     base = AutoModelForCausalLM.from_pretrained(
16 |         base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 | 
18 |     print("Loading delta")
19 |     delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20 |     delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21 | 
22 |     print("Applying delta")
23 |     for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24 |         if name not in base.state_dict():
25 |             assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26 |             continue
27 |         if param.data.shape == base.state_dict()[name].shape:
28 |             param.data += base.state_dict()[name]
29 |         else:
30 |             assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
31 |                 f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
32 |             bparam = base.state_dict()[name]
33 |             param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
34 | 
35 |     print("Saving target model")
36 |     delta.save_pretrained(target_model_path)
37 |     delta_tokenizer.save_pretrained(target_model_path)
38 | 
39 | 
40 | if __name__ == "__main__":
41 |     parser = argparse.ArgumentParser()
42 |     parser.add_argument("--base-model-path", type=str, required=True)
43 |     parser.add_argument("--target-model-path", type=str, required=True)
44 |     parser.add_argument("--delta-path", type=str, required=True)
45 | 
46 |     args = parser.parse_args()
47 | 
48 |     apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
49 | 


--------------------------------------------------------------------------------
/baselines/videollava/model/consolidate.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
 4 | """
 5 | import argparse
 6 | 
 7 | import torch
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | from videollava.model import *
10 | from videollava.model.utils import auto_upgrade
11 | 
12 | 
13 | def consolidate_ckpt(src_path, dst_path):
14 |     print("Loading model")
15 |     auto_upgrade(src_path)
16 |     src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17 |     src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18 |     src_model.save_pretrained(dst_path)
19 |     src_tokenizer.save_pretrained(dst_path)
20 | 
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--src", type=str, required=True)
25 |     parser.add_argument("--dst", type=str, required=True)
26 | 
27 |     args = parser.parse_args()
28 | 
29 |     consolidate_ckpt(args.src, args.dst)
30 | 


--------------------------------------------------------------------------------
/baselines/videollava/model/language_model/mpt/adapt_tokenizer.py:
--------------------------------------------------------------------------------
 1 | from typing import Union
 2 | from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast
 3 | Tokenizer = Union[PreTrainedTokenizer, PreTrainedTokenizerFast]
 4 | NUM_SENTINEL_TOKENS: int = 100
 5 | 
 6 | def adapt_tokenizer_for_denoising(tokenizer: Tokenizer):
 7 |     """Adds sentinel tokens and padding token (if missing).
 8 | 
 9 |     Expands the tokenizer vocabulary to include sentinel tokens
10 |     used in mixture-of-denoiser tasks as well as a padding token.
11 | 
12 |     All added tokens are added as special tokens. No tokens are
13 |     added if sentinel tokens and padding token already exist.
14 |     """
15 |     sentinels_to_add = [f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)]
16 |     tokenizer.add_tokens(sentinels_to_add, special_tokens=True)
17 |     if tokenizer.pad_token is None:
18 |         tokenizer.add_tokens('<pad>', special_tokens=True)
19 |         tokenizer.pad_token = '<pad>'
20 |         assert tokenizer.pad_token_id is not None
21 |     sentinels = ''.join([f'<extra_id_{i}>' for i in range(NUM_SENTINEL_TOKENS)])
22 |     _sentinel_token_ids = tokenizer(sentinels, add_special_tokens=False).input_ids
23 |     tokenizer.sentinel_token_ids = _sentinel_token_ids
24 | 
25 | class AutoTokenizerForMOD(AutoTokenizer):
26 |     """AutoTokenizer + Adaptation for MOD.
27 | 
28 |     A simple wrapper around AutoTokenizer to make instantiating
29 |     an MOD-adapted tokenizer a bit easier.
30 | 
31 |     MOD-adapted tokenizers have sentinel tokens (e.g., <extra_id_0>),
32 |     a padding token, and a property to get the token ids of the
33 |     sentinel tokens.
34 |     """
35 | 
36 |     @classmethod
37 |     def from_pretrained(cls, *args, **kwargs):
38 |         """See `AutoTokenizer.from_pretrained` docstring."""
39 |         tokenizer = super().from_pretrained(*args, **kwargs)
40 |         adapt_tokenizer_for_denoising(tokenizer)
41 |         return tokenizer


--------------------------------------------------------------------------------
/baselines/videollava/model/language_model/mpt/custom_embedding.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | from torch import Tensor
 5 | 
 6 | class SharedEmbedding(nn.Embedding):
 7 | 
 8 |     def forward(self, input: Tensor, unembed: bool=False) -> Tensor:
 9 |         if unembed:
10 |             return F.linear(input, self.weight)
11 |         return super().forward(input)


--------------------------------------------------------------------------------
/baselines/videollava/model/multimodal_encoder/builder.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .clip_encoder import CLIPVisionTower
 3 | from .languagebind import LanguageBindImageTower, LanguageBindVideoTower
 4 | 
 5 | # ============================================================================================================
 6 | 
 7 | def build_image_tower(image_tower_cfg, **kwargs):
 8 |     image_tower = getattr(image_tower_cfg, 'mm_image_tower', getattr(image_tower_cfg, 'image_tower', None))
 9 |     is_absolute_path_exists = os.path.exists(image_tower)
10 |     if is_absolute_path_exists or image_tower.startswith("openai") or image_tower.startswith("laion"):
11 |         return CLIPVisionTower(image_tower, args=image_tower_cfg, **kwargs)
12 |     if image_tower.endswith('LanguageBind_Image'):
13 |         local_file = "checkpoints/VideoLLaVA/LanguageBind_Image"
14 |         if not os.path.exists(local_file):
15 |             return LanguageBindImageTower(image_tower, args=image_tower_cfg, cache_dir='./cache_dir', **kwargs)
16 |         else:
17 |             return LanguageBindImageTower(local_file, args=image_tower_cfg, cache_dir='./cache_dir', **kwargs)
18 | 
19 |     raise ValueError(f'Unknown image tower: {image_tower}')
20 | 
21 | def build_video_tower(video_tower_cfg, **kwargs):
22 |     video_tower = getattr(video_tower_cfg, 'mm_video_tower', getattr(video_tower_cfg, 'video_tower', None))
23 |     if video_tower.endswith('LanguageBind_Video_merge'):
24 |         local_file = "checkpoints/VideoLLaVA/LanguageBind_Video_merge"
25 |         if not os.path.exists(local_file):
26 |             return LanguageBindVideoTower(video_tower, args=video_tower_cfg, cache_dir='./cache_dir', **kwargs)
27 |         else:
28 |             return LanguageBindVideoTower(local_file, args=video_tower_cfg, cache_dir='./cache_dir', **kwargs)
29 |     raise ValueError(f'Unknown video tower: {video_tower}')
30 | # ============================================================================================================
31 | 


--------------------------------------------------------------------------------
/baselines/videollava/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import re
 4 | 
 5 | 
 6 | class IdentityMap(nn.Module):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def forward(self, x, *args, **kwargs):
11 |         return x
12 | 
13 |     @property
14 |     def config(self):
15 |         return {"mm_projector_type": 'identity'}
16 | 
17 | 
18 | class SimpleResBlock(nn.Module):
19 |     def __init__(self, channels):
20 |         super().__init__()
21 |         self.pre_norm = nn.LayerNorm(channels)
22 | 
23 |         self.proj = nn.Sequential(
24 |             nn.Linear(channels, channels),
25 |             nn.GELU(),
26 |             nn.Linear(channels, channels)
27 |         )
28 |     def forward(self, x):
29 |         x = self.pre_norm(x)
30 |         return x + self.proj(x)
31 | 
32 | 
33 | def build_vision_projector(config, delay_load=False, **kwargs):
34 |     projector_type = getattr(config, 'mm_projector_type', 'linear')
35 | 
36 |     if projector_type == 'linear':
37 |         return nn.Linear(config.mm_hidden_size, config.hidden_size)
38 | 
39 |     mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40 |     if mlp_gelu_match:
41 |         mlp_depth = int(mlp_gelu_match.group(1))
42 |         modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43 |         for _ in range(1, mlp_depth):
44 |             modules.append(nn.GELU())
45 |             modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46 |         return nn.Sequential(*modules)
47 | 
48 |     if projector_type == 'identity':
49 |         return IdentityMap()
50 | 
51 |     raise ValueError(f'Unknown projector type: {projector_type}')
52 | 


--------------------------------------------------------------------------------
/baselines/videollava/model/utils.py:
--------------------------------------------------------------------------------
 1 | from transformers import AutoConfig
 2 | 
 3 | 
 4 | def auto_upgrade(config):
 5 |     cfg = AutoConfig.from_pretrained(config)
 6 |     if 'llava' in config and 'llava' not in cfg.model_type:
 7 |         assert cfg.model_type == 'llama'
 8 |         print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
 9 |         print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10 |         confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11 |         if confirm.lower() in ["y", "yes"]:
12 |             print("Upgrading checkpoint...")
13 |             assert len(cfg.architectures) == 1
14 |             setattr(cfg.__class__, "model_type", "llava")
15 |             cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16 |             cfg.save_pretrained(config)
17 |             print("Checkpoint upgraded.")
18 |         else:
19 |             print("Checkpoint upgrade aborted.")
20 |             exit(1)
21 | 


--------------------------------------------------------------------------------
/evaluations/evaluation_halluc.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import re
 3 | import argparse
 4 | 
 5 | def main(models):
 6 |     tps = ["obj_rel", "temporal", "semantic", "interaction", "fact", "nonfact"]
 7 |     tps = ["interaction"]
 8 | 
 9 |     for model in models:
10 |         basic = 0
11 |         halluc = 0
12 | 
13 |         for tp in tps:
14 |             res_filepath = f"results/{tp}_{model}.json"
15 |             
16 |             try:
17 |                 with open(res_filepath, 'r') as f:
18 |                     res = json.load(f)
19 |             except FileNotFoundError:
20 |                 print(f"File not found: {res_filepath}")
21 |                 continue
22 |             
23 |             for dct in res:
24 |                 basic_pred = dct["basic"]["predict"]
25 |                 basic_ans = dct["basic"]["answer"]
26 |                 halluc_pred = dct["hallucination"]["predict"]
27 |                 halluc_ans = dct["hallucination"]["answer"]
28 | 
29 |                 assert basic_ans == 'yes'
30 |                 assert halluc_ans == 'no'
31 | 
32 |                 y_pattern = r'\b(' + basic_ans + r')\b'
33 |                 n_pattern = r'\b(' + halluc_ans + r')\b'
34 | 
35 |                 if re.search(y_pattern, basic_pred, re.IGNORECASE):
36 |                     basic += 1
37 |                     # if re.search(n_pattern, halluc_pred, re.IGNORECASE):
38 |                     if re.search(y_pattern, halluc_pred, re.IGNORECASE):
39 |                         halluc += 1
40 | 
41 |         halluc_score = halluc / basic
42 | 
43 |         print(model)
44 |         print('halluciantion score: ', halluc_score)
45 | 
46 | 
47 | if __name__ == "__main__":
48 |     parser = argparse.ArgumentParser(description="Process some models.")
49 |     parser.add_argument('models', metavar='M', type=str, nargs='+',
50 |                         help='a list of models to process')
51 | 
52 |     args = parser.parse_args()
53 |     main(args.models)
54 | 


--------------------------------------------------------------------------------
/videohallucer_datasets/fact_detect/modify.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | 
 3 | fd = json.load(open("fact_detect_yn.json"))
 4 | 
 5 | for dct in fd:
 6 |     q = dct["basic"]["question"]
 7 |     nq = dct["hallucination"]["question"]
 8 |     dct["hallucination"]["question"] = "Does the following course summary contain all the necessary factual knowledge? " + nq.split("Does the following course summary contain any non-factual knowledge? ")[-1]
 9 |     dct["hallucination"]["answer"] = "no"
10 | 
11 | json.dump(fd, open("fact_detect.json", "w"), indent=4)


--------------------------------------------------------------------------------
/videohallucer_datasets/interaction/stat.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | from moviepy.editor import VideoFileClip
 4 | from tqdm import tqdm
 5 | 
 6 | 
 7 | inter = json.load(open("interaction.json"))
 8 | 
 9 | vid_dur = 0
10 | vid_cnt = 0
11 | q_dur = 0
12 | q_cnt = 0
13 | 
14 | for dct in tqdm(inter):
15 |     vid = "videos/" + dct["basic"]["video"]
16 |     q1 = len(dct["basic"]["question"].split())
17 |     q2 = len(dct["hallucination"]["question"].split())
18 |     q_dur += (q1 + q2)
19 |     q_cnt += 2
20 |     video = VideoFileClip(vid)
21 |     vid_dur += video.duration
22 |     vid_cnt += 1
23 |     
24 | print(vid_dur / vid_cnt)
25 | print(q_dur / q_cnt)


--------------------------------------------------------------------------------