├── .gitignore ├── README.md ├── docs ├── 1-CLI_DEMO.md ├── 2-Training.md └── images │ ├── figures │ ├── 1-architecture.png │ ├── audio-qual.png │ ├── comparison-prev_versions.png │ ├── grounding-qual.png │ ├── quant_grounding.png │ ├── quant_our_benchmark.png │ ├── quant_zero_shot.png │ └── teaser.png │ └── logos │ ├── IVAL_logo.png │ ├── MBZUAI_logo.png │ ├── Oryx_logo.png │ └── logo.png ├── grounding_evaluation ├── README.md ├── datasets │ ├── hcstvg_dataset.py │ ├── hcstvg_dataset_extract_interrogative.py │ ├── preproc_hcstvgv2.py │ ├── preproc_vidstg.py │ └── vidstg_dataset.py ├── eval_grounding.py ├── gen_qualitative_results.py ├── grounding_new_api.py └── util │ ├── box_ops.py │ ├── dist.py │ ├── entity_matching_openai.py │ ├── image_tagging.py │ ├── image_transforms.py │ ├── misc.py │ └── result_utils.py ├── quantitative_evaluation ├── README.md ├── benchmark_dataset_generation │ ├── generate_consistency_qa.py │ ├── generate_correctness_detailed_context_qa.py │ └── generate_temporal_qa.py ├── evaluate_activitynet_qa.py ├── evaluate_benchmark.sh ├── evaluate_benchmark_1_correctness.py ├── evaluate_benchmark_2_detailed_orientation.py ├── evaluate_benchmark_3_context.py ├── evaluate_benchmark_4_temporal.py └── evaluate_benchmark_5_consistency.py ├── requirements.txt ├── scripts ├── convert_instruction_json_to_training_format.py ├── filter_for_missing_videos.py └── save_spatio_temporal_clip_features.py └── video_chatgpt ├── __init__.py ├── audio_transcript ├── __init__.py └── transcribe.py ├── chat.py ├── constants.py ├── eval ├── __init__.py ├── model_utils.py ├── run_inference_benchmark_consistency.py ├── run_inference_benchmark_general.py ├── run_inference_qa_activitynet.py ├── run_inference_qa_msrvtt.py ├── run_inference_qa_msvd.py └── run_inference_qa_tgif.py ├── inference.py ├── model ├── __init__.py ├── consolidate.py ├── make_delta.py ├── multimodal_projector │ ├── __init__.py │ └── builder.py ├── utils.py └── video_chatgpt.py ├── train ├── llama_flash_attn_monkey_patch.py ├── llava_trainer.py ├── train.py └── train_mem.py ├── utils.py └── video_conversation.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/.gitignore -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/README.md -------------------------------------------------------------------------------- /docs/1-CLI_DEMO.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/1-CLI_DEMO.md -------------------------------------------------------------------------------- /docs/2-Training.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/2-Training.md -------------------------------------------------------------------------------- /docs/images/figures/1-architecture.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/figures/1-architecture.png -------------------------------------------------------------------------------- /docs/images/figures/audio-qual.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/figures/audio-qual.png -------------------------------------------------------------------------------- /docs/images/figures/comparison-prev_versions.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/figures/comparison-prev_versions.png -------------------------------------------------------------------------------- /docs/images/figures/grounding-qual.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/figures/grounding-qual.png -------------------------------------------------------------------------------- /docs/images/figures/quant_grounding.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/figures/quant_grounding.png -------------------------------------------------------------------------------- /docs/images/figures/quant_our_benchmark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/figures/quant_our_benchmark.png -------------------------------------------------------------------------------- /docs/images/figures/quant_zero_shot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/figures/quant_zero_shot.png -------------------------------------------------------------------------------- /docs/images/figures/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/figures/teaser.png -------------------------------------------------------------------------------- /docs/images/logos/IVAL_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/logos/IVAL_logo.png -------------------------------------------------------------------------------- /docs/images/logos/MBZUAI_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/logos/MBZUAI_logo.png -------------------------------------------------------------------------------- /docs/images/logos/Oryx_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/logos/Oryx_logo.png -------------------------------------------------------------------------------- /docs/images/logos/logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/logos/logo.png -------------------------------------------------------------------------------- /grounding_evaluation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/README.md -------------------------------------------------------------------------------- /grounding_evaluation/datasets/hcstvg_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/datasets/hcstvg_dataset.py -------------------------------------------------------------------------------- /grounding_evaluation/datasets/hcstvg_dataset_extract_interrogative.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/datasets/hcstvg_dataset_extract_interrogative.py -------------------------------------------------------------------------------- /grounding_evaluation/datasets/preproc_hcstvgv2.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/datasets/preproc_hcstvgv2.py -------------------------------------------------------------------------------- /grounding_evaluation/datasets/preproc_vidstg.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/datasets/preproc_vidstg.py -------------------------------------------------------------------------------- /grounding_evaluation/datasets/vidstg_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/datasets/vidstg_dataset.py -------------------------------------------------------------------------------- /grounding_evaluation/eval_grounding.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/eval_grounding.py -------------------------------------------------------------------------------- /grounding_evaluation/gen_qualitative_results.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/gen_qualitative_results.py -------------------------------------------------------------------------------- /grounding_evaluation/grounding_new_api.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/grounding_new_api.py -------------------------------------------------------------------------------- /grounding_evaluation/util/box_ops.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/util/box_ops.py -------------------------------------------------------------------------------- /grounding_evaluation/util/dist.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/util/dist.py -------------------------------------------------------------------------------- /grounding_evaluation/util/entity_matching_openai.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/util/entity_matching_openai.py -------------------------------------------------------------------------------- /grounding_evaluation/util/image_tagging.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/util/image_tagging.py -------------------------------------------------------------------------------- /grounding_evaluation/util/image_transforms.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/util/image_transforms.py -------------------------------------------------------------------------------- /grounding_evaluation/util/misc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/util/misc.py -------------------------------------------------------------------------------- /grounding_evaluation/util/result_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/util/result_utils.py -------------------------------------------------------------------------------- /quantitative_evaluation/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/README.md -------------------------------------------------------------------------------- /quantitative_evaluation/benchmark_dataset_generation/generate_consistency_qa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/benchmark_dataset_generation/generate_consistency_qa.py -------------------------------------------------------------------------------- /quantitative_evaluation/benchmark_dataset_generation/generate_correctness_detailed_context_qa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/benchmark_dataset_generation/generate_correctness_detailed_context_qa.py -------------------------------------------------------------------------------- /quantitative_evaluation/benchmark_dataset_generation/generate_temporal_qa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/benchmark_dataset_generation/generate_temporal_qa.py -------------------------------------------------------------------------------- /quantitative_evaluation/evaluate_activitynet_qa.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/evaluate_activitynet_qa.py -------------------------------------------------------------------------------- /quantitative_evaluation/evaluate_benchmark.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/evaluate_benchmark.sh -------------------------------------------------------------------------------- /quantitative_evaluation/evaluate_benchmark_1_correctness.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/evaluate_benchmark_1_correctness.py -------------------------------------------------------------------------------- /quantitative_evaluation/evaluate_benchmark_2_detailed_orientation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/evaluate_benchmark_2_detailed_orientation.py -------------------------------------------------------------------------------- /quantitative_evaluation/evaluate_benchmark_3_context.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/evaluate_benchmark_3_context.py -------------------------------------------------------------------------------- /quantitative_evaluation/evaluate_benchmark_4_temporal.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/evaluate_benchmark_4_temporal.py -------------------------------------------------------------------------------- /quantitative_evaluation/evaluate_benchmark_5_consistency.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/evaluate_benchmark_5_consistency.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/requirements.txt -------------------------------------------------------------------------------- /scripts/convert_instruction_json_to_training_format.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/scripts/convert_instruction_json_to_training_format.py -------------------------------------------------------------------------------- /scripts/filter_for_missing_videos.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/scripts/filter_for_missing_videos.py -------------------------------------------------------------------------------- /scripts/save_spatio_temporal_clip_features.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/scripts/save_spatio_temporal_clip_features.py -------------------------------------------------------------------------------- /video_chatgpt/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import VideoChatGPTLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /video_chatgpt/audio_transcript/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_chatgpt/audio_transcript/transcribe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/audio_transcript/transcribe.py -------------------------------------------------------------------------------- /video_chatgpt/chat.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/chat.py -------------------------------------------------------------------------------- /video_chatgpt/constants.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/constants.py -------------------------------------------------------------------------------- /video_chatgpt/eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /video_chatgpt/eval/model_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/eval/model_utils.py -------------------------------------------------------------------------------- /video_chatgpt/eval/run_inference_benchmark_consistency.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/eval/run_inference_benchmark_consistency.py -------------------------------------------------------------------------------- /video_chatgpt/eval/run_inference_benchmark_general.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/eval/run_inference_benchmark_general.py -------------------------------------------------------------------------------- /video_chatgpt/eval/run_inference_qa_activitynet.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/eval/run_inference_qa_activitynet.py -------------------------------------------------------------------------------- /video_chatgpt/eval/run_inference_qa_msrvtt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/eval/run_inference_qa_msrvtt.py -------------------------------------------------------------------------------- /video_chatgpt/eval/run_inference_qa_msvd.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/eval/run_inference_qa_msvd.py -------------------------------------------------------------------------------- /video_chatgpt/eval/run_inference_qa_tgif.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/eval/run_inference_qa_tgif.py -------------------------------------------------------------------------------- /video_chatgpt/inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/inference.py -------------------------------------------------------------------------------- /video_chatgpt/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/model/__init__.py -------------------------------------------------------------------------------- /video_chatgpt/model/consolidate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/model/consolidate.py -------------------------------------------------------------------------------- /video_chatgpt/model/make_delta.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/model/make_delta.py -------------------------------------------------------------------------------- /video_chatgpt/model/multimodal_projector/__init__.py: -------------------------------------------------------------------------------- 1 | from . import * -------------------------------------------------------------------------------- /video_chatgpt/model/multimodal_projector/builder.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/model/multimodal_projector/builder.py -------------------------------------------------------------------------------- /video_chatgpt/model/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/model/utils.py -------------------------------------------------------------------------------- /video_chatgpt/model/video_chatgpt.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/model/video_chatgpt.py -------------------------------------------------------------------------------- /video_chatgpt/train/llama_flash_attn_monkey_patch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/train/llama_flash_attn_monkey_patch.py -------------------------------------------------------------------------------- /video_chatgpt/train/llava_trainer.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/train/llava_trainer.py -------------------------------------------------------------------------------- /video_chatgpt/train/train.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/train/train.py -------------------------------------------------------------------------------- /video_chatgpt/train/train_mem.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/train/train_mem.py -------------------------------------------------------------------------------- /video_chatgpt/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/utils.py -------------------------------------------------------------------------------- /video_chatgpt/video_conversation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/video_conversation.py --------------------------------------------------------------------------------