├── .gitignore
├── README.md
├── docs
    ├── 1-CLI_DEMO.md
    ├── 2-Training.md
    └── images
    │   ├── figures
    │       ├── 1-architecture.png
    │       ├── audio-qual.png
    │       ├── comparison-prev_versions.png
    │       ├── grounding-qual.png
    │       ├── quant_grounding.png
    │       ├── quant_our_benchmark.png
    │       ├── quant_zero_shot.png
    │       └── teaser.png
    │   └── logos
    │       ├── IVAL_logo.png
    │       ├── MBZUAI_logo.png
    │       ├── Oryx_logo.png
    │       └── logo.png
├── grounding_evaluation
    ├── README.md
    ├── datasets
    │   ├── hcstvg_dataset.py
    │   ├── hcstvg_dataset_extract_interrogative.py
    │   ├── preproc_hcstvgv2.py
    │   ├── preproc_vidstg.py
    │   └── vidstg_dataset.py
    ├── eval_grounding.py
    ├── gen_qualitative_results.py
    ├── grounding_new_api.py
    └── util
    │   ├── box_ops.py
    │   ├── dist.py
    │   ├── entity_matching_openai.py
    │   ├── image_tagging.py
    │   ├── image_transforms.py
    │   ├── misc.py
    │   └── result_utils.py
├── quantitative_evaluation
    ├── README.md
    ├── benchmark_dataset_generation
    │   ├── generate_consistency_qa.py
    │   ├── generate_correctness_detailed_context_qa.py
    │   └── generate_temporal_qa.py
    ├── evaluate_activitynet_qa.py
    ├── evaluate_benchmark.sh
    ├── evaluate_benchmark_1_correctness.py
    ├── evaluate_benchmark_2_detailed_orientation.py
    ├── evaluate_benchmark_3_context.py
    ├── evaluate_benchmark_4_temporal.py
    └── evaluate_benchmark_5_consistency.py
├── requirements.txt
├── scripts
    ├── convert_instruction_json_to_training_format.py
    ├── filter_for_missing_videos.py
    └── save_spatio_temporal_clip_features.py
└── video_chatgpt
    ├── __init__.py
    ├── audio_transcript
        ├── __init__.py
        └── transcribe.py
    ├── chat.py
    ├── constants.py
    ├── eval
        ├── __init__.py
        ├── model_utils.py
        ├── run_inference_benchmark_consistency.py
        ├── run_inference_benchmark_general.py
        ├── run_inference_qa_activitynet.py
        ├── run_inference_qa_msrvtt.py
        ├── run_inference_qa_msvd.py
        └── run_inference_qa_tgif.py
    ├── inference.py
    ├── model
        ├── __init__.py
        ├── consolidate.py
        ├── make_delta.py
        ├── multimodal_projector
        │   ├── __init__.py
        │   └── builder.py
        ├── utils.py
        └── video_chatgpt.py
    ├── train
        ├── llama_flash_attn_monkey_patch.py
        ├── llava_trainer.py
        ├── train.py
        └── train_mem.py
    ├── utils.py
    └── video_conversation.py


/.gitignore:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/.gitignore


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/README.md


--------------------------------------------------------------------------------
/docs/1-CLI_DEMO.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/1-CLI_DEMO.md


--------------------------------------------------------------------------------
/docs/2-Training.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/2-Training.md


--------------------------------------------------------------------------------
/docs/images/figures/1-architecture.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/figures/1-architecture.png


--------------------------------------------------------------------------------
/docs/images/figures/audio-qual.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/figures/audio-qual.png


--------------------------------------------------------------------------------
/docs/images/figures/comparison-prev_versions.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/figures/comparison-prev_versions.png


--------------------------------------------------------------------------------
/docs/images/figures/grounding-qual.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/figures/grounding-qual.png


--------------------------------------------------------------------------------
/docs/images/figures/quant_grounding.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/figures/quant_grounding.png


--------------------------------------------------------------------------------
/docs/images/figures/quant_our_benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/figures/quant_our_benchmark.png


--------------------------------------------------------------------------------
/docs/images/figures/quant_zero_shot.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/figures/quant_zero_shot.png


--------------------------------------------------------------------------------
/docs/images/figures/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/figures/teaser.png


--------------------------------------------------------------------------------
/docs/images/logos/IVAL_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/logos/IVAL_logo.png


--------------------------------------------------------------------------------
/docs/images/logos/MBZUAI_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/logos/MBZUAI_logo.png


--------------------------------------------------------------------------------
/docs/images/logos/Oryx_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/logos/Oryx_logo.png


--------------------------------------------------------------------------------
/docs/images/logos/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/docs/images/logos/logo.png


--------------------------------------------------------------------------------
/grounding_evaluation/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/README.md


--------------------------------------------------------------------------------
/grounding_evaluation/datasets/hcstvg_dataset.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/datasets/hcstvg_dataset.py


--------------------------------------------------------------------------------
/grounding_evaluation/datasets/hcstvg_dataset_extract_interrogative.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/datasets/hcstvg_dataset_extract_interrogative.py


--------------------------------------------------------------------------------
/grounding_evaluation/datasets/preproc_hcstvgv2.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/datasets/preproc_hcstvgv2.py


--------------------------------------------------------------------------------
/grounding_evaluation/datasets/preproc_vidstg.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/datasets/preproc_vidstg.py


--------------------------------------------------------------------------------
/grounding_evaluation/datasets/vidstg_dataset.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/datasets/vidstg_dataset.py


--------------------------------------------------------------------------------
/grounding_evaluation/eval_grounding.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/eval_grounding.py


--------------------------------------------------------------------------------
/grounding_evaluation/gen_qualitative_results.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/gen_qualitative_results.py


--------------------------------------------------------------------------------
/grounding_evaluation/grounding_new_api.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/grounding_new_api.py


--------------------------------------------------------------------------------
/grounding_evaluation/util/box_ops.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/util/box_ops.py


--------------------------------------------------------------------------------
/grounding_evaluation/util/dist.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/util/dist.py


--------------------------------------------------------------------------------
/grounding_evaluation/util/entity_matching_openai.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/util/entity_matching_openai.py


--------------------------------------------------------------------------------
/grounding_evaluation/util/image_tagging.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/util/image_tagging.py


--------------------------------------------------------------------------------
/grounding_evaluation/util/image_transforms.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/util/image_transforms.py


--------------------------------------------------------------------------------
/grounding_evaluation/util/misc.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/util/misc.py


--------------------------------------------------------------------------------
/grounding_evaluation/util/result_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/grounding_evaluation/util/result_utils.py


--------------------------------------------------------------------------------
/quantitative_evaluation/README.md:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/README.md


--------------------------------------------------------------------------------
/quantitative_evaluation/benchmark_dataset_generation/generate_consistency_qa.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/benchmark_dataset_generation/generate_consistency_qa.py


--------------------------------------------------------------------------------
/quantitative_evaluation/benchmark_dataset_generation/generate_correctness_detailed_context_qa.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/benchmark_dataset_generation/generate_correctness_detailed_context_qa.py


--------------------------------------------------------------------------------
/quantitative_evaluation/benchmark_dataset_generation/generate_temporal_qa.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/benchmark_dataset_generation/generate_temporal_qa.py


--------------------------------------------------------------------------------
/quantitative_evaluation/evaluate_activitynet_qa.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/evaluate_activitynet_qa.py


--------------------------------------------------------------------------------
/quantitative_evaluation/evaluate_benchmark.sh:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/evaluate_benchmark.sh


--------------------------------------------------------------------------------
/quantitative_evaluation/evaluate_benchmark_1_correctness.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/evaluate_benchmark_1_correctness.py


--------------------------------------------------------------------------------
/quantitative_evaluation/evaluate_benchmark_2_detailed_orientation.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/evaluate_benchmark_2_detailed_orientation.py


--------------------------------------------------------------------------------
/quantitative_evaluation/evaluate_benchmark_3_context.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/evaluate_benchmark_3_context.py


--------------------------------------------------------------------------------
/quantitative_evaluation/evaluate_benchmark_4_temporal.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/evaluate_benchmark_4_temporal.py


--------------------------------------------------------------------------------
/quantitative_evaluation/evaluate_benchmark_5_consistency.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/quantitative_evaluation/evaluate_benchmark_5_consistency.py


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/requirements.txt


--------------------------------------------------------------------------------
/scripts/convert_instruction_json_to_training_format.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/scripts/convert_instruction_json_to_training_format.py


--------------------------------------------------------------------------------
/scripts/filter_for_missing_videos.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/scripts/filter_for_missing_videos.py


--------------------------------------------------------------------------------
/scripts/save_spatio_temporal_clip_features.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/scripts/save_spatio_temporal_clip_features.py


--------------------------------------------------------------------------------
/video_chatgpt/__init__.py:
--------------------------------------------------------------------------------
1 | from .model import VideoChatGPTLlamaForCausalLM
2 | 


--------------------------------------------------------------------------------
/video_chatgpt/audio_transcript/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_chatgpt/audio_transcript/transcribe.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/audio_transcript/transcribe.py


--------------------------------------------------------------------------------
/video_chatgpt/chat.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/chat.py


--------------------------------------------------------------------------------
/video_chatgpt/constants.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/constants.py


--------------------------------------------------------------------------------
/video_chatgpt/eval/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/video_chatgpt/eval/model_utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/eval/model_utils.py


--------------------------------------------------------------------------------
/video_chatgpt/eval/run_inference_benchmark_consistency.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/eval/run_inference_benchmark_consistency.py


--------------------------------------------------------------------------------
/video_chatgpt/eval/run_inference_benchmark_general.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/eval/run_inference_benchmark_general.py


--------------------------------------------------------------------------------
/video_chatgpt/eval/run_inference_qa_activitynet.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/eval/run_inference_qa_activitynet.py


--------------------------------------------------------------------------------
/video_chatgpt/eval/run_inference_qa_msrvtt.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/eval/run_inference_qa_msrvtt.py


--------------------------------------------------------------------------------
/video_chatgpt/eval/run_inference_qa_msvd.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/eval/run_inference_qa_msvd.py


--------------------------------------------------------------------------------
/video_chatgpt/eval/run_inference_qa_tgif.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/eval/run_inference_qa_tgif.py


--------------------------------------------------------------------------------
/video_chatgpt/inference.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/inference.py


--------------------------------------------------------------------------------
/video_chatgpt/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/model/__init__.py


--------------------------------------------------------------------------------
/video_chatgpt/model/consolidate.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/model/consolidate.py


--------------------------------------------------------------------------------
/video_chatgpt/model/make_delta.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/model/make_delta.py


--------------------------------------------------------------------------------
/video_chatgpt/model/multimodal_projector/__init__.py:
--------------------------------------------------------------------------------
1 | from . import *


--------------------------------------------------------------------------------
/video_chatgpt/model/multimodal_projector/builder.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/model/multimodal_projector/builder.py


--------------------------------------------------------------------------------
/video_chatgpt/model/utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/model/utils.py


--------------------------------------------------------------------------------
/video_chatgpt/model/video_chatgpt.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/model/video_chatgpt.py


--------------------------------------------------------------------------------
/video_chatgpt/train/llama_flash_attn_monkey_patch.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/train/llama_flash_attn_monkey_patch.py


--------------------------------------------------------------------------------
/video_chatgpt/train/llava_trainer.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/train/llava_trainer.py


--------------------------------------------------------------------------------
/video_chatgpt/train/train.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/train/train.py


--------------------------------------------------------------------------------
/video_chatgpt/train/train_mem.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/train/train_mem.py


--------------------------------------------------------------------------------
/video_chatgpt/utils.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/utils.py


--------------------------------------------------------------------------------
/video_chatgpt/video_conversation.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mbzuai-oryx/Video-LLaVA/HEAD/video_chatgpt/video_conversation.py


--------------------------------------------------------------------------------