├── vista_llama ├── eval │ ├── __init__.py │ ├── run_inference_benchmark_general.py │ ├── run_inference_benchmark_consistency.py │ ├── model_utils.py │ ├── run_inference_qformer_nextqa.py │ └── run_inference_qformer_general.py ├── __init__.py ├── .ipynb_checkpoints │ └── Untitled-checkpoint.ipynb ├── model │ ├── __init__.py │ ├── consolidate.py │ ├── utils.py │ ├── make_delta.py │ ├── dist_utils.py │ ├── logger.py │ ├── base_model.py │ └── blip2.py ├── constants.py ├── train │ ├── train_mem.py │ ├── llava_trainer.py │ └── llama_flash_attn_monkey_patch.py ├── inference.py ├── utils.py └── video_conversation.py ├── MSRVTT-QA ├── readme.txt └── category.txt ├── requirements.txt ├── README.md ├── scripts ├── set_env.sh ├── convert_instruction_json_to_training_format.py ├── run_infer.sh ├── apply_delta.py ├── train_val_eval.sh ├── save_evaclip_features.py ├── save_spatio_temporal_clip_features.py └── save_patch_clip_features.py ├── quantitative_evaluation ├── evaluate_benchmark.sh ├── analyze_nextqa.py ├── benchmark_dataset_generation │ ├── generate_correctness_detailed_context_qa.py │ ├── generate_temporal_qa.py │ └── generate_consistency_qa.py ├── evaluate_benchmark_1_correctness.py ├── evaluate_activitynet_qa.py ├── evaluate_benchmark_4_temporal.py ├── evaluate_nextqa.py ├── evaluate_benchmark_2_detailed_orientation.py ├── evaluate_benchmark_3_context.py └── evaluate_benchmark_5_consistency.py ├── data ├── README.md └── generate_descriptive_qa.py └── test.yaml /vista_llama/eval/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /vista_llama/__init__.py: -------------------------------------------------------------------------------- 1 | from .model import VideoChatGPTLlamaForCausalLM 2 | -------------------------------------------------------------------------------- /vista_llama/.ipynb_checkpoints/Untitled-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [], 3 | "metadata": {}, 4 | "nbformat": 4, 5 | "nbformat_minor": 5 6 | } 7 | -------------------------------------------------------------------------------- /vista_llama/model/__init__.py: -------------------------------------------------------------------------------- 1 | from .video_chatgpt import VideoChatGPTLlamaForCausalLM, VideoChatGPTConfig 2 | from .vista_llama_qformer import VistaLLaMAQformerLlamaForCausalLM 3 | -------------------------------------------------------------------------------- /vista_llama/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | 6 | 7 | # Defining model 8 | DEFAULT_VIDEO_TOKEN = "