├── InternVid ├── README.md ├── README_CN.md ├── demo.ipynb ├── div_sampling.py ├── test_viCLIP.py ├── utils │ ├── basic_utils.py │ ├── config.py │ ├── config_utils.py │ ├── distributed.py │ ├── easydict.py │ ├── logger.py │ ├── optimizer.py │ └── scheduler.py └── viclip │ ├── __init__.py │ ├── __pycache__ │ ├── __init__.cpython-311.pyc │ ├── __init__.cpython-39.pyc │ ├── simple_tokenizer.cpython-311.pyc │ ├── simple_tokenizer.cpython-39.pyc │ ├── viclip.cpython-39.pyc │ ├── viclip_text.cpython-39.pyc │ └── viclip_vision.cpython-39.pyc │ ├── bpe_simple_vocab_16e6.txt.gz │ ├── simple_tokenizer.py │ ├── viclip.py │ ├── viclip_text.py │ └── viclip_vision.py ├── LICENSE ├── LaViLa ├── CODE_OF_CONDUCT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── __pycache__ │ └── eval_narrator.cpython-39.pyc ├── clip_caption.py ├── datasets │ └── README.md ├── demo.py ├── demo_narrator.py ├── demo_narrator_3rd_person.py ├── docs │ ├── INSTALL.md │ ├── MODEL_ZOO.md │ └── PRETRAIN.md ├── eval_narrator.py ├── eval_zeroshot.py ├── lavila │ ├── data │ │ ├── __pycache__ │ │ │ ├── datasets.cpython-39.pyc │ │ │ └── video_transforms.cpython-39.pyc │ │ ├── datasets.py │ │ └── video_transforms.py │ ├── models │ │ ├── __pycache__ │ │ │ ├── coca.cpython-39.pyc │ │ │ ├── distributed_utils.cpython-39.pyc │ │ │ ├── gpt2_gated.cpython-39.pyc │ │ │ ├── loss.cpython-39.pyc │ │ │ ├── models.cpython-39.pyc │ │ │ ├── narrator.cpython-39.pyc │ │ │ ├── openai_clip.cpython-39.pyc │ │ │ ├── openai_model.cpython-39.pyc │ │ │ ├── timesformer.cpython-39.pyc │ │ │ ├── tokenizer.cpython-39.pyc │ │ │ └── utils.cpython-39.pyc │ │ ├── bpe_simple_vocab_16e6.txt.gz │ │ ├── coca.py │ │ ├── distributed_utils.py │ │ ├── gpt2_gated.py │ │ ├── loss.py │ │ ├── models.py │ │ ├── narrator.py │ │ ├── openai_clip.py │ │ ├── openai_model.py │ │ ├── timesformer.py │ │ ├── tokenizer.py │ │ └── utils.py │ └── utils │ │ ├── __pycache__ │ │ ├── distributed.cpython-39.pyc │ │ └── preprocess.cpython-39.pyc │ │ ├── distributed.py │ │ ├── evaluation.py │ │ ├── evaluation_charades.py │ │ ├── evaluation_egomcq.py │ │ ├── evaluation_ek100cls.py │ │ ├── evaluation_ek100mir.py │ │ ├── meter.py │ │ ├── preprocess.py │ │ ├── random.py │ │ └── scheduler.py ├── main_finetune_classification.py ├── main_finetune_retrieval.py ├── main_infer_narrator.py ├── main_pretrain.py ├── requirements.txt ├── run_with_submitit_finetune_classification.py ├── run_with_submitit_finetune_retrieval.py ├── run_with_submitit_infer_narrator.py ├── run_with_submitit_pretrain.py └── scripts │ ├── convert_egovlp_ckpt.py │ └── crop_and_resize_ego4d.sh ├── README.md ├── captioning.py ├── config └── default.yaml ├── database.py ├── demo.py ├── encoder.py ├── environment.yaml ├── imgs ├── demo.png └── teaser.png ├── inference.py ├── main.py ├── preprocess ├── boats │ ├── captions.json │ ├── reid.mp4 │ ├── reid.pkl │ ├── segment2id.json │ ├── segment_textual_embedding.pkl │ ├── segment_visual_embedding.pkl │ ├── tid2clip.pkl │ ├── tid2dinov2.pkl │ ├── tracking.pkl │ └── uid2clip.pkl ├── books │ ├── captions.json │ ├── reid.mp4 │ ├── reid.pkl │ ├── segment2id.json │ ├── segment_textual_embedding.pkl │ ├── segment_visual_embedding.pkl │ ├── tid2clip.pkl │ ├── tid2dinov2.pkl │ ├── tracking.pkl │ └── uid2clip.pkl ├── kitchen │ ├── captions.json │ ├── reid.mp4 │ ├── reid.pkl │ ├── segment2id.json │ ├── segment_0.mp4 │ ├── segment_1.mp4 │ ├── segment_18.mp4 │ ├── segment_3.mp4 │ ├── segment_8.mp4 │ ├── segment_textual_embedding.pkl │ ├── segment_visual_embedding.pkl │ ├── tid2clip.pkl │ ├── tid2dinov2.pkl │ ├── tracking.pkl │ └── uid2clip.pkl ├── painting │ ├── captions.json │ ├── reid.mp4 │ ├── reid.pkl │ ├── segment2id.json │ ├── segment_83.mp4 │ ├── segment_85.mp4 │ ├── segment_textual_embedding.pkl │ ├── segment_visual_embedding.pkl │ ├── tid2clip.pkl │ ├── tid2dinov2.pkl │ ├── tracking.pkl │ └── uid2clip.pkl └── talking │ ├── captions.json │ ├── reid.mp4 │ ├── reid.pkl │ ├── segment2id.json │ ├── segment_10.mp4 │ ├── segment_11.mp4 │ ├── segment_9.mp4 │ ├── segment_textual_embedding.pkl │ ├── segment_visual_embedding.pkl │ ├── tid2clip.pkl │ ├── tid2dinov2.pkl │ ├── tracking.pkl │ └── uid2clip.pkl ├── prompts ├── database_query_prompt.txt ├── multiple_choice_prompt.txt └── prompt.txt ├── reid.py ├── sample_videos ├── boats.mp4 ├── books.mp4 ├── kitchen.mp4 ├── painting.mp4 └── talking.mp4 ├── segment_feature.py ├── tools.py ├── tracking.py ├── utils.py └── video-llava.py /InternVid/README_CN.md: -------------------------------------------------------------------------------- 1 | # InternVid \[[论文](https://arxiv.org/pdf/2307.06942.pdf)\] 2 | 3 | [![数据集](https://img.shields.io/badge/%F0%9F%A4%97%20InternVid-Dataset-blue)](https://huggingface.co/datasets/OpenGVLab/InternVid) | [![模型](https://img.shields.io/badge/%F0%9F%A4%97%20ViCLIP-Model-purple)](https://huggingface.co/OpenGVLab/ViCLIP) 4 | 5 | \[[English verision](README.md)\] 6 | 7 | # :fire: 新闻 8 | 我们很高兴宣布部分发布一个大规模的视频文本数据集,旨在促进多模态理解和生成。作为此次发布的一部分,我们提供了该数据集的[子集](https://huggingface.co/datasets/OpenGVLab/InternVid)包含1000万个视频剪辑。此外,我们还提供了一个使用ViT-L架构在这个子集上训练的[ViCLIP](https://huggingface.co/OpenGVLab/ViCLIP)。该模型在Kinetics上实现了SOTA的零样本动作识别性能。 9 | 10 | 我们提供了示例代码,阐明如何使用ViClip的过程,在[demo.ipynb](https://github.com/OpenGVLab/InternVideo/blob/main/Data/InternVid/demo.ipynb)中有详述。 11 | 12 | 请关注我们的更新! 13 | 14 | # 简介 15 | 16 | **数据** 17 | 18 | 我们从16个流行类别中收集了各种百分比的视频。为了确保多样性,我们选择了来自不同语言的国家的视频,而非依赖于一个主导语言环境。我们采样的国家包括英国、美国、澳大利亚、日本、韩国、中国、俄罗斯和法国等。在时长方面,每个视频平均持续351.9秒。几乎一半(49%)的视频时长不超过五分钟,而四分之一(26%)的视频时长在五到十分钟之间。只有8%的视频超过20分钟。在策划的视频中,85%是高分辨率(720P),其余15%的分辨率从360P至720P不等。虽然低分辨率的视频在内容生成任务中可能表现不如高分辨率的视频,但只要配有适当的字幕,它们仍可用于视频-语言表示学习。 19 | 20 | ![b469e00b43d46a6b3f89899483abcf6](https://github.com/OpenGVLab/InternVideo/assets/43169235/7d6aca7d-362a-425d-9ef2-ec0189491b52) 21 | 22 | InternVid展示了在分割剪辑级别上具有不同剪辑时长和字幕长度的多样性。美学分数和剪辑-字幕相似度均匀分布。大部分剪辑的长度在0-10秒之间,占所有剪辑的85%。大约一半的剪辑字幕含有10-20个单词,而三分之一的剪辑字幕含有少于10个单词。大约11%的剪辑具有超过20个单词的长字幕。 23 | 24 | ![429af4993adb77478c000c865ae5a1b](https://github.com/OpenGVLab/InternVideo/assets/43169235/f64588c3-81e8-43de-b771-46500474d2ff) 25 | 26 | **ViCLIP: 一个简单的用于转移视频-文本表示的视频CLIP** 27 | 28 | 基于CLIP, 我们构建了一个简单的视频-文本预训练基线ViCLIP。它由视频编码器(ViT)和文本编码器组成,如下所示。这两个模块都是从相应的CLIP组件初始化的。我们将视频编码器中的原生注意力更新为时空注意力,同时保持其他设计元素不变。为了高效学习,我们在预训练中对视频进行了掩蔽处理。 29 | 30 | 87c6263cc4aceee72cc8e37085a8109 31 | 32 | 33 | # 数据 & 模型库 34 | 35 | ### 预训练数据 & 模型 36 | 37 |
38 | 39 | | 模型 | 训练数据 | 描述 | 40 | | :-----------------: | :----------------------: | :---------------------------------------------------------------------------------------------------: | 41 | | ViCLIP-L-14 \[[HuggingFace](https://huggingface.co/OpenGVLab/ViCLIP) \| [Aliyun](https://pjlab-gvm-data.oss-cn-shanghai.aliyuncs.com/internvideo/viclip/ViClip-InternVid-10M-FLT.pth)\] | InternVid-10M-FLT \[[HuggingFace](https://huggingface.co/datasets/OpenGVLab/InternVid) \| [OpenDataLab](https://opendatalab.com/shepshep/InternVid)\] | | 42 |
43 | 44 | 45 | ## Citation 46 | 47 | 如果您发现这项工作对您的研究有所帮助,请考虑引用InternVid。您的肯定将极大地帮助我们继续为研究社区贡献资源。 48 | 49 | ``` 50 | @article{wang2023internvid, 51 | title={InternVid: A Large-scale Video-Text Dataset for Multimodal Understanding and Generation}, 52 | author={Wang, Yi and He, Yinan and Li, Yizhuo and Li, Kunchang and Yu, Jiashuo and Ma, Xin and Chen, Xinyuan and Wang, Yaohui and Luo, Ping and Liu, Ziwei and Wang, Yali and Wang, Limin and Qiao, Yu}, 53 | journal={arXiv preprint arXiv:2307.06942}, 54 | year={2023} 55 | } 56 | 57 | @article{wang2022internvideo, 58 | title={InternVideo: General Video Foundation Models via Generative and Discriminative Learning}, 59 | author={Wang, Yi and Li, Kunchang and Li, Yizhuo and He, Yinan and Huang, Bingkun and Zhao, Zhiyu and Zhang, Hongjie and Xu, Jilan and Liu, Yi and Wang, Zun and Xing, Sen and Chen, Guo and Pan, Junting and Yu, Jiashuo and Wang, Yali and Wang, Limin and Qiao, Yu}, 60 | journal={arXiv preprint arXiv:2212.03191}, 61 | year={2022} 62 | } 63 | ``` -------------------------------------------------------------------------------- /InternVid/div_sampling.py: -------------------------------------------------------------------------------- 1 | from collections import Counter 2 | import json 3 | import random 4 | import numpy as np 5 | data = json.load(open("/path/to/to_sample")) 6 | video_id = set([x["video"].split("/")[-1][:11] for x in data]) 7 | video_id_counter = Counter([x["video"].split("/")[-1][:11] for x in data]) 8 | sampling_weights = [1.0 / video_id_counter[x["video"].split("/")[-1][:11]] for x in data] 9 | np.random.seed(42) 10 | sampling_weights = np.array(sampling_weights) 11 | sampling_weights = sampling_weights / sampling_weights.sum() 12 | sampled_index = np.random.choice(len(data), 10647458, replace=False, p=sampling_weights) 13 | data = [data[i] for i in sampled_index] 14 | json.dump(data, open("/path/to/sampled", "w")) -------------------------------------------------------------------------------- /InternVid/test_viCLIP.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import os 3 | import cv2 4 | 5 | from viclip import get_viclip, retrieve_text, _frame_from_video 6 | video = cv2.VideoCapture('Data/InternVid/example1.mp4') 7 | frames = [x for x in _frame_from_video(video)] 8 | print('frames', frames) 9 | # modify xxx to the path of the pretrained model 10 | model_cfgs = { 11 | 'viclip-l-internvid-10m-flt': { 12 | 'size': 'l', 13 | 'pretrained': '/home/yue/data/ViClip-InternVid-10M-FLT.pth', 14 | }, 15 | 'viclip-l-internvid-200m': { 16 | 'size': 'l', 17 | 'pretrained': 'xxx/ViCLIP-L_InternVid-200M.pth', 18 | }, 19 | 'viclip-b-internvid-10m-flt': { 20 | 'size': 'b', 21 | 'pretrained': 'xxx/ViCLIP-B_InternVid-FLT-10M.pth', 22 | }, 23 | 'viclip-b-internvid-200m': { 24 | 'size': 'b', 25 | 'pretrained': 'xxx/ViCLIP-B_InternVid-200M.pth', 26 | }, 27 | } 28 | 29 | text_candidates = ["A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon.", 30 | "A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys.", 31 | "A person dressed in a blue jacket shovels the snow-covered pavement outside their house.", 32 | "A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner.", 33 | "A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride.", 34 | "A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees.", 35 | "A playful dog slides down a snowy hill, wagging its tail with delight.", 36 | "A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees.", 37 | "A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run.", 38 | "A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery."] 39 | 40 | cfg = model_cfgs['viclip-l-internvid-10m-flt'] 41 | model_l = get_viclip(cfg['size'], cfg['pretrained']) 42 | print('a') 43 | texts, probs = retrieve_text(frames, text_candidates, models=model_l, topk=5) 44 | -------------------------------------------------------------------------------- /InternVid/utils/config_utils.py: -------------------------------------------------------------------------------- 1 | import logging 2 | import os 3 | import sys 4 | from os.path import dirname, join 5 | 6 | from utils.config import Config 7 | from utils.distributed import init_distributed_mode, is_main_process 8 | from utils.logger import setup_logger 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | 13 | def setup_config(): 14 | """Conbine yaml config and command line config with OmegaConf. 15 | Also converts types, e.g., `'None'` (str) --> `None` (None) 16 | """ 17 | config = Config.get_config() 18 | if config.debug: 19 | config.wandb.enable = False 20 | return config 21 | 22 | 23 | def setup_evaluate_config(config): 24 | """setup evaluation default settings, e.g., disable wandb""" 25 | assert config.evaluate 26 | config.wandb.enable = False 27 | if config.output_dir is None: 28 | config.output_dir = join(dirname(config.pretrained_path), "eval") 29 | return config 30 | 31 | 32 | def setup_output_dir(output_dir, excludes=["code"]): 33 | """ensure not overwritting an exisiting/non-empty output dir""" 34 | if not os.path.exists(output_dir): 35 | os.makedirs(output_dir, exist_ok=False) 36 | else: 37 | existing_dirs_files = os.listdir(output_dir) # list 38 | remaining = set(existing_dirs_files) - set(excludes) 39 | remaining = [e for e in remaining if "slurm" not in e] 40 | remaining = [e for e in remaining if ".out" not in e] 41 | # assert len(remaining) == 0, f"remaining dirs or files: {remaining}" 42 | logger.warn(f"remaining dirs or files: {remaining}") 43 | 44 | 45 | def setup_main(): 46 | """ 47 | Setup config, logger, output_dir, etc. 48 | Shared for pretrain and all downstream tasks. 49 | """ 50 | config = setup_config() 51 | if hasattr(config, "evaluate") and config.evaluate: 52 | config = setup_evaluate_config(config) 53 | init_distributed_mode(config) 54 | 55 | if is_main_process(): 56 | setup_output_dir(config.output_dir, excludes=["code"]) 57 | setup_logger(output=config.output_dir, color=True, name="vindlu") 58 | logger.info(f"config: {Config.pretty_text(config)}") 59 | Config.dump(config, os.path.join(config.output_dir, "config.json")) 60 | return config 61 | -------------------------------------------------------------------------------- /InternVid/utils/distributed.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/utils/distributed.py -------------------------------------------------------------------------------- /InternVid/utils/easydict.py: -------------------------------------------------------------------------------- 1 | class EasyDict(dict): 2 | """ 3 | Get attributes 4 | 5 | >>> d = EasyDict({'foo':3}) 6 | >>> d['foo'] 7 | 3 8 | >>> d.foo 9 | 3 10 | >>> d.bar 11 | Traceback (most recent call last): 12 | ... 13 | AttributeError: 'EasyDict' object has no attribute 'bar' 14 | 15 | Works recursively 16 | 17 | >>> d = EasyDict({'foo':3, 'bar':{'x':1, 'y':2}}) 18 | >>> isinstance(d.bar, dict) 19 | True 20 | >>> d.bar.x 21 | 1 22 | 23 | Bullet-proof 24 | 25 | >>> EasyDict({}) 26 | {} 27 | >>> EasyDict(d={}) 28 | {} 29 | >>> EasyDict(None) 30 | {} 31 | >>> d = {'a': 1} 32 | >>> EasyDict(**d) 33 | {'a': 1} 34 | 35 | Set attributes 36 | 37 | >>> d = EasyDict() 38 | >>> d.foo = 3 39 | >>> d.foo 40 | 3 41 | >>> d.bar = {'prop': 'value'} 42 | >>> d.bar.prop 43 | 'value' 44 | >>> d 45 | {'foo': 3, 'bar': {'prop': 'value'}} 46 | >>> d.bar.prop = 'newer' 47 | >>> d.bar.prop 48 | 'newer' 49 | 50 | 51 | Values extraction 52 | 53 | >>> d = EasyDict({'foo':0, 'bar':[{'x':1, 'y':2}, {'x':3, 'y':4}]}) 54 | >>> isinstance(d.bar, list) 55 | True 56 | >>> from operator import attrgetter 57 | >>> map(attrgetter('x'), d.bar) 58 | [1, 3] 59 | >>> map(attrgetter('y'), d.bar) 60 | [2, 4] 61 | >>> d = EasyDict() 62 | >>> d.keys() 63 | [] 64 | >>> d = EasyDict(foo=3, bar=dict(x=1, y=2)) 65 | >>> d.foo 66 | 3 67 | >>> d.bar.x 68 | 1 69 | 70 | Still like a dict though 71 | 72 | >>> o = EasyDict({'clean':True}) 73 | >>> o.items() 74 | [('clean', True)] 75 | 76 | And like a class 77 | 78 | >>> class Flower(EasyDict): 79 | ... power = 1 80 | ... 81 | >>> f = Flower() 82 | >>> f.power 83 | 1 84 | >>> f = Flower({'height': 12}) 85 | >>> f.height 86 | 12 87 | >>> f['power'] 88 | 1 89 | >>> sorted(f.keys()) 90 | ['height', 'power'] 91 | 92 | update and pop items 93 | >>> d = EasyDict(a=1, b='2') 94 | >>> e = EasyDict(c=3.0, a=9.0) 95 | >>> d.update(e) 96 | >>> d.c 97 | 3.0 98 | >>> d['c'] 99 | 3.0 100 | >>> d.get('c') 101 | 3.0 102 | >>> d.update(a=4, b=4) 103 | >>> d.b 104 | 4 105 | >>> d.pop('a') 106 | 4 107 | >>> d.a 108 | Traceback (most recent call last): 109 | ... 110 | AttributeError: 'EasyDict' object has no attribute 'a' 111 | """ 112 | 113 | def __init__(self, d=None, **kwargs): 114 | if d is None: 115 | d = {} 116 | if kwargs: 117 | d.update(**kwargs) 118 | for k, v in d.items(): 119 | setattr(self, k, v) 120 | # Class attributes 121 | for k in self.__class__.__dict__.keys(): 122 | if not (k.startswith("__") and k.endswith("__")) and not k in ("update", "pop"): 123 | setattr(self, k, getattr(self, k)) 124 | 125 | def __setattr__(self, name, value): 126 | if isinstance(value, (list, tuple)): 127 | value = [self.__class__(x) if isinstance(x, dict) else x for x in value] 128 | elif isinstance(value, dict) and not isinstance(value, self.__class__): 129 | value = self.__class__(value) 130 | super(EasyDict, self).__setattr__(name, value) 131 | super(EasyDict, self).__setitem__(name, value) 132 | 133 | __setitem__ = __setattr__ 134 | 135 | def update(self, e=None, **f): 136 | d = e or dict() 137 | d.update(f) 138 | for k in d: 139 | setattr(self, k, d[k]) 140 | 141 | def pop(self, k, d=None): 142 | if hasattr(self, k): 143 | delattr(self, k) 144 | return super(EasyDict, self).pop(k, d) 145 | 146 | 147 | if __name__ == "__main__": 148 | import doctest 149 | 150 | -------------------------------------------------------------------------------- /InternVid/utils/logger.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/utils/logger.py -------------------------------------------------------------------------------- /InternVid/utils/optimizer.py: -------------------------------------------------------------------------------- 1 | """ Optimizer Factory w/ Custom Weight Decay 2 | Hacked together by / Copyright 2020 Ross Wightman 3 | """ 4 | import re 5 | import torch 6 | from torch import optim as optim 7 | from utils.distributed import is_main_process 8 | import logging 9 | logger = logging.getLogger(__name__) 10 | try: 11 | from apex.optimizers import FusedNovoGrad, FusedAdam, FusedLAMB, FusedSGD 12 | has_apex = True 13 | except ImportError: 14 | has_apex = False 15 | 16 | 17 | def add_weight_decay(model, weight_decay, no_decay_list=(), filter_bias_and_bn=True): 18 | named_param_tuples = [] 19 | for name, param in model.named_parameters(): 20 | if not param.requires_grad: 21 | continue # frozen weights 22 | if filter_bias_and_bn and (len(param.shape) == 1 or name.endswith(".bias")): 23 | named_param_tuples.append([name, param, 0]) 24 | elif name in no_decay_list: 25 | named_param_tuples.append([name, param, 0]) 26 | else: 27 | named_param_tuples.append([name, param, weight_decay]) 28 | return named_param_tuples 29 | 30 | 31 | def add_different_lr(named_param_tuples_or_model, diff_lr_names, diff_lr, default_lr): 32 | """use lr=diff_lr for modules named found in diff_lr_names, 33 | otherwise use lr=default_lr 34 | 35 | Args: 36 | named_param_tuples_or_model: List([name, param, weight_decay]), or nn.Module 37 | diff_lr_names: List(str) 38 | diff_lr: float 39 | default_lr: float 40 | Returns: 41 | named_param_tuples_with_lr: List([name, param, weight_decay, lr]) 42 | """ 43 | named_param_tuples_with_lr = [] 44 | logger.info(f"diff_names: {diff_lr_names}, diff_lr: {diff_lr}") 45 | for name, p, wd in named_param_tuples_or_model: 46 | use_diff_lr = False 47 | for diff_name in diff_lr_names: 48 | # if diff_name in name: 49 | if re.search(diff_name, name) is not None: 50 | logger.info(f"param {name} use different_lr: {diff_lr}") 51 | use_diff_lr = True 52 | break 53 | 54 | named_param_tuples_with_lr.append( 55 | [name, p, wd, diff_lr if use_diff_lr else default_lr] 56 | ) 57 | 58 | if is_main_process(): 59 | for name, _, wd, diff_lr in named_param_tuples_with_lr: 60 | logger.info(f"param {name}: wd: {wd}, lr: {diff_lr}") 61 | 62 | return named_param_tuples_with_lr 63 | 64 | 65 | def create_optimizer_params_group(named_param_tuples_with_lr): 66 | """named_param_tuples_with_lr: List([name, param, weight_decay, lr])""" 67 | group = {} 68 | for name, p, wd, lr in named_param_tuples_with_lr: 69 | if wd not in group: 70 | group[wd] = {} 71 | if lr not in group[wd]: 72 | group[wd][lr] = [] 73 | group[wd][lr].append(p) 74 | 75 | optimizer_params_group = [] 76 | for wd, lr_groups in group.items(): 77 | for lr, p in lr_groups.items(): 78 | optimizer_params_group.append(dict( 79 | params=p, 80 | weight_decay=wd, 81 | lr=lr 82 | )) 83 | logger.info(f"optimizer -- lr={lr} wd={wd} len(p)={len(p)}") 84 | return optimizer_params_group 85 | 86 | 87 | def create_optimizer(args, model, filter_bias_and_bn=True): 88 | opt_lower = args.opt.lower() 89 | weight_decay = args.weight_decay 90 | # check for modules that requires different lr 91 | if hasattr(args, "different_lr") and args.different_lr.enable: 92 | diff_lr_module_names = args.different_lr.module_names 93 | diff_lr = args.different_lr.lr 94 | else: 95 | diff_lr_module_names = [] 96 | diff_lr = None 97 | 98 | no_decay = {} 99 | if hasattr(model, 'no_weight_decay'): 100 | no_decay = model.no_weight_decay() 101 | named_param_tuples = add_weight_decay( 102 | model, weight_decay, no_decay, filter_bias_and_bn) 103 | named_param_tuples = add_different_lr( 104 | named_param_tuples, diff_lr_module_names, diff_lr, args.lr) 105 | parameters = create_optimizer_params_group(named_param_tuples) 106 | 107 | if 'fused' in opt_lower: 108 | assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers' 109 | 110 | opt_args = dict(lr=args.lr, weight_decay=weight_decay) 111 | if hasattr(args, 'opt_eps') and args.opt_eps is not None: 112 | opt_args['eps'] = args.opt_eps 113 | if hasattr(args, 'opt_betas') and args.opt_betas is not None: 114 | opt_args['betas'] = args.opt_betas 115 | if hasattr(args, 'opt_args') and args.opt_args is not None: 116 | opt_args.update(args.opt_args) 117 | 118 | opt_split = opt_lower.split('_') 119 | opt_lower = opt_split[-1] 120 | if opt_lower == 'sgd' or opt_lower == 'nesterov': 121 | opt_args.pop('eps', None) 122 | optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=True, **opt_args) 123 | elif opt_lower == 'momentum': 124 | opt_args.pop('eps', None) 125 | optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=False, **opt_args) 126 | elif opt_lower == 'adam': 127 | optimizer = optim.Adam(parameters, **opt_args) 128 | elif opt_lower == 'adamw': 129 | optimizer = optim.AdamW(parameters, **opt_args) 130 | else: 131 | assert False and "Invalid optimizer" 132 | raise ValueError 133 | return optimizer 134 | -------------------------------------------------------------------------------- /InternVid/utils/scheduler.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/utils/scheduler.py -------------------------------------------------------------------------------- /InternVid/viclip/__init__.py: -------------------------------------------------------------------------------- 1 | from .simple_tokenizer import SimpleTokenizer as _Tokenizer 2 | from .viclip import ViCLIP 3 | import torch 4 | import numpy as np 5 | import cv2 6 | import os 7 | 8 | 9 | def get_viclip(size='l', 10 | pretrain=os.path.join(os.path.dirname(os.path.abspath(__file__)), "ViClip-InternVid-10M-FLT.pth")): 11 | 12 | tokenizer = _Tokenizer() 13 | vclip = ViCLIP(tokenizer=tokenizer, size=size, pretrain=pretrain) 14 | m = {'viclip':vclip, 'tokenizer':tokenizer} 15 | 16 | return m 17 | 18 | def get_text_feat_dict(texts, clip, tokenizer, text_feat_d={}): 19 | for t in texts: 20 | feat = clip.get_text_features(t, tokenizer, text_feat_d) 21 | text_feat_d[t] = feat 22 | return text_feat_d 23 | 24 | def get_vid_feat(frames, clip): 25 | return clip.get_vid_features(frames) 26 | 27 | 28 | def _frame_from_video(video): 29 | while video.isOpened(): 30 | success, frame = video.read() 31 | if success: 32 | yield frame 33 | else: 34 | break 35 | 36 | v_mean = np.array([0.485, 0.456, 0.406]).reshape(1,1,3) 37 | v_std = np.array([0.229, 0.224, 0.225]).reshape(1,1,3) 38 | def normalize(data): 39 | return (data/255.0-v_mean)/v_std 40 | 41 | def frames2tensor(vid_list, fnum=8, target_size=(224, 224), device=torch.device('cuda')): 42 | assert(len(vid_list) >= fnum) 43 | step = len(vid_list) // fnum 44 | vid_list = vid_list[::step][:fnum] 45 | vid_list = [cv2.resize(x[:,:,::-1], target_size) for x in vid_list] 46 | vid_tube = [np.expand_dims(normalize(x), axis=(0, 1)) for x in vid_list] 47 | vid_tube = np.concatenate(vid_tube, axis=1) 48 | vid_tube = np.transpose(vid_tube, (0, 1, 4, 2, 3)) 49 | vid_tube = torch.from_numpy(vid_tube).to(device, non_blocking=True).float() 50 | return vid_tube 51 | 52 | def retrieve_text(frames, 53 | texts, 54 | models={'viclip':None, 55 | 'tokenizer':None}, 56 | topk=5, 57 | device=torch.device('cuda')): 58 | # clip, tokenizer = get_clip(name, model_cfg['size'], model_cfg['pretrained'], model_cfg['reload']) 59 | assert(type(models)==dict and models['viclip'] is not None and models['tokenizer'] is not None) 60 | clip, tokenizer = models['viclip'], models['tokenizer'] 61 | clip = clip.to(device) 62 | frames_tensor = frames2tensor(frames, device=device) 63 | vid_feat = get_vid_feat(frames_tensor, clip) 64 | 65 | text_feat_d = {} 66 | text_feat_d = get_text_feat_dict(texts, clip, tokenizer, text_feat_d) 67 | text_feats = [text_feat_d[t] for t in texts] 68 | text_feats_tensor = torch.cat(text_feats, 0) 69 | 70 | probs, idxs = clip.get_predict_label(vid_feat, text_feats_tensor, top=topk) 71 | 72 | ret_texts = [texts[i] for i in idxs.numpy()[0].tolist()] 73 | return ret_texts, probs.numpy()[0] -------------------------------------------------------------------------------- /InternVid/viclip/__pycache__/__init__.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/__init__.cpython-311.pyc -------------------------------------------------------------------------------- /InternVid/viclip/__pycache__/__init__.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/__init__.cpython-39.pyc -------------------------------------------------------------------------------- /InternVid/viclip/__pycache__/simple_tokenizer.cpython-311.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/simple_tokenizer.cpython-311.pyc -------------------------------------------------------------------------------- /InternVid/viclip/__pycache__/simple_tokenizer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/simple_tokenizer.cpython-39.pyc -------------------------------------------------------------------------------- /InternVid/viclip/__pycache__/viclip.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/viclip.cpython-39.pyc -------------------------------------------------------------------------------- /InternVid/viclip/__pycache__/viclip_text.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/viclip_text.cpython-39.pyc -------------------------------------------------------------------------------- /InternVid/viclip/__pycache__/viclip_vision.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/viclip_vision.cpython-39.pyc -------------------------------------------------------------------------------- /InternVid/viclip/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /InternVid/viclip/simple_tokenizer.py: -------------------------------------------------------------------------------- 1 | import gzip 2 | import html 3 | import os 4 | from functools import lru_cache 5 | 6 | import ftfy 7 | import regex as re 8 | 9 | 10 | @lru_cache() 11 | def default_bpe(): 12 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz") 13 | # @lru_cache() 14 | # def default_bpe(): 15 | # return "bpe_simple_vocab_16e6.txt.gz" 16 | 17 | 18 | @lru_cache() 19 | def bytes_to_unicode(): 20 | """ 21 | Returns list of utf-8 byte and a corresponding list of unicode strings. 22 | The reversible bpe codes work on unicode strings. 23 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. 24 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. 25 | This is a signficant percentage of your normal, say, 32K bpe vocab. 26 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings. 27 | And avoids mapping to whitespace/control characters the bpe code barfs on. 28 | """ 29 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) 30 | cs = bs[:] 31 | n = 0 32 | for b in range(2**8): 33 | if b not in bs: 34 | bs.append(b) 35 | cs.append(2**8+n) 36 | n += 1 37 | cs = [chr(n) for n in cs] 38 | return dict(zip(bs, cs)) 39 | 40 | 41 | def get_pairs(word): 42 | """Return set of symbol pairs in a word. 43 | Word is represented as tuple of symbols (symbols being variable-length strings). 44 | """ 45 | pairs = set() 46 | prev_char = word[0] 47 | for char in word[1:]: 48 | pairs.add((prev_char, char)) 49 | prev_char = char 50 | return pairs 51 | 52 | 53 | def basic_clean(text): 54 | text = ftfy.fix_text(text) 55 | text = html.unescape(html.unescape(text)) 56 | return text.strip() 57 | 58 | 59 | def whitespace_clean(text): 60 | text = re.sub(r'\s+', ' ', text) 61 | text = text.strip() 62 | return text 63 | 64 | 65 | class SimpleTokenizer(object): 66 | def __init__(self, bpe_path: str = default_bpe()): 67 | self.byte_encoder = bytes_to_unicode() 68 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()} 69 | merges = gzip.open(bpe_path).read().decode("utf-8").split('\n') 70 | merges = merges[1:49152-256-2+1] 71 | merges = [tuple(merge.split()) for merge in merges] 72 | vocab = list(bytes_to_unicode().values()) 73 | vocab = vocab + [v+'' for v in vocab] 74 | for merge in merges: 75 | vocab.append(''.join(merge)) 76 | vocab.extend(['<|startoftext|>', '<|endoftext|>']) 77 | self.encoder = dict(zip(vocab, range(len(vocab)))) 78 | self.decoder = {v: k for k, v in self.encoder.items()} 79 | self.bpe_ranks = dict(zip(merges, range(len(merges)))) 80 | self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'} 81 | self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE) 82 | 83 | def bpe(self, token): 84 | if token in self.cache: 85 | return self.cache[token] 86 | word = tuple(token[:-1]) + ( token[-1] + '',) 87 | pairs = get_pairs(word) 88 | 89 | if not pairs: 90 | return token+'' 91 | 92 | while True: 93 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf'))) 94 | if bigram not in self.bpe_ranks: 95 | break 96 | first, second = bigram 97 | new_word = [] 98 | i = 0 99 | while i < len(word): 100 | try: 101 | j = word.index(first, i) 102 | new_word.extend(word[i:j]) 103 | i = j 104 | except: 105 | new_word.extend(word[i:]) 106 | break 107 | 108 | if word[i] == first and i < len(word)-1 and word[i+1] == second: 109 | new_word.append(first+second) 110 | i += 2 111 | else: 112 | new_word.append(word[i]) 113 | i += 1 114 | new_word = tuple(new_word) 115 | word = new_word 116 | if len(word) == 1: 117 | break 118 | else: 119 | pairs = get_pairs(word) 120 | word = ' '.join(word) 121 | self.cache[token] = word 122 | return word 123 | 124 | def encode(self, text): 125 | bpe_tokens = [] 126 | text = whitespace_clean(basic_clean(text)).lower() 127 | for token in re.findall(self.pat, text): 128 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8')) 129 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' ')) 130 | return bpe_tokens 131 | 132 | def decode(self, tokens): 133 | text = ''.join([self.decoder[token] for token in tokens]) 134 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ') 135 | return text 136 | -------------------------------------------------------------------------------- /LaViLa/CODE_OF_CONDUCT.md: -------------------------------------------------------------------------------- 1 | # Code of Conduct 2 | 3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to. 4 | Please read the [full text](https://code.fb.com/codeofconduct/) 5 | so that you can understand what actions will and will not be tolerated. -------------------------------------------------------------------------------- /LaViLa/CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # Contributing to LaViLa 2 | We want to make contributing to this project as easy and transparent as 3 | possible. 4 | 5 | ## Our Development Process 6 | Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis. 7 | 8 | ## Pull Requests 9 | We actively welcome your pull requests. 10 | 11 | 1. Fork the repo and create your branch from `main`. 12 | 2. If you've added code that should be tested, add tests. 13 | 3. If you've changed APIs, update the documentation. 14 | 4. Ensure the test suite passes. 15 | 5. Make sure your code lints. 16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA"). 17 | 18 | ## Contributor License Agreement ("CLA") 19 | In order to accept your pull request, we need you to submit a CLA. You only need 20 | to do this once to work on any of Facebook's open source projects. 21 | 22 | Complete your CLA here: 23 | 24 | ## Issues 25 | We use GitHub issues to track public bugs. Please ensure your description is 26 | clear and has sufficient instructions to be able to reproduce the issue. 27 | 28 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe 29 | disclosure of security bugs. In those cases, please go through the process 30 | outlined on that page and do not file a public issue. 31 | 32 | ## Coding Style 33 | * 4 spaces for indentation rather than tabs 34 | * 80 character line length 35 | * PEP8 formatting following [Black](https://black.readthedocs.io/en/stable/) 36 | 37 | ## License 38 | By contributing to LaViLa, you agree that your contributions will be licensed 39 | under the LICENSE file in the root directory of this source tree. 40 | -------------------------------------------------------------------------------- /LaViLa/LICENSE: -------------------------------------------------------------------------------- 1 | 2 | MIT License 3 | 4 | Copyright (c) Meta Platforms, Inc. and affiliates. 5 | 6 | Permission is hereby granted, free of charge, to any person obtaining a copy 7 | of this software and associated documentation files (the "Software"), to deal 8 | in the Software without restriction, including without limitation the rights 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 10 | copies of the Software, and to permit persons to whom the Software is 11 | furnished to do so, subject to the following conditions: 12 | 13 | The above copyright notice and this permission notice shall be included in all 14 | copies or substantial portions of the Software. 15 | 16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 | SOFTWARE. 23 | -------------------------------------------------------------------------------- /LaViLa/__pycache__/eval_narrator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/__pycache__/eval_narrator.cpython-39.pyc -------------------------------------------------------------------------------- /LaViLa/clip_caption.py: -------------------------------------------------------------------------------- 1 | import decord 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from collections import OrderedDict 5 | import time 6 | import torch 7 | import torchvision.transforms as transforms 8 | import torchvision.transforms._transforms_video as transforms_video 9 | import sys 10 | sys.path.insert(0, './') 11 | from lavila.data.video_transforms import Permute 12 | from lavila.data.datasets import get_frame_ids, video_loader_by_frames 13 | from lavila.models.models import VCLM_OPENAI_TIMESFORMER_BASE_GPT2 14 | from lavila.models.tokenizer import MyGPT2Tokenizer 15 | from base64 import b64encode 16 | import os 17 | import fnmatch 18 | import imageio 19 | import json 20 | import cv2 21 | 22 | 23 | ckpt_path = 'vclm_openai_timesformer_base_gpt2_base.pt_ego4d.jobid_319630.ep_0002.md5sum_68a71f.pth' 24 | ckpt = torch.load(ckpt_path, map_location='cpu') 25 | state_dict = OrderedDict() 26 | for k, v in ckpt['state_dict'].items(): 27 | state_dict[k.replace('module.', '')] = v 28 | 29 | # instantiate the model, and load the pre-trained weights 30 | model = VCLM_OPENAI_TIMESFORMER_BASE_GPT2( 31 | text_use_cls_token=False, 32 | project_embed_dim=256, 33 | gated_xattn=True, 34 | timesformer_gated_xattn=False, 35 | freeze_lm_vclm=False, 36 | freeze_visual_vclm=False, 37 | freeze_visual_vclm_temporal=False, 38 | num_frames=4, 39 | drop_path_rate=0. 40 | ) 41 | 42 | model.load_state_dict(state_dict, strict=True) 43 | model.eval() 44 | tokenizer = MyGPT2Tokenizer('gpt2', add_bos=True) 45 | 46 | candidate_num = 5 47 | crop_size = 224 48 | val_transform = transforms.Compose([ 49 | Permute([3, 0, 1, 2]), 50 | transforms.Resize(crop_size), 51 | transforms.CenterCrop(crop_size), 52 | transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305]) 53 | ]) 54 | 55 | 56 | def decode_one(generated_ids, tokenizer): 57 | # get the index of 58 | if tokenizer.eos_token_id == tokenizer.bos_token_id: 59 | if tokenizer.eos_token_id in generated_ids[1:].tolist(): 60 | eos_id = generated_ids[1:].tolist().index(tokenizer.eos_token_id) + 1 61 | else: 62 | eos_id = len(generated_ids.tolist()) - 1 63 | elif tokenizer.eos_token_id in generated_ids.tolist(): 64 | eos_id = generated_ids.tolist().index(tokenizer.eos_token_id) 65 | else: 66 | eos_id = len(generated_ids.tolist()) - 1 67 | generated_text_str = tokenizer.tokenizer.decode(generated_ids[1:eos_id].tolist()) 68 | return generated_text_str 69 | 70 | 71 | def create_caption(frames): 72 | with torch.no_grad(): 73 | image_features = model.encode_image(frames) 74 | generated_text_ids, ppls = model.generate( 75 | image_features, 76 | tokenizer, 77 | target=None, # free-form generation 78 | max_text_length=77, 79 | top_k=None, 80 | top_p=0.95, # nucleus sampling 81 | num_return_sequences=candidate_num, # number of candidates: 10 82 | temperature=0.9, 83 | early_stopping=True, 84 | ) 85 | longest_sentence = "" 86 | for i in range(candidate_num): 87 | generated_text_str = decode_one(generated_text_ids[i], tokenizer) 88 | if len(generated_text_str) > len(longest_sentence): 89 | longest_sentence = generated_text_str 90 | return longest_sentence 91 | 92 | 93 | def captioning(frame_path, fps, caption_seconds=2, frames_per_caption=4): 94 | frame_interval = int(fps*caption_seconds/frames_per_caption) 95 | sequential_image_list = [] 96 | sequential_caption_list = dict() 97 | 98 | for root, dirs, files in os.walk(frame_path): 99 | for file in files: 100 | if fnmatch.fnmatch(file, '*.jpg'): 101 | sequential_image_list.append(file) 102 | 103 | sequential_image_list.sort() # ordered frame list 104 | 105 | start_frame = int(sequential_image_list[0].split('.')[0].split('_')[-1]) 106 | end_frame = int(sequential_image_list[-1].split('.')[0].split('_')[-1]) 107 | 108 | print(start_frame) 109 | print(end_frame) 110 | total_frames = end_frame-start_frame+1 111 | 112 | total_captions = total_frames//(fps*caption_seconds) 113 | IMAGE_NAME_PATTERN = "video_frame_{:07d}.jpg" 114 | 115 | 116 | for i in range(total_captions): 117 | print(i) 118 | caption_start_frame = start_frame + i * fps * caption_seconds 119 | caption_end_frame = start_frame + (i+1) * fps * caption_seconds 120 | input_frames = [] 121 | for j in range(frames_per_caption): 122 | frame_idx = caption_start_frame + j* frame_interval 123 | print('frame: ', frame_idx) 124 | frame_name = IMAGE_NAME_PATTERN.format(frame_idx) 125 | image_file = os.path.join(frame_path, frame_name) 126 | image = imageio.imread(image_file) 127 | input_frames.append(image) 128 | input_frames = torch.from_numpy(np.stack(input_frames, axis=0)).float() #[4, w, h, 3] 129 | #print("input_frames: ", input_frames) 130 | #print("input_frames.size: ", input_frames.size()) 131 | frames = val_transform(input_frames) 132 | frames = frames.unsqueeze(0) 133 | caption = create_caption(frames) 134 | time_stamps = "{}-{}".format(str(caption_start_frame), str(caption_end_frame)) 135 | sequential_caption_list[time_stamps] = caption 136 | 137 | with open(os.path.join(frame_path, 'captions.json'), 'w') as f: 138 | json.dump(sequential_caption_list, f) 139 | 140 | 141 | 142 | def captioning(frame_path, fps, caption_seconds=2, frames_per_caption=4): 143 | frame_interval = int(fps*caption_seconds/frames_per_caption) 144 | sequential_image_list = [] 145 | sequential_caption_list = dict() 146 | 147 | for root, dirs, files in os.walk(frame_path): 148 | for file in files: 149 | if fnmatch.fnmatch(file, '*.jpg'): 150 | sequential_image_list.append(file) 151 | 152 | sequential_image_list.sort() # ordered frame list 153 | 154 | start_frame = int(sequential_image_list[0].split('.')[0].split('_')[-1]) 155 | end_frame = int(sequential_image_list[-1].split('.')[0].split('_')[-1]) 156 | 157 | print(start_frame) 158 | print(end_frame) 159 | total_frames = end_frame-start_frame+1 160 | 161 | total_captions = total_frames//(fps*caption_seconds) 162 | IMAGE_NAME_PATTERN = "video_frame_{:07d}.jpg" 163 | 164 | 165 | for i in range(total_captions): 166 | print(i) 167 | caption_start_frame = start_frame + i * fps * caption_seconds 168 | caption_end_frame = start_frame + (i+1) * fps * caption_seconds 169 | input_frames = [] 170 | for j in range(frames_per_caption): 171 | frame_idx = caption_start_frame + j* frame_interval 172 | print('frame: ', frame_idx) 173 | frame_name = IMAGE_NAME_PATTERN.format(frame_idx) 174 | image_file = os.path.join(frame_path, frame_name) 175 | image = imageio.imread(image_file) 176 | input_frames.append(image) 177 | input_frames = torch.from_numpy(np.stack(input_frames, axis=0)).float() #[4, w, h, 3] 178 | #print("input_frames: ", input_frames) 179 | #print("input_frames.size: ", input_frames.size()) 180 | frames = val_transform(input_frames) 181 | frames = frames.unsqueeze(0) 182 | caption = create_caption(frames) 183 | time_stamps = "{}-{}".format(str(caption_start_frame), str(caption_end_frame)) 184 | sequential_caption_list[time_stamps] = caption 185 | 186 | with open(os.path.join(frame_path, 'captions.json'), 'w') as f: 187 | json.dump(sequential_caption_list, f) -------------------------------------------------------------------------------- /LaViLa/datasets/README.md: -------------------------------------------------------------------------------- 1 | # Preparing datasets for LAVILA 2 | 3 | Please download the (selected) datasets from the official websites and place or sim-link them under `$LAVILA_ROOT/datasets/`. 4 | 5 | ```bash 6 | $LAVILA_ROOT/datasets/ 7 | CharadesEgo/ 8 | EGTEA/ 9 | EK100/ 10 | Ego4D/ 11 | ``` 12 | 13 | ## Ego4D 14 | 1. Download [Ego4D videos](https://ego4d-data.org/docs/start-here/#download-data) (license is required). 15 | 16 | 2. Preprocess 17 | 18 | We cut each video into 5-minute-long chunks and resize the smaller size to be 288 pixels for faster IO. Please refer to [this script](scripts/crop_and_resize_ego4d.sh) for more details. 19 | 20 | 3. Download annotations 21 | 22 | a. Download [egomcq.json](https://drive.google.com/file/d/1-5iRYf4BCHmj4MYQYFRMY4bhsWJUN3rW/view) to `$LAVILA_ROOT/datasets/Ego4D` (if you want to evaluate EgoMCQ). 23 | 24 | b. Download [metadata for train split](https://dl.fbaipublicfiles.com/lavila/metadata/ego4d/ego4d_train.pkl) and [val split](https://dl.fbaipublicfiles.com/lavila/metadata/ego4d/ego4d_val.pkl) to `$LAVILA_ROOT/datasets/Ego4D` ((if you want to train LAVILA from scratch). 25 | 26 | The fold should look like this: 27 | ```bash 28 | $LAVILA_ROOT/datasets/ 29 | Ego4D/ 30 | ego4d_train.pkl 31 | ego4d_val.pkl 32 | egomcq.json 33 | video_288px/ 34 | 000786a7-3f9d-4fe6-bfb3-045b368f7d44.mp4/ 35 | 0.mp4 36 | 300.mp4 37 | 000a3525-6c98-4650-aaab-be7d2c7b9402.mp4/ 38 | 0.mp4 39 | ... 40 | ``` 41 | 42 | 43 | ## EPIC-Kitchens-100 (EK-100) 44 | 45 | 1. Download annotations 46 | 47 | ```bash 48 | # Assume that you are under `datasets/EK100/` 49 | git clone https://github.com/epic-kitchens/epic-kitchens-100-annotations 50 | ``` 51 | 52 | 2. Download videos. 53 | 54 | a. For raw videos, please download them from [https://epic-kitchens.github.io/](https://epic-kitchens.github.io/). 55 | 56 | b. (Recommended) The raw videos are huge (~1 TB). As an alternative, please check out a [resized version](https://utexas.box.com/s/l7ij81ie5q07p9fdg0vtejihq61liln9). 57 | 58 | 3. (For EK-100 MIR) 59 | 60 | a. Generate the relevancy matrix of train/val splits using [the official code](https://github.com/mwray/Joint-Part-of-Speech-Embeddings). 61 | 62 | b. (Recommended) The generated result has some randomness. Therefore, we also provide the [replica of train split](https://dl.fbaipublicfiles.com/lavila/metadata/EK100/caption_relevancy_EPIC_100_retrieval_train.pkl) and [val split](https://dl.fbaipublicfiles.com/lavila/metadata/EK100/caption_relevancy_EPIC_100_retrieval_test.pkl). Please put them to the folder `$LAVILA_ROOT/datasets/EK100/epic-kitchens-100-annotations/retrieval_annotations/relevancy/`. 63 | 64 | 65 | The folder should look like this: 66 | ```bash 67 | $LAVILA_ROOT/datasets/ 68 | EK100/ 69 | epic-kitchens-100-annotations/ 70 | EPIC_100_train.csv 71 | EPIC_100_validation.csv 72 | ... 73 | retrieval_annotations/relevancy/ # this appears if you do 3. 74 | caption_relevancy_EPIC_100_retrieval_train.pkl 75 | caption_relevancy_EPIC_100_retrieval_test.pkl 76 | video_ht256px/ 77 | P01/ 78 | P01_01.MP4 79 | P01_02.MP4 80 | ... 81 | P01_19.MP4 82 | P02/ 83 | P02_01.MP4 84 | P02_02.MP4 85 | ... 86 | P02_15.MP4 87 | ... 88 | ``` 89 | 90 | ## CharadesEgo 91 | 92 | 1. Download annotations at [https://prior.allenai.org/projects/charades-ego](https://prior.allenai.org/projects/charades-ego). 93 | ```bash 94 | ### Annotations 95 | # Assume that you are under `datasets/CharadesEgo/` 96 | wget https://ai2-public-datasets.s3-us-west-2.amazonaws.com/charades/CharadesEgo.zip 97 | unzip CharadesEgo.zip && rm CharadesEgo.zip 98 | ``` 99 | 100 | 2. Download data (~11GB) at [https://prior.allenai.org/projects/charades-ego](https://prior.allenai.org/projects/charades-ego). 101 | ```bash 102 | ### Data 103 | wget https://ai2-public-datasets.s3-us-west-2.amazonaws.com/charades/CharadesEgo_v1_480.tar 104 | tar -xvf CharadesEgo_v1_480.tar # Or specify an external path using `-C` and sim-link it to here 105 | rm CharadesEgo_v1_480.tar 106 | ``` 107 | 108 | 3. (For fine-tuning CharadesEgo) Download two additional metadata files: [clip-level metadata (train)](https://dl.fbaipublicfiles.com/lavila/metadata/CharadesEgo/metadata_filtered_train.pkl) and [clip-level metadata (val)](https://dl.fbaipublicfiles.com/lavila/metadata/CharadesEgo/metadata_filtered_val.pkl). Put them to the folder `$LAVILA_ROOT/datasets/CharadesEgo/CharadesEgo/`. 109 | 110 | The folder should look like this: 111 | ```bash 112 | $LAVILA_ROOT/datasets/ 113 | CharadesEgo/ 114 | CharadesEgo/ 115 | CharadesEgo_v1_train_only1st.csv 116 | CharadesEgo_v1_test_only1st.csv 117 | ... 118 | metadata_filtered_train.pkl # this appears if you do 3. 119 | metadata_filtered_val.pkl # this appears if you do 3. 120 | CharadesEgo_v1_480/ 121 | 005BU.mp4 122 | 005BUEGO.mp4 123 | ... 124 | ``` 125 | 126 | 127 | ## EGTEA 128 | 129 | 1. Visit [https://cbs.ic.gatech.edu/fpv/](https://cbs.ic.gatech.edu/fpv/). 130 | 131 | 2. Download `TRIMMED_ACTION_CLIPS` (~20GB) and `ACTION_ANNOTATIONS` and untar to the current folder `$LAVILA_ROOT/datasets/EGTEA`. 132 | 133 | ```bash 134 | unzip action_annotation.zip -d EGTEA/ && rm action_annotation.zip 135 | ``` 136 | 137 | The folder should look like this: 138 | ```bash 139 | $LAVILA_ROOT/datasets/ 140 | EGTEA/ 141 | train_split1.txt 142 | test_split1.txt 143 | cropped_clips/ 144 | OP01-R01-PastaSalad/ 145 | OP01-R01-PastaSalad-1002316-1004005-F024051-F024101.mp4 146 | OP01-R01-PastaSalad-1004110-1021110-F024057-F024548.mp4 147 | OP01-R01-PastaSalad-1022590-1024050-F024539-F024581.mp4 148 | ... 149 | OP01-R02-TurkeySandwich/ 150 | OP01-R02-TurkeySandwich-102320-105110-F002449-F002529.mp4 151 | OP01-R02-TurkeySandwich-105440-106460-F002528-F002558.mp4 152 | OP01-R02-TurkeySandwich-107332-133184-F002513-F003259.mp4 153 | ... 154 | ... 155 | ``` 156 | -------------------------------------------------------------------------------- /LaViLa/demo.py: -------------------------------------------------------------------------------- 1 | import decord 2 | import matplotlib.pyplot as plt 3 | import numpy as np 4 | from collections import OrderedDict 5 | import time 6 | import torch 7 | import torchvision.transforms as transforms 8 | import torchvision.transforms._transforms_video as transforms_video 9 | 10 | import sys 11 | sys.path.insert(0, './') 12 | from lavila.data.video_transforms import Permute 13 | from lavila.data.datasets import get_frame_ids, video_loader_by_frames 14 | from lavila.models.models import VCLM_OPENAI_TIMESFORMER_BASE_GPT2 15 | from lavila.models.tokenizer import MyGPT2Tokenizer 16 | 17 | 18 | video_path = 'assets/3c0dffd0-e38e-4643-bc48-d513943dc20b_012_014.mp4' 19 | 20 | 21 | from base64 import b64encode 22 | 23 | 24 | # The video is represented by `num_seg=4` frames 25 | vr = decord.VideoReader(video_path) 26 | print("total length:", len(vr)) 27 | num_seg = 4 28 | frame_ids = get_frame_ids(0, len(vr), num_segments=num_seg, jitter=False) 29 | frames = video_loader_by_frames('./', video_path, frame_ids) 30 | print(frames) 31 | print('frames_size:', frames.size()) #[num_seg, w, h, 3] 32 | 33 | 34 | # display the subsampled frames 35 | # plt.figure(figsize=(16, 40)) 36 | # for i in range(num_seg): 37 | # plt.subplot(1, num_seg, i + 1) 38 | # plt.imshow(frames[i].cpu().numpy().astype(int)) 39 | # plt.axis('off') 40 | # plt.show() 41 | 42 | 43 | ckpt_path = 'vclm_openai_timesformer_base_gpt2_base.pt_ego4d.jobid_319630.ep_0002.md5sum_68a71f.pth' 44 | ckpt = torch.load(ckpt_path, map_location='cpu') 45 | state_dict = OrderedDict() 46 | for k, v in ckpt['state_dict'].items(): 47 | state_dict[k.replace('module.', '')] = v 48 | 49 | # instantiate the model, and load the pre-trained weights 50 | model = VCLM_OPENAI_TIMESFORMER_BASE_GPT2( 51 | text_use_cls_token=False, 52 | project_embed_dim=256, 53 | gated_xattn=True, 54 | timesformer_gated_xattn=False, 55 | freeze_lm_vclm=False, 56 | freeze_visual_vclm=False, 57 | freeze_visual_vclm_temporal=False, 58 | num_frames=4, 59 | drop_path_rate=0. 60 | ) 61 | 62 | model.load_state_dict(state_dict, strict=True) 63 | 64 | num_params = sum(p.numel() for p in model.parameters()) 65 | print(f'model params: {num_params}') 66 | model.eval() 67 | #model.cuda() 68 | print('loaded into GPU') 69 | # transforms on input frames 70 | crop_size = 224 71 | val_transform = transforms.Compose([ 72 | Permute([3, 0, 1, 2]), 73 | transforms.Resize(crop_size), 74 | transforms.CenterCrop(crop_size), 75 | transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305]) 76 | ]) 77 | frames = val_transform(frames) 78 | print("frames shape before squeeze: ", frames.size()) #[3, 4, 224, 224] 79 | frames = frames.unsqueeze(0) # fake a batch dimension 80 | print("frames shape: ", frames.size()) #[1, 3, 4, 224, 224] 81 | 82 | tokenizer = MyGPT2Tokenizer('gpt2', add_bos=True) 83 | 84 | candidate_num = 5 85 | 86 | def decode_one(generated_ids, tokenizer): 87 | # get the index of 88 | if tokenizer.eos_token_id == tokenizer.bos_token_id: 89 | if tokenizer.eos_token_id in generated_ids[1:].tolist(): 90 | eos_id = generated_ids[1:].tolist().index(tokenizer.eos_token_id) + 1 91 | else: 92 | eos_id = len(generated_ids.tolist()) - 1 93 | elif tokenizer.eos_token_id in generated_ids.tolist(): 94 | eos_id = generated_ids.tolist().index(tokenizer.eos_token_id) 95 | else: 96 | eos_id = len(generated_ids.tolist()) - 1 97 | generated_text_str = tokenizer.tokenizer.decode(generated_ids[1:eos_id].tolist()) 98 | return generated_text_str 99 | 100 | 101 | 102 | start_time = time.time() 103 | for i in range(100): 104 | with torch.no_grad(): 105 | image_features = model.encode_image(frames) 106 | generated_text_ids, ppls = model.generate( 107 | image_features, 108 | tokenizer, 109 | target=None, # free-form generation 110 | max_text_length=77, 111 | top_k=None, 112 | top_p=0.95, # nucleus sampling 113 | num_return_sequences=candidate_num, # number of candidates: 10 114 | temperature=0.7, 115 | early_stopping=True, 116 | ) 117 | for i in range(candidate_num): 118 | generated_text_str = decode_one(generated_text_ids[i], tokenizer) 119 | print('{}: {}'.format(i, generated_text_str)) 120 | end_time = time.time() 121 | print(end_time-start_time) -------------------------------------------------------------------------------- /LaViLa/demo_narrator.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | import argparse 9 | import os 10 | import urllib.request 11 | from collections import OrderedDict 12 | 13 | import torch 14 | import torchvision.transforms as transforms 15 | import torchvision.transforms._transforms_video as transforms_video 16 | import decord 17 | 18 | from lavila.data.video_transforms import Permute 19 | from lavila.data.datasets import get_frame_ids, video_loader_by_frames 20 | from lavila.models.models import VCLM_OPENAI_TIMESFORMER_LARGE_336PX_GPT2_XL 21 | from lavila.models.tokenizer import MyGPT2Tokenizer 22 | from eval_narrator import decode_one 23 | import cv2 24 | 25 | def main(args): 26 | 27 | vr = decord.VideoReader(args.video_path) 28 | num_seg = 4 29 | frame_ids = get_frame_ids(0, len(vr), num_segments=num_seg, jitter=False) 30 | print('frame_ids: ', frame_ids) 31 | frames = video_loader_by_frames('./', args.video_path, frame_ids) 32 | test_frame = frames[0].numpy() 33 | print(test_frame.shape) 34 | cv2.imwrite("test_frame.jpg", cv2.cvtColor(test_frame, cv2.COLOR_BGR2RGB)) 35 | ckpt_name = 'vclm_openai_timesformer_large_336px_gpt2_xl.pt_ego4d.jobid_246897.ep_0003.md5sum_443263.pth' 36 | ckpt_path = os.path.join('modelzoo/', ckpt_name) 37 | os.makedirs('modelzoo/', exist_ok=True) 38 | if not os.path.exists(ckpt_path): 39 | print('downloading model to {}'.format(ckpt_path)) 40 | urllib.request.urlretrieve('https://dl.fbaipublicfiles.com/lavila/checkpoints/narrator/{}'.format(ckpt_name), ckpt_path) 41 | ckpt = torch.load(ckpt_path, map_location='cpu') 42 | state_dict = OrderedDict() 43 | for k, v in ckpt['state_dict'].items(): 44 | state_dict[k.replace('module.', '')] = v 45 | 46 | # instantiate the model, and load the pre-trained weights 47 | model = VCLM_OPENAI_TIMESFORMER_LARGE_336PX_GPT2_XL( 48 | text_use_cls_token=False, 49 | project_embed_dim=256, 50 | gated_xattn=True, 51 | timesformer_gated_xattn=False, 52 | freeze_lm_vclm=False, # we use model.eval() anyway 53 | freeze_visual_vclm=False, # we use model.eval() anyway 54 | num_frames=4, 55 | drop_path_rate=0. 56 | ) 57 | model.load_state_dict(state_dict, strict=True) 58 | if args.cuda: 59 | model.cuda() 60 | model.eval() 61 | 62 | # transforms on input frames 63 | crop_size = 336 64 | val_transform = transforms.Compose([ 65 | Permute([3, 0, 1, 2]), 66 | transforms.Resize(crop_size), 67 | transforms.CenterCrop(crop_size), 68 | transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305]) 69 | ]) 70 | frames = val_transform(frames) 71 | frames = frames.unsqueeze(0) # fake a batch dimension 72 | 73 | tokenizer = MyGPT2Tokenizer('gpt2-xl', add_bos=True) 74 | with torch.no_grad(): 75 | if args.cuda: 76 | frames = frames.cuda(non_blocking=True) 77 | image_features = model.encode_image(frames) 78 | generated_text_ids, ppls = model.generate( 79 | image_features, 80 | tokenizer, 81 | target=None, # free-form generation 82 | max_text_length=77, 83 | top_k=None, 84 | top_p=0.95, # nucleus sampling 85 | num_return_sequences=10, # number of candidates: 10 86 | temperature=0.7, 87 | early_stopping=True, 88 | ) 89 | 90 | for i in range(10): 91 | generated_text_str = decode_one(generated_text_ids[i], tokenizer) 92 | print('{}: {}'.format(i, generated_text_str)) 93 | 94 | 95 | if __name__ == '__main__': 96 | parser = argparse.ArgumentParser('lavila narrator demo') 97 | parser.add_argument('--cuda', default=True, action='store_true', help='use cuda') 98 | parser.add_argument('--video-path', default='assets/3c0dffd0-e38e-4643-bc48-d513943dc20b_012_014.mp4', type=str, help='video path') 99 | #parser.add_argument('--video-path', default='/home/yue/data/mount/fillipo/Datasets/Ego4d/v1/full_scale/0a3dc289-557f-4121-9bc7-521a2b5d3bb8.mp4', type=str, help='video path') 100 | args = parser.parse_args() 101 | main(args) 102 | -------------------------------------------------------------------------------- /LaViLa/demo_narrator_3rd_person.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | 8 | import argparse 9 | import os 10 | import urllib.request 11 | from collections import OrderedDict 12 | 13 | import torch 14 | import torchvision.transforms as transforms 15 | import torchvision.transforms._transforms_video as transforms_video 16 | import decord 17 | 18 | from lavila.data.video_transforms import Permute 19 | from lavila.data.datasets import get_frame_ids, video_loader_by_frames 20 | from lavila.models.models import VCLM_OPENAI_TIMESFORMER_LARGE_GPT2_XL 21 | from lavila.models.tokenizer import MyGPT2Tokenizer 22 | from eval_narrator import decode_one 23 | 24 | 25 | def main(args): 26 | 27 | vr = decord.VideoReader(args.video_path) 28 | num_seg = 4 29 | frame_ids = get_frame_ids(0, len(vr), num_segments=num_seg, jitter=False) 30 | frames = video_loader_by_frames('./', args.video_path, frame_ids) 31 | 32 | ckpt_name = 'vclm_openai_timesformer_large_gpt2_xl.pt_htm.jobid_341080.ep_0001.pth' 33 | ckpt_path = os.path.join('modelzoo/', ckpt_name) 34 | os.makedirs('modelzoo/', exist_ok=True) 35 | if not os.path.exists(ckpt_path): 36 | print('downloading model to {}'.format(ckpt_path)) 37 | urllib.request.urlretrieve('https://dl.fbaipublicfiles.com/lavila/checkpoints/narrator/htm_aa/{}'.format(ckpt_name), ckpt_path) 38 | ckpt = torch.load(ckpt_path, map_location='cpu') 39 | state_dict = OrderedDict() 40 | for k, v in ckpt['state_dict'].items(): 41 | state_dict[k.replace('module.', '')] = v 42 | 43 | # instantiate the model, and load the pre-trained weights 44 | model = VCLM_OPENAI_TIMESFORMER_LARGE_GPT2_XL( 45 | text_use_cls_token=False, 46 | project_embed_dim=256, 47 | gated_xattn=True, 48 | timesformer_gated_xattn=False, 49 | freeze_lm_vclm=False, # we use model.eval() anyway 50 | freeze_visual_vclm=False, # we use model.eval() anyway 51 | freeze_visual_vclm_temporal=False, 52 | num_frames=4, 53 | drop_path_rate=0. 54 | ) 55 | model.load_state_dict(state_dict, strict=True) 56 | if args.cuda: 57 | model.cuda() 58 | model.eval() 59 | 60 | # transforms on input frames 61 | crop_size = 224 62 | val_transform = transforms.Compose([ 63 | Permute([3, 0, 1, 2]), 64 | transforms.Resize(crop_size), 65 | transforms.CenterCrop(crop_size), 66 | transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305]) 67 | ]) 68 | frames = val_transform(frames) 69 | frames = frames.unsqueeze(0) # fake a batch dimension 70 | 71 | tokenizer = MyGPT2Tokenizer('gpt2-xl', add_bos=True) 72 | with torch.no_grad(): 73 | if args.cuda: 74 | frames = frames.cuda(non_blocking=True) 75 | image_features = model.encode_image(frames) 76 | generated_text_ids, ppls = model.generate( 77 | image_features, 78 | tokenizer, 79 | target=None, # free-form generation 80 | max_text_length=77, 81 | top_k=None, 82 | top_p=0.95, # nucleus sampling 83 | num_return_sequences=10, # number of candidates: 10 84 | temperature=0.7, 85 | early_stopping=True, 86 | ) 87 | 88 | for i in range(10): 89 | generated_text_str = decode_one(generated_text_ids[i], tokenizer) 90 | print('{}: {}'.format(i, generated_text_str)) 91 | 92 | 93 | if __name__ == '__main__': 94 | parser = argparse.ArgumentParser('lavila narrator demo') 95 | parser.add_argument('--cuda', action='store_true', help='use cuda') 96 | parser.add_argument('--video-path', type=str, 97 | default='assets/mixkit-pastry-chef-cutting-a-loaf-into-slices-43015-medium.mp4') 98 | args = parser.parse_args() 99 | main(args) 100 | -------------------------------------------------------------------------------- /LaViLa/docs/INSTALL.md: -------------------------------------------------------------------------------- 1 | # Installation 2 | 3 | ## Requirements 4 | 5 | 6 | ## Example conda environment setup 7 | 8 | ```bash 9 | conda create --name lavila python=3.8 -y 10 | conda activate lavila 11 | pip install -r requirements.txt 12 | ``` 13 | 14 | ## datasets 15 | If you want to train/evaluate on the datasets, please see [datasets/README.md](../datasets/README.md) to see how we prepare datasets for this project. 16 | -------------------------------------------------------------------------------- /LaViLa/docs/PRETRAIN.md: -------------------------------------------------------------------------------- 1 | # LAVILA Pretraining 2 | 3 | In this doc, we provide a step-by-step guide (with commands) to train LaViLa. 4 | Note that we recommend running the following job with four 8x V100 (32GB) nodes (or eight nodes for the larger backbone) using [submitit](https://github.com/facebookincubator/submitit). 5 | See how to install submitit at [here](./MODEL_ZOO.md#multi-node-training). 6 | 7 | 8 | ## Pre-training Dual-Encoder Baseline 9 | 10 | We first pre-train a dual-encoder baseline with human annotations on Ego4d clips. 11 | The goal is (1) to establish a comparable baseline for LAVILA, and (2) provide a video encoder for narrator (see below). 12 | We use a default batch size of 32 per gpu so that the total batch size for InfoNCE loss is `32*8*4=1024`. 13 | 14 |
Train a baseline dual-encoder (with TSF-B) 15 | 16 | ```bash 17 | python run_with_submitit_pretrain.py --model CLIP_OPENAI_TIMESFORMER_BASE \ 18 | --norm-embed --freeze-temperature \ 19 | --fix-lr --contrastive-use-vissl \ 20 | --nodes 4 --use_volta32 21 | ``` 22 |
23 | 24 | To fit a High-Resolution TimeSformer-Large with a sufficient batch size, we use [DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert), a memory-efficient text encoder, instead of the original text encoder in the CLIP. Additionally we apply [gradient checkpointing](https://pytorch.org/docs/stable/checkpoint.html) and [Zero Redundancy Optimizer (ZeRO)](https://arxiv.org/abs/1910.02054). 25 | 26 |
Train a baseline dual-encoder (with TSF-L@HR) 27 | 28 | ```bash 29 | python run_with_submitit_pretrain.py --model CLIP_OPENAI_TIMESFORMER_LARGE_336PX_DISTILBERT_BASE \ 30 | --batch-size 8 \ 31 | --use-checkpoint --use-zero \ 32 | --norm-embed --freeze-temperature \ 33 | --fix-lr --contrastive-use-vissl \ 34 | --nodes 8 --use_volta32 35 | ``` 36 |
37 | 38 | ## Training and Evaluating Narrator 39 | 40 | The narrator is a *visually conditioned* large language model (VCLM), which comprises a pre-trained video encoder (obtained above), a text decoder (GPT-2 family), and a few gated cross-attention modules that attends visual information while captioning. Both the video encoder and the text decoder are kept frozen while the cross-attention modules are learnable. 41 | 42 | Note that we turn off Pytorch's automatic mixed-precision (AMP) during training the narrator. We observe training is instable if AMP is on. 43 | 44 | Also note that `$PATH` can be found in the `Vis. Encoder` column of [MODEL_ZOO.md#Narrator](./MODEL_ZOO.md#narrator). If you are using your own checkpoint (e.g. pre-trained in the previous step), please make sure that the following keys in the checkpoint have been dropped: `epoch`, `optimizer`, and `scaler`. 45 | 46 |
Train a baseline narrator (TSF-B as visual encoder and GPT-2 base as textual decoder) 47 | 48 | ```bash 49 | python run_with_submitit_pretrain.py \ 50 | --model VCLM_OPENAI_TIMESFORMER_BASE_GPT2 \ 51 | --gated-xattn --freeze-lm-vclm --freeze-visual-vclm --freeze-visual-vclm-temporal \ 52 | --fix-lr --batch-size 8 --clip-grad-value 1.0 --eval-freq 1 --disable-amp \ 53 | --nodes 4 --use_volta32 --resume $PATH # Eg. $PATH can be "modelzoo/clip_openai_timesformer_base.baseline.ep_0003.pth" 54 | ``` 55 | 56 |
57 | 58 |
Train a strong narrator (TSF-L@HR as visual encoder and GPT-2 XL as textual decoder) 59 | 60 | ```bash 61 | python run_with_submitit_pretrain.py \ 62 | --model VCLM_OPENAI_TIMESFORMER_LARGE_336PX_GPT2_XL \ 63 | --gated-xattn --freeze-lm-vclm --freeze-visual-vclm --freeze-visual-vclm-temporal --use-checkpoint \ 64 | --fix-lr --batch-size 8 --clip-grad-value 1.0 --eval-freq 1 --disable-amp \ 65 | --nodes 4 --use_volta32 --resume $PATH # Eg. $PATH can be "modelzoo/clip_openai_timesformer_large_336px_distilbert_base.baseline.ep_0003.pth" 66 | ``` 67 |
68 | 69 |
Evaluate the narrator on Ego4D val split 70 | 71 | ```bash 72 | torchrun --nproc_per_node=1 eval_narrator.py \ 73 | --caption-top-p 0.95 --caption-temperature 0.7 \ 74 | --eval-freq 10000 \ # evaluate on the val split of Ego4D (1/10000-subset for fast evaluation) 75 | --resume $VCLM_CHECKPOINT 76 | ``` 77 | This will output some common NLG metrics, such as BLEU-x, METEOR, ROUGE_L, and CIDEr (using the human narrations as ground-truth). 78 |
79 | 80 | ## Narrating video clips using LAVILA-Narrator 81 | 82 | 83 |
Infer the narrator 84 | 85 | ```bash 86 | python run_with_submitit_infer_narrator.py \ 87 | --metadata datasets/Ego4D/ego4d_train.pkl \ 88 | --batch-size 64 \ 89 | --resume $PATH --use-half \ 90 | --nodes 4 --use_volta32 91 | ``` 92 |
93 | 94 | It will generate a pickle file (`$output_dir/total.pkl`) which is a list of quintuples - `(video_uid: str, start_time: float, end_time: float, narration_list: List[str], NLL_list: List[float])`. 95 | 96 | For narrator-generated narrations on Ego4D ground-truth clips, we also provide a [replica](https://dl.fbaipublicfiles.com/lavila/metadata/ego4d/ego4d_train.narrator_63690737.return_10.pkl). Note that the narrator used here is our best performing one. 97 | 98 | In addition, we can apply this narrator over the entire video for temporally dense auto-narration. We provide a [replica](https://dl.fbaipublicfiles.com/lavila/metadata/ego4d/ego4d_train.uncovered_all.narrator_63690737.return_5.pkl) (excluding the annotated clips). 99 | 100 | ## Rephrasing human narrations using LAVILA-Rephraser 101 | 102 | Rephraser is a standard LLM that can paraphrase narrations in existing clips. 103 | Specifically, we use an off-the-shelf T5-based paraphraser which is publicly available at [Hugging Face's model hub](https://huggingface.co/ramsrigouthamg/t5-large-paraphraser-diverse-high-quality). 104 | For more details, please refer to the [model card](https://huggingface.co/ramsrigouthamg/t5-large-paraphraser-diverse-high-quality). 105 | 106 | For rephrased human narrations on Ego4D ground-truth clips, we provide a [replica](https://dl.fbaipublicfiles.com/lavila/metadata/ego4d/ego4d_train.rephraser.no_punkt_top3.pkl). 107 | 108 | 109 | ## Pre-training LAVILA Dual-Encoder 110 | Now we are ready to pre-train our LAVILA's dual-encoder by combining human annotations (augmented by Rephraser) and the Narrator-generated narrations. 111 | 112 |
Training a LaViLa dual-encoder 113 | 114 | ```bash 115 | python run_with_submitit_pretrain.py --model CLIP_OPENAI_TIMESFORMER_BASE \ 116 | --metadata datasets/Ego4D/ego4d_train.rephraser.no_punkt_top3.pkl \ 117 | --metadata-aux datasets/Ego4D/ego4d_train.narrator_63690737.return_10.pkl \ # also optionally add `datasets/Ego4D/ego4d_train.uncovered_all.narrator_63690737.return_5.pkl` 118 | --norm-embed --freeze-temperature \ 119 | --freeze-pseudo-temperature \ 120 | --fix-lr --contrastive-use-vissl \ 121 | --nodes 4 --use_volta32 122 | ``` 123 |
124 | 125 | ## Down-stream Evaluation 126 | With the pre-trained dual-encoder at hand, we now can do zero-shot or fine-tuning evalution evaluations on down-stream benchmarks. 127 | Please refer to [MODEL_ZOO.md](./MODEL_ZOO.md#zero-shot) for more details. 128 | -------------------------------------------------------------------------------- /LaViLa/lavila/data/__pycache__/datasets.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/data/__pycache__/datasets.cpython-39.pyc -------------------------------------------------------------------------------- /LaViLa/lavila/data/__pycache__/video_transforms.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/data/__pycache__/video_transforms.cpython-39.pyc -------------------------------------------------------------------------------- /LaViLa/lavila/data/video_transforms.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import math 8 | from typing import Sequence 9 | import torch 10 | import torch.nn as nn 11 | from torchvision import transforms 12 | 13 | 14 | class Permute(nn.Module): 15 | """ 16 | Permutation as an op 17 | """ 18 | 19 | def __init__(self, ordering): 20 | super().__init__() 21 | self.ordering = ordering 22 | 23 | def forward(self, frames): 24 | """ 25 | Args: 26 | frames in some ordering, by default (C, T, H, W) 27 | Returns: 28 | frames in the ordering that was specified 29 | """ 30 | return frames.permute(self.ordering) 31 | 32 | 33 | class TemporalCrop(nn.Module): 34 | """ 35 | Convert the video into smaller clips temporally. 36 | """ 37 | 38 | def __init__( 39 | self, frames_per_clip: int = 8, stride: int = 8, frame_stride: int = 1 40 | ): 41 | super().__init__() 42 | self.frames = frames_per_clip 43 | self.stride = stride 44 | self.frame_stride = frame_stride 45 | 46 | def forward(self, video): 47 | assert video.ndim == 4, "Must be (C, T, H, W)" 48 | res = [] 49 | for start in range( 50 | 0, video.size(1) - (self.frames * self.frame_stride) + 1, self.stride 51 | ): 52 | end = start + (self.frames) * self.frame_stride 53 | res.append(video[:, start: end: self.frame_stride, ...]) 54 | return res 55 | 56 | 57 | def crop_boxes(boxes, x_offset, y_offset): 58 | """ 59 | Peform crop on the bounding boxes given the offsets. 60 | Args: 61 | boxes (ndarray or None): bounding boxes to peform crop. The dimension 62 | is `num boxes` x 4. 63 | x_offset (int): cropping offset in the x axis. 64 | y_offset (int): cropping offset in the y axis. 65 | Returns: 66 | cropped_boxes (ndarray or None): the cropped boxes with dimension of 67 | `num boxes` x 4. 68 | """ 69 | cropped_boxes = boxes.copy() 70 | cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset 71 | cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset 72 | 73 | return cropped_boxes 74 | 75 | 76 | def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None): 77 | """ 78 | Perform uniform spatial sampling on the images and corresponding boxes. 79 | Args: 80 | images (tensor): images to perform uniform crop. The dimension is 81 | `num frames` x `channel` x `height` x `width`. 82 | size (int): size of height and weight to crop the images. 83 | spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width 84 | is larger than height. Or 0, 1, or 2 for top, center, and bottom 85 | crop if height is larger than width. 86 | boxes (ndarray or None): optional. Corresponding boxes to images. 87 | Dimension is `num boxes` x 4. 88 | scale_size (int): optinal. If not None, resize the images to scale_size before 89 | performing any crop. 90 | Returns: 91 | cropped (tensor): images with dimension of 92 | `num frames` x `channel` x `size` x `size`. 93 | cropped_boxes (ndarray or None): the cropped boxes with dimension of 94 | `num boxes` x 4. 95 | """ 96 | assert spatial_idx in [0, 1, 2] 97 | ndim = len(images.shape) 98 | if ndim == 3: 99 | images = images.unsqueeze(0) 100 | height = images.shape[2] 101 | width = images.shape[3] 102 | 103 | if scale_size is not None: 104 | if width <= height: 105 | width, height = scale_size, int(height / width * scale_size) 106 | else: 107 | width, height = int(width / height * scale_size), scale_size 108 | images = torch.nn.functional.interpolate( 109 | images, 110 | size=(height, width), 111 | mode="bilinear", 112 | align_corners=False, 113 | ) 114 | 115 | y_offset = int(math.ceil((height - size) / 2)) 116 | x_offset = int(math.ceil((width - size) / 2)) 117 | 118 | if height > width: 119 | if spatial_idx == 0: 120 | y_offset = 0 121 | elif spatial_idx == 2: 122 | y_offset = height - size 123 | else: 124 | if spatial_idx == 0: 125 | x_offset = 0 126 | elif spatial_idx == 2: 127 | x_offset = width - size 128 | cropped = images[:, :, y_offset: y_offset + size, x_offset: x_offset + size] 129 | cropped_boxes = crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None 130 | if ndim == 3: 131 | cropped = cropped.squeeze(0) 132 | return cropped, cropped_boxes 133 | 134 | 135 | class SpatialCrop(nn.Module): 136 | """ 137 | Convert the video into 3 smaller clips spatially. Must be used after the 138 | temporal crops to get spatial crops, and should be used with 139 | -2 in the spatial crop at the slowfast augmentation stage (so full 140 | frames are passed in here). Will return a larger list with the 141 | 3x spatial crops as well. It's useful for 3x4 testing (eg in SwinT) 142 | or 3x10 testing in SlowFast etc. 143 | """ 144 | 145 | def __init__(self, crop_size: int = 224, num_crops: int = 3): 146 | super().__init__() 147 | self.crop_size = crop_size 148 | if num_crops == 6: 149 | self.crops_to_ext = [0, 1, 2] 150 | # I guess Swin uses 5 crops without flipping, but that doesn't 151 | # make sense given they first resize to 224 and take 224 crops. 152 | # (pg 6 of https://arxiv.org/pdf/2106.13230.pdf) 153 | # So I'm assuming we can use flipped crops and that will add sth.. 154 | self.flipped_crops_to_ext = [0, 1, 2] 155 | elif num_crops == 3: 156 | self.crops_to_ext = [0, 1, 2] 157 | self.flipped_crops_to_ext = [] 158 | elif num_crops == 1: 159 | self.crops_to_ext = [1] 160 | self.flipped_crops_to_ext = [] 161 | else: 162 | raise NotImplementedError( 163 | "Nothing else supported yet, " 164 | "slowfast only takes 0, 1, 2 as arguments" 165 | ) 166 | 167 | def forward(self, videos: Sequence[torch.Tensor]): 168 | """ 169 | Args: 170 | videos: A list of C, T, H, W videos. 171 | Returns: 172 | videos: A list with 3x the number of elements. Each video converted 173 | to C, T, H', W' by spatial cropping. 174 | """ 175 | assert isinstance(videos, list), "Must be a list of videos after temporal crops" 176 | assert all([video.ndim == 4 for video in videos]), "Must be (C,T,H,W)" 177 | res = [] 178 | for video in videos: 179 | for spatial_idx in self.crops_to_ext: 180 | res.append(uniform_crop(video, self.crop_size, spatial_idx)[0]) 181 | if not self.flipped_crops_to_ext: 182 | continue 183 | flipped_video = transforms.functional.hflip(video) 184 | for spatial_idx in self.flipped_crops_to_ext: 185 | res.append(uniform_crop(flipped_video, self.crop_size, spatial_idx)[0]) 186 | return res 187 | -------------------------------------------------------------------------------- /LaViLa/lavila/models/__pycache__/coca.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/coca.cpython-39.pyc -------------------------------------------------------------------------------- /LaViLa/lavila/models/__pycache__/distributed_utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/distributed_utils.cpython-39.pyc -------------------------------------------------------------------------------- /LaViLa/lavila/models/__pycache__/gpt2_gated.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/gpt2_gated.cpython-39.pyc -------------------------------------------------------------------------------- /LaViLa/lavila/models/__pycache__/loss.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/loss.cpython-39.pyc -------------------------------------------------------------------------------- /LaViLa/lavila/models/__pycache__/models.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/models.cpython-39.pyc -------------------------------------------------------------------------------- /LaViLa/lavila/models/__pycache__/narrator.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/narrator.cpython-39.pyc -------------------------------------------------------------------------------- /LaViLa/lavila/models/__pycache__/openai_clip.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/openai_clip.cpython-39.pyc -------------------------------------------------------------------------------- /LaViLa/lavila/models/__pycache__/openai_model.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/openai_model.cpython-39.pyc -------------------------------------------------------------------------------- /LaViLa/lavila/models/__pycache__/timesformer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/timesformer.cpython-39.pyc -------------------------------------------------------------------------------- /LaViLa/lavila/models/__pycache__/tokenizer.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/tokenizer.cpython-39.pyc -------------------------------------------------------------------------------- /LaViLa/lavila/models/__pycache__/utils.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/utils.cpython-39.pyc -------------------------------------------------------------------------------- /LaViLa/lavila/models/bpe_simple_vocab_16e6.txt.gz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/bpe_simple_vocab_16e6.txt.gz -------------------------------------------------------------------------------- /LaViLa/lavila/models/coca.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # Part of the code is from https://github.com/lucidrains/CoCa-pytorch/blob/main/coca_pytorch/coca_pytorch.py 8 | # Modified by Yue Zhao 9 | # The original code is under MIT License 10 | 11 | import torch 12 | import torch.nn as nn 13 | import torch.nn.functional as F 14 | from torch import einsum 15 | from einops import rearrange 16 | 17 | 18 | def exists(val): 19 | return val is not None 20 | 21 | 22 | def default(val, d): 23 | return val if exists(val) else d 24 | 25 | 26 | # normalization 27 | # they use layernorm without bias, something that pytorch does not offer 28 | class LayerNorm(nn.Module): 29 | def __init__(self, dim): 30 | super().__init__() 31 | self.gamma = nn.Parameter(torch.ones(dim)) 32 | self.register_buffer("beta", torch.zeros(dim)) 33 | 34 | def forward(self, x): 35 | return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta) 36 | 37 | 38 | class Residual(nn.Module): 39 | def __init__(self, fn): 40 | super().__init__() 41 | self.fn = fn 42 | 43 | def forward(self, x, *args, **kwargs): 44 | return self.fn(x, *args, **kwargs) + x 45 | 46 | 47 | # classic Noam Shazeer paper, except here they use SwiGLU instead of the more popular GEGLU for gating the feedforward 48 | # https://arxiv.org/abs/2002.05202 49 | class SwiGLU(nn.Module): 50 | def forward(self, x): 51 | x, gate = x.chunk(2, dim=-1) 52 | return F.silu(gate) * x 53 | 54 | 55 | class CrossAttention(nn.Module): 56 | def __init__( 57 | self, 58 | dim, 59 | *, 60 | context_dim=None, 61 | dim_head=64, 62 | heads=8, 63 | parallel_ff=False, 64 | ff_mult=4, 65 | norm_context=False 66 | ): 67 | super().__init__() 68 | self.heads = heads 69 | self.scale = dim_head ** -0.5 70 | inner_dim = heads * dim_head 71 | context_dim = default(context_dim, dim) 72 | 73 | self.norm = LayerNorm(dim) 74 | self.context_norm = LayerNorm(context_dim) if norm_context else nn.Identity() 75 | 76 | self.to_q = nn.Linear(dim, inner_dim, bias=False) 77 | self.to_kv = nn.Linear(context_dim, dim_head * 2, bias=False) 78 | self.to_out = nn.Linear(inner_dim, dim, bias=False) 79 | 80 | # whether to have parallel feedforward 81 | 82 | ff_inner_dim = ff_mult * dim 83 | 84 | self.ff = nn.Sequential( 85 | nn.Linear(dim, ff_inner_dim * 2, bias=False), 86 | SwiGLU(), 87 | nn.Linear(ff_inner_dim, dim, bias=False) 88 | ) if parallel_ff else None 89 | 90 | def forward(self, x, context): 91 | """ 92 | einstein notation 93 | b - batch 94 | h - heads 95 | n, i, j - sequence length (base sequence length, source, target) 96 | d - feature dimension 97 | """ 98 | 99 | # pre-layernorm, for queries and context 100 | x = self.norm(x) 101 | context = self.context_norm(context) 102 | 103 | # get queries 104 | q = self.to_q(x) 105 | q = rearrange(q, 'b n (h d) -> b h n d', h=self.heads) 106 | 107 | # scale 108 | q = q * self.scale 109 | 110 | # get key / values 111 | k, v = self.to_kv(context).chunk(2, dim=-1) 112 | 113 | # query / key similarity 114 | sim = einsum('b h i d, b j d -> b h i j', q, k) 115 | 116 | # attention 117 | sim = sim - sim.amax(dim=-1, keepdim=True) 118 | attn = sim.softmax(dim=-1) 119 | 120 | # aggregate 121 | out = einsum('b h i j, b j d -> b h i d', attn, v) 122 | 123 | # merge and combine heads 124 | out = rearrange(out, 'b h n d -> b n (h d)') 125 | out = self.to_out(out) 126 | 127 | # add parallel feedforward (for multimodal layers) 128 | if exists(self.ff): 129 | out = out + self.ff(x) 130 | 131 | return out 132 | -------------------------------------------------------------------------------- /LaViLa/lavila/models/distributed_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Facebook, Inc. and its affiliates. 2 | 3 | # This source code is licensed under the MIT license found in the 4 | # LICENSE file in the root directory of this source tree. 5 | 6 | # Part of the code is from 7 | # `https://github.com/facebookresearch/vissl/blob/main/vissl/utils/distributed_utils.py` and 8 | # `https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/generic/distributed_util.py` 9 | # Modified by Yue Zhao 10 | # The original code is under MIT License 11 | 12 | import torch 13 | import torch.distributed as dist 14 | from typing import Tuple 15 | 16 | 17 | def convert_to_distributed_tensor(tensor: torch.Tensor) -> Tuple[torch.Tensor, str]: 18 | """ 19 | For some backends, such as NCCL, communication only works if the 20 | tensor is on the GPU. This helper function converts to the correct 21 | device and returns the tensor + original device. 22 | """ 23 | orig_device = "cpu" if not tensor.is_cuda else "gpu" 24 | if ( 25 | torch.distributed.is_available() 26 | and torch.distributed.get_backend() == torch.distributed.Backend.NCCL 27 | and not tensor.is_cuda 28 | ): 29 | tensor = tensor.cuda() 30 | return (tensor, orig_device) 31 | 32 | 33 | def convert_to_normal_tensor(tensor: torch.Tensor, orig_device: str) -> torch.Tensor: 34 | """ 35 | For some backends, such as NCCL, communication only works if the 36 | tensor is on the GPU. This converts the tensor back to original device. 37 | """ 38 | if tensor.is_cuda and orig_device == "cpu": 39 | tensor = tensor.cpu() 40 | return tensor 41 | 42 | 43 | def is_distributed_training_run() -> bool: 44 | return ( 45 | torch.distributed.is_available() 46 | and torch.distributed.is_initialized() 47 | and (torch.distributed.get_world_size() > 1) 48 | ) 49 | 50 | 51 | class GatherLayer(torch.autograd.Function): 52 | """ 53 | Gather tensors from all workers with support for backward propagation: 54 | This implementation does not cut the gradients as torch.distributed.all_gather does. 55 | """ 56 | 57 | @staticmethod 58 | def forward(ctx, x): 59 | output = [torch.zeros_like(x) for _ in range(dist.get_world_size())] 60 | dist.all_gather(output, x) 61 | return tuple(output) 62 | 63 | @staticmethod 64 | def backward(ctx, *grads): 65 | all_gradients = torch.stack(grads) 66 | dist.all_reduce(all_gradients) 67 | return all_gradients[dist.get_rank()] 68 | 69 | 70 | def gather_from_all(tensor: torch.Tensor) -> torch.Tensor: 71 | """ 72 | Similar to classy_vision.generic.distributed_util.gather_from_all 73 | except that it does not cut the gradients 74 | """ 75 | if tensor.ndim == 0: 76 | # 0 dim tensors cannot be gathered. so unsqueeze 77 | tensor = tensor.unsqueeze(0) 78 | 79 | if is_distributed_training_run(): 80 | tensor, orig_device = convert_to_distributed_tensor(tensor) 81 | gathered_tensors = GatherLayer.apply(tensor) 82 | gathered_tensors = [ 83 | convert_to_normal_tensor(_tensor, orig_device) 84 | for _tensor in gathered_tensors 85 | ] 86 | else: 87 | gathered_tensors = [tensor] 88 | gathered_tensor = torch.cat(gathered_tensors, 0) 89 | return gathered_tensor 90 | -------------------------------------------------------------------------------- /LaViLa/lavila/models/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | from collections import OrderedDict 8 | import functools 9 | import torch 10 | import torch.nn.functional as F 11 | 12 | 13 | def inflate_positional_embeds( 14 | current_model_state_dict, new_state_dict, 15 | num_frames=4, 16 | load_temporal_fix='bilinear', 17 | ): 18 | # allow loading of timesformer with fewer num_frames 19 | curr_keys = list(current_model_state_dict.keys()) 20 | if 'visual.temporal_embed' in new_state_dict and 'visual.temporal_embed' in curr_keys: 21 | load_temporal_embed = new_state_dict['visual.temporal_embed'] 22 | load_num_frames = load_temporal_embed.shape[1] 23 | curr_num_frames = num_frames 24 | embed_dim = load_temporal_embed.shape[2] 25 | 26 | if load_num_frames != curr_num_frames: 27 | if load_num_frames > curr_num_frames: 28 | print(f'### loaded SpaceTimeTransformer model has MORE frames than current...' 29 | f'### loading weights, filling in the extras via {load_temporal_fix}') 30 | new_temporal_embed = load_temporal_embed[:, :curr_num_frames, :] 31 | else: 32 | print(f'### loaded SpaceTimeTransformer model has FEWER frames than current...' 33 | f'### loading weights, filling in the extras via {load_temporal_fix}') 34 | if load_temporal_fix == 'zeros': 35 | new_temporal_embed = torch.zeros([load_temporal_embed.shape[0], curr_num_frames, embed_dim]) 36 | new_temporal_embed[:, :load_num_frames] = load_temporal_embed 37 | elif load_temporal_fix in ['interp', 'bilinear']: 38 | # interpolate 39 | # unsqueeze so pytorch thinks its an image 40 | mode = 'nearest' 41 | if load_temporal_fix == 'bilinear': 42 | mode = 'bilinear' 43 | load_temporal_embed = load_temporal_embed.unsqueeze(0) 44 | new_temporal_embed = F.interpolate(load_temporal_embed, 45 | (curr_num_frames, embed_dim), mode=mode).squeeze(0) 46 | else: 47 | raise NotImplementedError 48 | new_state_dict['visual.temporal_embed'] = new_temporal_embed 49 | # allow loading with smaller spatial patches. assumes custom border crop, to append the 50 | # border patches to the input sequence 51 | if 'visual.pos_embed' in new_state_dict and 'visual.pos_embed' in curr_keys: 52 | load_pos_embed = new_state_dict['visual.pos_embed'] 53 | load_num_patches = load_pos_embed.shape[1] 54 | curr_pos_embed = current_model_state_dict['visual.pos_embed'] 55 | if load_num_patches != curr_pos_embed.shape[1]: 56 | raise NotImplementedError( 57 | 'Loading models with different spatial resolution / patch number not yet implemented, sorry.') 58 | 59 | return new_state_dict 60 | 61 | 62 | def rsetattr(obj, attr, val): 63 | pre, _, post = attr.rpartition('.') 64 | return setattr(rgetattr(obj, pre) if pre else obj, post, val) 65 | 66 | 67 | def rgetattr(obj, attr, *args): 68 | def _getattr(obj, attr): 69 | return getattr(obj, attr, *args) 70 | return functools.reduce(_getattr, [obj] + attr.split('.')) 71 | 72 | 73 | # util functions to convert CLIP-style model keys to TimeSformer-style 74 | def remap_keys(clip_state_dict, transformer_layers=12): 75 | remapped_state_dict = OrderedDict() 76 | key_mapping = { 77 | "class_embedding": "cls_token", 78 | "positional_embedding": "pos_embed", 79 | "conv1.weight": "patch_embed.proj.weight", 80 | "ln_pre.weight": "ln_pre.weight", 81 | "ln_pre.bias": "ln_pre.bias", 82 | "ln_post.weight": "norm.weight", 83 | "ln_post.bias": "norm.bias", 84 | } 85 | for layer in range(transformer_layers): 86 | key_mapping[f"transformer.resblocks.{layer}.attn.in_proj_weight"] = f"blocks.{layer}.attn.qkv.weight" 87 | key_mapping[f"transformer.resblocks.{layer}.attn.in_proj_bias"] = f"blocks.{layer}.attn.qkv.bias" 88 | key_mapping[f"transformer.resblocks.{layer}.attn.out_proj.weight"] = f"blocks.{layer}.attn.proj.weight" 89 | key_mapping[f"transformer.resblocks.{layer}.attn.out_proj.bias"] = f"blocks.{layer}.attn.proj.bias" 90 | key_mapping[f"transformer.resblocks.{layer}.ln_1.weight"] = f"blocks.{layer}.norm1.weight" 91 | key_mapping[f"transformer.resblocks.{layer}.ln_1.bias"] = f"blocks.{layer}.norm1.bias" 92 | key_mapping[f"transformer.resblocks.{layer}.mlp.c_fc.weight"] = f"blocks.{layer}.mlp.fc1.weight" 93 | key_mapping[f"transformer.resblocks.{layer}.mlp.c_fc.bias"] = f"blocks.{layer}.mlp.fc1.bias" 94 | key_mapping[f"transformer.resblocks.{layer}.mlp.c_proj.weight"] = f"blocks.{layer}.mlp.fc2.weight" 95 | key_mapping[f"transformer.resblocks.{layer}.mlp.c_proj.bias"] = f"blocks.{layer}.mlp.fc2.bias" 96 | key_mapping[f"transformer.resblocks.{layer}.ln_2.weight"] = f"blocks.{layer}.norm2.weight" 97 | key_mapping[f"transformer.resblocks.{layer}.ln_2.bias"] = f"blocks.{layer}.norm2.bias" 98 | 99 | for key in clip_state_dict: 100 | if key == 'proj': 101 | continue # due to possible dim mismatch, we load this later 102 | if key == "class_embedding": 103 | clip_state_dict[key] = clip_state_dict[key].unsqueeze(0).unsqueeze(0) 104 | if key == "positional_embedding": 105 | clip_state_dict[key] = clip_state_dict[key].unsqueeze(0) 106 | remapped_state_dict[key_mapping[key]] = clip_state_dict[key] 107 | 108 | return remapped_state_dict 109 | -------------------------------------------------------------------------------- /LaViLa/lavila/utils/__pycache__/distributed.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/utils/__pycache__/distributed.cpython-39.pyc -------------------------------------------------------------------------------- /LaViLa/lavila/utils/__pycache__/preprocess.cpython-39.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/utils/__pycache__/preprocess.cpython-39.pyc -------------------------------------------------------------------------------- /LaViLa/lavila/utils/distributed.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import os 8 | import shutil 9 | import torch 10 | import torch.distributed as dist 11 | 12 | 13 | def get_model(model): 14 | if isinstance(model, torch.nn.DataParallel) \ 15 | or isinstance(model, torch.nn.parallel.DistributedDataParallel): 16 | return model.module 17 | else: 18 | return model 19 | 20 | 21 | def setup_for_distributed(is_master): 22 | """ 23 | This function disables printing when not in master process 24 | """ 25 | import builtins as __builtin__ 26 | builtin_print = __builtin__.print 27 | 28 | def print(*args, **kwargs): 29 | force = kwargs.pop('force', False) 30 | if is_master or force: 31 | builtin_print(*args, **kwargs) 32 | 33 | __builtin__.print = print 34 | 35 | 36 | def is_dist_avail_and_initialized(): 37 | if not dist.is_available(): 38 | return False 39 | if not dist.is_initialized(): 40 | return False 41 | return True 42 | 43 | 44 | def get_world_size(): 45 | if not is_dist_avail_and_initialized(): 46 | return 1 47 | else: 48 | return dist.get_world_size() 49 | 50 | 51 | def get_rank(): 52 | if not is_dist_avail_and_initialized(): 53 | return 0 54 | return dist.get_rank() 55 | 56 | 57 | def is_main_process(): 58 | return get_rank() == 0 59 | 60 | 61 | def save_on_master(state, is_best, output_dir, is_epoch=True): 62 | if is_main_process(): 63 | ckpt_path = f'{output_dir}/checkpoint.pt' 64 | best_path = f'{output_dir}/checkpoint_best.pt' 65 | if is_best: 66 | torch.save(state, best_path) 67 | if is_epoch: 68 | if isinstance(state['epoch'], int): 69 | ckpt2_path = '{}/checkpoint_{:04d}.pt'.format(output_dir, state['epoch']) 70 | else: 71 | ckpt2_path = '{}/checkpoint_{:.4f}.pt'.format(output_dir, state['epoch']) 72 | torch.save(state, ckpt_path) 73 | shutil.copy(ckpt_path, ckpt2_path) 74 | 75 | 76 | def init_distributed_mode(args): 77 | if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ: 78 | args.rank = int(os.environ["RANK"]) 79 | args.world_size = int(os.environ['WORLD_SIZE']) 80 | args.gpu = int(os.environ['LOCAL_RANK']) 81 | elif 'SLURM_PROCID' in os.environ: 82 | args.rank = int(os.environ['SLURM_PROCID']) 83 | args.gpu = args.rank % torch.cuda.device_count() 84 | else: 85 | print('Not using distributed mode') 86 | args.distributed = False 87 | return 88 | 89 | args.distributed = True 90 | 91 | torch.cuda.set_device(args.gpu) 92 | args.dist_backend = 'nccl' 93 | print('| distributed init (rank {}): {}'.format( 94 | args.rank, args.dist_url), flush=True) 95 | torch.distributed.init_process_group( 96 | backend=args.dist_backend, 97 | init_method=args.dist_url, 98 | world_size=args.world_size, 99 | rank=args.rank 100 | ) 101 | torch.distributed.barrier() 102 | setup_for_distributed(args.rank == 0) 103 | -------------------------------------------------------------------------------- /LaViLa/lavila/utils/evaluation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import numpy as np 8 | import torch 9 | 10 | 11 | def accuracy(output, target, topk=(1,)): 12 | """Computes the accuracy over the k top predictions for the specified values of k""" 13 | with torch.no_grad(): 14 | maxk = max(topk) 15 | batch_size = target.size(0) 16 | 17 | _, pred = output.topk(maxk, 1, True, True) 18 | pred = pred.t() 19 | correct = pred.eq(target.reshape(1, -1).expand_as(pred)) 20 | 21 | res = [] 22 | for k in topk: 23 | correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True) 24 | res.append(correct_k.mul_(100.0 / batch_size)) 25 | return res 26 | 27 | 28 | def get_mean_accuracy(cm): 29 | list_acc = [] 30 | for i in range(len(cm)): 31 | acc = 0 32 | if cm[i, :].sum() > 0: 33 | acc = cm[i, i] / cm[i, :].sum() 34 | list_acc.append(acc) 35 | 36 | return 100 * np.mean(list_acc), 100 * np.trace(cm) / np.sum(cm) 37 | -------------------------------------------------------------------------------- /LaViLa/lavila/utils/evaluation_charades.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import numpy as np 8 | 9 | 10 | def compute_map(submission_array, gt_array): 11 | """ Returns mAP, weighted mAP, and AP array """ 12 | m_aps = [] 13 | n_classes = submission_array.shape[1] 14 | for oc_i in range(n_classes): 15 | sorted_idxs = np.argsort(-submission_array[:, oc_i]) 16 | tp = gt_array[:, oc_i][sorted_idxs] == 1 17 | fp = np.invert(tp) 18 | n_pos = tp.sum() 19 | if n_pos < 0.1: 20 | m_aps.append(float('nan')) 21 | continue 22 | fp.sum() 23 | f_pcs = np.cumsum(fp) 24 | t_pcs = np.cumsum(tp) 25 | prec = t_pcs / (f_pcs+t_pcs).astype(float) 26 | avg_prec = 0 27 | for i in range(submission_array.shape[0]): 28 | if tp[i]: 29 | avg_prec += prec[i] 30 | m_aps.append(avg_prec / n_pos.astype(float)) 31 | m_aps = np.array(m_aps) 32 | m_ap = np.mean(m_aps) 33 | w_ap = (m_aps * gt_array.sum(axis=0) / gt_array.sum().sum().astype(float)) 34 | return m_ap, w_ap, m_aps 35 | 36 | 37 | def charades_map(submission_array, gt_array): 38 | """ 39 | Approximate version of the charades evaluation function 40 | For precise numbers, use the submission file with the official matlab script 41 | """ 42 | fix = submission_array.copy() 43 | empty = np.sum(gt_array, axis=1) == 0 44 | fix[empty, :] = np.NINF 45 | return compute_map(fix, gt_array) 46 | 47 | 48 | def create_submission(video_list, predictions, out_file): 49 | assert len(video_list) == predictions.shape[0] 50 | with open(out_file, 'w') as f: 51 | for i, video_id in enumerate(video_list): 52 | pred_str = ' '.join(map(lambda x: str(x), predictions[i].tolist())) 53 | f.write('{} {}\n\n'.format(video_id, pred_str)) 54 | -------------------------------------------------------------------------------- /LaViLa/lavila/utils/evaluation_egomcq.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | 9 | 10 | def egomcq_accuracy_metrics(preds, labels, types): 11 | metrics = {} 12 | type_list = torch.unique(types) 13 | group_list = ["Intra-video", "Inter-video"] 14 | for type_i, group_i in zip(type_list, group_list): 15 | correct = 0 16 | total = 0 17 | for pred, label, type in zip(preds, labels, types): 18 | if type == type_i: 19 | pred_ = torch.argmax(pred) 20 | if pred_.item() == label.item(): 21 | correct += 1 22 | total += 1 23 | accuracy = correct/total 24 | metrics[group_i] = accuracy * 100 25 | return metrics 26 | -------------------------------------------------------------------------------- /LaViLa/lavila/utils/evaluation_ek100cls.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | # Part of the code is from https://github.com/fpv-iplab/rulstm/blob/master/RULSTM/utils.py 8 | # Modified by Yue Zhao 9 | 10 | import numpy as np 11 | 12 | 13 | def get_marginal_indexes(actions, mode): 14 | """For each verb/noun retrieve the list of actions containing that verb/name 15 | Input: 16 | mode: "verb" or "noun" 17 | Output: 18 | a list of numpy array of indexes. If verb/noun 3 is contained in actions 2,8,19, 19 | then output[3] will be np.array([2,8,19]) 20 | """ 21 | vi = [] 22 | for v in range(actions[mode].max()+1): 23 | vals = actions[actions[mode] == v].index.values 24 | if len(vals) > 0: 25 | vi.append(vals) 26 | else: 27 | vi.append(np.array([0])) 28 | return vi 29 | 30 | 31 | def marginalize(probs, indexes): 32 | mprobs = [] 33 | for ilist in indexes: 34 | mprobs.append(probs[:, ilist].sum(1)) 35 | return np.array(mprobs).T 36 | -------------------------------------------------------------------------------- /LaViLa/lavila/utils/meter.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import torch 8 | import torch.distributed as dist 9 | from lavila.utils import distributed as dist_utils 10 | 11 | 12 | class AverageMeter(object): 13 | """Computes and stores the average and current value""" 14 | def __init__(self, name, fmt=':f'): 15 | self.name = name 16 | self.fmt = fmt 17 | self.reset() 18 | 19 | def reset(self): 20 | self.val = 0 21 | self.avg = 0 22 | self.sum = 0 23 | self.count = 0 24 | 25 | def update(self, val, n=1): 26 | self.val = val 27 | self.sum += val * n 28 | self.count += n 29 | self.avg = self.sum / self.count 30 | 31 | def synchronize(self): 32 | if not dist_utils.is_dist_avail_and_initialized(): 33 | return 34 | t = torch.tensor([self.sum, self.count], dtype=torch.float64, device='cuda') 35 | dist.barrier() 36 | dist.all_reduce(t) 37 | t = t.tolist() 38 | self.sum = int(t[0]) 39 | self.count = t[1] 40 | self.avg = self.sum / self.count 41 | 42 | def __str__(self): 43 | fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})' 44 | return fmtstr.format(**self.__dict__) 45 | 46 | 47 | class ProgressMeter(object): 48 | def __init__(self, num_batches, meters, prefix=""): 49 | self.batch_fmtstr = self._get_batch_fmtstr(num_batches) 50 | self.meters = meters 51 | self.prefix = prefix 52 | 53 | def display(self, batch): 54 | entries = [self.prefix + self.batch_fmtstr.format(batch)] 55 | entries += [str(meter) for meter in self.meters] 56 | print('\t'.join(entries)) 57 | 58 | def synchronize(self): 59 | for meter in self.meters: 60 | meter.synchronize() 61 | 62 | def _get_batch_fmtstr(self, num_batches): 63 | num_digits = len(str(num_batches // 1)) 64 | fmt = '{:' + str(num_digits) + 'd}' 65 | return '[' + fmt + '/' + fmt.format(num_batches) + ']' 66 | -------------------------------------------------------------------------------- /LaViLa/lavila/utils/preprocess.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import csv 8 | 9 | from lavila.models.tokenizer import MyBertTokenizer, MyDistilBertTokenizer, MyGPT2Tokenizer, SimpleTokenizer 10 | 11 | 12 | def generate_label_map(dataset): 13 | if dataset == 'ek100_cls': 14 | print("Preprocess ek100 action label space") 15 | vn_list = [] 16 | mapping_vn2narration = {} 17 | for f in [ 18 | 'datasets/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv', 19 | 'datasets/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv', 20 | ]: 21 | csv_reader = csv.reader(open(f)) 22 | _ = next(csv_reader) # skip the header 23 | for row in csv_reader: 24 | vn = '{}:{}'.format(int(row[10]), int(row[12])) 25 | narration = row[8] 26 | if vn not in vn_list: 27 | vn_list.append(vn) 28 | if vn not in mapping_vn2narration: 29 | mapping_vn2narration[vn] = [narration] 30 | else: 31 | mapping_vn2narration[vn].append(narration) 32 | # mapping_vn2narration[vn] = [narration] 33 | vn_list = sorted(vn_list) 34 | print('# of action= {}'.format(len(vn_list))) 35 | mapping_vn2act = {vn: i for i, vn in enumerate(vn_list)} 36 | labels = [list(set(mapping_vn2narration[vn_list[i]])) for i in range(len(mapping_vn2act))] 37 | print(labels[:5]) 38 | elif dataset == 'charades_ego': 39 | print("=> preprocessing charades_ego action label space") 40 | vn_list = [] 41 | labels = [] 42 | with open('datasets/CharadesEgo/CharadesEgo/Charades_v1_classes.txt') as f: 43 | csv_reader = csv.reader(f) 44 | for row in csv_reader: 45 | vn = row[0][:4] 46 | vn_list.append(vn) 47 | narration = row[0][5:] 48 | labels.append(narration) 49 | mapping_vn2act = {vn: i for i, vn in enumerate(vn_list)} 50 | print(labels[:5]) 51 | elif dataset == 'egtea': 52 | print("=> preprocessing egtea action label space") 53 | labels = [] 54 | with open('datasets/EGTEA/action_idx.txt') as f: 55 | for row in f: 56 | row = row.strip() 57 | narration = ' '.join(row.split(' ')[:-1]) 58 | labels.append(narration.replace('_', ' ').lower()) 59 | # labels.append(narration) 60 | mapping_vn2act = {label: i for i, label in enumerate(labels)} 61 | print(len(labels), labels[:5]) 62 | else: 63 | raise NotImplementedError 64 | return labels, mapping_vn2act 65 | 66 | 67 | def generate_tokenizer(model): 68 | if model.endswith('DISTILBERT_BASE'): 69 | tokenizer = MyDistilBertTokenizer('distilbert-base-uncased') 70 | elif model.endswith('BERT_BASE'): 71 | tokenizer = MyBertTokenizer('bert-base-uncased') 72 | elif model.endswith('BERT_LARGE'): 73 | tokenizer = MyBertTokenizer('bert-large-uncased') 74 | elif model.endswith('GPT2'): 75 | tokenizer = MyGPT2Tokenizer('gpt2', add_bos=True) 76 | elif model.endswith('GPT2_MEDIUM'): 77 | tokenizer = MyGPT2Tokenizer('gpt2-medium', add_bos=True) 78 | elif model.endswith('GPT2_LARGE'): 79 | tokenizer = MyGPT2Tokenizer('gpt2-large', add_bos=True) 80 | elif model.endswith('GPT2_XL'): 81 | tokenizer = MyGPT2Tokenizer('gpt2-xl', add_bos=True) 82 | else: 83 | print("Using SimpleTokenizer because of model '{}'. " 84 | "Please check if this is what you want".format(model)) 85 | tokenizer = SimpleTokenizer() 86 | return tokenizer 87 | -------------------------------------------------------------------------------- /LaViLa/lavila/utils/random.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import random 8 | import numpy as np 9 | import torch 10 | 11 | 12 | def random_seed(seed=42, rank=0): 13 | torch.manual_seed(seed + rank) 14 | np.random.seed(seed + rank) 15 | random.seed(seed + rank) 16 | -------------------------------------------------------------------------------- /LaViLa/lavila/utils/scheduler.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | 7 | import numpy as np 8 | 9 | 10 | def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warmup_epochs=0, start_warmup_value=0): 11 | warmup_schedule = np.array([]) 12 | warmup_iters = warmup_epochs * niter_per_ep 13 | if warmup_epochs > 0: 14 | warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters) 15 | 16 | iters = np.arange(epochs * niter_per_ep - warmup_iters) 17 | schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters))) 18 | 19 | schedule = np.concatenate((warmup_schedule, schedule)) 20 | assert len(schedule) == epochs * niter_per_ep 21 | return schedule 22 | -------------------------------------------------------------------------------- /LaViLa/requirements.txt: -------------------------------------------------------------------------------- 1 | timm==0.5.4 2 | torch==1.10.1 3 | torchvision==0.11.2 4 | decord==0.6.0 5 | einops==0.4.1 6 | pandas==1.4.2 7 | pytorchvideo==0.1.5 8 | transformers==4.27 9 | ftfy==4.4.3 10 | spacy==3.4.1 11 | scikit-learn==1.1.1 12 | git+https://github.com/Maluuba/nlg-eval.git@master 13 | -------------------------------------------------------------------------------- /LaViLa/run_with_submitit_finetune_classification.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | A script to run multinode training with submitit. 8 | """ 9 | import argparse 10 | import os 11 | import uuid 12 | from pathlib import Path 13 | 14 | import main_finetune_classification as main_finetune 15 | import submitit 16 | 17 | 18 | def parse_args(): 19 | parser = main_finetune.get_args_parser() 20 | parser = argparse.ArgumentParser("Submitit for lavila fine-tuning", parents=[parser]) 21 | parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node") 22 | parser.add_argument("--nodes", default=8, type=int, help="Number of nodes to request") 23 | parser.add_argument("--timeout", default=2880, type=int, help="Duration of the job") 24 | parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.") 25 | 26 | parser.add_argument("--partition", default="learnlab", type=str, help="Partition where to submit") 27 | parser.add_argument("--use_volta32", action='store_true', help="Big models? Use this") 28 | parser.add_argument('--comment', default="", type=str, 29 | help='Comment to pass to scheduler, e.g. priority message') 30 | return parser.parse_args() 31 | 32 | 33 | def get_shared_folder() -> Path: 34 | user = os.getenv("USER") 35 | if Path("/checkpoint/").is_dir(): 36 | p = Path(f"/checkpoint/{user}/experiments/lavila_ft") 37 | p.mkdir(exist_ok=True) 38 | return p 39 | raise RuntimeError("No shared folder available") 40 | 41 | 42 | def get_init_file(): 43 | # Init file must not exist, but it's parent dir must exist. 44 | os.makedirs(str(get_shared_folder()), exist_ok=True) 45 | init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init" 46 | if init_file.exists(): 47 | os.remove(str(init_file)) 48 | return init_file 49 | 50 | 51 | class Trainer(object): 52 | def __init__(self, args): 53 | self.args = args 54 | 55 | def __call__(self): 56 | import main_finetune_classification as main_finetune 57 | 58 | self._setup_gpu_args() 59 | main_finetune.main(self.args) 60 | 61 | def checkpoint(self): 62 | import submitit 63 | 64 | self.args.dist_url = get_init_file().as_uri() 65 | print("Requeuing ", self.args) 66 | empty_trainer = type(self)(self.args) 67 | return submitit.helpers.DelayedSubmission(empty_trainer) 68 | 69 | def _setup_gpu_args(self): 70 | import submitit 71 | from pathlib import Path 72 | 73 | job_env = submitit.JobEnvironment() 74 | self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id))) 75 | self.args.gpu = job_env.local_rank 76 | self.args.rank = job_env.global_rank 77 | self.args.world_size = job_env.num_tasks 78 | print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") 79 | 80 | 81 | def main(): 82 | args = parse_args() 83 | if args.job_dir == "": 84 | args.job_dir = get_shared_folder() / "%j" 85 | 86 | # Note that the folder will depend on the job_id, to easily track experiments 87 | executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) 88 | 89 | num_gpus_per_node = args.ngpus 90 | nodes = args.nodes 91 | timeout_min = args.timeout 92 | 93 | partition = args.partition 94 | kwargs = {} 95 | if args.use_volta32: 96 | kwargs['slurm_constraint'] = 'volta32gb' 97 | if args.comment: 98 | kwargs['slurm_comment'] = args.comment 99 | 100 | executor.update_parameters( 101 | mem_gb=40 * num_gpus_per_node, 102 | gpus_per_node=num_gpus_per_node, 103 | tasks_per_node=num_gpus_per_node, # one task per GPU 104 | cpus_per_task=10, 105 | nodes=nodes, 106 | timeout_min=timeout_min, # max is 60 * 72 107 | # Below are cluster dependent parameters 108 | slurm_partition=partition, 109 | slurm_signal_delay_s=120, 110 | **kwargs 111 | ) 112 | 113 | executor.update_parameters(name="lavila_ft") 114 | 115 | args.dist_url = get_init_file().as_uri() 116 | args.output_dir = args.job_dir 117 | 118 | trainer = Trainer(args) 119 | job = executor.submit(trainer) 120 | 121 | print("Submitted job_id:", job.job_id) 122 | 123 | 124 | if __name__ == "__main__": 125 | main() 126 | -------------------------------------------------------------------------------- /LaViLa/run_with_submitit_finetune_retrieval.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | A script to run multinode training with submitit. 8 | """ 9 | import argparse 10 | import os 11 | import uuid 12 | from pathlib import Path 13 | 14 | import main_finetune_retrieval as main_finetune 15 | import submitit 16 | 17 | 18 | def parse_args(): 19 | parser = main_finetune.get_args_parser() 20 | parser = argparse.ArgumentParser("Submitit for lavila fine-tuning", parents=[parser]) 21 | parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node") 22 | parser.add_argument("--nodes", default=8, type=int, help="Number of nodes to request") 23 | parser.add_argument("--timeout", default=2880, type=int, help="Duration of the job") 24 | parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.") 25 | 26 | parser.add_argument("--partition", default="learnlab", type=str, help="Partition where to submit") 27 | parser.add_argument("--use_volta32", action='store_true', help="Big models? Use this") 28 | parser.add_argument('--comment', default="", type=str, 29 | help='Comment to pass to scheduler, e.g. priority message') 30 | return parser.parse_args() 31 | 32 | 33 | def get_shared_folder() -> Path: 34 | user = os.getenv("USER") 35 | if Path("/checkpoint/").is_dir(): 36 | p = Path(f"/checkpoint/{user}/experiments/lavila_ft") 37 | p.mkdir(exist_ok=True) 38 | return p 39 | raise RuntimeError("No shared folder available") 40 | 41 | 42 | def get_init_file(): 43 | # Init file must not exist, but it's parent dir must exist. 44 | os.makedirs(str(get_shared_folder()), exist_ok=True) 45 | init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init" 46 | if init_file.exists(): 47 | os.remove(str(init_file)) 48 | return init_file 49 | 50 | 51 | class Trainer(object): 52 | def __init__(self, args): 53 | self.args = args 54 | 55 | def __call__(self): 56 | import main_finetune_retrieval as main_finetune 57 | 58 | self._setup_gpu_args() 59 | main_finetune.main(self.args) 60 | 61 | def checkpoint(self): 62 | import submitit 63 | 64 | self.args.dist_url = get_init_file().as_uri() 65 | print("Requeuing ", self.args) 66 | empty_trainer = type(self)(self.args) 67 | return submitit.helpers.DelayedSubmission(empty_trainer) 68 | 69 | def _setup_gpu_args(self): 70 | import submitit 71 | from pathlib import Path 72 | 73 | job_env = submitit.JobEnvironment() 74 | self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id))) 75 | self.args.gpu = job_env.local_rank 76 | self.args.rank = job_env.global_rank 77 | self.args.world_size = job_env.num_tasks 78 | print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") 79 | 80 | 81 | def main(): 82 | args = parse_args() 83 | if args.job_dir == "": 84 | args.job_dir = get_shared_folder() / "%j" 85 | 86 | # Note that the folder will depend on the job_id, to easily track experiments 87 | executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) 88 | 89 | num_gpus_per_node = args.ngpus 90 | nodes = args.nodes 91 | timeout_min = args.timeout 92 | 93 | partition = args.partition 94 | kwargs = {} 95 | if args.use_volta32: 96 | kwargs['slurm_constraint'] = 'volta32gb' 97 | if args.comment: 98 | kwargs['slurm_comment'] = args.comment 99 | 100 | executor.update_parameters( 101 | mem_gb=40 * num_gpus_per_node, 102 | gpus_per_node=num_gpus_per_node, 103 | tasks_per_node=num_gpus_per_node, # one task per GPU 104 | cpus_per_task=10, 105 | nodes=nodes, 106 | timeout_min=timeout_min, # max is 60 * 72 107 | # Below are cluster dependent parameters 108 | slurm_partition=partition, 109 | slurm_signal_delay_s=120, 110 | **kwargs 111 | ) 112 | 113 | executor.update_parameters(name="lavila_ft") 114 | 115 | args.dist_url = get_init_file().as_uri() 116 | args.output_dir = args.job_dir 117 | 118 | trainer = Trainer(args) 119 | job = executor.submit(trainer) 120 | 121 | print("Submitted job_id:", job.job_id) 122 | 123 | 124 | if __name__ == "__main__": 125 | main() 126 | -------------------------------------------------------------------------------- /LaViLa/run_with_submitit_infer_narrator.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | # Copyright (c) Meta Platforms, Inc. and affiliates. 4 | # All rights reserved. 5 | 6 | # This source code is licensed under the license found in the 7 | # LICENSE file in the root directory of this source tree. 8 | """ 9 | A script to run multinode training with submitit. 10 | """ 11 | import argparse 12 | import os 13 | import uuid 14 | from pathlib import Path 15 | 16 | import main_infer_narrator 17 | import submitit 18 | 19 | 20 | def parse_args(): 21 | parser = main_infer_narrator.get_args_parser() 22 | parser = argparse.ArgumentParser("Submitit for inferring narrator", parents=[parser]) 23 | parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node") 24 | parser.add_argument("--nodes", default=4, type=int, help="Number of nodes to request") 25 | parser.add_argument("--timeout", default=2880, type=int, help="Duration of the job") 26 | parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.") 27 | 28 | parser.add_argument("--partition", default="learnlab", type=str, help="Partition where to submit") 29 | parser.add_argument("--use_volta32", action='store_true', help="Big models? Use this") 30 | parser.add_argument('--comment', default="", type=str, 31 | help='Comment to pass to scheduler, e.g. priority message') 32 | return parser.parse_args() 33 | 34 | 35 | def get_shared_folder() -> Path: 36 | user = os.getenv("USER") 37 | if Path("/checkpoint/").is_dir(): 38 | p = Path(f"/checkpoint/{user}/experiments/extract_caption") 39 | p.mkdir(exist_ok=True) 40 | return p 41 | raise RuntimeError("No shared folder available") 42 | 43 | 44 | def get_init_file(): 45 | # Init file must not exist, but it's parent dir must exist. 46 | os.makedirs(str(get_shared_folder()), exist_ok=True) 47 | init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init" 48 | if init_file.exists(): 49 | os.remove(str(init_file)) 50 | return init_file 51 | 52 | 53 | class Trainer(object): 54 | def __init__(self, args): 55 | self.args = args 56 | 57 | def __call__(self): 58 | import main_infer_narrator 59 | 60 | self._setup_gpu_args() 61 | main_infer_narrator.main(self.args) 62 | 63 | def checkpoint(self): 64 | import submitit 65 | 66 | self.args.dist_url = get_init_file().as_uri() 67 | print("Requeuing ", self.args) 68 | empty_trainer = type(self)(self.args) 69 | return submitit.helpers.DelayedSubmission(empty_trainer) 70 | 71 | def _setup_gpu_args(self): 72 | import submitit 73 | from pathlib import Path 74 | 75 | job_env = submitit.JobEnvironment() 76 | self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id))) 77 | self.args.gpu = job_env.local_rank 78 | self.args.rank = job_env.global_rank 79 | self.args.world_size = job_env.num_tasks 80 | print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") 81 | 82 | 83 | def main(): 84 | args = parse_args() 85 | if args.job_dir == "": 86 | args.job_dir = get_shared_folder() / "%j" 87 | 88 | # Note that the folder will depend on the job_id, to easily track experiments 89 | executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) 90 | 91 | num_gpus_per_node = args.ngpus 92 | nodes = args.nodes 93 | timeout_min = args.timeout 94 | 95 | partition = args.partition 96 | kwargs = {} 97 | if args.use_volta32: 98 | kwargs['slurm_constraint'] = 'volta32gb' 99 | if args.comment: 100 | kwargs['slurm_comment'] = args.comment 101 | 102 | executor.update_parameters( 103 | mem_gb=55 * num_gpus_per_node, 104 | gpus_per_node=num_gpus_per_node, 105 | tasks_per_node=num_gpus_per_node, # one task per GPU 106 | cpus_per_task=10, 107 | nodes=nodes, 108 | timeout_min=timeout_min, # max is 60 * 72 109 | # Below are cluster dependent parameters 110 | slurm_partition=partition, 111 | slurm_signal_delay_s=120, 112 | **kwargs 113 | ) 114 | 115 | executor.update_parameters(name="infer_narrator") 116 | 117 | args.dist_url = get_init_file().as_uri() 118 | args.output_dir = args.job_dir 119 | 120 | trainer = Trainer(args) 121 | job = executor.submit(trainer) 122 | 123 | print("Submitted job_id:", job.job_id) 124 | 125 | 126 | if __name__ == "__main__": 127 | main() 128 | -------------------------------------------------------------------------------- /LaViLa/run_with_submitit_pretrain.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) Meta Platforms, Inc. and affiliates. 2 | # All rights reserved. 3 | 4 | # This source code is licensed under the license found in the 5 | # LICENSE file in the root directory of this source tree. 6 | """ 7 | A script to run multinode training with submitit. 8 | """ 9 | import argparse 10 | import os 11 | import uuid 12 | from pathlib import Path 13 | 14 | import main_pretrain 15 | import submitit 16 | 17 | 18 | def parse_args(): 19 | parser = main_pretrain.get_args_parser() 20 | parser = argparse.ArgumentParser("Submitit for lavila pre-training", parents=[parser]) 21 | parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node") 22 | parser.add_argument("--nodes", default=8, type=int, help="Number of nodes to request") 23 | parser.add_argument("--timeout", default=2880, type=int, help="Duration of the job") 24 | parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.") 25 | 26 | parser.add_argument("--partition", default="learnlab", type=str, help="Partition where to submit") 27 | parser.add_argument("--use_volta32", action='store_true', help="Big models? Use this") 28 | parser.add_argument('--comment', default="", type=str, 29 | help='Comment to pass to scheduler, e.g. priority message') 30 | return parser.parse_args() 31 | 32 | 33 | def get_shared_folder() -> Path: 34 | user = os.getenv("USER") 35 | if Path("/checkpoint/").is_dir(): 36 | p = Path(f"/checkpoint/{user}/experiments/lavila_pretrain") 37 | p.mkdir(exist_ok=True) 38 | return p 39 | raise RuntimeError("No shared folder available") 40 | 41 | 42 | def get_init_file(): 43 | # Init file must not exist, but it's parent dir must exist. 44 | os.makedirs(str(get_shared_folder()), exist_ok=True) 45 | init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init" 46 | if init_file.exists(): 47 | os.remove(str(init_file)) 48 | return init_file 49 | 50 | 51 | class Trainer(object): 52 | def __init__(self, args): 53 | self.args = args 54 | 55 | def __call__(self): 56 | import main_pretrain 57 | 58 | self._setup_gpu_args() 59 | main_pretrain.main(self.args) 60 | 61 | def checkpoint(self): 62 | import submitit 63 | 64 | self.args.dist_url = get_init_file().as_uri() 65 | print("Requeuing ", self.args) 66 | empty_trainer = type(self)(self.args) 67 | return submitit.helpers.DelayedSubmission(empty_trainer) 68 | 69 | def _setup_gpu_args(self): 70 | import submitit 71 | from pathlib import Path 72 | 73 | job_env = submitit.JobEnvironment() 74 | self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id))) 75 | self.args.gpu = job_env.local_rank 76 | self.args.rank = job_env.global_rank 77 | self.args.world_size = job_env.num_tasks 78 | print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}") 79 | 80 | 81 | def main(): 82 | args = parse_args() 83 | if args.job_dir == "": 84 | args.job_dir = get_shared_folder() / "%j" 85 | 86 | # Note that the folder will depend on the job_id, to easily track experiments 87 | executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30) 88 | 89 | num_gpus_per_node = args.ngpus 90 | nodes = args.nodes 91 | timeout_min = args.timeout 92 | 93 | partition = args.partition 94 | kwargs = {} 95 | if args.use_volta32: 96 | kwargs['slurm_constraint'] = 'volta32gb' 97 | if args.comment: 98 | kwargs['slurm_comment'] = args.comment 99 | 100 | executor.update_parameters( 101 | mem_gb=40 * num_gpus_per_node, 102 | gpus_per_node=num_gpus_per_node, 103 | tasks_per_node=num_gpus_per_node, # one task per GPU 104 | cpus_per_task=10, 105 | nodes=nodes, 106 | timeout_min=timeout_min, # max is 60 * 72 107 | # Below are cluster dependent parameters 108 | slurm_partition=partition, 109 | slurm_signal_delay_s=120, 110 | **kwargs 111 | ) 112 | 113 | executor.update_parameters(name="lavila_pretrain") 114 | 115 | args.dist_url = get_init_file().as_uri() 116 | args.output_dir = args.job_dir 117 | 118 | trainer = Trainer(args) 119 | job = executor.submit(trainer) 120 | 121 | print("Submitted job_id:", job.job_id) 122 | 123 | 124 | if __name__ == "__main__": 125 | main() 126 | -------------------------------------------------------------------------------- /LaViLa/scripts/convert_egovlp_ckpt.py: -------------------------------------------------------------------------------- 1 | # This source code is licensed under the license found in the 2 | # LICENSE file in the root directory of this source tree. 3 | 4 | ''' 5 | Usage: 6 | ```bash 7 | PYTHONPATH= python scripts/convert_egovlp_ckpt.py \ 8 | --input-ckpt \ 9 | --output-ckpt egovlp_converted.pth 10 | ``` 11 | ''' 12 | 13 | import argparse 14 | from collections import OrderedDict 15 | import torch 16 | 17 | 18 | def get_args_parser(): 19 | parser = argparse.ArgumentParser(description='Convert EgoVLP checkpoint', add_help=False) 20 | parser.add_argument('--input-ckpt', type=str) 21 | parser.add_argument('--output-ckpt', type=str) 22 | return parser 23 | 24 | 25 | def main(args): 26 | input_ckpt = torch.load(args.input_ckpt, map_location='cpu') 27 | input_ckpt = input_ckpt['state_dict'] 28 | output_ckpt = OrderedDict() 29 | for k in input_ckpt: 30 | if k.startswith('module.video_model'): 31 | output_ckpt[k.replace('module.video_model', 'module.visual')] = input_ckpt[k] 32 | elif k.startswith('module.text_model'): 33 | output_ckpt[k.replace('module.text_model', 'module.textual')] = input_ckpt[k] 34 | elif k.startswith('module.txt_proj'): 35 | output_ckpt[k.replace('module.txt_proj', 'module.text_projection')] = input_ckpt[k] 36 | elif k.startswith('module.vid_proj'): 37 | output_ckpt[k.replace('module.vid_proj', 'module.image_projection')] = input_ckpt[k] 38 | else: 39 | print(k) 40 | raise ValueError 41 | torch.save({ 42 | 'epoch': 0, 43 | 'state_dict': output_ckpt, 44 | 'best_acc1': 0, 45 | }, args.output_ckpt) 46 | 47 | 48 | if __name__ == '__main__': 49 | parser = argparse.ArgumentParser('Convert EgoVLP checkpoint', parents=[get_args_parser()]) 50 | args = parser.parse_args() 51 | main(args) 52 | -------------------------------------------------------------------------------- /LaViLa/scripts/crop_and_resize_ego4d.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | small_side=288 4 | cliplen_sec=300 5 | max_tries=5 6 | indir="/path/to/full-scale/videos/" 7 | outdir="/path/to/downscaled/videos/" 8 | 9 | cd $indir || exit 10 | all_videos=$(find . -iname "*.mp4") 11 | all_videos=( $all_videos ) # to array 12 | cd - 13 | 14 | for video in "${all_videos[@]}"; do 15 | W=$( ffprobe -v quiet -show_format -show_streams -show_entries stream=width "${indir}/${video}" | grep width ) 16 | W=${W#width=} 17 | H=$( ffprobe -v quiet -show_format -show_streams -show_entries stream=height "${indir}/${video}" | grep height ) 18 | H=${H#height=} 19 | # Set the smaller side to small_side 20 | # from https://superuser.com/a/624564 21 | if [ $W -gt $H ] && [ $H -gt ${small_side} ]; then 22 | scale_str="-filter:v scale=-1:${small_side}" 23 | elif [ $H -gt $W ] && [ $W -gt ${small_side} ]; then 24 | scale_str="-filter:v scale=${small_side}:-1" 25 | else 26 | # The small side is smaller than required size, so don't resize/distort the video 27 | scale_str="" 28 | fi 29 | vidlen_sec=$( ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "${indir}/${video}" ) 30 | mkdir -p "${outdir}/${video}" 31 | for st_sec in $(seq 0 ${cliplen_sec} ${vidlen_sec}); do 32 | outfpath=${outdir}/${video}/${st_sec}.mp4 33 | try=0 34 | while [ $try -le $max_tries ]; do 35 | ffmpeg -y -ss ${st_sec} -i "${indir}/${video}" ${scale_str} -t ${cliplen_sec} "${outfpath}" 36 | try=$(( $try + 1 )) 37 | write_errors=$( ffprobe -v error -i "${outfpath}" ) 38 | # If no errors detected by ffprobe, we are done 39 | if [ -z "$write_errors" ]; then 40 | echo $outfpath written successfully in $try tries! 41 | break 42 | fi 43 | done 44 | done 45 | echo "Converted ${video}" 46 | done 47 | 48 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 |

VideoAgent: A Memory-augmented Multimodal Agent for Video Understanding (ECCV 2024)

2 | 3 | # Introduction 4 | This is the official code repository of [VideoAgent: A Memory-augmented Multimodal Agent for Video Understanding 5 | ](https://videoagent.github.io/). VideoAgent is a mulit-modal agent that can understand the input video and answer the questions raised by you. 6 | 7 | Given a video and a question, VideoAgent has two phases: memory construction phase and inference phase. During the memory construction phase, structured information is extracted from the video and stored in the memory. During the inference phase, a LLM is prompted to use a set of tools interacting with the memory to answer the question. 8 |

9 | 10 |

11 | 12 | # Prerequisites 13 | This project is tested on Ubuntu 20.04 with a NVIDIA RTX 4090(24GB). 14 | 15 | 16 | # Installation Guide 17 | Use the following command to create the environment named as videoagent: 18 | ```sh 19 | conda env create -f environment.yaml 20 | ``` 21 | 22 | Create the environment of [Video-LLaVA](https://github.com/PKU-YuanGroup/Video-LLaVA) by running the following command: 23 | ```sh 24 | git clone https://github.com/PKU-YuanGroup/Video-LLaVA 25 | cd Video-LLaVA 26 | conda create -n videollava python=3.10 -y 27 | conda activate videollava 28 | pip install --upgrade pip # enable PEP 660 support 29 | pip install -e . 30 | pip install -e ".[train]" 31 | pip install flash-attn --no-build-isolation 32 | pip install decord opencv-python git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d 33 | ``` 34 | Note: Only the conda envrionment named videollava is required for this project, while the Video-LLaVA repository is not required. You can clone Video-LLaVA repository to anywhere you want and build the conda environment named videollava. 35 | 36 | Download the ```cache_dir.zip``` and ```tool_models.zip``` from [here](https://zenodo.org/records/11031717) and unzip them to the directory of ```VideoAgent```. This will create two folder ```cache_dir```(the model weights of VideoLLaVA) and ```tool_models```(the model weights of all other models) under ```VideoAgent```. 37 | 38 | # Usage 39 | Make sure you are under VideoAgent directory. 40 | Enter your OpenAI api key in ```config/default.yaml```. 41 | 42 | First, open a terminal and run: 43 | ```sh 44 | conda activate videollava 45 | python video-llava.py 46 | ``` 47 | This will start a Video-LLaVA server process that will deal with Visual Question Answering request raised by VideoAgent. 48 | 49 | Once you see ```ready for connection!``` in the first process, Then, open another terminal and run: 50 | ```sh 51 | conda activate videoagent 52 | python demo.py 53 | ``` 54 | This will create a Gradio demo shown as follows. 55 |

56 | 57 |

58 | You can choose the example videos for inference, or you can also upload your own videos and questions. Once submitted, VideoAgent will start processing your video and store the files under ```preprocess/your_video_name```. After processing the input video, it will answer your question. 59 | 60 | The results will provide: 61 | 1. the answer to the question 62 | 2. the replay with object re-ID of the input video 63 | 3. the inference log (chain-of-thought) of VideoAgent 64 | 65 | For batch inference, you can run 66 | ```sh 67 | conda activate videoagent 68 | python main.py 69 | ``` 70 | 71 | # Citation 72 | If you find our paper and code useful in your research, please consider giving a star ⭐ and citation 📝. 73 | ``` 74 | @inproceedings{fan2025videoagent, 75 | title={Videoagent: A memory-augmented multimodal agent for video understanding}, 76 | author={Fan, Yue and Ma, Xiaojian and Wu, Rujie and Du, Yuntao and Li, Jiaqi and Gao, Zhi and Li, Qing}, 77 | booktitle={European Conference on Computer Vision}, 78 | pages={75--92}, 79 | year={2025}, 80 | organization={Springer} 81 | } 82 | ``` 83 | 84 | -------------------------------------------------------------------------------- /captioning.py: -------------------------------------------------------------------------------- 1 | import sys 2 | sys.path.insert(0, 'LaViLa/') 3 | import os 4 | import urllib.request 5 | from collections import OrderedDict 6 | import numpy as np 7 | import time 8 | import torch 9 | import torchvision.transforms as transforms 10 | import torchvision.transforms._transforms_video as transforms_video 11 | from LaViLa.lavila.data.video_transforms import Permute 12 | from LaViLa.lavila.models.models import VCLM_OPENAI_TIMESFORMER_LARGE_336PX_GPT2_XL 13 | from LaViLa.lavila.models.tokenizer import MyGPT2Tokenizer 14 | from LaViLa.eval_narrator import decode_one 15 | import json 16 | import cv2 17 | import pickle 18 | 19 | 20 | 21 | class Captioning: 22 | def __init__(self, video_path_list, base_dir='preprocess'): 23 | self.video_path_list = video_path_list 24 | self.seconds_per_caption = 2 # a caption covers 2 seconds 25 | self.frames_per_caption = 4 # a caption is generated from 4 frames in the 2-second segments 26 | self.base_dir = base_dir 27 | 28 | 29 | def generate_captions_for_all_videos(self): 30 | """create the captions for all videos""" 31 | start_time = time.time() 32 | crop_size = 336 33 | val_transform = transforms.Compose([ 34 | Permute([3, 0, 1, 2]), 35 | transforms.Resize(crop_size), 36 | transforms.CenterCrop(crop_size), 37 | transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305]) 38 | ]) 39 | ckpt_name = 'vclm_openai_timesformer_large_336px_gpt2_xl.pt_ego4d.jobid_246897.ep_0003.md5sum_443263.pth' 40 | ckpt_path = os.path.join('tool_models/LaViLa/', ckpt_name) 41 | if not os.path.exists(ckpt_path): 42 | print('downloading model to {}'.format(ckpt_path)) 43 | urllib.request.urlretrieve('https://dl.fbaipublicfiles.com/lavila/checkpoints/narrator/{}'.format(ckpt_name), ckpt_path) 44 | ckpt = torch.load(ckpt_path, map_location='cpu') 45 | state_dict = OrderedDict() 46 | for k, v in ckpt['state_dict'].items(): 47 | state_dict[k.replace('module.', '')] = v 48 | # instantiate the model, and load the pre-trained weights 49 | model = VCLM_OPENAI_TIMESFORMER_LARGE_336PX_GPT2_XL( 50 | text_use_cls_token=False, 51 | project_embed_dim=256, 52 | gated_xattn=True, 53 | timesformer_gated_xattn=False, 54 | freeze_lm_vclm=False, # we use model.eval() anyway 55 | freeze_visual_vclm=False, # we use model.eval() anyway 56 | num_frames=4, 57 | drop_path_rate=0. 58 | ) 59 | model.load_state_dict(state_dict, strict=True) 60 | model.cuda() 61 | model.eval() 62 | tokenizer = MyGPT2Tokenizer('gpt2-xl', add_bos=True) 63 | end_time = time.time() 64 | print(f'time for loading captioning model: {round(end_time-start_time, 3)} seconds') 65 | 66 | 67 | for video_path in self.video_path_list: 68 | cap = cv2.VideoCapture(video_path) 69 | if not cap.isOpened(): 70 | print("Error: Unable to open video file.") 71 | continue 72 | fps = round(cap.get(cv2.CAP_PROP_FPS)) 73 | total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) 74 | total_captions = total_frames//(fps*self.seconds_per_caption) 75 | frame_interval = fps*self.seconds_per_caption//self.frames_per_caption # the interval between two selected frames 76 | 77 | base_name = os.path.basename(video_path).replace(".mp4", "") 78 | video_dir = os.path.join(self.base_dir, base_name) 79 | if not os.path.exists(video_dir): 80 | os.makedirs(video_dir) 81 | 82 | captions = dict() 83 | start_time = time.time() 84 | cap.set(cv2.CAP_PROP_POS_FRAMES, 0) 85 | for caption_id in range(total_captions): 86 | frames = [] 87 | for i in range(self.frames_per_caption): # 4 frames are selected for generating the caption 88 | success, frame = cap.read() 89 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 90 | frames.append(frame) 91 | for j in range(frame_interval-1): #skip other frames 92 | success, frame = cap.read() 93 | for i in range(fps*self.seconds_per_caption-frame_interval*self.frames_per_caption): 94 | success, frame = cap.read() #skip remaining frames 95 | frames = [torch.tensor(frame, dtype=torch.float32) for frame in frames] 96 | frames = torch.stack(frames, dim=0) 97 | frames = val_transform(frames) 98 | frames = frames.unsqueeze(0) 99 | 100 | with torch.no_grad(): 101 | input_frames = frames.cuda(non_blocking=True) 102 | image_features = model.encode_image(input_frames) 103 | generated_text_ids, ppls = model.generate( 104 | image_features, 105 | tokenizer, 106 | target=None, # free-form generation 107 | max_text_length=77, 108 | top_k=None, 109 | top_p=0.95, # nucleus sampling 110 | num_return_sequences=5, # number of candidates: 5 111 | temperature=0.7, 112 | early_stopping=True, 113 | ) 114 | text = "" 115 | length = -1 116 | for i in range(5): 117 | # select the longest candidate as the caption 118 | generated_text_str = decode_one(generated_text_ids[i], tokenizer) 119 | if len(generated_text_str) > length: 120 | length = len(generated_text_str) 121 | text = generated_text_str 122 | caption_start_frame = caption_id*fps*self.seconds_per_caption 123 | caption_end_frame = (caption_id+1)*fps*self.seconds_per_caption 124 | segment = "{}_{}".format(str(caption_start_frame), str(caption_end_frame)) 125 | captions[segment] = text 126 | print(f"id: {caption_id}, frame_interval: {segment}, caption: {text}") 127 | end_time = time.time() 128 | cap.release() 129 | print(f"captioning time for video {base_name}: {round(end_time-start_time, 3)} seconds") 130 | with open(os.path.join(video_dir, "captions.json"), 'w') as f: 131 | json.dump(captions, f) 132 | segments = list(captions) 133 | segment2id = dict() 134 | for segment in segments: 135 | segment2id[segment] = len(segment2id) 136 | with open(os.path.join(video_dir, "segment2id.json"), 'w') as f: 137 | json.dump(segment2id, f) 138 | 139 | def run(self): 140 | self.generate_captions_for_all_videos() -------------------------------------------------------------------------------- /config/default.yaml: -------------------------------------------------------------------------------- 1 | openai_api_key: your_openai-api_key 2 | use_reid: true 3 | vqa_tool: videollava #videollava or gpt-4v 4 | base_dir: preprocess -------------------------------------------------------------------------------- /database.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import pickle 4 | from collections import defaultdict 5 | from encoder import encode_sentences 6 | from utils import compute_cosine_similarity, top_k_indices 7 | import numpy as np 8 | import sqlite3 9 | 10 | 11 | class DataBase: 12 | def __init__(self, video_path, base_dir='preprocess', use_reid=True): 13 | base_name = os.path.basename(video_path).replace(".mp4", "") 14 | self.video_dir = os.path.join(base_dir, base_name) 15 | self.use_reid = use_reid 16 | if self.use_reid: 17 | with open(os.path.join(self.video_dir, 'reid.pkl'), 'rb') as f: 18 | content = pickle.load(f) 19 | self.frame2uid, self.uid2frame, self.uid2category = content[0], content[1], content[2] 20 | with open(os.path.join(self.video_dir, 'uid2clip.pkl'), 'rb') as f: 21 | self.uid2emb = pickle.load(f) 22 | else: 23 | with open(os.path.join(self.video_dir, 'tracking.pkl'), 'rb') as f: 24 | content = pickle.load(f) 25 | self.frame2uid, self.uid2frame, self.uid2category = content[0], content[1], content[2] 26 | with open(os.path.join(self.video_dir, 'tid2clip.pkl'), 'rb') as f: 27 | self.uid2emb = pickle.load(f) 28 | 29 | with open(os.path.join(self.video_dir, 'segment2id.json')) as f: 30 | self.segment2id = json.load(f) 31 | self.segment_id2uids = defaultdict(set) 32 | for frame in self.frame2uid: 33 | segment_id = 0 34 | for segment in self.segment2id: 35 | start, end = segment.split('_') 36 | start, end = int(start), int(end) 37 | if start <= frame <= end: 38 | segment_id = self.segment2id[segment] 39 | break 40 | uids = list(self.frame2uid[frame]) 41 | self.segment_id2uids[segment_id].update(uids) 42 | 43 | if os.path.exists('database.db'): 44 | os.remove('database.db') 45 | connection = sqlite3.connect('database.db') 46 | cursor = connection.cursor() 47 | create_object = """ 48 | CREATE TABLE Objects( 49 | object_id INT, 50 | category VARCHAR(255), 51 | PRIMARY KEY (object_id) 52 | ); 53 | """ 54 | cursor.execute(create_object) 55 | create_segment = """ 56 | CREATE TABLE Segments( 57 | segment_id INT, 58 | PRIMARY KEY (segment_id) 59 | ); 60 | """ 61 | cursor.execute(create_segment) 62 | create_object_segment = """ 63 | CREATE TABLE Objects_Segments( 64 | object_id INT, 65 | segment_id INT, 66 | PRIMARY KEY (object_id, segment_id), 67 | FOREIGN KEY (object_id) REFERENCES Objects(object_id), 68 | FOREIGN KEY (segment_id) REFERENCES Segments(segment_id) 69 | ); 70 | """ 71 | cursor.execute(create_object_segment) 72 | connection.commit() 73 | 74 | insert_objects = [] 75 | for uid in self.uid2category: 76 | line = "INSERT INTO Objects (object_id, category) VALUES ({}, '{}')".format(str(uid), self.uid2category[uid]) 77 | #print(line) 78 | insert_objects.append(line) 79 | for s in insert_objects: 80 | cursor.execute(s) 81 | 82 | insert_segments = [] 83 | for segment in self.segment2id: 84 | segment_id = self.segment2id[segment] 85 | line = "INSERT INTO Segments (segment_id) VALUES ({})".format(str(segment_id)) 86 | #print(line) 87 | insert_segments.append(line) 88 | for s in insert_segments: 89 | cursor.execute(s) 90 | 91 | 92 | insert_object_segments = [] 93 | for segment_id in self.segment_id2uids: 94 | for uid in self.segment_id2uids[segment_id]: 95 | line = "INSERT INTO Objects_Segments (object_id, segment_id) VALUES ({}, {})".format(str(uid), str(segment_id)) 96 | #print(line) 97 | insert_object_segments.append(line) 98 | for s in insert_object_segments: 99 | cursor.execute(s) 100 | 101 | connection.commit() 102 | cursor.close() 103 | connection.close() 104 | 105 | 106 | def retrieve_candidate_objects(self, description): 107 | des_emb = encode_sentences([f"a photo of a {description}."], model_name='clip') 108 | scores = compute_cosine_similarity(des_emb, list(self.uid2emb.values())) 109 | indices = np.where(scores >= 0.26)[0] 110 | candidate_uids = [] 111 | for i in indices: 112 | candidate_uids.append(list(self.uid2emb)[i]) 113 | return candidate_uids 114 | 115 | 116 | def query_database(self, program): 117 | connection = sqlite3.connect('database.db') 118 | cursor = connection.cursor() 119 | try: 120 | cursor.execute(program) 121 | results = cursor.fetchall() 122 | return results 123 | except sqlite3.Error as e: 124 | return e -------------------------------------------------------------------------------- /demo.py: -------------------------------------------------------------------------------- 1 | import gradio as gr 2 | import openai 3 | from main import preprocess, ReActAgent 4 | from multiprocessing import Process 5 | import os 6 | import socket 7 | from omegaconf import OmegaConf 8 | 9 | config = OmegaConf.load('config/default.yaml') 10 | openai_api_key = config['openai_api_key'] 11 | use_reid = config['use_reid'] 12 | vqa_tool = config['vqa_tool'] 13 | base_dir = config['base_dir'] 14 | 15 | 16 | def ask_question(video_file, question): 17 | preprocess(video_path_list=[video_file], 18 | base_dir=base_dir, 19 | show_tracking=False) 20 | answer, log = ReActAgent(video_path=video_file, question=question, base_dir=base_dir, vqa_tool=vqa_tool, use_reid=use_reid, openai_api_key=openai_api_key) 21 | base_name = os.path.basename(video_file).replace(".mp4", "") 22 | reid_file = os.path.join("preprocess", base_name, "reid.mp4") 23 | return answer, reid_file, log 24 | 25 | 26 | with gr.Row(): 27 | # Define inputs 28 | with gr.Column(scale=6): 29 | video_input = gr.Video(label="Upload a video") 30 | question_input = gr.Textbox(label="Ask a question") 31 | 32 | 33 | # Define output 34 | with gr.Column(scale=6): 35 | output_text = gr.Textbox(label="Answer") 36 | output_reid = gr.Video(label="Video replay with object re-identifcation") 37 | output_log = gr.Textbox(label="Inference log") 38 | 39 | 40 | # Create Gradio interface 41 | gr.Interface( 42 | fn=ask_question, 43 | inputs=[video_input, question_input], 44 | outputs=[output_text, output_reid, output_log], 45 | title="VideoAgent", 46 | examples = [ 47 | [f"sample_videos/boats.mp4", "How many boats are there in the video?"], 48 | [f"sample_videos/talking.mp4", 49 | "From what clue do you know that the woman with black spectacles at the start of the video is married?"], 50 | [f"sample_videos/books.mp4", 51 | "Based on the actions observed, what could be a possible motivation or goal for what c is doing in the video?"], 52 | [f"sample_videos/painting.mp4", 53 | "What was the primary purpose of the cup of water in this video, and how did it contribute to the overall painting process?"], 54 | [f"sample_videos/kitchen.mp4", 55 | "Is there a microwave in the kitchen?"], 56 | ], 57 | description="""### This is the demo of [VideoAgent](https://videoagent.github.io/). 58 | 59 | Upload a video and ask a question to get an answer from the VideoAgent.""" 60 | 61 | ).launch(share=True) 62 | -------------------------------------------------------------------------------- /encoder.py: -------------------------------------------------------------------------------- 1 | import json 2 | import openai 3 | import numpy as np 4 | import pickle 5 | from sentence_transformers import SentenceTransformer 6 | import os 7 | from PIL import Image 8 | import clip 9 | import torch 10 | from openai import OpenAI 11 | import torchvision.transforms as T 12 | from PIL import Image 13 | from time import time 14 | 15 | 16 | sentence_models = ['text-embedding-ada-002', 'text-embedding-3-large', 'all-MiniLM-L6-v2', 'all-mpnet-base-v2', 'clip'] 17 | 18 | 19 | def encode_sentences(sentence_list, model_name): 20 | '''given a list of sentences, return the embeddings for them using the sentence encoder model''' 21 | assert model_name in sentence_models 22 | emb_list = [] 23 | if model_name in['text-embedding-ada-002', 'text-embedding-3-large']: #openai embedding requires api-key 24 | client = OpenAI() 25 | emb = client.embeddings.create(input=sentence_list, model=model_name) 26 | for i in range(len(sentence_list)): 27 | emb_list.append(np.array(emb.data[i].embedding).reshape(1, -1)) 28 | emb_list = np.concatenate(emb_list, axis=0) 29 | return emb_list 30 | elif model_name == 'clip': # clip embedding 31 | device = "cuda" if torch.cuda.is_available() else "cpu" 32 | model, transform = clip.load("ViT-B/32", device=device) 33 | with torch.no_grad(): 34 | for sentence in sentence_list: 35 | emb_list.append(model.encode_text(clip.tokenize([sentence]).to(device)).cpu().numpy()) 36 | emb_list = np.concatenate(emb_list, axis=0) 37 | return emb_list 38 | else: #sentence transformer encoder 39 | model = SentenceTransformer('sentence-transformers/'+model_name) 40 | num = len(sentence_list) 41 | batch_size = 10 42 | batch_num = num // batch_size 43 | with torch.no_grad(): 44 | for batch_id in range(batch_num): 45 | batch_sentences = sentence_list[batch_id*10: (batch_id+1)*10] 46 | emb_list.append(model.encode(batch_sentences)) 47 | if batch_num * 10 < num: #remaining <10 sentences 48 | remaining_sentences = sentence_list[batch_num*10: num] 49 | emb_list.append(model.encode(remaining_sentences)) 50 | return emb_list 51 | 52 | 53 | if __name__ == '__main__': 54 | encode_sentences(['hello!', 'what'], model_name='text-embedding-ada-002') 55 | 56 | -------------------------------------------------------------------------------- /environment.yaml: -------------------------------------------------------------------------------- 1 | name: videoagent 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - _libgcc_mutex=0.1=conda_forge 7 | - _openmp_mutex=4.5=2_gnu 8 | - bzip2=1.0.8=hd590300_5 9 | - ca-certificates=2023.11.17=hbcca054_0 10 | - ld_impl_linux-64=2.40=h41732ed_0 11 | - libffi=3.4.2=h7f98852_5 12 | - libgcc-ng=13.2.0=h807b86a_3 13 | - libgomp=13.2.0=h807b86a_3 14 | - libnsl=2.0.1=hd590300_0 15 | - libsqlite=3.44.2=h2797004_0 16 | - libuuid=2.38.1=h0b41bf4_0 17 | - libzlib=1.2.13=hd590300_5 18 | - ncurses=6.4=h59595ed_2 19 | - openssl=3.2.0=hd590300_1 20 | - pip=23.3.2=pyhd8ed1ab_0 21 | - python=3.9.18=h0755675_0_cpython 22 | - readline=8.2=h8228510_1 23 | - setuptools=68.2.2=pyhd8ed1ab_0 24 | - tk=8.6.13=noxft_h4845f30_101 25 | - tzdata=2023c=h71feb2d_0 26 | - wheel=0.42.0=pyhd8ed1ab_0 27 | - xz=5.2.6=h166bdaf_0 28 | - pip: 29 | - aiohttp==3.9.1 30 | - aiosignal==1.3.1 31 | - annotated-types==0.6.0 32 | - anyio==3.7.1 33 | - async-timeout==4.0.3 34 | - attrs==23.1.0 35 | - av==11.0.0 36 | - blis==0.7.11 37 | - catalogue==2.0.10 38 | - certifi==2023.11.17 39 | - charset-normalizer==3.3.2 40 | - click==8.1.7 41 | - cmake==3.28.1 42 | - confection==0.1.4 43 | - contourpy==1.2.0 44 | - cycler==0.12.1 45 | - cymem==2.0.8 46 | - cython==3.0.7 47 | - dataclasses-json==0.6.3 48 | - decord==0.6.0 49 | - distro==1.8.0 50 | - einops==0.4.1 51 | - exceptiongroup==1.2.0 52 | - fairscale==0.4.4 53 | - filelock==3.13.1 54 | - fonttools==4.47.0 55 | - frozenlist==1.4.1 56 | - fsspec==2023.12.2 57 | - ftfy==4.4.3 58 | - fvcore==0.1.5.post20221221 59 | - gradio==4.22.0 60 | - gradio-client==0.13.0 61 | - gensim==3.8.3 62 | - greenlet==3.0.3 63 | - h11==0.14.0 64 | - html5lib==1.1 65 | - httpcore==1.0.2 66 | - httpx==0.26.0 67 | - huggingface-hub==0.19.4 68 | - idna==3.6 69 | - imageio==2.33.1 70 | - importlib-resources==6.1.1 71 | - pip-install==1.3.5 72 | - iopath==0.1.10 73 | - jinja2==3.1.2 74 | - joblib==1.3.2 75 | - jsonpatch==1.33 76 | - jsonpointer==2.4 77 | - kiwisolver==1.4.5 78 | - langchain==0.1.2 79 | - langchain-community==0.0.14 80 | - langchain-core==0.1.14 81 | - langchain-openai==0.0.3 82 | - langchainhub==0.1.14 83 | - langcodes==3.3.0 84 | - langsmith==0.0.83 85 | - lapx==0.5.5 86 | - lit==17.0.6 87 | - markupsafe==2.1.3 88 | - marshmallow==3.20.2 89 | - matplotlib==3.8.2 90 | - mpmath==1.3.0 91 | - multidict==6.0.4 92 | - murmurhash==1.0.10 93 | - mypy-extensions==1.0.0 94 | - moviepy==1.0.3 95 | - networkx==3.2.1 96 | - nltk==3.8.1 97 | - numpy==1.26.2 98 | - nvidia-cublas-cu11==11.10.3.66 99 | - nvidia-cublas-cu12==12.1.3.1 100 | - nvidia-cuda-cupti-cu11==11.7.101 101 | - nvidia-cuda-cupti-cu12==12.1.105 102 | - nvidia-cuda-nvrtc-cu11==11.7.99 103 | - nvidia-cuda-nvrtc-cu12==12.1.105 104 | - nvidia-cuda-runtime-cu11==11.7.99 105 | - nvidia-cuda-runtime-cu12==12.1.105 106 | - nvidia-cudnn-cu11==8.5.0.96 107 | - nvidia-cudnn-cu12==8.9.2.26 108 | - nvidia-cufft-cu11==10.9.0.58 109 | - nvidia-cufft-cu12==11.0.2.54 110 | - nvidia-curand-cu11==10.2.10.91 111 | - nvidia-curand-cu12==10.3.2.106 112 | - nvidia-cusolver-cu11==11.4.0.1 113 | - nvidia-cusolver-cu12==11.4.5.107 114 | - nvidia-cusparse-cu11==11.7.4.91 115 | - nvidia-cusparse-cu12==12.1.0.106 116 | - nvidia-nccl-cu11==2.14.3 117 | - nvidia-nccl-cu12==2.18.1 118 | - nvidia-nvjitlink-cu12==12.3.101 119 | - nvidia-nvtx-cu11==11.7.91 120 | - nvidia-nvtx-cu12==12.1.105 121 | - omegaconf==2.3.0 122 | - openai==1.9.0 123 | - opencv-python==4.8.1.78 124 | - packaging==23.2 125 | - pandas==1.3.5 126 | - parameterized==0.9.0 127 | - pathy==0.10.3 128 | - pillow==10.1.0 129 | - pims==0.6.1 130 | - portalocker==2.8.2 131 | - preshed==3.0.9 132 | - protobuf==4.21.12 133 | - psutil==5.9.7 134 | - py-cpuinfo==9.0.0 135 | - pydantic==2.5.3 136 | - pyparsing==3.1.1 137 | - python-dateutil==2.8.2 138 | - pytorchvideo==0.1.5 139 | - pytz==2023.3.post1 140 | - pyyaml==6.0.1 141 | - regex==2023.10.3 142 | - requests==2.31.0 143 | - safetensors==0.4.1 144 | - scikit-learn==1.0.2 145 | - scipy==1.11.4 146 | - seaborn==0.13.1 147 | - sentence-transformers==2.2.2 148 | - sentencepiece==0.1.99 149 | - six==1.16.0 150 | - slicerator==1.1.0 151 | - smart-open==6.4.0 152 | - sniffio==1.3.0 153 | - sqlalchemy==2.0.25 154 | - srsly==2.4.8 155 | - sympy==1.12 156 | - tabulate==0.9.0 157 | - tenacity==8.2.3 158 | - termcolor==2.4.0 159 | - theano==1.0.5 160 | - thinc==8.1.12 161 | - thop==0.1.1-2209072238 162 | - threadpoolctl==3.2.0 163 | - tiktoken==0.5.2 164 | - timm==0.5.4 165 | - tokenizers==0.12.1 166 | - torch==2.1.2 167 | - torchvision==0.16.2 168 | - tqdm==4.66.1 169 | - transformers==4.27.0 170 | - triton==2.1.0 171 | - types-requests==2.31.0.20240106 172 | - typing-extensions==4.9.0 173 | - typing-inspect==0.9.0 174 | - ultralytics==8.0.235 175 | - urllib3==2.1.0 176 | - wasabi==0.10.1 177 | - wcwidth==0.2.12 178 | - webdataset==0.2.86 179 | - webencodings==0.5.1 180 | - xdg==6.0.0 181 | - yacs==0.1.8 182 | - yarl==1.9.4 183 | - zipp==3.17.0 184 | - git+https://github.com/openai/CLIP.git 185 | - git+https://github.com/Maluuba/nlg-eval.git@master 186 | -------------------------------------------------------------------------------- /imgs/demo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/imgs/demo.png -------------------------------------------------------------------------------- /imgs/teaser.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/imgs/teaser.png -------------------------------------------------------------------------------- /preprocess/boats/captions.json: -------------------------------------------------------------------------------- 1 | {"0_48": "#C C looks around ", "48_96": "#C C looks around the", "96_144": "#C C looks at the", "144_192": "#C C looks around ", "192_240": "#C C looks around the lake", "240_288": "#C C looks around "} -------------------------------------------------------------------------------- /preprocess/boats/reid.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/reid.mp4 -------------------------------------------------------------------------------- /preprocess/boats/reid.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/reid.pkl -------------------------------------------------------------------------------- /preprocess/boats/segment2id.json: -------------------------------------------------------------------------------- 1 | {"0_48": 0, "48_96": 1, "96_144": 2, "144_192": 3, "192_240": 4, "240_288": 5} -------------------------------------------------------------------------------- /preprocess/boats/segment_textual_embedding.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/segment_textual_embedding.pkl -------------------------------------------------------------------------------- /preprocess/boats/segment_visual_embedding.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/segment_visual_embedding.pkl -------------------------------------------------------------------------------- /preprocess/boats/tid2clip.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/tid2clip.pkl -------------------------------------------------------------------------------- /preprocess/boats/tid2dinov2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/tid2dinov2.pkl -------------------------------------------------------------------------------- /preprocess/boats/tracking.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/tracking.pkl -------------------------------------------------------------------------------- /preprocess/boats/uid2clip.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/uid2clip.pkl -------------------------------------------------------------------------------- /preprocess/books/captions.json: -------------------------------------------------------------------------------- 1 | {"0_60": "#C C picks books from the floor", "60_120": "#C C picks the book on the floor", "120_180": "#C C holds a book in the book shelf", "180_240": "#C C places the book on the shelf", "240_300": "#C C removes the book from the", "300_360": "#C C holds the book with his left hand", "360_420": "#C C touches a book on a shelf with his right hand", "420_480": "#C C looks at the books on the floor", "480_540": "#C C picks the book from the shelf", "540_600": "#C C arranges books on the shelf", "600_660": "#C C looks around the floor", "660_720": "#C C picks a book from the floor", "720_780": "#C C places the books on the shelf", "780_840": "#C C picks a book from the shelf with his right hand", "840_900": "#C C adjusts the books on the shelf with her left hand", "900_960": "#C C arranges the books in the shelf", "960_1020": "#C C picks a book from the floor", "1020_1080": "#C C picks the books from the floor", "1080_1140": "#C C puts the book on the shelf", "1140_1200": "#C C adjusts the books on the shelf", "1200_1260": "#C C picks a book from the shelf", "1260_1320": "#C C arranges books on the book shelf", "1320_1380": "#C C picks the book from the", "1380_1440": "#C C puts book on top of the bookshe", "1440_1500": "#C C puts the books on the bookshe", "1500_1560": "#C C picks the book from the floor", "1560_1620": "#C C picks books from the floor", "1620_1680": "#C C places the book in his right hand in the bookshelf", "1680_1740": "#C C puts the books in the book shelf with his right hand", "1740_1800": "#C C picks a book from the floor with her left hand", "1800_1860": "#C C arranges books on the shelf", "1860_1920": "#C C arranges books", "1920_1980": "#C C looks around the house", "1980_2040": "#C C stares at the", "2040_2100": "#C C looks at the", "2100_2160": "#C C looks around", "2160_2220": "#C C adjusts the books on the shelf with his hands", "2220_2280": "#C C moves the books in the bookshelf with his right hand", "2280_2340": "#C C arranges books on the shelf", "2340_2400": "#C C touches the books on the shelf", "2400_2460": "#c c puts books on the shelf", "2460_2520": "#C C holds the book with her right hand", "2520_2580": "#C C places the book on the book shelf", "2580_2640": "#C C puts book on the shelf", "2640_2700": "#C C picks the books from the floor", "2700_2760": "#C C looks around the room.", "2760_2820": "#C C adjusts the books in the shelf with his hands", "2820_2880": "#C C picks the books from the floor", "2880_2940": "#C C picks a book from the", "2940_3000": "#C C arranges the books in the shelf", "3000_3060": "#C C arranges books on the shelf", "3060_3120": "#C C picks a book from the floor with her right hand", "3120_3180": "#C C picks the books on the floor", "3180_3240": "#C C picks a book from the shelf with his left hand", "3240_3300": "#C C picks the book from the shelf", "3300_3360": "#C C puts the book in the shelf with her right hand", "3360_3420": "#C C places the book on the shelf with her right hand", "3420_3480": "#C C places the book in his left hand in the shelf", "3480_3540": "#C C adjusts the book on the shelf.", "3540_3600": "#C C holds the books on her hands", "3600_3660": "#C C picks a", "3660_3720": "#C C picks books from the floor", "3720_3780": "#C C picks a book from the floor", "3780_3840": "#C C puts the books in the book shelf", "3840_3900": "#C C picks a book from the shelf", "3900_3960": "#C C arranges books in the shelve", "3960_4020": "#C C adjusts the books on the shelf ", "4020_4080": "#C C puts the books on the", "4080_4140": "#C C picks a book from the floor with her right hand", "4140_4200": "#C C holds the books with her hands", "4200_4260": "#C C picks up the books from the floor", "4260_4320": "#C C puts the books on the floor", "4320_4380": "#C C places the book on the shelf", "4380_4440": "#C C picks the book holder from the floor with her right hand", "4440_4500": "#C C arranges the books in the shelf with his right hand", "4500_4560": "#C C looks at the books on the", "4560_4620": "#C C puts a book on the floor", "4620_4680": "#C C places the book in his left hand on the ground", "4680_4740": "#C C picks the book from the floor with her right hand", "4740_4800": "#C C looks around the", "4800_4860": "#C C picks the book on the shelf", "4860_4920": "#C C arranges the books in the bookcase", "4920_4980": "#C C looks around the house", "4980_5040": "#C C picks the book from the", "5040_5100": "#C C picks a book from the", "5100_5160": "#C C puts the books on the floor", "5160_5220": "#C C puts books on the floor", "5220_5280": "#C C picks the books from the floor", "5280_5340": "#C C looks around the house", "5340_5400": "#C C puts the books on the shelf"} -------------------------------------------------------------------------------- /preprocess/books/reid.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/reid.mp4 -------------------------------------------------------------------------------- /preprocess/books/reid.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/reid.pkl -------------------------------------------------------------------------------- /preprocess/books/segment2id.json: -------------------------------------------------------------------------------- 1 | {"0_60": 0, "60_120": 1, "120_180": 2, "180_240": 3, "240_300": 4, "300_360": 5, "360_420": 6, "420_480": 7, "480_540": 8, "540_600": 9, "600_660": 10, "660_720": 11, "720_780": 12, "780_840": 13, "840_900": 14, "900_960": 15, "960_1020": 16, "1020_1080": 17, "1080_1140": 18, "1140_1200": 19, "1200_1260": 20, "1260_1320": 21, "1320_1380": 22, "1380_1440": 23, "1440_1500": 24, "1500_1560": 25, "1560_1620": 26, "1620_1680": 27, "1680_1740": 28, "1740_1800": 29, "1800_1860": 30, "1860_1920": 31, "1920_1980": 32, "1980_2040": 33, "2040_2100": 34, "2100_2160": 35, "2160_2220": 36, "2220_2280": 37, "2280_2340": 38, "2340_2400": 39, "2400_2460": 40, "2460_2520": 41, "2520_2580": 42, "2580_2640": 43, "2640_2700": 44, "2700_2760": 45, "2760_2820": 46, "2820_2880": 47, "2880_2940": 48, "2940_3000": 49, "3000_3060": 50, "3060_3120": 51, "3120_3180": 52, "3180_3240": 53, "3240_3300": 54, "3300_3360": 55, "3360_3420": 56, "3420_3480": 57, "3480_3540": 58, "3540_3600": 59, "3600_3660": 60, "3660_3720": 61, "3720_3780": 62, "3780_3840": 63, "3840_3900": 64, "3900_3960": 65, "3960_4020": 66, "4020_4080": 67, "4080_4140": 68, "4140_4200": 69, "4200_4260": 70, "4260_4320": 71, "4320_4380": 72, "4380_4440": 73, "4440_4500": 74, "4500_4560": 75, "4560_4620": 76, "4620_4680": 77, "4680_4740": 78, "4740_4800": 79, "4800_4860": 80, "4860_4920": 81, "4920_4980": 82, "4980_5040": 83, "5040_5100": 84, "5100_5160": 85, "5160_5220": 86, "5220_5280": 87, "5280_5340": 88, "5340_5400": 89} -------------------------------------------------------------------------------- /preprocess/books/segment_textual_embedding.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/segment_textual_embedding.pkl -------------------------------------------------------------------------------- /preprocess/books/segment_visual_embedding.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/segment_visual_embedding.pkl -------------------------------------------------------------------------------- /preprocess/books/tid2clip.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/tid2clip.pkl -------------------------------------------------------------------------------- /preprocess/books/tid2dinov2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/tid2dinov2.pkl -------------------------------------------------------------------------------- /preprocess/books/tracking.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/tracking.pkl -------------------------------------------------------------------------------- /preprocess/books/uid2clip.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/uid2clip.pkl -------------------------------------------------------------------------------- /preprocess/kitchen/captions.json: -------------------------------------------------------------------------------- 1 | {"0_30": "#C C opens the kitchen cabinet", "30_60": "#C C opens the cabinet door", "60_90": "#C C picks a glass in the", "90_120": "#C C opens the cabinet.", "120_150": "#C C opens the tap", "150_180": "#C C puts water in the cup", "180_210": "#C C closes the tap.", "210_240": "#C C puts cup on the sink counter", "240_270": "#C C picks the cup from the counter", "270_300": "#C C opens a refrigerator with his left", "300_330": "#C C closes the fridge with his right", "330_360": "#C C picks a bottle of milk from the", "360_390": "#C C puts the bottle in the fridge", "390_420": "#C C closes the fridge with his left hand", "420_450": "#C C closes the refrigerator with his left hand", "450_480": "#C C opens the water bottle lid", "480_510": "#C C covers the kettle with the lid", "510_540": "#C C puts water in the coffee maker", "540_570": "#C C pours water in the sink", "570_600": "#C C pours the milk into the"} -------------------------------------------------------------------------------- /preprocess/kitchen/reid.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/reid.mp4 -------------------------------------------------------------------------------- /preprocess/kitchen/reid.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/reid.pkl -------------------------------------------------------------------------------- /preprocess/kitchen/segment2id.json: -------------------------------------------------------------------------------- 1 | {"0_30": 0, "30_60": 1, "60_90": 2, "90_120": 3, "120_150": 4, "150_180": 5, "180_210": 6, "210_240": 7, "240_270": 8, "270_300": 9, "300_330": 10, "330_360": 11, "360_390": 12, "390_420": 13, "420_450": 14, "450_480": 15, "480_510": 16, "510_540": 17, "540_570": 18, "570_600": 19} -------------------------------------------------------------------------------- /preprocess/kitchen/segment_0.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_0.mp4 -------------------------------------------------------------------------------- /preprocess/kitchen/segment_1.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_1.mp4 -------------------------------------------------------------------------------- /preprocess/kitchen/segment_18.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_18.mp4 -------------------------------------------------------------------------------- /preprocess/kitchen/segment_3.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_3.mp4 -------------------------------------------------------------------------------- /preprocess/kitchen/segment_8.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_8.mp4 -------------------------------------------------------------------------------- /preprocess/kitchen/segment_textual_embedding.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_textual_embedding.pkl -------------------------------------------------------------------------------- /preprocess/kitchen/segment_visual_embedding.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_visual_embedding.pkl -------------------------------------------------------------------------------- /preprocess/kitchen/tid2clip.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/tid2clip.pkl -------------------------------------------------------------------------------- /preprocess/kitchen/tid2dinov2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/tid2dinov2.pkl -------------------------------------------------------------------------------- /preprocess/kitchen/tracking.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/tracking.pkl -------------------------------------------------------------------------------- /preprocess/kitchen/uid2clip.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/uid2clip.pkl -------------------------------------------------------------------------------- /preprocess/painting/captions.json: -------------------------------------------------------------------------------- 1 | {"0_60": "#C C draws on the paper with the paint brush in his right hand.", "60_120": "#C C draws on the paper with the painting brush in his right hand. ", "120_180": "#C C draws on the paper with the paint brush in his right hand.", "180_240": "#C C moves a paint palette on the table with his right hand.", "240_300": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "300_360": "#C C draws on the paper with the paint brush in his right hand. ", "360_420": "#C C paints on the paper with the paint brush in his right hand.", "420_480": "#C C adjusts the drawing board with his left hand.", "480_540": "#C C draws on the paper with the paint brush in his right hand.", "540_600": "#C C dips the paint brush in his right hand in the cup of water on the table.", "600_660": "#C C dips the paint brush in his right hand in the paint palette on the table. ", "660_720": "#C C smears watercolor on the watercolor set with the", "720_780": "#C C dips the paint brush in his right hand in the paint palette on the table.", "780_840": "#C C dips the paint brush in his right hand in the paint palette on the table.", "840_900": "#C C dips the paint brush in his right hand in the cup of water on the table", "900_960": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "960_1020": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "1020_1080": "#C C draws on the paper with the paint brush in his right hand.", "1080_1140": "#C C paints a", "1140_1200": "#C C paints a", "1200_1260": "#C C paints a", "1260_1320": "#C C paints on the paper with the paint brush in his right hand", "1320_1380": "#C C draws on the paper with the paint brush in his right hand.", "1380_1440": "#C C adjusts the painting board with his right hand.", "1440_1500": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "1500_1560": "#C C dips the paint brush in his right hand in the paint palette on the table.", "1560_1620": "#C C dips the paint brush in his right hand in the cup of water on the table.", "1620_1680": "#C C stirs brush in the watercolor pan", "1680_1740": "#C C dips the paint brush in his right hand in the paint palette on the table.", "1740_1800": "#C C paints on the paper with the paint brush in his right hand. ", "1800_1860": "#C C paints a", "1860_1920": "#C C paints a", "1920_1980": "#C C paints a", "1980_2040": "#C C draws on the paper with the paint brush in his right hand.", "2040_2100": "#C C paints the", "2100_2160": "#C C paints a", "2160_2220": "#C C draws on the paper with the paint brush in his right hand. ", "2220_2280": "#C C moves the paint palette on the table with his right hand.", "2280_2340": "#C C dips the paint brush in his right hand in the paint palette on the table.", "2340_2400": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "2400_2460": "#C C operates the tablet computer on the table with his right hand.", "2460_2520": "#C C paints a", "2520_2580": "#C C draws on the paper with the paint brush in his right hand.", "2580_2640": "#C C adjusts the drawing board with his left hand.", "2640_2700": "#C C draws on the paper with the paint brush in his right hand.", "2700_2760": "#C C adjusts the book on the table with his left hand", "2760_2820": "#C C lifts the paint brush from the drawing board with his right hand.", "2820_2880": "#C C paints a", "2880_2940": "#C C paints a", "2940_3000": "#C C draws on the paper with the painting brush in his right hand.", "3000_3060": "#C C paints a", "3060_3120": "#C C dips the paint brush in his right hand in the paint palette on the table.", "3120_3180": "#C C draws on the paper with the paint brush in his right hand.", "3180_3240": "#C C paints a", "3240_3300": "#C C adjusts the book on the table with her right hand.", "3300_3360": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "3360_3420": "#C C paints on the paper with the painting brush in his right hand.", "3420_3480": "#C C paints a", "3480_3540": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "3540_3600": "#C C draws on the paper with the paint brush in his right hand.", "3600_3660": "#C C paints a", "3660_3720": "#C C draws on the paper with the paint brush in his right hand.", "3720_3780": "#C C paints on the paper with the paint brush in his right hand.", "3780_3840": "#C C adjusts the drawing board with his right hand.", "3840_3900": "#C C paints the cover of the paint palette with the paint brush in his right hand. ", "3900_3960": "#C C draws on the paper with the paint brush in his right hand.", "3960_4020": "#C C draws on the paper with the painting brush in his right hand.", "4020_4080": "#C C draws on the paper with the paint brush in his right hand.", "4080_4140": "#C C draws on the paper with the paint brush in his right hand.", "4140_4200": "#C C draws on the paper with the paint brush in his right hand.", "4200_4260": "#C C draws on the paper with the painting brush in his right hand.", "4260_4320": "#C C draws on the paper with the paint brush in his right hand.", "4320_4380": "#C C draws on the paper with the paint brush in his right hand.", "4380_4440": "#C C paints a", "4440_4500": "#C C draws on the paper with the painting brush in his right hand.", "4500_4560": "#C C draws on the paper with the paint brush in his right hand.", "4560_4620": "#C C dips the paint brush in his right hand in the paint palette on the table.", "4620_4680": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "4680_4740": "#C C draws on the paper with the paint brush in his right hand.", "4740_4800": "#C C paints a", "4800_4860": "#C C draws on the paper with the paint brush in his right hand.", "4860_4920": "#C C paints a", "4920_4980": "#C C adjusts the book on his lap with his left hand.", "4980_5040": "#C C dips the paint brush in his right hand into the cup of water on the table.", "5040_5100": "#C C moves the painting brush", "5100_5160": "#C C dips the paint brush in his right hand in the cup of water on the table.", "5160_5220": "#C C touches the book with his left hand", "5220_5280": "#C C dips the paint brush in his right hand in the cup of water on the table.", "5280_5340": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "5340_5400": "#C C dips the paint brush in his right hand in the cup of water on the table."} -------------------------------------------------------------------------------- /preprocess/painting/reid.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/reid.mp4 -------------------------------------------------------------------------------- /preprocess/painting/reid.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/reid.pkl -------------------------------------------------------------------------------- /preprocess/painting/segment2id.json: -------------------------------------------------------------------------------- 1 | {"0_60": 0, "60_120": 1, "120_180": 2, "180_240": 3, "240_300": 4, "300_360": 5, "360_420": 6, "420_480": 7, "480_540": 8, "540_600": 9, "600_660": 10, "660_720": 11, "720_780": 12, "780_840": 13, "840_900": 14, "900_960": 15, "960_1020": 16, "1020_1080": 17, "1080_1140": 18, "1140_1200": 19, "1200_1260": 20, "1260_1320": 21, "1320_1380": 22, "1380_1440": 23, "1440_1500": 24, "1500_1560": 25, "1560_1620": 26, "1620_1680": 27, "1680_1740": 28, "1740_1800": 29, "1800_1860": 30, "1860_1920": 31, "1920_1980": 32, "1980_2040": 33, "2040_2100": 34, "2100_2160": 35, "2160_2220": 36, "2220_2280": 37, "2280_2340": 38, "2340_2400": 39, "2400_2460": 40, "2460_2520": 41, "2520_2580": 42, "2580_2640": 43, "2640_2700": 44, "2700_2760": 45, "2760_2820": 46, "2820_2880": 47, "2880_2940": 48, "2940_3000": 49, "3000_3060": 50, "3060_3120": 51, "3120_3180": 52, "3180_3240": 53, "3240_3300": 54, "3300_3360": 55, "3360_3420": 56, "3420_3480": 57, "3480_3540": 58, "3540_3600": 59, "3600_3660": 60, "3660_3720": 61, "3720_3780": 62, "3780_3840": 63, "3840_3900": 64, "3900_3960": 65, "3960_4020": 66, "4020_4080": 67, "4080_4140": 68, "4140_4200": 69, "4200_4260": 70, "4260_4320": 71, "4320_4380": 72, "4380_4440": 73, "4440_4500": 74, "4500_4560": 75, "4560_4620": 76, "4620_4680": 77, "4680_4740": 78, "4740_4800": 79, "4800_4860": 80, "4860_4920": 81, "4920_4980": 82, "4980_5040": 83, "5040_5100": 84, "5100_5160": 85, "5160_5220": 86, "5220_5280": 87, "5280_5340": 88, "5340_5400": 89} -------------------------------------------------------------------------------- /preprocess/painting/segment_83.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/segment_83.mp4 -------------------------------------------------------------------------------- /preprocess/painting/segment_85.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/segment_85.mp4 -------------------------------------------------------------------------------- /preprocess/painting/segment_textual_embedding.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/segment_textual_embedding.pkl -------------------------------------------------------------------------------- /preprocess/painting/segment_visual_embedding.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/segment_visual_embedding.pkl -------------------------------------------------------------------------------- /preprocess/painting/tid2clip.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/tid2clip.pkl -------------------------------------------------------------------------------- /preprocess/painting/tid2dinov2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/tid2dinov2.pkl -------------------------------------------------------------------------------- /preprocess/painting/tracking.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/tracking.pkl -------------------------------------------------------------------------------- /preprocess/painting/uid2clip.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/uid2clip.pkl -------------------------------------------------------------------------------- /preprocess/talking/captions.json: -------------------------------------------------------------------------------- 1 | {"0_60": "#O woman X points at the ceiling", "60_120": "#O person Y converses with C", "120_180": "#O person Y touches her nose with her right", "180_240": "#O woman X lifts up her", "240_300": "#C C moves the hands", "300_360": "#C C looks around the room", "360_420": "#O woman X uses gesture with a", "420_480": "#C C interacts with the woman X", "480_540": "#O woman X points at the ceiling", "540_600": "#O woman X does a hand gesture", "600_660": "#O woman X raises a hand", "660_720": "#O The woman X adjusts her glasses with both hands", "720_780": "#O person X points towards the ceiling", "780_840": "#C C interacts with lady X", "840_900": "#O person Y adjusts the camera", "900_960": "#O A woman X interacts with C", "960_1020": "#O woman X converses with C", "1020_1080": "#C C talks to the colleagues", "1080_1140": "#O woman X talks to woman", "1140_1200": "#O woman Y converses with woman", "1200_1260": "#O person Z moves her hands", "1260_1320": "#O person Y looks at person X", "1320_1380": "#C C looks at the people in the", "1380_1440": "#C C converses with a woman V,W and X and a man Y and Z", "1440_1500": "#O The man Y holds the phone with his right hand.", "1500_1560": "#C C converses with a man X and Y and a woman Z", "1560_1620": "#C C converses with the woman Y", "1620_1680": "#C The man M interacts with C, the woman N, the man M and the woman N", "1680_1740": "#O A Woman M holds her waist with both hands", "1740_1800": "#O The Woman X taps her left fingers on her thigh", "1800_1860": "#O Woman A Holds a camera with hands", "1860_1920": "#O person Y puts the card on the table", "1920_1980": "#O A woman X looks at C", "1980_2040": "#O person X talks to person", "2040_2100": "#O Woman Y eats food with the right", "2100_2160": "#C C looks around the room", "2160_2220": "#O person X interacts with person Z", "2220_2280": "#O A woman X stands in the", "2280_2340": "#C C looks at the people in the", "2340_2400": "#C C looks at the woman", "2400_2460": "#C C stares at a woman Y", "2460_2520": "#C C looks around the house", "2520_2580": "#C C converses with the man Y, the man X and the woman Z", "2580_2640": "#O A man X talks to man Z"} -------------------------------------------------------------------------------- /preprocess/talking/reid.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/reid.mp4 -------------------------------------------------------------------------------- /preprocess/talking/reid.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/reid.pkl -------------------------------------------------------------------------------- /preprocess/talking/segment2id.json: -------------------------------------------------------------------------------- 1 | {"0_60": 0, "60_120": 1, "120_180": 2, "180_240": 3, "240_300": 4, "300_360": 5, "360_420": 6, "420_480": 7, "480_540": 8, "540_600": 9, "600_660": 10, "660_720": 11, "720_780": 12, "780_840": 13, "840_900": 14, "900_960": 15, "960_1020": 16, "1020_1080": 17, "1080_1140": 18, "1140_1200": 19, "1200_1260": 20, "1260_1320": 21, "1320_1380": 22, "1380_1440": 23, "1440_1500": 24, "1500_1560": 25, "1560_1620": 26, "1620_1680": 27, "1680_1740": 28, "1740_1800": 29, "1800_1860": 30, "1860_1920": 31, "1920_1980": 32, "1980_2040": 33, "2040_2100": 34, "2100_2160": 35, "2160_2220": 36, "2220_2280": 37, "2280_2340": 38, "2340_2400": 39, "2400_2460": 40, "2460_2520": 41, "2520_2580": 42, "2580_2640": 43} -------------------------------------------------------------------------------- /preprocess/talking/segment_10.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/segment_10.mp4 -------------------------------------------------------------------------------- /preprocess/talking/segment_11.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/segment_11.mp4 -------------------------------------------------------------------------------- /preprocess/talking/segment_9.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/segment_9.mp4 -------------------------------------------------------------------------------- /preprocess/talking/segment_textual_embedding.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/segment_textual_embedding.pkl -------------------------------------------------------------------------------- /preprocess/talking/segment_visual_embedding.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/segment_visual_embedding.pkl -------------------------------------------------------------------------------- /preprocess/talking/tid2clip.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/tid2clip.pkl -------------------------------------------------------------------------------- /preprocess/talking/tid2dinov2.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/tid2dinov2.pkl -------------------------------------------------------------------------------- /preprocess/talking/tracking.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/tracking.pkl -------------------------------------------------------------------------------- /preprocess/talking/uid2clip.pkl: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/uid2clip.pkl -------------------------------------------------------------------------------- /prompts/database_query_prompt.txt: -------------------------------------------------------------------------------- 1 | You are tasked with answering a question about a video using a database. The database consists of three tables: 2 | 3 | TABLE Objects( 4 | object_id INT, 5 | category VARCHAR(255), 6 | PRIMARY KEY (object_id) 7 | ) 8 | The 'Objects' table catalogues the people or objects in the video, with each assigned a unique 'object_id' and 'category'. For example, an object entry may be (1, 'cup'). 9 | 10 | TABLE Segments( 11 | segment_id INT, 12 | PRIMARY KEY (segment_id) 13 | ) 14 | The 'Segments' are 2-second segments of the video. The 'segment_id' starts from 0 and increments by 1 sequentially. 15 | 16 | TABLE Objects_Segments( 17 | object_id INT, 18 | segment_id INT, 19 | PRIMARY KEY (object_id, segment_id), 20 | FOREIGN KEY (object_id) REFERENCES Objects(object_id), 21 | FOREIGN KEY (segment_id) REFERENCES Segments(segment_id) 22 | ) 23 | The 'Objects_Segments' table links the 'Objects' and 'Segments' tables, recording the appearing objects in each segment. 24 | 25 | You have access to the following tools: 26 | 27 | {tools} 28 | 29 | ATTENTION: 30 | 1. Since you only have information about the objects and their appearing segments, if you think the question requires more information, just output "I cannot answer this question." 31 | 2. The categories of the objects/people are limited. To find a specific object, you can first query the database for all the object categories, and match the object to one of the categories. If you cannot find objects using the categories, you can also try the tool 'retreive_candidate_objects'. 32 | 3. use single quotes for the strings in the MySQL program, for instance: SELECT COUNT(DISTINCT object_id) FROM Objects WHERE category = 'person' 33 | 34 | Use the following format: 35 | 36 | Question: the input question you must answer 37 | Thought: you should always think about what to do 38 | Action: the action to take, should be one of [{tool_names}] 39 | Action Input: the input to the action 40 | Observation: the result of the action... (this Thought/Action/Action Input/Observation can repeat N times) 41 | Thought: I now know the final answer 42 | Final Answer: the answer to the original input question 43 | 44 | Begin! 45 | 46 | Question: {input} 47 | Thought: {agent_scratchpad} -------------------------------------------------------------------------------- /prompts/multiple_choice_prompt.txt: -------------------------------------------------------------------------------- 1 | You are tasked with answering a multiple-choice question related to a video. The question has 5 choices, labeled as 0, 1, 2, 3, 4. The video is sliced into 2-second segments, each with an segment ID starting from zero and incrementing in chronological order. Each segment has a caption depicting the event. 2 | There is an object memory that saves the objects and their appearing segments. The object memory is maintained by another agent. 3 | You have access to the following tools: 4 | 5 | {tools} 6 | 7 | ATTENTION: 8 | 1. the segment captions with prefix '#C' refer to the camera wearer, while those with prefix '#O' refer to someone other than the camera wearer. 9 | 2. You can use both 'visual_question_answering' and 'object_memory_querying' to answer questions related to objects or people. 10 | 3. The 'visual_question_answering' may have hallucination. You should pay more attention to the description rather than the answer in 'visual_question_answering'. 11 | 4. Use double quotes on the string arguments of the tools. The input to the tools should not contain any single quotes. If the tool has two arguments, output the arguments in brackets such as ("what is the man doing", 1). 12 | 5. Its easier to answer the multiple-choice question by validating the choices. 13 | 6. If the information is too vague to provide an accurate answer, make your best guess. 14 | 15 | Use the following format: 16 | 17 | Question: the input question you must answer 18 | Thought: you should always think about what to do 19 | Action: the action to take, should be one of [{tool_names}] 20 | Action Input: the input to the action 21 | Observation: the result of the action... (this Thought/Action/Action Input/Observation can repeat N times) 22 | Thought: I now know the final answer 23 | Final Answer: the correct choice label (0, 1, 2, 3, 4) to the original input question 24 | 25 | Begin! 26 | 27 | Question: {input} 28 | Thought: {agent_scratchpad} 29 | -------------------------------------------------------------------------------- /prompts/prompt.txt: -------------------------------------------------------------------------------- 1 | You are tasked with answering a question related to a video. The video is sliced into 2-second segments, each with an segment ID starting from zero and incrementing in chronological order. Each segment has a caption depicting the event. 2 | There is an object memory that saves the objects and their appearing segments. The object memory is maintained by another agent. 3 | You have access to the following tools: 4 | 5 | {tools} 6 | 7 | ATTENTION: 8 | 1. the segment captions with prefix '#C' refer to the camera wearer, while those with prefix '#O' refer to someone other than the camera wearer. 9 | 2. You can use both 'visual_question_answering' and 'object_memory_querying' to answer questions related to objects or people. 10 | 3. The 'visual_question_answering' may have hallucination. You should pay more attention to the description rather than the answer in 'visual_question_answering'. 11 | 4. Use double quotes on the string arguments of the tools. The input to the tools should not contain any single quotes. If the tool has two arguments, output the arguments in brackets such as ("what is the man doing", 1). 12 | 5. If the information is too vague to provide an accurate answer, make your best guess. 13 | 14 | Use the following format: 15 | 16 | Question: the input question you must answer 17 | Thought: you should always think about what to do 18 | Action: the action to take, should be one of [{tool_names}] 19 | Action Input: the input to the action 20 | Observation: the result of the action... (this Thought/Action/Action Input/Observation can repeat N times) 21 | Thought: I now know the final answer 22 | Final Answer: the answer to the original input question 23 | 24 | Begin! 25 | 26 | Question: {input} 27 | Thought: {agent_scratchpad} 28 | -------------------------------------------------------------------------------- /reid.py: -------------------------------------------------------------------------------- 1 | import cv2 2 | from time import time 3 | import json 4 | import pickle 5 | import os 6 | from collections import defaultdict 7 | import clip 8 | import random as rd 9 | from PIL import Image 10 | import torch 11 | import numpy as np 12 | import imageio 13 | rd.seed(0) 14 | 15 | 16 | def hash_color(obj_id): 17 | np.random.seed(obj_id) 18 | color = np.random.randint(0, 256, 3) 19 | new_color = tuple(int(i) for i in color) 20 | return new_color 21 | 22 | 23 | class ReID: 24 | def __init__(self, video_path_list, base_dir='preprocess'): 25 | self.video_path_list = video_path_list 26 | self.base_dir = base_dir 27 | self.trackid2clip_emb = None 28 | self.trackid2dinov2_emb = None 29 | self.trackid2frame = None 30 | self.trackid2category = None 31 | self.uid2tids = None 32 | self.tid2uid = None 33 | 34 | 35 | def hard_constraint(self, obj1, obj2): 36 | # if self.trackid2category[obj1] != self.trackid2category[obj2]: # if two tracked objects have different categories, they cannot be the same object 37 | # return False 38 | frame1 = set(self.trackid2frame[obj1]) 39 | frame2 = set(self.trackid2frame[obj2]) 40 | if len(frame1.intersection(frame2)) > 0: # if two tracked objects co-exist, they cannot be the same object 41 | return False 42 | return True 43 | 44 | 45 | def clip_similarity_score(self, obj1, obj2, x0=0.925, slope=20): 46 | clip_emb1 = self.trackid2clip_emb[obj1] 47 | clip_emb2 = self.trackid2clip_emb[obj2] 48 | cosine_score = np.dot(clip_emb1, clip_emb2) / (np.linalg.norm(clip_emb1) * np.linalg.norm(clip_emb2)) 49 | clip_score = 1 / (1 + np.exp(-slope * (cosine_score - x0))) 50 | return clip_score 51 | 52 | 53 | def dinov2_similarity_score(self, obj1, obj2, x0=0.5, slope=4.1): 54 | dinov2_emb1 = self.trackid2dinov2_emb[obj1] 55 | dinov2_emb2 = self.trackid2dinov2_emb[obj2] 56 | cosine_score = np.dot(dinov2_emb1, dinov2_emb2) / (np.linalg.norm(dinov2_emb1) * np.linalg.norm(dinov2_emb2)) 57 | #dinov2_score = 1 / (1 + np.exp(-slope * (cosine_score - x0))) 58 | dinov2_score = cosine_score 59 | return dinov2_score 60 | 61 | 62 | def compute_score(self, obj1, obj2): 63 | if not self.hard_constraint(obj1, obj2): 64 | return 0 65 | clip_score = self.clip_similarity_score(obj1, obj2) 66 | dinov2_score = self.dinov2_similarity_score(obj1, obj2) 67 | return 0.15*clip_score+ 0.85*dinov2_score 68 | 69 | 70 | def check_group(self, tid, uid): 71 | """tid should has score > 0.5 for all uid objects, and at least one score > 0.62""" 72 | sgn = False 73 | for t in self.uid2tids[uid]: 74 | if self.compute_score(tid, t) < 0.5: 75 | return False 76 | if self.compute_score(tid, t) >= 0.62: 77 | sgn = True 78 | return sgn 79 | 80 | 81 | def reid_for_all_videos(self): 82 | for video_path in self.video_path_list: 83 | base_name = os.path.basename(video_path).replace(".mp4", "") 84 | video_dir = os.path.join(self.base_dir, base_name) 85 | with open(os.path.join(video_dir, 'tid2clip.pkl'), 'rb') as f: 86 | self.trackid2clip_emb = pickle.load(f) 87 | with open(os.path.join(video_dir, 'tid2dinov2.pkl'), 'rb') as f: 88 | self.trackid2dinov2_emb = pickle.load(f) 89 | with open(os.path.join(video_dir, 'tracking.pkl'), 'rb') as f: 90 | content = pickle.load(f) 91 | self.frame2trackid, self.trackid2frame, self.trackid2category = content[0], content[1], content[2] 92 | self.uid2tids = defaultdict(list) 93 | self.tid2uid = dict() 94 | 95 | for frame in self.frame2trackid: 96 | cur_track_ids = self.frame2trackid[frame] 97 | for tid in cur_track_ids: 98 | if tid in self.tid2uid: 99 | continue 100 | sgn = False 101 | for uid in self.uid2tids: 102 | if self.check_group(tid, uid): 103 | self.uid2tids[uid].append(tid) 104 | self.tid2uid[tid] = uid 105 | sgn = True 106 | break 107 | if sgn == False: 108 | uid = len(self.uid2tids) 109 | self.uid2tids[uid].append(tid) 110 | self.tid2uid[tid] = uid 111 | 112 | frame2uid = defaultdict(dict) 113 | uid2frame = defaultdict(list) 114 | uid2category = dict() 115 | uid2clipemb = defaultdict(list) 116 | uid2clip = dict() 117 | for frame in self.frame2trackid: 118 | for tid in self.frame2trackid[frame]: 119 | frame2uid[frame][self.tid2uid[tid]] = self.frame2trackid[frame][tid] 120 | for uid in self.uid2tids: 121 | tids = self.uid2tids[uid] 122 | for tid in tids: 123 | uid2frame[uid] += self.trackid2frame[tid] 124 | uid2clipemb[uid].append(self.trackid2clip_emb[tid]) 125 | 126 | for uid in uid2clipemb: 127 | emb = torch.stack(uid2clipemb[uid], dim=0) 128 | emb = torch.mean(emb, dim=0) 129 | uid2clip[uid] = emb 130 | save_file = os.path.join(video_dir, 'uid2clip.pkl') 131 | with open(save_file, 'wb') as f: 132 | pickle.dump(uid2clip, f) 133 | 134 | reid_file = os.path.join(video_dir, 'reid.pkl') 135 | for uid in self.uid2tids: 136 | uid2category[uid] = self.trackid2category[self.uid2tids[uid][0]] 137 | with open(reid_file, 'wb') as f: 138 | pickle.dump([frame2uid, uid2frame, uid2category], f) 139 | 140 | 141 | def replay(self): 142 | for video_path in self.video_path_list: 143 | base_name = os.path.basename(video_path).replace(".mp4", "") 144 | video_dir = os.path.join(self.base_dir, base_name) 145 | with open(os.path.join(video_dir, 'reid.pkl'), 'rb') as f: 146 | content = pickle.load(f) 147 | frame2uid, uid2frame, uid2category = content[0], content[1], content[2] 148 | cap = cv2.VideoCapture(video_path) 149 | cap.set(cv2.CAP_PROP_POS_FRAMES, 0) 150 | frame_idx = -1 151 | writer = imageio.get_writer(os.path.join(video_dir, 'reid.mp4'), fps=15) 152 | while True: 153 | success, frame = cap.read() 154 | frame_idx += 1 155 | if not success: 156 | break 157 | if frame_idx in frame2uid: 158 | for uid in frame2uid[frame_idx]: 159 | c = hash_color(uid) 160 | x, y, w, h = frame2uid[frame_idx][uid][1] 161 | left_top = (int(x-w/2), int(y-h/2)) 162 | right_bottom = (int(x+w/2), int(y+h/2)) 163 | cv2.rectangle(frame, left_top, right_bottom, c, 2) 164 | label = f'ID: {uid}' 165 | label_position = (int(x-w/2)+2, int(y-h/2)+12) 166 | cv2.putText(frame, label, label_position, cv2.FONT_HERSHEY_SIMPLEX, 0.5, c, 2) 167 | #cv2.imshow("reid", frame) 168 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 169 | writer.append_data(frame) 170 | writer.close() 171 | cap.release() 172 | cv2.destroyAllWindows() 173 | 174 | 175 | def run(self): 176 | self.reid_for_all_videos() 177 | self.replay() 178 | -------------------------------------------------------------------------------- /sample_videos/boats.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/sample_videos/boats.mp4 -------------------------------------------------------------------------------- /sample_videos/books.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/sample_videos/books.mp4 -------------------------------------------------------------------------------- /sample_videos/kitchen.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/sample_videos/kitchen.mp4 -------------------------------------------------------------------------------- /sample_videos/painting.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/sample_videos/painting.mp4 -------------------------------------------------------------------------------- /sample_videos/talking.mp4: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/sample_videos/talking.mp4 -------------------------------------------------------------------------------- /segment_feature.py: -------------------------------------------------------------------------------- 1 | import os 2 | import time 3 | import torch 4 | import json 5 | import cv2 6 | import pickle 7 | from InternVid.viclip import get_viclip, frames2tensor, get_vid_feat 8 | from encoder import encode_sentences 9 | 10 | 11 | model_cfgs = { 12 | 'viclip-l-internvid-10m-flt': { 13 | 'size': 'l', 14 | 'pretrained': 'tool_models/viCLIP/ViClip-InternVid-10M-FLT.pth', 15 | } 16 | } 17 | 18 | class SegmentFeature: 19 | def __init__(self, video_path_list, base_dir='preprocess'): 20 | self.video_path_list = video_path_list 21 | self.base_dir = base_dir 22 | self.seconds_per_feat = 2 23 | self.frames_per_feat = 10 24 | 25 | 26 | 27 | def create_textual_embedding(self): 28 | """use the sentence encoder model to embed the captions of all the videos""" 29 | model='text-embedding-3-large' 30 | for video_path in self.video_path_list: 31 | start_time = time.time() 32 | base_name = os.path.basename(video_path).replace(".mp4", "") 33 | video_dir = os.path.join(self.base_dir, base_name) 34 | with open(os.path.join(video_dir, 'captions.json')) as f: 35 | captions = json.load(f) 36 | caps = list(captions.values()) 37 | caption_emb = encode_sentences(sentence_list=caps, model_name=model) 38 | print(caption_emb) 39 | with open(os.path.join(video_dir, f'segment_textual_embedding.pkl'), 'wb') as f: 40 | pickle.dump(caption_emb, f) 41 | end_time = time.time() 42 | print(f"textual encoding time for video {base_name}: {round(end_time-start_time, 3)} seconds") 43 | 44 | 45 | def create_visual_embedding(self): 46 | start_time = time.time() 47 | cfg = model_cfgs['viclip-l-internvid-10m-flt'] 48 | model = get_viclip(cfg['size'], cfg['pretrained']) 49 | assert(type(model)==dict and model['viclip'] is not None and model['tokenizer'] is not None) 50 | clip, tokenizer = model['viclip'], model['tokenizer'] 51 | clip = clip.to("cuda") 52 | end_time = time.time() 53 | print(f'time for loading viCLIP model: {round(end_time-start_time, 3)} seconds') 54 | 55 | for video_path in self.video_path_list: 56 | base_name = os.path.basename(video_path).replace(".mp4", "") 57 | video_dir = os.path.join(self.base_dir, base_name) 58 | if not os.path.exists(video_dir): 59 | os.makedirs(video_dir) 60 | 61 | cap = cv2.VideoCapture(video_path) 62 | fps = round(cap.get(cv2.CAP_PROP_FPS)) 63 | total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) 64 | frame_interval = fps*self.seconds_per_feat//self.frames_per_feat 65 | total_feats = total_frames//(fps*self.seconds_per_feat) 66 | 67 | segment_feats = [] 68 | start_time = time.time() 69 | cap.set(cv2.CAP_PROP_POS_FRAMES, 0) 70 | for segment_id in range(total_feats): 71 | frames = [] 72 | for i in range(self.frames_per_feat): 73 | success, frame = cap.read() 74 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 75 | frames.append(frame) 76 | for j in range(frame_interval-1): #skip other frames 77 | success, frame = cap.read() 78 | for i in range(fps*self.seconds_per_feat-frame_interval*self.frames_per_feat): 79 | success, frame = cap.read() #skip remaining frames 80 | frames_tensor = frames2tensor(frames, device='cuda') 81 | with torch.no_grad(): 82 | vid_feat = get_vid_feat(frames_tensor, clip).cpu() 83 | segment_feats.append(vid_feat) 84 | segment_feats = torch.cat(segment_feats, dim=0).numpy() 85 | end_time = time.time() 86 | cap.release() 87 | print(segment_feats) 88 | print(f"visual embedding time for video {base_name}: {round(end_time-start_time, 3)} seconds") 89 | with open(os.path.join(video_dir, 'segment_visual_embedding.pkl'), 'wb') as f: 90 | pickle.dump(segment_feats, f) 91 | 92 | 93 | def run(self): 94 | self.create_textual_embedding() 95 | self.create_visual_embedding() 96 | 97 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | from sklearn.metrics.pairwise import cosine_similarity 2 | import numpy as np 3 | 4 | 5 | def compute_cosine_similarity(target_embedding, embedding_list): 6 | target_embedding_tensor = target_embedding.reshape(1, -1) 7 | # Compute cosine similarity 8 | similarity_scores = cosine_similarity(target_embedding_tensor, embedding_list) 9 | return similarity_scores.reshape(-1) 10 | 11 | 12 | def top_k_indices(scores, k): 13 | max_len = scores.shape[0] 14 | k = min(max_len, k) 15 | indices = np.argsort(scores)[-k:][::-1] 16 | return list(indices) -------------------------------------------------------------------------------- /video-llava.py: -------------------------------------------------------------------------------- 1 | import torch 2 | from videollava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN 3 | from videollava.conversation import conv_templates, SeparatorStyle 4 | from videollava.model.builder import load_pretrained_model 5 | from videollava.utils import disable_torch_init 6 | from videollava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria 7 | import socket 8 | import os 9 | import pickle 10 | 11 | 12 | def main(): 13 | disable_torch_init() 14 | model_path = 'LanguageBind/Video-LLaVA-7B' 15 | cache_dir = 'cache_dir' 16 | device = 'cuda' 17 | load_4bit, load_8bit = True, False 18 | model_name = get_model_name_from_path(model_path) 19 | tokenizer, model, processor, _ = load_pretrained_model(model_path, None, model_name, load_8bit, load_4bit, device=device, cache_dir=cache_dir) 20 | video_processor = processor['video'] 21 | server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) 22 | if not os.path.exists("tmp"): 23 | os.mkdir("tmp") 24 | if os.path.exists("tmp/vqa.sock"): 25 | os.unlink("tmp/vqa.sock") 26 | server.bind("tmp/vqa.sock") 27 | server.listen(0) 28 | print('ready for connection!') 29 | # with open("tmp/ready.txt", 'w') as f: 30 | # f.write("ready!") 31 | while True: 32 | connection, address = server.accept() 33 | r = connection.recv(1024).decode() 34 | # if r == "stop": 35 | # break 36 | with open('tmp/content.pkl', 'rb') as f: 37 | content = pickle.load(f) 38 | video_path = content['video_path'] 39 | questions = ['what is the video about?', content['question']] 40 | answers = [] 41 | print('\n'+video_path) 42 | for i in range(2): 43 | video_tensor = video_processor(video_path, return_tensors='pt')['pixel_values'] 44 | if type(video_tensor) is list: 45 | tensor = [video.to(model.device, dtype=torch.float16) for video in video_tensor] 46 | else: 47 | tensor = video_tensor.to(model.device, dtype=torch.float16) 48 | 49 | conv_mode = "llava_v1" 50 | conv = conv_templates[conv_mode].copy() 51 | roles = conv.roles 52 | 53 | print(f"{roles[1]}: {questions[i]}") 54 | question = ' '.join([DEFAULT_IMAGE_TOKEN] * model.get_video_tower().config.num_frames) + '\n' + questions[i] 55 | conv.append_message(conv.roles[0], question) 56 | conv.append_message(conv.roles[1], None) 57 | prompt = conv.get_prompt() 58 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 59 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 60 | keywords = [stop_str] 61 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids) 62 | #print('video & question processing done!') 63 | with torch.inference_mode(): 64 | output_ids = model.generate( 65 | input_ids, 66 | images=tensor, 67 | do_sample=True, 68 | temperature=0.1, 69 | max_new_tokens=1024, 70 | use_cache=True, 71 | stopping_criteria=[stopping_criteria]) 72 | 73 | outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip() 74 | outputs = outputs.replace("", "") 75 | answers.append(outputs) 76 | reply = f"Segment description: {answers[0]}\nAnswer to the question: {answers[1]}" 77 | print(reply) 78 | with open('tmp/content.pkl', 'wb') as f: 79 | pickle.dump(reply, f) 80 | connection.send(b'sent') 81 | r = connection.recv(1024) 82 | connection.close() 83 | 84 | 85 | if __name__ == '__main__': 86 | main() --------------------------------------------------------------------------------