├── InternVid
├── README.md
├── README_CN.md
├── demo.ipynb
├── div_sampling.py
├── test_viCLIP.py
├── utils
│ ├── basic_utils.py
│ ├── config.py
│ ├── config_utils.py
│ ├── distributed.py
│ ├── easydict.py
│ ├── logger.py
│ ├── optimizer.py
│ └── scheduler.py
└── viclip
│ ├── __init__.py
│ ├── __pycache__
│ ├── __init__.cpython-311.pyc
│ ├── __init__.cpython-39.pyc
│ ├── simple_tokenizer.cpython-311.pyc
│ ├── simple_tokenizer.cpython-39.pyc
│ ├── viclip.cpython-39.pyc
│ ├── viclip_text.cpython-39.pyc
│ └── viclip_vision.cpython-39.pyc
│ ├── bpe_simple_vocab_16e6.txt.gz
│ ├── simple_tokenizer.py
│ ├── viclip.py
│ ├── viclip_text.py
│ └── viclip_vision.py
├── LICENSE
├── LaViLa
├── CODE_OF_CONDUCT.md
├── CONTRIBUTING.md
├── LICENSE
├── README.md
├── __pycache__
│ └── eval_narrator.cpython-39.pyc
├── clip_caption.py
├── datasets
│ └── README.md
├── demo.py
├── demo_narrator.py
├── demo_narrator_3rd_person.py
├── docs
│ ├── INSTALL.md
│ ├── MODEL_ZOO.md
│ └── PRETRAIN.md
├── eval_narrator.py
├── eval_zeroshot.py
├── lavila
│ ├── data
│ │ ├── __pycache__
│ │ │ ├── datasets.cpython-39.pyc
│ │ │ └── video_transforms.cpython-39.pyc
│ │ ├── datasets.py
│ │ └── video_transforms.py
│ ├── models
│ │ ├── __pycache__
│ │ │ ├── coca.cpython-39.pyc
│ │ │ ├── distributed_utils.cpython-39.pyc
│ │ │ ├── gpt2_gated.cpython-39.pyc
│ │ │ ├── loss.cpython-39.pyc
│ │ │ ├── models.cpython-39.pyc
│ │ │ ├── narrator.cpython-39.pyc
│ │ │ ├── openai_clip.cpython-39.pyc
│ │ │ ├── openai_model.cpython-39.pyc
│ │ │ ├── timesformer.cpython-39.pyc
│ │ │ ├── tokenizer.cpython-39.pyc
│ │ │ └── utils.cpython-39.pyc
│ │ ├── bpe_simple_vocab_16e6.txt.gz
│ │ ├── coca.py
│ │ ├── distributed_utils.py
│ │ ├── gpt2_gated.py
│ │ ├── loss.py
│ │ ├── models.py
│ │ ├── narrator.py
│ │ ├── openai_clip.py
│ │ ├── openai_model.py
│ │ ├── timesformer.py
│ │ ├── tokenizer.py
│ │ └── utils.py
│ └── utils
│ │ ├── __pycache__
│ │ ├── distributed.cpython-39.pyc
│ │ └── preprocess.cpython-39.pyc
│ │ ├── distributed.py
│ │ ├── evaluation.py
│ │ ├── evaluation_charades.py
│ │ ├── evaluation_egomcq.py
│ │ ├── evaluation_ek100cls.py
│ │ ├── evaluation_ek100mir.py
│ │ ├── meter.py
│ │ ├── preprocess.py
│ │ ├── random.py
│ │ └── scheduler.py
├── main_finetune_classification.py
├── main_finetune_retrieval.py
├── main_infer_narrator.py
├── main_pretrain.py
├── requirements.txt
├── run_with_submitit_finetune_classification.py
├── run_with_submitit_finetune_retrieval.py
├── run_with_submitit_infer_narrator.py
├── run_with_submitit_pretrain.py
└── scripts
│ ├── convert_egovlp_ckpt.py
│ └── crop_and_resize_ego4d.sh
├── README.md
├── captioning.py
├── config
└── default.yaml
├── database.py
├── demo.py
├── encoder.py
├── environment.yaml
├── imgs
├── demo.png
└── teaser.png
├── inference.py
├── main.py
├── preprocess
├── boats
│ ├── captions.json
│ ├── reid.mp4
│ ├── reid.pkl
│ ├── segment2id.json
│ ├── segment_textual_embedding.pkl
│ ├── segment_visual_embedding.pkl
│ ├── tid2clip.pkl
│ ├── tid2dinov2.pkl
│ ├── tracking.pkl
│ └── uid2clip.pkl
├── books
│ ├── captions.json
│ ├── reid.mp4
│ ├── reid.pkl
│ ├── segment2id.json
│ ├── segment_textual_embedding.pkl
│ ├── segment_visual_embedding.pkl
│ ├── tid2clip.pkl
│ ├── tid2dinov2.pkl
│ ├── tracking.pkl
│ └── uid2clip.pkl
├── kitchen
│ ├── captions.json
│ ├── reid.mp4
│ ├── reid.pkl
│ ├── segment2id.json
│ ├── segment_0.mp4
│ ├── segment_1.mp4
│ ├── segment_18.mp4
│ ├── segment_3.mp4
│ ├── segment_8.mp4
│ ├── segment_textual_embedding.pkl
│ ├── segment_visual_embedding.pkl
│ ├── tid2clip.pkl
│ ├── tid2dinov2.pkl
│ ├── tracking.pkl
│ └── uid2clip.pkl
├── painting
│ ├── captions.json
│ ├── reid.mp4
│ ├── reid.pkl
│ ├── segment2id.json
│ ├── segment_83.mp4
│ ├── segment_85.mp4
│ ├── segment_textual_embedding.pkl
│ ├── segment_visual_embedding.pkl
│ ├── tid2clip.pkl
│ ├── tid2dinov2.pkl
│ ├── tracking.pkl
│ └── uid2clip.pkl
└── talking
│ ├── captions.json
│ ├── reid.mp4
│ ├── reid.pkl
│ ├── segment2id.json
│ ├── segment_10.mp4
│ ├── segment_11.mp4
│ ├── segment_9.mp4
│ ├── segment_textual_embedding.pkl
│ ├── segment_visual_embedding.pkl
│ ├── tid2clip.pkl
│ ├── tid2dinov2.pkl
│ ├── tracking.pkl
│ └── uid2clip.pkl
├── prompts
├── database_query_prompt.txt
├── multiple_choice_prompt.txt
└── prompt.txt
├── reid.py
├── sample_videos
├── boats.mp4
├── books.mp4
├── kitchen.mp4
├── painting.mp4
└── talking.mp4
├── segment_feature.py
├── tools.py
├── tracking.py
├── utils.py
└── video-llava.py
/InternVid/README_CN.md:
--------------------------------------------------------------------------------
1 | # InternVid \[[论文](https://arxiv.org/pdf/2307.06942.pdf)\]
2 |
3 | [](https://huggingface.co/datasets/OpenGVLab/InternVid) | [](https://huggingface.co/OpenGVLab/ViCLIP)
4 |
5 | \[[English verision](README.md)\]
6 |
7 | # :fire: 新闻
8 | 我们很高兴宣布部分发布一个大规模的视频文本数据集,旨在促进多模态理解和生成。作为此次发布的一部分,我们提供了该数据集的[子集](https://huggingface.co/datasets/OpenGVLab/InternVid)包含1000万个视频剪辑。此外,我们还提供了一个使用ViT-L架构在这个子集上训练的[ViCLIP](https://huggingface.co/OpenGVLab/ViCLIP)。该模型在Kinetics上实现了SOTA的零样本动作识别性能。
9 |
10 | 我们提供了示例代码,阐明如何使用ViClip的过程,在[demo.ipynb](https://github.com/OpenGVLab/InternVideo/blob/main/Data/InternVid/demo.ipynb)中有详述。
11 |
12 | 请关注我们的更新!
13 |
14 | # 简介
15 |
16 | **数据**
17 |
18 | 我们从16个流行类别中收集了各种百分比的视频。为了确保多样性,我们选择了来自不同语言的国家的视频,而非依赖于一个主导语言环境。我们采样的国家包括英国、美国、澳大利亚、日本、韩国、中国、俄罗斯和法国等。在时长方面,每个视频平均持续351.9秒。几乎一半(49%)的视频时长不超过五分钟,而四分之一(26%)的视频时长在五到十分钟之间。只有8%的视频超过20分钟。在策划的视频中,85%是高分辨率(720P),其余15%的分辨率从360P至720P不等。虽然低分辨率的视频在内容生成任务中可能表现不如高分辨率的视频,但只要配有适当的字幕,它们仍可用于视频-语言表示学习。
19 |
20 | 
21 |
22 | InternVid展示了在分割剪辑级别上具有不同剪辑时长和字幕长度的多样性。美学分数和剪辑-字幕相似度均匀分布。大部分剪辑的长度在0-10秒之间,占所有剪辑的85%。大约一半的剪辑字幕含有10-20个单词,而三分之一的剪辑字幕含有少于10个单词。大约11%的剪辑具有超过20个单词的长字幕。
23 |
24 | 
25 |
26 | **ViCLIP: 一个简单的用于转移视频-文本表示的视频CLIP**
27 |
28 | 基于CLIP, 我们构建了一个简单的视频-文本预训练基线ViCLIP。它由视频编码器(ViT)和文本编码器组成,如下所示。这两个模块都是从相应的CLIP组件初始化的。我们将视频编码器中的原生注意力更新为时空注意力,同时保持其他设计元素不变。为了高效学习,我们在预训练中对视频进行了掩蔽处理。
29 |
30 |
31 |
32 |
33 | # 数据 & 模型库
34 |
35 | ### 预训练数据 & 模型
36 |
37 |
38 |
39 | | 模型 | 训练数据 | 描述 |
40 | | :-----------------: | :----------------------: | :---------------------------------------------------------------------------------------------------: |
41 | | ViCLIP-L-14 \[[HuggingFace](https://huggingface.co/OpenGVLab/ViCLIP) \| [Aliyun](https://pjlab-gvm-data.oss-cn-shanghai.aliyuncs.com/internvideo/viclip/ViClip-InternVid-10M-FLT.pth)\] | InternVid-10M-FLT \[[HuggingFace](https://huggingface.co/datasets/OpenGVLab/InternVid) \| [OpenDataLab](https://opendatalab.com/shepshep/InternVid)\] | |
42 |
43 |
44 |
45 | ## Citation
46 |
47 | 如果您发现这项工作对您的研究有所帮助,请考虑引用InternVid。您的肯定将极大地帮助我们继续为研究社区贡献资源。
48 |
49 | ```
50 | @article{wang2023internvid,
51 | title={InternVid: A Large-scale Video-Text Dataset for Multimodal Understanding and Generation},
52 | author={Wang, Yi and He, Yinan and Li, Yizhuo and Li, Kunchang and Yu, Jiashuo and Ma, Xin and Chen, Xinyuan and Wang, Yaohui and Luo, Ping and Liu, Ziwei and Wang, Yali and Wang, Limin and Qiao, Yu},
53 | journal={arXiv preprint arXiv:2307.06942},
54 | year={2023}
55 | }
56 |
57 | @article{wang2022internvideo,
58 | title={InternVideo: General Video Foundation Models via Generative and Discriminative Learning},
59 | author={Wang, Yi and Li, Kunchang and Li, Yizhuo and He, Yinan and Huang, Bingkun and Zhao, Zhiyu and Zhang, Hongjie and Xu, Jilan and Liu, Yi and Wang, Zun and Xing, Sen and Chen, Guo and Pan, Junting and Yu, Jiashuo and Wang, Yali and Wang, Limin and Qiao, Yu},
60 | journal={arXiv preprint arXiv:2212.03191},
61 | year={2022}
62 | }
63 | ```
--------------------------------------------------------------------------------
/InternVid/div_sampling.py:
--------------------------------------------------------------------------------
1 | from collections import Counter
2 | import json
3 | import random
4 | import numpy as np
5 | data = json.load(open("/path/to/to_sample"))
6 | video_id = set([x["video"].split("/")[-1][:11] for x in data])
7 | video_id_counter = Counter([x["video"].split("/")[-1][:11] for x in data])
8 | sampling_weights = [1.0 / video_id_counter[x["video"].split("/")[-1][:11]] for x in data]
9 | np.random.seed(42)
10 | sampling_weights = np.array(sampling_weights)
11 | sampling_weights = sampling_weights / sampling_weights.sum()
12 | sampled_index = np.random.choice(len(data), 10647458, replace=False, p=sampling_weights)
13 | data = [data[i] for i in sampled_index]
14 | json.dump(data, open("/path/to/sampled", "w"))
--------------------------------------------------------------------------------
/InternVid/test_viCLIP.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import os
3 | import cv2
4 |
5 | from viclip import get_viclip, retrieve_text, _frame_from_video
6 | video = cv2.VideoCapture('Data/InternVid/example1.mp4')
7 | frames = [x for x in _frame_from_video(video)]
8 | print('frames', frames)
9 | # modify xxx to the path of the pretrained model
10 | model_cfgs = {
11 | 'viclip-l-internvid-10m-flt': {
12 | 'size': 'l',
13 | 'pretrained': '/home/yue/data/ViClip-InternVid-10M-FLT.pth',
14 | },
15 | 'viclip-l-internvid-200m': {
16 | 'size': 'l',
17 | 'pretrained': 'xxx/ViCLIP-L_InternVid-200M.pth',
18 | },
19 | 'viclip-b-internvid-10m-flt': {
20 | 'size': 'b',
21 | 'pretrained': 'xxx/ViCLIP-B_InternVid-FLT-10M.pth',
22 | },
23 | 'viclip-b-internvid-200m': {
24 | 'size': 'b',
25 | 'pretrained': 'xxx/ViCLIP-B_InternVid-200M.pth',
26 | },
27 | }
28 |
29 | text_candidates = ["A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon.",
30 | "A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys.",
31 | "A person dressed in a blue jacket shovels the snow-covered pavement outside their house.",
32 | "A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner.",
33 | "A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride.",
34 | "A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees.",
35 | "A playful dog slides down a snowy hill, wagging its tail with delight.",
36 | "A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees.",
37 | "A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run.",
38 | "A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery."]
39 |
40 | cfg = model_cfgs['viclip-l-internvid-10m-flt']
41 | model_l = get_viclip(cfg['size'], cfg['pretrained'])
42 | print('a')
43 | texts, probs = retrieve_text(frames, text_candidates, models=model_l, topk=5)
44 |
--------------------------------------------------------------------------------
/InternVid/utils/config_utils.py:
--------------------------------------------------------------------------------
1 | import logging
2 | import os
3 | import sys
4 | from os.path import dirname, join
5 |
6 | from utils.config import Config
7 | from utils.distributed import init_distributed_mode, is_main_process
8 | from utils.logger import setup_logger
9 |
10 | logger = logging.getLogger(__name__)
11 |
12 |
13 | def setup_config():
14 | """Conbine yaml config and command line config with OmegaConf.
15 | Also converts types, e.g., `'None'` (str) --> `None` (None)
16 | """
17 | config = Config.get_config()
18 | if config.debug:
19 | config.wandb.enable = False
20 | return config
21 |
22 |
23 | def setup_evaluate_config(config):
24 | """setup evaluation default settings, e.g., disable wandb"""
25 | assert config.evaluate
26 | config.wandb.enable = False
27 | if config.output_dir is None:
28 | config.output_dir = join(dirname(config.pretrained_path), "eval")
29 | return config
30 |
31 |
32 | def setup_output_dir(output_dir, excludes=["code"]):
33 | """ensure not overwritting an exisiting/non-empty output dir"""
34 | if not os.path.exists(output_dir):
35 | os.makedirs(output_dir, exist_ok=False)
36 | else:
37 | existing_dirs_files = os.listdir(output_dir) # list
38 | remaining = set(existing_dirs_files) - set(excludes)
39 | remaining = [e for e in remaining if "slurm" not in e]
40 | remaining = [e for e in remaining if ".out" not in e]
41 | # assert len(remaining) == 0, f"remaining dirs or files: {remaining}"
42 | logger.warn(f"remaining dirs or files: {remaining}")
43 |
44 |
45 | def setup_main():
46 | """
47 | Setup config, logger, output_dir, etc.
48 | Shared for pretrain and all downstream tasks.
49 | """
50 | config = setup_config()
51 | if hasattr(config, "evaluate") and config.evaluate:
52 | config = setup_evaluate_config(config)
53 | init_distributed_mode(config)
54 |
55 | if is_main_process():
56 | setup_output_dir(config.output_dir, excludes=["code"])
57 | setup_logger(output=config.output_dir, color=True, name="vindlu")
58 | logger.info(f"config: {Config.pretty_text(config)}")
59 | Config.dump(config, os.path.join(config.output_dir, "config.json"))
60 | return config
61 |
--------------------------------------------------------------------------------
/InternVid/utils/distributed.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/utils/distributed.py
--------------------------------------------------------------------------------
/InternVid/utils/easydict.py:
--------------------------------------------------------------------------------
1 | class EasyDict(dict):
2 | """
3 | Get attributes
4 |
5 | >>> d = EasyDict({'foo':3})
6 | >>> d['foo']
7 | 3
8 | >>> d.foo
9 | 3
10 | >>> d.bar
11 | Traceback (most recent call last):
12 | ...
13 | AttributeError: 'EasyDict' object has no attribute 'bar'
14 |
15 | Works recursively
16 |
17 | >>> d = EasyDict({'foo':3, 'bar':{'x':1, 'y':2}})
18 | >>> isinstance(d.bar, dict)
19 | True
20 | >>> d.bar.x
21 | 1
22 |
23 | Bullet-proof
24 |
25 | >>> EasyDict({})
26 | {}
27 | >>> EasyDict(d={})
28 | {}
29 | >>> EasyDict(None)
30 | {}
31 | >>> d = {'a': 1}
32 | >>> EasyDict(**d)
33 | {'a': 1}
34 |
35 | Set attributes
36 |
37 | >>> d = EasyDict()
38 | >>> d.foo = 3
39 | >>> d.foo
40 | 3
41 | >>> d.bar = {'prop': 'value'}
42 | >>> d.bar.prop
43 | 'value'
44 | >>> d
45 | {'foo': 3, 'bar': {'prop': 'value'}}
46 | >>> d.bar.prop = 'newer'
47 | >>> d.bar.prop
48 | 'newer'
49 |
50 |
51 | Values extraction
52 |
53 | >>> d = EasyDict({'foo':0, 'bar':[{'x':1, 'y':2}, {'x':3, 'y':4}]})
54 | >>> isinstance(d.bar, list)
55 | True
56 | >>> from operator import attrgetter
57 | >>> map(attrgetter('x'), d.bar)
58 | [1, 3]
59 | >>> map(attrgetter('y'), d.bar)
60 | [2, 4]
61 | >>> d = EasyDict()
62 | >>> d.keys()
63 | []
64 | >>> d = EasyDict(foo=3, bar=dict(x=1, y=2))
65 | >>> d.foo
66 | 3
67 | >>> d.bar.x
68 | 1
69 |
70 | Still like a dict though
71 |
72 | >>> o = EasyDict({'clean':True})
73 | >>> o.items()
74 | [('clean', True)]
75 |
76 | And like a class
77 |
78 | >>> class Flower(EasyDict):
79 | ... power = 1
80 | ...
81 | >>> f = Flower()
82 | >>> f.power
83 | 1
84 | >>> f = Flower({'height': 12})
85 | >>> f.height
86 | 12
87 | >>> f['power']
88 | 1
89 | >>> sorted(f.keys())
90 | ['height', 'power']
91 |
92 | update and pop items
93 | >>> d = EasyDict(a=1, b='2')
94 | >>> e = EasyDict(c=3.0, a=9.0)
95 | >>> d.update(e)
96 | >>> d.c
97 | 3.0
98 | >>> d['c']
99 | 3.0
100 | >>> d.get('c')
101 | 3.0
102 | >>> d.update(a=4, b=4)
103 | >>> d.b
104 | 4
105 | >>> d.pop('a')
106 | 4
107 | >>> d.a
108 | Traceback (most recent call last):
109 | ...
110 | AttributeError: 'EasyDict' object has no attribute 'a'
111 | """
112 |
113 | def __init__(self, d=None, **kwargs):
114 | if d is None:
115 | d = {}
116 | if kwargs:
117 | d.update(**kwargs)
118 | for k, v in d.items():
119 | setattr(self, k, v)
120 | # Class attributes
121 | for k in self.__class__.__dict__.keys():
122 | if not (k.startswith("__") and k.endswith("__")) and not k in ("update", "pop"):
123 | setattr(self, k, getattr(self, k))
124 |
125 | def __setattr__(self, name, value):
126 | if isinstance(value, (list, tuple)):
127 | value = [self.__class__(x) if isinstance(x, dict) else x for x in value]
128 | elif isinstance(value, dict) and not isinstance(value, self.__class__):
129 | value = self.__class__(value)
130 | super(EasyDict, self).__setattr__(name, value)
131 | super(EasyDict, self).__setitem__(name, value)
132 |
133 | __setitem__ = __setattr__
134 |
135 | def update(self, e=None, **f):
136 | d = e or dict()
137 | d.update(f)
138 | for k in d:
139 | setattr(self, k, d[k])
140 |
141 | def pop(self, k, d=None):
142 | if hasattr(self, k):
143 | delattr(self, k)
144 | return super(EasyDict, self).pop(k, d)
145 |
146 |
147 | if __name__ == "__main__":
148 | import doctest
149 |
150 |
--------------------------------------------------------------------------------
/InternVid/utils/logger.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/utils/logger.py
--------------------------------------------------------------------------------
/InternVid/utils/optimizer.py:
--------------------------------------------------------------------------------
1 | """ Optimizer Factory w/ Custom Weight Decay
2 | Hacked together by / Copyright 2020 Ross Wightman
3 | """
4 | import re
5 | import torch
6 | from torch import optim as optim
7 | from utils.distributed import is_main_process
8 | import logging
9 | logger = logging.getLogger(__name__)
10 | try:
11 | from apex.optimizers import FusedNovoGrad, FusedAdam, FusedLAMB, FusedSGD
12 | has_apex = True
13 | except ImportError:
14 | has_apex = False
15 |
16 |
17 | def add_weight_decay(model, weight_decay, no_decay_list=(), filter_bias_and_bn=True):
18 | named_param_tuples = []
19 | for name, param in model.named_parameters():
20 | if not param.requires_grad:
21 | continue # frozen weights
22 | if filter_bias_and_bn and (len(param.shape) == 1 or name.endswith(".bias")):
23 | named_param_tuples.append([name, param, 0])
24 | elif name in no_decay_list:
25 | named_param_tuples.append([name, param, 0])
26 | else:
27 | named_param_tuples.append([name, param, weight_decay])
28 | return named_param_tuples
29 |
30 |
31 | def add_different_lr(named_param_tuples_or_model, diff_lr_names, diff_lr, default_lr):
32 | """use lr=diff_lr for modules named found in diff_lr_names,
33 | otherwise use lr=default_lr
34 |
35 | Args:
36 | named_param_tuples_or_model: List([name, param, weight_decay]), or nn.Module
37 | diff_lr_names: List(str)
38 | diff_lr: float
39 | default_lr: float
40 | Returns:
41 | named_param_tuples_with_lr: List([name, param, weight_decay, lr])
42 | """
43 | named_param_tuples_with_lr = []
44 | logger.info(f"diff_names: {diff_lr_names}, diff_lr: {diff_lr}")
45 | for name, p, wd in named_param_tuples_or_model:
46 | use_diff_lr = False
47 | for diff_name in diff_lr_names:
48 | # if diff_name in name:
49 | if re.search(diff_name, name) is not None:
50 | logger.info(f"param {name} use different_lr: {diff_lr}")
51 | use_diff_lr = True
52 | break
53 |
54 | named_param_tuples_with_lr.append(
55 | [name, p, wd, diff_lr if use_diff_lr else default_lr]
56 | )
57 |
58 | if is_main_process():
59 | for name, _, wd, diff_lr in named_param_tuples_with_lr:
60 | logger.info(f"param {name}: wd: {wd}, lr: {diff_lr}")
61 |
62 | return named_param_tuples_with_lr
63 |
64 |
65 | def create_optimizer_params_group(named_param_tuples_with_lr):
66 | """named_param_tuples_with_lr: List([name, param, weight_decay, lr])"""
67 | group = {}
68 | for name, p, wd, lr in named_param_tuples_with_lr:
69 | if wd not in group:
70 | group[wd] = {}
71 | if lr not in group[wd]:
72 | group[wd][lr] = []
73 | group[wd][lr].append(p)
74 |
75 | optimizer_params_group = []
76 | for wd, lr_groups in group.items():
77 | for lr, p in lr_groups.items():
78 | optimizer_params_group.append(dict(
79 | params=p,
80 | weight_decay=wd,
81 | lr=lr
82 | ))
83 | logger.info(f"optimizer -- lr={lr} wd={wd} len(p)={len(p)}")
84 | return optimizer_params_group
85 |
86 |
87 | def create_optimizer(args, model, filter_bias_and_bn=True):
88 | opt_lower = args.opt.lower()
89 | weight_decay = args.weight_decay
90 | # check for modules that requires different lr
91 | if hasattr(args, "different_lr") and args.different_lr.enable:
92 | diff_lr_module_names = args.different_lr.module_names
93 | diff_lr = args.different_lr.lr
94 | else:
95 | diff_lr_module_names = []
96 | diff_lr = None
97 |
98 | no_decay = {}
99 | if hasattr(model, 'no_weight_decay'):
100 | no_decay = model.no_weight_decay()
101 | named_param_tuples = add_weight_decay(
102 | model, weight_decay, no_decay, filter_bias_and_bn)
103 | named_param_tuples = add_different_lr(
104 | named_param_tuples, diff_lr_module_names, diff_lr, args.lr)
105 | parameters = create_optimizer_params_group(named_param_tuples)
106 |
107 | if 'fused' in opt_lower:
108 | assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers'
109 |
110 | opt_args = dict(lr=args.lr, weight_decay=weight_decay)
111 | if hasattr(args, 'opt_eps') and args.opt_eps is not None:
112 | opt_args['eps'] = args.opt_eps
113 | if hasattr(args, 'opt_betas') and args.opt_betas is not None:
114 | opt_args['betas'] = args.opt_betas
115 | if hasattr(args, 'opt_args') and args.opt_args is not None:
116 | opt_args.update(args.opt_args)
117 |
118 | opt_split = opt_lower.split('_')
119 | opt_lower = opt_split[-1]
120 | if opt_lower == 'sgd' or opt_lower == 'nesterov':
121 | opt_args.pop('eps', None)
122 | optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=True, **opt_args)
123 | elif opt_lower == 'momentum':
124 | opt_args.pop('eps', None)
125 | optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=False, **opt_args)
126 | elif opt_lower == 'adam':
127 | optimizer = optim.Adam(parameters, **opt_args)
128 | elif opt_lower == 'adamw':
129 | optimizer = optim.AdamW(parameters, **opt_args)
130 | else:
131 | assert False and "Invalid optimizer"
132 | raise ValueError
133 | return optimizer
134 |
--------------------------------------------------------------------------------
/InternVid/utils/scheduler.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/utils/scheduler.py
--------------------------------------------------------------------------------
/InternVid/viclip/__init__.py:
--------------------------------------------------------------------------------
1 | from .simple_tokenizer import SimpleTokenizer as _Tokenizer
2 | from .viclip import ViCLIP
3 | import torch
4 | import numpy as np
5 | import cv2
6 | import os
7 |
8 |
9 | def get_viclip(size='l',
10 | pretrain=os.path.join(os.path.dirname(os.path.abspath(__file__)), "ViClip-InternVid-10M-FLT.pth")):
11 |
12 | tokenizer = _Tokenizer()
13 | vclip = ViCLIP(tokenizer=tokenizer, size=size, pretrain=pretrain)
14 | m = {'viclip':vclip, 'tokenizer':tokenizer}
15 |
16 | return m
17 |
18 | def get_text_feat_dict(texts, clip, tokenizer, text_feat_d={}):
19 | for t in texts:
20 | feat = clip.get_text_features(t, tokenizer, text_feat_d)
21 | text_feat_d[t] = feat
22 | return text_feat_d
23 |
24 | def get_vid_feat(frames, clip):
25 | return clip.get_vid_features(frames)
26 |
27 |
28 | def _frame_from_video(video):
29 | while video.isOpened():
30 | success, frame = video.read()
31 | if success:
32 | yield frame
33 | else:
34 | break
35 |
36 | v_mean = np.array([0.485, 0.456, 0.406]).reshape(1,1,3)
37 | v_std = np.array([0.229, 0.224, 0.225]).reshape(1,1,3)
38 | def normalize(data):
39 | return (data/255.0-v_mean)/v_std
40 |
41 | def frames2tensor(vid_list, fnum=8, target_size=(224, 224), device=torch.device('cuda')):
42 | assert(len(vid_list) >= fnum)
43 | step = len(vid_list) // fnum
44 | vid_list = vid_list[::step][:fnum]
45 | vid_list = [cv2.resize(x[:,:,::-1], target_size) for x in vid_list]
46 | vid_tube = [np.expand_dims(normalize(x), axis=(0, 1)) for x in vid_list]
47 | vid_tube = np.concatenate(vid_tube, axis=1)
48 | vid_tube = np.transpose(vid_tube, (0, 1, 4, 2, 3))
49 | vid_tube = torch.from_numpy(vid_tube).to(device, non_blocking=True).float()
50 | return vid_tube
51 |
52 | def retrieve_text(frames,
53 | texts,
54 | models={'viclip':None,
55 | 'tokenizer':None},
56 | topk=5,
57 | device=torch.device('cuda')):
58 | # clip, tokenizer = get_clip(name, model_cfg['size'], model_cfg['pretrained'], model_cfg['reload'])
59 | assert(type(models)==dict and models['viclip'] is not None and models['tokenizer'] is not None)
60 | clip, tokenizer = models['viclip'], models['tokenizer']
61 | clip = clip.to(device)
62 | frames_tensor = frames2tensor(frames, device=device)
63 | vid_feat = get_vid_feat(frames_tensor, clip)
64 |
65 | text_feat_d = {}
66 | text_feat_d = get_text_feat_dict(texts, clip, tokenizer, text_feat_d)
67 | text_feats = [text_feat_d[t] for t in texts]
68 | text_feats_tensor = torch.cat(text_feats, 0)
69 |
70 | probs, idxs = clip.get_predict_label(vid_feat, text_feats_tensor, top=topk)
71 |
72 | ret_texts = [texts[i] for i in idxs.numpy()[0].tolist()]
73 | return ret_texts, probs.numpy()[0]
--------------------------------------------------------------------------------
/InternVid/viclip/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/__init__.cpython-311.pyc
--------------------------------------------------------------------------------
/InternVid/viclip/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/__init__.cpython-39.pyc
--------------------------------------------------------------------------------
/InternVid/viclip/__pycache__/simple_tokenizer.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/simple_tokenizer.cpython-311.pyc
--------------------------------------------------------------------------------
/InternVid/viclip/__pycache__/simple_tokenizer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/simple_tokenizer.cpython-39.pyc
--------------------------------------------------------------------------------
/InternVid/viclip/__pycache__/viclip.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/viclip.cpython-39.pyc
--------------------------------------------------------------------------------
/InternVid/viclip/__pycache__/viclip_text.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/viclip_text.cpython-39.pyc
--------------------------------------------------------------------------------
/InternVid/viclip/__pycache__/viclip_vision.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/viclip_vision.cpython-39.pyc
--------------------------------------------------------------------------------
/InternVid/viclip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/bpe_simple_vocab_16e6.txt.gz
--------------------------------------------------------------------------------
/InternVid/viclip/simple_tokenizer.py:
--------------------------------------------------------------------------------
1 | import gzip
2 | import html
3 | import os
4 | from functools import lru_cache
5 |
6 | import ftfy
7 | import regex as re
8 |
9 |
10 | @lru_cache()
11 | def default_bpe():
12 | return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
13 | # @lru_cache()
14 | # def default_bpe():
15 | # return "bpe_simple_vocab_16e6.txt.gz"
16 |
17 |
18 | @lru_cache()
19 | def bytes_to_unicode():
20 | """
21 | Returns list of utf-8 byte and a corresponding list of unicode strings.
22 | The reversible bpe codes work on unicode strings.
23 | This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
24 | When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
25 | This is a signficant percentage of your normal, say, 32K bpe vocab.
26 | To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
27 | And avoids mapping to whitespace/control characters the bpe code barfs on.
28 | """
29 | bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
30 | cs = bs[:]
31 | n = 0
32 | for b in range(2**8):
33 | if b not in bs:
34 | bs.append(b)
35 | cs.append(2**8+n)
36 | n += 1
37 | cs = [chr(n) for n in cs]
38 | return dict(zip(bs, cs))
39 |
40 |
41 | def get_pairs(word):
42 | """Return set of symbol pairs in a word.
43 | Word is represented as tuple of symbols (symbols being variable-length strings).
44 | """
45 | pairs = set()
46 | prev_char = word[0]
47 | for char in word[1:]:
48 | pairs.add((prev_char, char))
49 | prev_char = char
50 | return pairs
51 |
52 |
53 | def basic_clean(text):
54 | text = ftfy.fix_text(text)
55 | text = html.unescape(html.unescape(text))
56 | return text.strip()
57 |
58 |
59 | def whitespace_clean(text):
60 | text = re.sub(r'\s+', ' ', text)
61 | text = text.strip()
62 | return text
63 |
64 |
65 | class SimpleTokenizer(object):
66 | def __init__(self, bpe_path: str = default_bpe()):
67 | self.byte_encoder = bytes_to_unicode()
68 | self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
69 | merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
70 | merges = merges[1:49152-256-2+1]
71 | merges = [tuple(merge.split()) for merge in merges]
72 | vocab = list(bytes_to_unicode().values())
73 | vocab = vocab + [v+'' for v in vocab]
74 | for merge in merges:
75 | vocab.append(''.join(merge))
76 | vocab.extend(['<|startoftext|>', '<|endoftext|>'])
77 | self.encoder = dict(zip(vocab, range(len(vocab))))
78 | self.decoder = {v: k for k, v in self.encoder.items()}
79 | self.bpe_ranks = dict(zip(merges, range(len(merges))))
80 | self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
81 | self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
82 |
83 | def bpe(self, token):
84 | if token in self.cache:
85 | return self.cache[token]
86 | word = tuple(token[:-1]) + ( token[-1] + '',)
87 | pairs = get_pairs(word)
88 |
89 | if not pairs:
90 | return token+''
91 |
92 | while True:
93 | bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
94 | if bigram not in self.bpe_ranks:
95 | break
96 | first, second = bigram
97 | new_word = []
98 | i = 0
99 | while i < len(word):
100 | try:
101 | j = word.index(first, i)
102 | new_word.extend(word[i:j])
103 | i = j
104 | except:
105 | new_word.extend(word[i:])
106 | break
107 |
108 | if word[i] == first and i < len(word)-1 and word[i+1] == second:
109 | new_word.append(first+second)
110 | i += 2
111 | else:
112 | new_word.append(word[i])
113 | i += 1
114 | new_word = tuple(new_word)
115 | word = new_word
116 | if len(word) == 1:
117 | break
118 | else:
119 | pairs = get_pairs(word)
120 | word = ' '.join(word)
121 | self.cache[token] = word
122 | return word
123 |
124 | def encode(self, text):
125 | bpe_tokens = []
126 | text = whitespace_clean(basic_clean(text)).lower()
127 | for token in re.findall(self.pat, text):
128 | token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
129 | bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
130 | return bpe_tokens
131 |
132 | def decode(self, tokens):
133 | text = ''.join([self.decoder[token] for token in tokens])
134 | text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('', ' ')
135 | return text
136 |
--------------------------------------------------------------------------------
/LaViLa/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct
2 |
3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
4 | Please read the [full text](https://code.fb.com/codeofconduct/)
5 | so that you can understand what actions will and will not be tolerated.
--------------------------------------------------------------------------------
/LaViLa/CONTRIBUTING.md:
--------------------------------------------------------------------------------
1 | # Contributing to LaViLa
2 | We want to make contributing to this project as easy and transparent as
3 | possible.
4 |
5 | ## Our Development Process
6 | Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis.
7 |
8 | ## Pull Requests
9 | We actively welcome your pull requests.
10 |
11 | 1. Fork the repo and create your branch from `main`.
12 | 2. If you've added code that should be tested, add tests.
13 | 3. If you've changed APIs, update the documentation.
14 | 4. Ensure the test suite passes.
15 | 5. Make sure your code lints.
16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
17 |
18 | ## Contributor License Agreement ("CLA")
19 | In order to accept your pull request, we need you to submit a CLA. You only need
20 | to do this once to work on any of Facebook's open source projects.
21 |
22 | Complete your CLA here:
23 |
24 | ## Issues
25 | We use GitHub issues to track public bugs. Please ensure your description is
26 | clear and has sufficient instructions to be able to reproduce the issue.
27 |
28 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
29 | disclosure of security bugs. In those cases, please go through the process
30 | outlined on that page and do not file a public issue.
31 |
32 | ## Coding Style
33 | * 4 spaces for indentation rather than tabs
34 | * 80 character line length
35 | * PEP8 formatting following [Black](https://black.readthedocs.io/en/stable/)
36 |
37 | ## License
38 | By contributing to LaViLa, you agree that your contributions will be licensed
39 | under the LICENSE file in the root directory of this source tree.
40 |
--------------------------------------------------------------------------------
/LaViLa/LICENSE:
--------------------------------------------------------------------------------
1 |
2 | MIT License
3 |
4 | Copyright (c) Meta Platforms, Inc. and affiliates.
5 |
6 | Permission is hereby granted, free of charge, to any person obtaining a copy
7 | of this software and associated documentation files (the "Software"), to deal
8 | in the Software without restriction, including without limitation the rights
9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 |
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 |
--------------------------------------------------------------------------------
/LaViLa/__pycache__/eval_narrator.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/__pycache__/eval_narrator.cpython-39.pyc
--------------------------------------------------------------------------------
/LaViLa/clip_caption.py:
--------------------------------------------------------------------------------
1 | import decord
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | from collections import OrderedDict
5 | import time
6 | import torch
7 | import torchvision.transforms as transforms
8 | import torchvision.transforms._transforms_video as transforms_video
9 | import sys
10 | sys.path.insert(0, './')
11 | from lavila.data.video_transforms import Permute
12 | from lavila.data.datasets import get_frame_ids, video_loader_by_frames
13 | from lavila.models.models import VCLM_OPENAI_TIMESFORMER_BASE_GPT2
14 | from lavila.models.tokenizer import MyGPT2Tokenizer
15 | from base64 import b64encode
16 | import os
17 | import fnmatch
18 | import imageio
19 | import json
20 | import cv2
21 |
22 |
23 | ckpt_path = 'vclm_openai_timesformer_base_gpt2_base.pt_ego4d.jobid_319630.ep_0002.md5sum_68a71f.pth'
24 | ckpt = torch.load(ckpt_path, map_location='cpu')
25 | state_dict = OrderedDict()
26 | for k, v in ckpt['state_dict'].items():
27 | state_dict[k.replace('module.', '')] = v
28 |
29 | # instantiate the model, and load the pre-trained weights
30 | model = VCLM_OPENAI_TIMESFORMER_BASE_GPT2(
31 | text_use_cls_token=False,
32 | project_embed_dim=256,
33 | gated_xattn=True,
34 | timesformer_gated_xattn=False,
35 | freeze_lm_vclm=False,
36 | freeze_visual_vclm=False,
37 | freeze_visual_vclm_temporal=False,
38 | num_frames=4,
39 | drop_path_rate=0.
40 | )
41 |
42 | model.load_state_dict(state_dict, strict=True)
43 | model.eval()
44 | tokenizer = MyGPT2Tokenizer('gpt2', add_bos=True)
45 |
46 | candidate_num = 5
47 | crop_size = 224
48 | val_transform = transforms.Compose([
49 | Permute([3, 0, 1, 2]),
50 | transforms.Resize(crop_size),
51 | transforms.CenterCrop(crop_size),
52 | transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305])
53 | ])
54 |
55 |
56 | def decode_one(generated_ids, tokenizer):
57 | # get the index of
58 | if tokenizer.eos_token_id == tokenizer.bos_token_id:
59 | if tokenizer.eos_token_id in generated_ids[1:].tolist():
60 | eos_id = generated_ids[1:].tolist().index(tokenizer.eos_token_id) + 1
61 | else:
62 | eos_id = len(generated_ids.tolist()) - 1
63 | elif tokenizer.eos_token_id in generated_ids.tolist():
64 | eos_id = generated_ids.tolist().index(tokenizer.eos_token_id)
65 | else:
66 | eos_id = len(generated_ids.tolist()) - 1
67 | generated_text_str = tokenizer.tokenizer.decode(generated_ids[1:eos_id].tolist())
68 | return generated_text_str
69 |
70 |
71 | def create_caption(frames):
72 | with torch.no_grad():
73 | image_features = model.encode_image(frames)
74 | generated_text_ids, ppls = model.generate(
75 | image_features,
76 | tokenizer,
77 | target=None, # free-form generation
78 | max_text_length=77,
79 | top_k=None,
80 | top_p=0.95, # nucleus sampling
81 | num_return_sequences=candidate_num, # number of candidates: 10
82 | temperature=0.9,
83 | early_stopping=True,
84 | )
85 | longest_sentence = ""
86 | for i in range(candidate_num):
87 | generated_text_str = decode_one(generated_text_ids[i], tokenizer)
88 | if len(generated_text_str) > len(longest_sentence):
89 | longest_sentence = generated_text_str
90 | return longest_sentence
91 |
92 |
93 | def captioning(frame_path, fps, caption_seconds=2, frames_per_caption=4):
94 | frame_interval = int(fps*caption_seconds/frames_per_caption)
95 | sequential_image_list = []
96 | sequential_caption_list = dict()
97 |
98 | for root, dirs, files in os.walk(frame_path):
99 | for file in files:
100 | if fnmatch.fnmatch(file, '*.jpg'):
101 | sequential_image_list.append(file)
102 |
103 | sequential_image_list.sort() # ordered frame list
104 |
105 | start_frame = int(sequential_image_list[0].split('.')[0].split('_')[-1])
106 | end_frame = int(sequential_image_list[-1].split('.')[0].split('_')[-1])
107 |
108 | print(start_frame)
109 | print(end_frame)
110 | total_frames = end_frame-start_frame+1
111 |
112 | total_captions = total_frames//(fps*caption_seconds)
113 | IMAGE_NAME_PATTERN = "video_frame_{:07d}.jpg"
114 |
115 |
116 | for i in range(total_captions):
117 | print(i)
118 | caption_start_frame = start_frame + i * fps * caption_seconds
119 | caption_end_frame = start_frame + (i+1) * fps * caption_seconds
120 | input_frames = []
121 | for j in range(frames_per_caption):
122 | frame_idx = caption_start_frame + j* frame_interval
123 | print('frame: ', frame_idx)
124 | frame_name = IMAGE_NAME_PATTERN.format(frame_idx)
125 | image_file = os.path.join(frame_path, frame_name)
126 | image = imageio.imread(image_file)
127 | input_frames.append(image)
128 | input_frames = torch.from_numpy(np.stack(input_frames, axis=0)).float() #[4, w, h, 3]
129 | #print("input_frames: ", input_frames)
130 | #print("input_frames.size: ", input_frames.size())
131 | frames = val_transform(input_frames)
132 | frames = frames.unsqueeze(0)
133 | caption = create_caption(frames)
134 | time_stamps = "{}-{}".format(str(caption_start_frame), str(caption_end_frame))
135 | sequential_caption_list[time_stamps] = caption
136 |
137 | with open(os.path.join(frame_path, 'captions.json'), 'w') as f:
138 | json.dump(sequential_caption_list, f)
139 |
140 |
141 |
142 | def captioning(frame_path, fps, caption_seconds=2, frames_per_caption=4):
143 | frame_interval = int(fps*caption_seconds/frames_per_caption)
144 | sequential_image_list = []
145 | sequential_caption_list = dict()
146 |
147 | for root, dirs, files in os.walk(frame_path):
148 | for file in files:
149 | if fnmatch.fnmatch(file, '*.jpg'):
150 | sequential_image_list.append(file)
151 |
152 | sequential_image_list.sort() # ordered frame list
153 |
154 | start_frame = int(sequential_image_list[0].split('.')[0].split('_')[-1])
155 | end_frame = int(sequential_image_list[-1].split('.')[0].split('_')[-1])
156 |
157 | print(start_frame)
158 | print(end_frame)
159 | total_frames = end_frame-start_frame+1
160 |
161 | total_captions = total_frames//(fps*caption_seconds)
162 | IMAGE_NAME_PATTERN = "video_frame_{:07d}.jpg"
163 |
164 |
165 | for i in range(total_captions):
166 | print(i)
167 | caption_start_frame = start_frame + i * fps * caption_seconds
168 | caption_end_frame = start_frame + (i+1) * fps * caption_seconds
169 | input_frames = []
170 | for j in range(frames_per_caption):
171 | frame_idx = caption_start_frame + j* frame_interval
172 | print('frame: ', frame_idx)
173 | frame_name = IMAGE_NAME_PATTERN.format(frame_idx)
174 | image_file = os.path.join(frame_path, frame_name)
175 | image = imageio.imread(image_file)
176 | input_frames.append(image)
177 | input_frames = torch.from_numpy(np.stack(input_frames, axis=0)).float() #[4, w, h, 3]
178 | #print("input_frames: ", input_frames)
179 | #print("input_frames.size: ", input_frames.size())
180 | frames = val_transform(input_frames)
181 | frames = frames.unsqueeze(0)
182 | caption = create_caption(frames)
183 | time_stamps = "{}-{}".format(str(caption_start_frame), str(caption_end_frame))
184 | sequential_caption_list[time_stamps] = caption
185 |
186 | with open(os.path.join(frame_path, 'captions.json'), 'w') as f:
187 | json.dump(sequential_caption_list, f)
--------------------------------------------------------------------------------
/LaViLa/datasets/README.md:
--------------------------------------------------------------------------------
1 | # Preparing datasets for LAVILA
2 |
3 | Please download the (selected) datasets from the official websites and place or sim-link them under `$LAVILA_ROOT/datasets/`.
4 |
5 | ```bash
6 | $LAVILA_ROOT/datasets/
7 | CharadesEgo/
8 | EGTEA/
9 | EK100/
10 | Ego4D/
11 | ```
12 |
13 | ## Ego4D
14 | 1. Download [Ego4D videos](https://ego4d-data.org/docs/start-here/#download-data) (license is required).
15 |
16 | 2. Preprocess
17 |
18 | We cut each video into 5-minute-long chunks and resize the smaller size to be 288 pixels for faster IO. Please refer to [this script](scripts/crop_and_resize_ego4d.sh) for more details.
19 |
20 | 3. Download annotations
21 |
22 | a. Download [egomcq.json](https://drive.google.com/file/d/1-5iRYf4BCHmj4MYQYFRMY4bhsWJUN3rW/view) to `$LAVILA_ROOT/datasets/Ego4D` (if you want to evaluate EgoMCQ).
23 |
24 | b. Download [metadata for train split](https://dl.fbaipublicfiles.com/lavila/metadata/ego4d/ego4d_train.pkl) and [val split](https://dl.fbaipublicfiles.com/lavila/metadata/ego4d/ego4d_val.pkl) to `$LAVILA_ROOT/datasets/Ego4D` ((if you want to train LAVILA from scratch).
25 |
26 | The fold should look like this:
27 | ```bash
28 | $LAVILA_ROOT/datasets/
29 | Ego4D/
30 | ego4d_train.pkl
31 | ego4d_val.pkl
32 | egomcq.json
33 | video_288px/
34 | 000786a7-3f9d-4fe6-bfb3-045b368f7d44.mp4/
35 | 0.mp4
36 | 300.mp4
37 | 000a3525-6c98-4650-aaab-be7d2c7b9402.mp4/
38 | 0.mp4
39 | ...
40 | ```
41 |
42 |
43 | ## EPIC-Kitchens-100 (EK-100)
44 |
45 | 1. Download annotations
46 |
47 | ```bash
48 | # Assume that you are under `datasets/EK100/`
49 | git clone https://github.com/epic-kitchens/epic-kitchens-100-annotations
50 | ```
51 |
52 | 2. Download videos.
53 |
54 | a. For raw videos, please download them from [https://epic-kitchens.github.io/](https://epic-kitchens.github.io/).
55 |
56 | b. (Recommended) The raw videos are huge (~1 TB). As an alternative, please check out a [resized version](https://utexas.box.com/s/l7ij81ie5q07p9fdg0vtejihq61liln9).
57 |
58 | 3. (For EK-100 MIR)
59 |
60 | a. Generate the relevancy matrix of train/val splits using [the official code](https://github.com/mwray/Joint-Part-of-Speech-Embeddings).
61 |
62 | b. (Recommended) The generated result has some randomness. Therefore, we also provide the [replica of train split](https://dl.fbaipublicfiles.com/lavila/metadata/EK100/caption_relevancy_EPIC_100_retrieval_train.pkl) and [val split](https://dl.fbaipublicfiles.com/lavila/metadata/EK100/caption_relevancy_EPIC_100_retrieval_test.pkl). Please put them to the folder `$LAVILA_ROOT/datasets/EK100/epic-kitchens-100-annotations/retrieval_annotations/relevancy/`.
63 |
64 |
65 | The folder should look like this:
66 | ```bash
67 | $LAVILA_ROOT/datasets/
68 | EK100/
69 | epic-kitchens-100-annotations/
70 | EPIC_100_train.csv
71 | EPIC_100_validation.csv
72 | ...
73 | retrieval_annotations/relevancy/ # this appears if you do 3.
74 | caption_relevancy_EPIC_100_retrieval_train.pkl
75 | caption_relevancy_EPIC_100_retrieval_test.pkl
76 | video_ht256px/
77 | P01/
78 | P01_01.MP4
79 | P01_02.MP4
80 | ...
81 | P01_19.MP4
82 | P02/
83 | P02_01.MP4
84 | P02_02.MP4
85 | ...
86 | P02_15.MP4
87 | ...
88 | ```
89 |
90 | ## CharadesEgo
91 |
92 | 1. Download annotations at [https://prior.allenai.org/projects/charades-ego](https://prior.allenai.org/projects/charades-ego).
93 | ```bash
94 | ### Annotations
95 | # Assume that you are under `datasets/CharadesEgo/`
96 | wget https://ai2-public-datasets.s3-us-west-2.amazonaws.com/charades/CharadesEgo.zip
97 | unzip CharadesEgo.zip && rm CharadesEgo.zip
98 | ```
99 |
100 | 2. Download data (~11GB) at [https://prior.allenai.org/projects/charades-ego](https://prior.allenai.org/projects/charades-ego).
101 | ```bash
102 | ### Data
103 | wget https://ai2-public-datasets.s3-us-west-2.amazonaws.com/charades/CharadesEgo_v1_480.tar
104 | tar -xvf CharadesEgo_v1_480.tar # Or specify an external path using `-C` and sim-link it to here
105 | rm CharadesEgo_v1_480.tar
106 | ```
107 |
108 | 3. (For fine-tuning CharadesEgo) Download two additional metadata files: [clip-level metadata (train)](https://dl.fbaipublicfiles.com/lavila/metadata/CharadesEgo/metadata_filtered_train.pkl) and [clip-level metadata (val)](https://dl.fbaipublicfiles.com/lavila/metadata/CharadesEgo/metadata_filtered_val.pkl). Put them to the folder `$LAVILA_ROOT/datasets/CharadesEgo/CharadesEgo/`.
109 |
110 | The folder should look like this:
111 | ```bash
112 | $LAVILA_ROOT/datasets/
113 | CharadesEgo/
114 | CharadesEgo/
115 | CharadesEgo_v1_train_only1st.csv
116 | CharadesEgo_v1_test_only1st.csv
117 | ...
118 | metadata_filtered_train.pkl # this appears if you do 3.
119 | metadata_filtered_val.pkl # this appears if you do 3.
120 | CharadesEgo_v1_480/
121 | 005BU.mp4
122 | 005BUEGO.mp4
123 | ...
124 | ```
125 |
126 |
127 | ## EGTEA
128 |
129 | 1. Visit [https://cbs.ic.gatech.edu/fpv/](https://cbs.ic.gatech.edu/fpv/).
130 |
131 | 2. Download `TRIMMED_ACTION_CLIPS` (~20GB) and `ACTION_ANNOTATIONS` and untar to the current folder `$LAVILA_ROOT/datasets/EGTEA`.
132 |
133 | ```bash
134 | unzip action_annotation.zip -d EGTEA/ && rm action_annotation.zip
135 | ```
136 |
137 | The folder should look like this:
138 | ```bash
139 | $LAVILA_ROOT/datasets/
140 | EGTEA/
141 | train_split1.txt
142 | test_split1.txt
143 | cropped_clips/
144 | OP01-R01-PastaSalad/
145 | OP01-R01-PastaSalad-1002316-1004005-F024051-F024101.mp4
146 | OP01-R01-PastaSalad-1004110-1021110-F024057-F024548.mp4
147 | OP01-R01-PastaSalad-1022590-1024050-F024539-F024581.mp4
148 | ...
149 | OP01-R02-TurkeySandwich/
150 | OP01-R02-TurkeySandwich-102320-105110-F002449-F002529.mp4
151 | OP01-R02-TurkeySandwich-105440-106460-F002528-F002558.mp4
152 | OP01-R02-TurkeySandwich-107332-133184-F002513-F003259.mp4
153 | ...
154 | ...
155 | ```
156 |
--------------------------------------------------------------------------------
/LaViLa/demo.py:
--------------------------------------------------------------------------------
1 | import decord
2 | import matplotlib.pyplot as plt
3 | import numpy as np
4 | from collections import OrderedDict
5 | import time
6 | import torch
7 | import torchvision.transforms as transforms
8 | import torchvision.transforms._transforms_video as transforms_video
9 |
10 | import sys
11 | sys.path.insert(0, './')
12 | from lavila.data.video_transforms import Permute
13 | from lavila.data.datasets import get_frame_ids, video_loader_by_frames
14 | from lavila.models.models import VCLM_OPENAI_TIMESFORMER_BASE_GPT2
15 | from lavila.models.tokenizer import MyGPT2Tokenizer
16 |
17 |
18 | video_path = 'assets/3c0dffd0-e38e-4643-bc48-d513943dc20b_012_014.mp4'
19 |
20 |
21 | from base64 import b64encode
22 |
23 |
24 | # The video is represented by `num_seg=4` frames
25 | vr = decord.VideoReader(video_path)
26 | print("total length:", len(vr))
27 | num_seg = 4
28 | frame_ids = get_frame_ids(0, len(vr), num_segments=num_seg, jitter=False)
29 | frames = video_loader_by_frames('./', video_path, frame_ids)
30 | print(frames)
31 | print('frames_size:', frames.size()) #[num_seg, w, h, 3]
32 |
33 |
34 | # display the subsampled frames
35 | # plt.figure(figsize=(16, 40))
36 | # for i in range(num_seg):
37 | # plt.subplot(1, num_seg, i + 1)
38 | # plt.imshow(frames[i].cpu().numpy().astype(int))
39 | # plt.axis('off')
40 | # plt.show()
41 |
42 |
43 | ckpt_path = 'vclm_openai_timesformer_base_gpt2_base.pt_ego4d.jobid_319630.ep_0002.md5sum_68a71f.pth'
44 | ckpt = torch.load(ckpt_path, map_location='cpu')
45 | state_dict = OrderedDict()
46 | for k, v in ckpt['state_dict'].items():
47 | state_dict[k.replace('module.', '')] = v
48 |
49 | # instantiate the model, and load the pre-trained weights
50 | model = VCLM_OPENAI_TIMESFORMER_BASE_GPT2(
51 | text_use_cls_token=False,
52 | project_embed_dim=256,
53 | gated_xattn=True,
54 | timesformer_gated_xattn=False,
55 | freeze_lm_vclm=False,
56 | freeze_visual_vclm=False,
57 | freeze_visual_vclm_temporal=False,
58 | num_frames=4,
59 | drop_path_rate=0.
60 | )
61 |
62 | model.load_state_dict(state_dict, strict=True)
63 |
64 | num_params = sum(p.numel() for p in model.parameters())
65 | print(f'model params: {num_params}')
66 | model.eval()
67 | #model.cuda()
68 | print('loaded into GPU')
69 | # transforms on input frames
70 | crop_size = 224
71 | val_transform = transforms.Compose([
72 | Permute([3, 0, 1, 2]),
73 | transforms.Resize(crop_size),
74 | transforms.CenterCrop(crop_size),
75 | transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305])
76 | ])
77 | frames = val_transform(frames)
78 | print("frames shape before squeeze: ", frames.size()) #[3, 4, 224, 224]
79 | frames = frames.unsqueeze(0) # fake a batch dimension
80 | print("frames shape: ", frames.size()) #[1, 3, 4, 224, 224]
81 |
82 | tokenizer = MyGPT2Tokenizer('gpt2', add_bos=True)
83 |
84 | candidate_num = 5
85 |
86 | def decode_one(generated_ids, tokenizer):
87 | # get the index of
88 | if tokenizer.eos_token_id == tokenizer.bos_token_id:
89 | if tokenizer.eos_token_id in generated_ids[1:].tolist():
90 | eos_id = generated_ids[1:].tolist().index(tokenizer.eos_token_id) + 1
91 | else:
92 | eos_id = len(generated_ids.tolist()) - 1
93 | elif tokenizer.eos_token_id in generated_ids.tolist():
94 | eos_id = generated_ids.tolist().index(tokenizer.eos_token_id)
95 | else:
96 | eos_id = len(generated_ids.tolist()) - 1
97 | generated_text_str = tokenizer.tokenizer.decode(generated_ids[1:eos_id].tolist())
98 | return generated_text_str
99 |
100 |
101 |
102 | start_time = time.time()
103 | for i in range(100):
104 | with torch.no_grad():
105 | image_features = model.encode_image(frames)
106 | generated_text_ids, ppls = model.generate(
107 | image_features,
108 | tokenizer,
109 | target=None, # free-form generation
110 | max_text_length=77,
111 | top_k=None,
112 | top_p=0.95, # nucleus sampling
113 | num_return_sequences=candidate_num, # number of candidates: 10
114 | temperature=0.7,
115 | early_stopping=True,
116 | )
117 | for i in range(candidate_num):
118 | generated_text_str = decode_one(generated_text_ids[i], tokenizer)
119 | print('{}: {}'.format(i, generated_text_str))
120 | end_time = time.time()
121 | print(end_time-start_time)
--------------------------------------------------------------------------------
/LaViLa/demo_narrator.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 |
8 | import argparse
9 | import os
10 | import urllib.request
11 | from collections import OrderedDict
12 |
13 | import torch
14 | import torchvision.transforms as transforms
15 | import torchvision.transforms._transforms_video as transforms_video
16 | import decord
17 |
18 | from lavila.data.video_transforms import Permute
19 | from lavila.data.datasets import get_frame_ids, video_loader_by_frames
20 | from lavila.models.models import VCLM_OPENAI_TIMESFORMER_LARGE_336PX_GPT2_XL
21 | from lavila.models.tokenizer import MyGPT2Tokenizer
22 | from eval_narrator import decode_one
23 | import cv2
24 |
25 | def main(args):
26 |
27 | vr = decord.VideoReader(args.video_path)
28 | num_seg = 4
29 | frame_ids = get_frame_ids(0, len(vr), num_segments=num_seg, jitter=False)
30 | print('frame_ids: ', frame_ids)
31 | frames = video_loader_by_frames('./', args.video_path, frame_ids)
32 | test_frame = frames[0].numpy()
33 | print(test_frame.shape)
34 | cv2.imwrite("test_frame.jpg", cv2.cvtColor(test_frame, cv2.COLOR_BGR2RGB))
35 | ckpt_name = 'vclm_openai_timesformer_large_336px_gpt2_xl.pt_ego4d.jobid_246897.ep_0003.md5sum_443263.pth'
36 | ckpt_path = os.path.join('modelzoo/', ckpt_name)
37 | os.makedirs('modelzoo/', exist_ok=True)
38 | if not os.path.exists(ckpt_path):
39 | print('downloading model to {}'.format(ckpt_path))
40 | urllib.request.urlretrieve('https://dl.fbaipublicfiles.com/lavila/checkpoints/narrator/{}'.format(ckpt_name), ckpt_path)
41 | ckpt = torch.load(ckpt_path, map_location='cpu')
42 | state_dict = OrderedDict()
43 | for k, v in ckpt['state_dict'].items():
44 | state_dict[k.replace('module.', '')] = v
45 |
46 | # instantiate the model, and load the pre-trained weights
47 | model = VCLM_OPENAI_TIMESFORMER_LARGE_336PX_GPT2_XL(
48 | text_use_cls_token=False,
49 | project_embed_dim=256,
50 | gated_xattn=True,
51 | timesformer_gated_xattn=False,
52 | freeze_lm_vclm=False, # we use model.eval() anyway
53 | freeze_visual_vclm=False, # we use model.eval() anyway
54 | num_frames=4,
55 | drop_path_rate=0.
56 | )
57 | model.load_state_dict(state_dict, strict=True)
58 | if args.cuda:
59 | model.cuda()
60 | model.eval()
61 |
62 | # transforms on input frames
63 | crop_size = 336
64 | val_transform = transforms.Compose([
65 | Permute([3, 0, 1, 2]),
66 | transforms.Resize(crop_size),
67 | transforms.CenterCrop(crop_size),
68 | transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305])
69 | ])
70 | frames = val_transform(frames)
71 | frames = frames.unsqueeze(0) # fake a batch dimension
72 |
73 | tokenizer = MyGPT2Tokenizer('gpt2-xl', add_bos=True)
74 | with torch.no_grad():
75 | if args.cuda:
76 | frames = frames.cuda(non_blocking=True)
77 | image_features = model.encode_image(frames)
78 | generated_text_ids, ppls = model.generate(
79 | image_features,
80 | tokenizer,
81 | target=None, # free-form generation
82 | max_text_length=77,
83 | top_k=None,
84 | top_p=0.95, # nucleus sampling
85 | num_return_sequences=10, # number of candidates: 10
86 | temperature=0.7,
87 | early_stopping=True,
88 | )
89 |
90 | for i in range(10):
91 | generated_text_str = decode_one(generated_text_ids[i], tokenizer)
92 | print('{}: {}'.format(i, generated_text_str))
93 |
94 |
95 | if __name__ == '__main__':
96 | parser = argparse.ArgumentParser('lavila narrator demo')
97 | parser.add_argument('--cuda', default=True, action='store_true', help='use cuda')
98 | parser.add_argument('--video-path', default='assets/3c0dffd0-e38e-4643-bc48-d513943dc20b_012_014.mp4', type=str, help='video path')
99 | #parser.add_argument('--video-path', default='/home/yue/data/mount/fillipo/Datasets/Ego4d/v1/full_scale/0a3dc289-557f-4121-9bc7-521a2b5d3bb8.mp4', type=str, help='video path')
100 | args = parser.parse_args()
101 | main(args)
102 |
--------------------------------------------------------------------------------
/LaViLa/demo_narrator_3rd_person.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 |
8 | import argparse
9 | import os
10 | import urllib.request
11 | from collections import OrderedDict
12 |
13 | import torch
14 | import torchvision.transforms as transforms
15 | import torchvision.transforms._transforms_video as transforms_video
16 | import decord
17 |
18 | from lavila.data.video_transforms import Permute
19 | from lavila.data.datasets import get_frame_ids, video_loader_by_frames
20 | from lavila.models.models import VCLM_OPENAI_TIMESFORMER_LARGE_GPT2_XL
21 | from lavila.models.tokenizer import MyGPT2Tokenizer
22 | from eval_narrator import decode_one
23 |
24 |
25 | def main(args):
26 |
27 | vr = decord.VideoReader(args.video_path)
28 | num_seg = 4
29 | frame_ids = get_frame_ids(0, len(vr), num_segments=num_seg, jitter=False)
30 | frames = video_loader_by_frames('./', args.video_path, frame_ids)
31 |
32 | ckpt_name = 'vclm_openai_timesformer_large_gpt2_xl.pt_htm.jobid_341080.ep_0001.pth'
33 | ckpt_path = os.path.join('modelzoo/', ckpt_name)
34 | os.makedirs('modelzoo/', exist_ok=True)
35 | if not os.path.exists(ckpt_path):
36 | print('downloading model to {}'.format(ckpt_path))
37 | urllib.request.urlretrieve('https://dl.fbaipublicfiles.com/lavila/checkpoints/narrator/htm_aa/{}'.format(ckpt_name), ckpt_path)
38 | ckpt = torch.load(ckpt_path, map_location='cpu')
39 | state_dict = OrderedDict()
40 | for k, v in ckpt['state_dict'].items():
41 | state_dict[k.replace('module.', '')] = v
42 |
43 | # instantiate the model, and load the pre-trained weights
44 | model = VCLM_OPENAI_TIMESFORMER_LARGE_GPT2_XL(
45 | text_use_cls_token=False,
46 | project_embed_dim=256,
47 | gated_xattn=True,
48 | timesformer_gated_xattn=False,
49 | freeze_lm_vclm=False, # we use model.eval() anyway
50 | freeze_visual_vclm=False, # we use model.eval() anyway
51 | freeze_visual_vclm_temporal=False,
52 | num_frames=4,
53 | drop_path_rate=0.
54 | )
55 | model.load_state_dict(state_dict, strict=True)
56 | if args.cuda:
57 | model.cuda()
58 | model.eval()
59 |
60 | # transforms on input frames
61 | crop_size = 224
62 | val_transform = transforms.Compose([
63 | Permute([3, 0, 1, 2]),
64 | transforms.Resize(crop_size),
65 | transforms.CenterCrop(crop_size),
66 | transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305])
67 | ])
68 | frames = val_transform(frames)
69 | frames = frames.unsqueeze(0) # fake a batch dimension
70 |
71 | tokenizer = MyGPT2Tokenizer('gpt2-xl', add_bos=True)
72 | with torch.no_grad():
73 | if args.cuda:
74 | frames = frames.cuda(non_blocking=True)
75 | image_features = model.encode_image(frames)
76 | generated_text_ids, ppls = model.generate(
77 | image_features,
78 | tokenizer,
79 | target=None, # free-form generation
80 | max_text_length=77,
81 | top_k=None,
82 | top_p=0.95, # nucleus sampling
83 | num_return_sequences=10, # number of candidates: 10
84 | temperature=0.7,
85 | early_stopping=True,
86 | )
87 |
88 | for i in range(10):
89 | generated_text_str = decode_one(generated_text_ids[i], tokenizer)
90 | print('{}: {}'.format(i, generated_text_str))
91 |
92 |
93 | if __name__ == '__main__':
94 | parser = argparse.ArgumentParser('lavila narrator demo')
95 | parser.add_argument('--cuda', action='store_true', help='use cuda')
96 | parser.add_argument('--video-path', type=str,
97 | default='assets/mixkit-pastry-chef-cutting-a-loaf-into-slices-43015-medium.mp4')
98 | args = parser.parse_args()
99 | main(args)
100 |
--------------------------------------------------------------------------------
/LaViLa/docs/INSTALL.md:
--------------------------------------------------------------------------------
1 | # Installation
2 |
3 | ## Requirements
4 |
5 |
6 | ## Example conda environment setup
7 |
8 | ```bash
9 | conda create --name lavila python=3.8 -y
10 | conda activate lavila
11 | pip install -r requirements.txt
12 | ```
13 |
14 | ## datasets
15 | If you want to train/evaluate on the datasets, please see [datasets/README.md](../datasets/README.md) to see how we prepare datasets for this project.
16 |
--------------------------------------------------------------------------------
/LaViLa/docs/PRETRAIN.md:
--------------------------------------------------------------------------------
1 | # LAVILA Pretraining
2 |
3 | In this doc, we provide a step-by-step guide (with commands) to train LaViLa.
4 | Note that we recommend running the following job with four 8x V100 (32GB) nodes (or eight nodes for the larger backbone) using [submitit](https://github.com/facebookincubator/submitit).
5 | See how to install submitit at [here](./MODEL_ZOO.md#multi-node-training).
6 |
7 |
8 | ## Pre-training Dual-Encoder Baseline
9 |
10 | We first pre-train a dual-encoder baseline with human annotations on Ego4d clips.
11 | The goal is (1) to establish a comparable baseline for LAVILA, and (2) provide a video encoder for narrator (see below).
12 | We use a default batch size of 32 per gpu so that the total batch size for InfoNCE loss is `32*8*4=1024`.
13 |
14 | Train a baseline dual-encoder (with TSF-B)
15 |
16 | ```bash
17 | python run_with_submitit_pretrain.py --model CLIP_OPENAI_TIMESFORMER_BASE \
18 | --norm-embed --freeze-temperature \
19 | --fix-lr --contrastive-use-vissl \
20 | --nodes 4 --use_volta32
21 | ```
22 |
23 |
24 | To fit a High-Resolution TimeSformer-Large with a sufficient batch size, we use [DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert), a memory-efficient text encoder, instead of the original text encoder in the CLIP. Additionally we apply [gradient checkpointing](https://pytorch.org/docs/stable/checkpoint.html) and [Zero Redundancy Optimizer (ZeRO)](https://arxiv.org/abs/1910.02054).
25 |
26 | Train a baseline dual-encoder (with TSF-L@HR)
27 |
28 | ```bash
29 | python run_with_submitit_pretrain.py --model CLIP_OPENAI_TIMESFORMER_LARGE_336PX_DISTILBERT_BASE \
30 | --batch-size 8 \
31 | --use-checkpoint --use-zero \
32 | --norm-embed --freeze-temperature \
33 | --fix-lr --contrastive-use-vissl \
34 | --nodes 8 --use_volta32
35 | ```
36 |
37 |
38 | ## Training and Evaluating Narrator
39 |
40 | The narrator is a *visually conditioned* large language model (VCLM), which comprises a pre-trained video encoder (obtained above), a text decoder (GPT-2 family), and a few gated cross-attention modules that attends visual information while captioning. Both the video encoder and the text decoder are kept frozen while the cross-attention modules are learnable.
41 |
42 | Note that we turn off Pytorch's automatic mixed-precision (AMP) during training the narrator. We observe training is instable if AMP is on.
43 |
44 | Also note that `$PATH` can be found in the `Vis. Encoder` column of [MODEL_ZOO.md#Narrator](./MODEL_ZOO.md#narrator). If you are using your own checkpoint (e.g. pre-trained in the previous step), please make sure that the following keys in the checkpoint have been dropped: `epoch`, `optimizer`, and `scaler`.
45 |
46 | Train a baseline narrator (TSF-B as visual encoder and GPT-2 base as textual decoder)
47 |
48 | ```bash
49 | python run_with_submitit_pretrain.py \
50 | --model VCLM_OPENAI_TIMESFORMER_BASE_GPT2 \
51 | --gated-xattn --freeze-lm-vclm --freeze-visual-vclm --freeze-visual-vclm-temporal \
52 | --fix-lr --batch-size 8 --clip-grad-value 1.0 --eval-freq 1 --disable-amp \
53 | --nodes 4 --use_volta32 --resume $PATH # Eg. $PATH can be "modelzoo/clip_openai_timesformer_base.baseline.ep_0003.pth"
54 | ```
55 |
56 |
57 |
58 | Train a strong narrator (TSF-L@HR as visual encoder and GPT-2 XL as textual decoder)
59 |
60 | ```bash
61 | python run_with_submitit_pretrain.py \
62 | --model VCLM_OPENAI_TIMESFORMER_LARGE_336PX_GPT2_XL \
63 | --gated-xattn --freeze-lm-vclm --freeze-visual-vclm --freeze-visual-vclm-temporal --use-checkpoint \
64 | --fix-lr --batch-size 8 --clip-grad-value 1.0 --eval-freq 1 --disable-amp \
65 | --nodes 4 --use_volta32 --resume $PATH # Eg. $PATH can be "modelzoo/clip_openai_timesformer_large_336px_distilbert_base.baseline.ep_0003.pth"
66 | ```
67 |
68 |
69 | Evaluate the narrator on Ego4D val split
70 |
71 | ```bash
72 | torchrun --nproc_per_node=1 eval_narrator.py \
73 | --caption-top-p 0.95 --caption-temperature 0.7 \
74 | --eval-freq 10000 \ # evaluate on the val split of Ego4D (1/10000-subset for fast evaluation)
75 | --resume $VCLM_CHECKPOINT
76 | ```
77 | This will output some common NLG metrics, such as BLEU-x, METEOR, ROUGE_L, and CIDEr (using the human narrations as ground-truth).
78 |
79 |
80 | ## Narrating video clips using LAVILA-Narrator
81 |
82 |
83 | Infer the narrator
84 |
85 | ```bash
86 | python run_with_submitit_infer_narrator.py \
87 | --metadata datasets/Ego4D/ego4d_train.pkl \
88 | --batch-size 64 \
89 | --resume $PATH --use-half \
90 | --nodes 4 --use_volta32
91 | ```
92 |
93 |
94 | It will generate a pickle file (`$output_dir/total.pkl`) which is a list of quintuples - `(video_uid: str, start_time: float, end_time: float, narration_list: List[str], NLL_list: List[float])`.
95 |
96 | For narrator-generated narrations on Ego4D ground-truth clips, we also provide a [replica](https://dl.fbaipublicfiles.com/lavila/metadata/ego4d/ego4d_train.narrator_63690737.return_10.pkl). Note that the narrator used here is our best performing one.
97 |
98 | In addition, we can apply this narrator over the entire video for temporally dense auto-narration. We provide a [replica](https://dl.fbaipublicfiles.com/lavila/metadata/ego4d/ego4d_train.uncovered_all.narrator_63690737.return_5.pkl) (excluding the annotated clips).
99 |
100 | ## Rephrasing human narrations using LAVILA-Rephraser
101 |
102 | Rephraser is a standard LLM that can paraphrase narrations in existing clips.
103 | Specifically, we use an off-the-shelf T5-based paraphraser which is publicly available at [Hugging Face's model hub](https://huggingface.co/ramsrigouthamg/t5-large-paraphraser-diverse-high-quality).
104 | For more details, please refer to the [model card](https://huggingface.co/ramsrigouthamg/t5-large-paraphraser-diverse-high-quality).
105 |
106 | For rephrased human narrations on Ego4D ground-truth clips, we provide a [replica](https://dl.fbaipublicfiles.com/lavila/metadata/ego4d/ego4d_train.rephraser.no_punkt_top3.pkl).
107 |
108 |
109 | ## Pre-training LAVILA Dual-Encoder
110 | Now we are ready to pre-train our LAVILA's dual-encoder by combining human annotations (augmented by Rephraser) and the Narrator-generated narrations.
111 |
112 | Training a LaViLa dual-encoder
113 |
114 | ```bash
115 | python run_with_submitit_pretrain.py --model CLIP_OPENAI_TIMESFORMER_BASE \
116 | --metadata datasets/Ego4D/ego4d_train.rephraser.no_punkt_top3.pkl \
117 | --metadata-aux datasets/Ego4D/ego4d_train.narrator_63690737.return_10.pkl \ # also optionally add `datasets/Ego4D/ego4d_train.uncovered_all.narrator_63690737.return_5.pkl`
118 | --norm-embed --freeze-temperature \
119 | --freeze-pseudo-temperature \
120 | --fix-lr --contrastive-use-vissl \
121 | --nodes 4 --use_volta32
122 | ```
123 |
124 |
125 | ## Down-stream Evaluation
126 | With the pre-trained dual-encoder at hand, we now can do zero-shot or fine-tuning evalution evaluations on down-stream benchmarks.
127 | Please refer to [MODEL_ZOO.md](./MODEL_ZOO.md#zero-shot) for more details.
128 |
--------------------------------------------------------------------------------
/LaViLa/lavila/data/__pycache__/datasets.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/data/__pycache__/datasets.cpython-39.pyc
--------------------------------------------------------------------------------
/LaViLa/lavila/data/__pycache__/video_transforms.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/data/__pycache__/video_transforms.cpython-39.pyc
--------------------------------------------------------------------------------
/LaViLa/lavila/data/video_transforms.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | import math
8 | from typing import Sequence
9 | import torch
10 | import torch.nn as nn
11 | from torchvision import transforms
12 |
13 |
14 | class Permute(nn.Module):
15 | """
16 | Permutation as an op
17 | """
18 |
19 | def __init__(self, ordering):
20 | super().__init__()
21 | self.ordering = ordering
22 |
23 | def forward(self, frames):
24 | """
25 | Args:
26 | frames in some ordering, by default (C, T, H, W)
27 | Returns:
28 | frames in the ordering that was specified
29 | """
30 | return frames.permute(self.ordering)
31 |
32 |
33 | class TemporalCrop(nn.Module):
34 | """
35 | Convert the video into smaller clips temporally.
36 | """
37 |
38 | def __init__(
39 | self, frames_per_clip: int = 8, stride: int = 8, frame_stride: int = 1
40 | ):
41 | super().__init__()
42 | self.frames = frames_per_clip
43 | self.stride = stride
44 | self.frame_stride = frame_stride
45 |
46 | def forward(self, video):
47 | assert video.ndim == 4, "Must be (C, T, H, W)"
48 | res = []
49 | for start in range(
50 | 0, video.size(1) - (self.frames * self.frame_stride) + 1, self.stride
51 | ):
52 | end = start + (self.frames) * self.frame_stride
53 | res.append(video[:, start: end: self.frame_stride, ...])
54 | return res
55 |
56 |
57 | def crop_boxes(boxes, x_offset, y_offset):
58 | """
59 | Peform crop on the bounding boxes given the offsets.
60 | Args:
61 | boxes (ndarray or None): bounding boxes to peform crop. The dimension
62 | is `num boxes` x 4.
63 | x_offset (int): cropping offset in the x axis.
64 | y_offset (int): cropping offset in the y axis.
65 | Returns:
66 | cropped_boxes (ndarray or None): the cropped boxes with dimension of
67 | `num boxes` x 4.
68 | """
69 | cropped_boxes = boxes.copy()
70 | cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
71 | cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
72 |
73 | return cropped_boxes
74 |
75 |
76 | def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None):
77 | """
78 | Perform uniform spatial sampling on the images and corresponding boxes.
79 | Args:
80 | images (tensor): images to perform uniform crop. The dimension is
81 | `num frames` x `channel` x `height` x `width`.
82 | size (int): size of height and weight to crop the images.
83 | spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
84 | is larger than height. Or 0, 1, or 2 for top, center, and bottom
85 | crop if height is larger than width.
86 | boxes (ndarray or None): optional. Corresponding boxes to images.
87 | Dimension is `num boxes` x 4.
88 | scale_size (int): optinal. If not None, resize the images to scale_size before
89 | performing any crop.
90 | Returns:
91 | cropped (tensor): images with dimension of
92 | `num frames` x `channel` x `size` x `size`.
93 | cropped_boxes (ndarray or None): the cropped boxes with dimension of
94 | `num boxes` x 4.
95 | """
96 | assert spatial_idx in [0, 1, 2]
97 | ndim = len(images.shape)
98 | if ndim == 3:
99 | images = images.unsqueeze(0)
100 | height = images.shape[2]
101 | width = images.shape[3]
102 |
103 | if scale_size is not None:
104 | if width <= height:
105 | width, height = scale_size, int(height / width * scale_size)
106 | else:
107 | width, height = int(width / height * scale_size), scale_size
108 | images = torch.nn.functional.interpolate(
109 | images,
110 | size=(height, width),
111 | mode="bilinear",
112 | align_corners=False,
113 | )
114 |
115 | y_offset = int(math.ceil((height - size) / 2))
116 | x_offset = int(math.ceil((width - size) / 2))
117 |
118 | if height > width:
119 | if spatial_idx == 0:
120 | y_offset = 0
121 | elif spatial_idx == 2:
122 | y_offset = height - size
123 | else:
124 | if spatial_idx == 0:
125 | x_offset = 0
126 | elif spatial_idx == 2:
127 | x_offset = width - size
128 | cropped = images[:, :, y_offset: y_offset + size, x_offset: x_offset + size]
129 | cropped_boxes = crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
130 | if ndim == 3:
131 | cropped = cropped.squeeze(0)
132 | return cropped, cropped_boxes
133 |
134 |
135 | class SpatialCrop(nn.Module):
136 | """
137 | Convert the video into 3 smaller clips spatially. Must be used after the
138 | temporal crops to get spatial crops, and should be used with
139 | -2 in the spatial crop at the slowfast augmentation stage (so full
140 | frames are passed in here). Will return a larger list with the
141 | 3x spatial crops as well. It's useful for 3x4 testing (eg in SwinT)
142 | or 3x10 testing in SlowFast etc.
143 | """
144 |
145 | def __init__(self, crop_size: int = 224, num_crops: int = 3):
146 | super().__init__()
147 | self.crop_size = crop_size
148 | if num_crops == 6:
149 | self.crops_to_ext = [0, 1, 2]
150 | # I guess Swin uses 5 crops without flipping, but that doesn't
151 | # make sense given they first resize to 224 and take 224 crops.
152 | # (pg 6 of https://arxiv.org/pdf/2106.13230.pdf)
153 | # So I'm assuming we can use flipped crops and that will add sth..
154 | self.flipped_crops_to_ext = [0, 1, 2]
155 | elif num_crops == 3:
156 | self.crops_to_ext = [0, 1, 2]
157 | self.flipped_crops_to_ext = []
158 | elif num_crops == 1:
159 | self.crops_to_ext = [1]
160 | self.flipped_crops_to_ext = []
161 | else:
162 | raise NotImplementedError(
163 | "Nothing else supported yet, "
164 | "slowfast only takes 0, 1, 2 as arguments"
165 | )
166 |
167 | def forward(self, videos: Sequence[torch.Tensor]):
168 | """
169 | Args:
170 | videos: A list of C, T, H, W videos.
171 | Returns:
172 | videos: A list with 3x the number of elements. Each video converted
173 | to C, T, H', W' by spatial cropping.
174 | """
175 | assert isinstance(videos, list), "Must be a list of videos after temporal crops"
176 | assert all([video.ndim == 4 for video in videos]), "Must be (C,T,H,W)"
177 | res = []
178 | for video in videos:
179 | for spatial_idx in self.crops_to_ext:
180 | res.append(uniform_crop(video, self.crop_size, spatial_idx)[0])
181 | if not self.flipped_crops_to_ext:
182 | continue
183 | flipped_video = transforms.functional.hflip(video)
184 | for spatial_idx in self.flipped_crops_to_ext:
185 | res.append(uniform_crop(flipped_video, self.crop_size, spatial_idx)[0])
186 | return res
187 |
--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/coca.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/coca.cpython-39.pyc
--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/distributed_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/distributed_utils.cpython-39.pyc
--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/gpt2_gated.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/gpt2_gated.cpython-39.pyc
--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/loss.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/loss.cpython-39.pyc
--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/models.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/models.cpython-39.pyc
--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/narrator.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/narrator.cpython-39.pyc
--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/openai_clip.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/openai_clip.cpython-39.pyc
--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/openai_model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/openai_model.cpython-39.pyc
--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/timesformer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/timesformer.cpython-39.pyc
--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/tokenizer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/tokenizer.cpython-39.pyc
--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/utils.cpython-39.pyc
--------------------------------------------------------------------------------
/LaViLa/lavila/models/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/bpe_simple_vocab_16e6.txt.gz
--------------------------------------------------------------------------------
/LaViLa/lavila/models/coca.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | # Part of the code is from https://github.com/lucidrains/CoCa-pytorch/blob/main/coca_pytorch/coca_pytorch.py
8 | # Modified by Yue Zhao
9 | # The original code is under MIT License
10 |
11 | import torch
12 | import torch.nn as nn
13 | import torch.nn.functional as F
14 | from torch import einsum
15 | from einops import rearrange
16 |
17 |
18 | def exists(val):
19 | return val is not None
20 |
21 |
22 | def default(val, d):
23 | return val if exists(val) else d
24 |
25 |
26 | # normalization
27 | # they use layernorm without bias, something that pytorch does not offer
28 | class LayerNorm(nn.Module):
29 | def __init__(self, dim):
30 | super().__init__()
31 | self.gamma = nn.Parameter(torch.ones(dim))
32 | self.register_buffer("beta", torch.zeros(dim))
33 |
34 | def forward(self, x):
35 | return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta)
36 |
37 |
38 | class Residual(nn.Module):
39 | def __init__(self, fn):
40 | super().__init__()
41 | self.fn = fn
42 |
43 | def forward(self, x, *args, **kwargs):
44 | return self.fn(x, *args, **kwargs) + x
45 |
46 |
47 | # classic Noam Shazeer paper, except here they use SwiGLU instead of the more popular GEGLU for gating the feedforward
48 | # https://arxiv.org/abs/2002.05202
49 | class SwiGLU(nn.Module):
50 | def forward(self, x):
51 | x, gate = x.chunk(2, dim=-1)
52 | return F.silu(gate) * x
53 |
54 |
55 | class CrossAttention(nn.Module):
56 | def __init__(
57 | self,
58 | dim,
59 | *,
60 | context_dim=None,
61 | dim_head=64,
62 | heads=8,
63 | parallel_ff=False,
64 | ff_mult=4,
65 | norm_context=False
66 | ):
67 | super().__init__()
68 | self.heads = heads
69 | self.scale = dim_head ** -0.5
70 | inner_dim = heads * dim_head
71 | context_dim = default(context_dim, dim)
72 |
73 | self.norm = LayerNorm(dim)
74 | self.context_norm = LayerNorm(context_dim) if norm_context else nn.Identity()
75 |
76 | self.to_q = nn.Linear(dim, inner_dim, bias=False)
77 | self.to_kv = nn.Linear(context_dim, dim_head * 2, bias=False)
78 | self.to_out = nn.Linear(inner_dim, dim, bias=False)
79 |
80 | # whether to have parallel feedforward
81 |
82 | ff_inner_dim = ff_mult * dim
83 |
84 | self.ff = nn.Sequential(
85 | nn.Linear(dim, ff_inner_dim * 2, bias=False),
86 | SwiGLU(),
87 | nn.Linear(ff_inner_dim, dim, bias=False)
88 | ) if parallel_ff else None
89 |
90 | def forward(self, x, context):
91 | """
92 | einstein notation
93 | b - batch
94 | h - heads
95 | n, i, j - sequence length (base sequence length, source, target)
96 | d - feature dimension
97 | """
98 |
99 | # pre-layernorm, for queries and context
100 | x = self.norm(x)
101 | context = self.context_norm(context)
102 |
103 | # get queries
104 | q = self.to_q(x)
105 | q = rearrange(q, 'b n (h d) -> b h n d', h=self.heads)
106 |
107 | # scale
108 | q = q * self.scale
109 |
110 | # get key / values
111 | k, v = self.to_kv(context).chunk(2, dim=-1)
112 |
113 | # query / key similarity
114 | sim = einsum('b h i d, b j d -> b h i j', q, k)
115 |
116 | # attention
117 | sim = sim - sim.amax(dim=-1, keepdim=True)
118 | attn = sim.softmax(dim=-1)
119 |
120 | # aggregate
121 | out = einsum('b h i j, b j d -> b h i d', attn, v)
122 |
123 | # merge and combine heads
124 | out = rearrange(out, 'b h n d -> b n (h d)')
125 | out = self.to_out(out)
126 |
127 | # add parallel feedforward (for multimodal layers)
128 | if exists(self.ff):
129 | out = out + self.ff(x)
130 |
131 | return out
132 |
--------------------------------------------------------------------------------
/LaViLa/lavila/models/distributed_utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Facebook, Inc. and its affiliates.
2 |
3 | # This source code is licensed under the MIT license found in the
4 | # LICENSE file in the root directory of this source tree.
5 |
6 | # Part of the code is from
7 | # `https://github.com/facebookresearch/vissl/blob/main/vissl/utils/distributed_utils.py` and
8 | # `https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/generic/distributed_util.py`
9 | # Modified by Yue Zhao
10 | # The original code is under MIT License
11 |
12 | import torch
13 | import torch.distributed as dist
14 | from typing import Tuple
15 |
16 |
17 | def convert_to_distributed_tensor(tensor: torch.Tensor) -> Tuple[torch.Tensor, str]:
18 | """
19 | For some backends, such as NCCL, communication only works if the
20 | tensor is on the GPU. This helper function converts to the correct
21 | device and returns the tensor + original device.
22 | """
23 | orig_device = "cpu" if not tensor.is_cuda else "gpu"
24 | if (
25 | torch.distributed.is_available()
26 | and torch.distributed.get_backend() == torch.distributed.Backend.NCCL
27 | and not tensor.is_cuda
28 | ):
29 | tensor = tensor.cuda()
30 | return (tensor, orig_device)
31 |
32 |
33 | def convert_to_normal_tensor(tensor: torch.Tensor, orig_device: str) -> torch.Tensor:
34 | """
35 | For some backends, such as NCCL, communication only works if the
36 | tensor is on the GPU. This converts the tensor back to original device.
37 | """
38 | if tensor.is_cuda and orig_device == "cpu":
39 | tensor = tensor.cpu()
40 | return tensor
41 |
42 |
43 | def is_distributed_training_run() -> bool:
44 | return (
45 | torch.distributed.is_available()
46 | and torch.distributed.is_initialized()
47 | and (torch.distributed.get_world_size() > 1)
48 | )
49 |
50 |
51 | class GatherLayer(torch.autograd.Function):
52 | """
53 | Gather tensors from all workers with support for backward propagation:
54 | This implementation does not cut the gradients as torch.distributed.all_gather does.
55 | """
56 |
57 | @staticmethod
58 | def forward(ctx, x):
59 | output = [torch.zeros_like(x) for _ in range(dist.get_world_size())]
60 | dist.all_gather(output, x)
61 | return tuple(output)
62 |
63 | @staticmethod
64 | def backward(ctx, *grads):
65 | all_gradients = torch.stack(grads)
66 | dist.all_reduce(all_gradients)
67 | return all_gradients[dist.get_rank()]
68 |
69 |
70 | def gather_from_all(tensor: torch.Tensor) -> torch.Tensor:
71 | """
72 | Similar to classy_vision.generic.distributed_util.gather_from_all
73 | except that it does not cut the gradients
74 | """
75 | if tensor.ndim == 0:
76 | # 0 dim tensors cannot be gathered. so unsqueeze
77 | tensor = tensor.unsqueeze(0)
78 |
79 | if is_distributed_training_run():
80 | tensor, orig_device = convert_to_distributed_tensor(tensor)
81 | gathered_tensors = GatherLayer.apply(tensor)
82 | gathered_tensors = [
83 | convert_to_normal_tensor(_tensor, orig_device)
84 | for _tensor in gathered_tensors
85 | ]
86 | else:
87 | gathered_tensors = [tensor]
88 | gathered_tensor = torch.cat(gathered_tensors, 0)
89 | return gathered_tensor
90 |
--------------------------------------------------------------------------------
/LaViLa/lavila/models/utils.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | from collections import OrderedDict
8 | import functools
9 | import torch
10 | import torch.nn.functional as F
11 |
12 |
13 | def inflate_positional_embeds(
14 | current_model_state_dict, new_state_dict,
15 | num_frames=4,
16 | load_temporal_fix='bilinear',
17 | ):
18 | # allow loading of timesformer with fewer num_frames
19 | curr_keys = list(current_model_state_dict.keys())
20 | if 'visual.temporal_embed' in new_state_dict and 'visual.temporal_embed' in curr_keys:
21 | load_temporal_embed = new_state_dict['visual.temporal_embed']
22 | load_num_frames = load_temporal_embed.shape[1]
23 | curr_num_frames = num_frames
24 | embed_dim = load_temporal_embed.shape[2]
25 |
26 | if load_num_frames != curr_num_frames:
27 | if load_num_frames > curr_num_frames:
28 | print(f'### loaded SpaceTimeTransformer model has MORE frames than current...'
29 | f'### loading weights, filling in the extras via {load_temporal_fix}')
30 | new_temporal_embed = load_temporal_embed[:, :curr_num_frames, :]
31 | else:
32 | print(f'### loaded SpaceTimeTransformer model has FEWER frames than current...'
33 | f'### loading weights, filling in the extras via {load_temporal_fix}')
34 | if load_temporal_fix == 'zeros':
35 | new_temporal_embed = torch.zeros([load_temporal_embed.shape[0], curr_num_frames, embed_dim])
36 | new_temporal_embed[:, :load_num_frames] = load_temporal_embed
37 | elif load_temporal_fix in ['interp', 'bilinear']:
38 | # interpolate
39 | # unsqueeze so pytorch thinks its an image
40 | mode = 'nearest'
41 | if load_temporal_fix == 'bilinear':
42 | mode = 'bilinear'
43 | load_temporal_embed = load_temporal_embed.unsqueeze(0)
44 | new_temporal_embed = F.interpolate(load_temporal_embed,
45 | (curr_num_frames, embed_dim), mode=mode).squeeze(0)
46 | else:
47 | raise NotImplementedError
48 | new_state_dict['visual.temporal_embed'] = new_temporal_embed
49 | # allow loading with smaller spatial patches. assumes custom border crop, to append the
50 | # border patches to the input sequence
51 | if 'visual.pos_embed' in new_state_dict and 'visual.pos_embed' in curr_keys:
52 | load_pos_embed = new_state_dict['visual.pos_embed']
53 | load_num_patches = load_pos_embed.shape[1]
54 | curr_pos_embed = current_model_state_dict['visual.pos_embed']
55 | if load_num_patches != curr_pos_embed.shape[1]:
56 | raise NotImplementedError(
57 | 'Loading models with different spatial resolution / patch number not yet implemented, sorry.')
58 |
59 | return new_state_dict
60 |
61 |
62 | def rsetattr(obj, attr, val):
63 | pre, _, post = attr.rpartition('.')
64 | return setattr(rgetattr(obj, pre) if pre else obj, post, val)
65 |
66 |
67 | def rgetattr(obj, attr, *args):
68 | def _getattr(obj, attr):
69 | return getattr(obj, attr, *args)
70 | return functools.reduce(_getattr, [obj] + attr.split('.'))
71 |
72 |
73 | # util functions to convert CLIP-style model keys to TimeSformer-style
74 | def remap_keys(clip_state_dict, transformer_layers=12):
75 | remapped_state_dict = OrderedDict()
76 | key_mapping = {
77 | "class_embedding": "cls_token",
78 | "positional_embedding": "pos_embed",
79 | "conv1.weight": "patch_embed.proj.weight",
80 | "ln_pre.weight": "ln_pre.weight",
81 | "ln_pre.bias": "ln_pre.bias",
82 | "ln_post.weight": "norm.weight",
83 | "ln_post.bias": "norm.bias",
84 | }
85 | for layer in range(transformer_layers):
86 | key_mapping[f"transformer.resblocks.{layer}.attn.in_proj_weight"] = f"blocks.{layer}.attn.qkv.weight"
87 | key_mapping[f"transformer.resblocks.{layer}.attn.in_proj_bias"] = f"blocks.{layer}.attn.qkv.bias"
88 | key_mapping[f"transformer.resblocks.{layer}.attn.out_proj.weight"] = f"blocks.{layer}.attn.proj.weight"
89 | key_mapping[f"transformer.resblocks.{layer}.attn.out_proj.bias"] = f"blocks.{layer}.attn.proj.bias"
90 | key_mapping[f"transformer.resblocks.{layer}.ln_1.weight"] = f"blocks.{layer}.norm1.weight"
91 | key_mapping[f"transformer.resblocks.{layer}.ln_1.bias"] = f"blocks.{layer}.norm1.bias"
92 | key_mapping[f"transformer.resblocks.{layer}.mlp.c_fc.weight"] = f"blocks.{layer}.mlp.fc1.weight"
93 | key_mapping[f"transformer.resblocks.{layer}.mlp.c_fc.bias"] = f"blocks.{layer}.mlp.fc1.bias"
94 | key_mapping[f"transformer.resblocks.{layer}.mlp.c_proj.weight"] = f"blocks.{layer}.mlp.fc2.weight"
95 | key_mapping[f"transformer.resblocks.{layer}.mlp.c_proj.bias"] = f"blocks.{layer}.mlp.fc2.bias"
96 | key_mapping[f"transformer.resblocks.{layer}.ln_2.weight"] = f"blocks.{layer}.norm2.weight"
97 | key_mapping[f"transformer.resblocks.{layer}.ln_2.bias"] = f"blocks.{layer}.norm2.bias"
98 |
99 | for key in clip_state_dict:
100 | if key == 'proj':
101 | continue # due to possible dim mismatch, we load this later
102 | if key == "class_embedding":
103 | clip_state_dict[key] = clip_state_dict[key].unsqueeze(0).unsqueeze(0)
104 | if key == "positional_embedding":
105 | clip_state_dict[key] = clip_state_dict[key].unsqueeze(0)
106 | remapped_state_dict[key_mapping[key]] = clip_state_dict[key]
107 |
108 | return remapped_state_dict
109 |
--------------------------------------------------------------------------------
/LaViLa/lavila/utils/__pycache__/distributed.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/utils/__pycache__/distributed.cpython-39.pyc
--------------------------------------------------------------------------------
/LaViLa/lavila/utils/__pycache__/preprocess.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/utils/__pycache__/preprocess.cpython-39.pyc
--------------------------------------------------------------------------------
/LaViLa/lavila/utils/distributed.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | import os
8 | import shutil
9 | import torch
10 | import torch.distributed as dist
11 |
12 |
13 | def get_model(model):
14 | if isinstance(model, torch.nn.DataParallel) \
15 | or isinstance(model, torch.nn.parallel.DistributedDataParallel):
16 | return model.module
17 | else:
18 | return model
19 |
20 |
21 | def setup_for_distributed(is_master):
22 | """
23 | This function disables printing when not in master process
24 | """
25 | import builtins as __builtin__
26 | builtin_print = __builtin__.print
27 |
28 | def print(*args, **kwargs):
29 | force = kwargs.pop('force', False)
30 | if is_master or force:
31 | builtin_print(*args, **kwargs)
32 |
33 | __builtin__.print = print
34 |
35 |
36 | def is_dist_avail_and_initialized():
37 | if not dist.is_available():
38 | return False
39 | if not dist.is_initialized():
40 | return False
41 | return True
42 |
43 |
44 | def get_world_size():
45 | if not is_dist_avail_and_initialized():
46 | return 1
47 | else:
48 | return dist.get_world_size()
49 |
50 |
51 | def get_rank():
52 | if not is_dist_avail_and_initialized():
53 | return 0
54 | return dist.get_rank()
55 |
56 |
57 | def is_main_process():
58 | return get_rank() == 0
59 |
60 |
61 | def save_on_master(state, is_best, output_dir, is_epoch=True):
62 | if is_main_process():
63 | ckpt_path = f'{output_dir}/checkpoint.pt'
64 | best_path = f'{output_dir}/checkpoint_best.pt'
65 | if is_best:
66 | torch.save(state, best_path)
67 | if is_epoch:
68 | if isinstance(state['epoch'], int):
69 | ckpt2_path = '{}/checkpoint_{:04d}.pt'.format(output_dir, state['epoch'])
70 | else:
71 | ckpt2_path = '{}/checkpoint_{:.4f}.pt'.format(output_dir, state['epoch'])
72 | torch.save(state, ckpt_path)
73 | shutil.copy(ckpt_path, ckpt2_path)
74 |
75 |
76 | def init_distributed_mode(args):
77 | if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
78 | args.rank = int(os.environ["RANK"])
79 | args.world_size = int(os.environ['WORLD_SIZE'])
80 | args.gpu = int(os.environ['LOCAL_RANK'])
81 | elif 'SLURM_PROCID' in os.environ:
82 | args.rank = int(os.environ['SLURM_PROCID'])
83 | args.gpu = args.rank % torch.cuda.device_count()
84 | else:
85 | print('Not using distributed mode')
86 | args.distributed = False
87 | return
88 |
89 | args.distributed = True
90 |
91 | torch.cuda.set_device(args.gpu)
92 | args.dist_backend = 'nccl'
93 | print('| distributed init (rank {}): {}'.format(
94 | args.rank, args.dist_url), flush=True)
95 | torch.distributed.init_process_group(
96 | backend=args.dist_backend,
97 | init_method=args.dist_url,
98 | world_size=args.world_size,
99 | rank=args.rank
100 | )
101 | torch.distributed.barrier()
102 | setup_for_distributed(args.rank == 0)
103 |
--------------------------------------------------------------------------------
/LaViLa/lavila/utils/evaluation.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | import numpy as np
8 | import torch
9 |
10 |
11 | def accuracy(output, target, topk=(1,)):
12 | """Computes the accuracy over the k top predictions for the specified values of k"""
13 | with torch.no_grad():
14 | maxk = max(topk)
15 | batch_size = target.size(0)
16 |
17 | _, pred = output.topk(maxk, 1, True, True)
18 | pred = pred.t()
19 | correct = pred.eq(target.reshape(1, -1).expand_as(pred))
20 |
21 | res = []
22 | for k in topk:
23 | correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
24 | res.append(correct_k.mul_(100.0 / batch_size))
25 | return res
26 |
27 |
28 | def get_mean_accuracy(cm):
29 | list_acc = []
30 | for i in range(len(cm)):
31 | acc = 0
32 | if cm[i, :].sum() > 0:
33 | acc = cm[i, i] / cm[i, :].sum()
34 | list_acc.append(acc)
35 |
36 | return 100 * np.mean(list_acc), 100 * np.trace(cm) / np.sum(cm)
37 |
--------------------------------------------------------------------------------
/LaViLa/lavila/utils/evaluation_charades.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | import numpy as np
8 |
9 |
10 | def compute_map(submission_array, gt_array):
11 | """ Returns mAP, weighted mAP, and AP array """
12 | m_aps = []
13 | n_classes = submission_array.shape[1]
14 | for oc_i in range(n_classes):
15 | sorted_idxs = np.argsort(-submission_array[:, oc_i])
16 | tp = gt_array[:, oc_i][sorted_idxs] == 1
17 | fp = np.invert(tp)
18 | n_pos = tp.sum()
19 | if n_pos < 0.1:
20 | m_aps.append(float('nan'))
21 | continue
22 | fp.sum()
23 | f_pcs = np.cumsum(fp)
24 | t_pcs = np.cumsum(tp)
25 | prec = t_pcs / (f_pcs+t_pcs).astype(float)
26 | avg_prec = 0
27 | for i in range(submission_array.shape[0]):
28 | if tp[i]:
29 | avg_prec += prec[i]
30 | m_aps.append(avg_prec / n_pos.astype(float))
31 | m_aps = np.array(m_aps)
32 | m_ap = np.mean(m_aps)
33 | w_ap = (m_aps * gt_array.sum(axis=0) / gt_array.sum().sum().astype(float))
34 | return m_ap, w_ap, m_aps
35 |
36 |
37 | def charades_map(submission_array, gt_array):
38 | """
39 | Approximate version of the charades evaluation function
40 | For precise numbers, use the submission file with the official matlab script
41 | """
42 | fix = submission_array.copy()
43 | empty = np.sum(gt_array, axis=1) == 0
44 | fix[empty, :] = np.NINF
45 | return compute_map(fix, gt_array)
46 |
47 |
48 | def create_submission(video_list, predictions, out_file):
49 | assert len(video_list) == predictions.shape[0]
50 | with open(out_file, 'w') as f:
51 | for i, video_id in enumerate(video_list):
52 | pred_str = ' '.join(map(lambda x: str(x), predictions[i].tolist()))
53 | f.write('{} {}\n\n'.format(video_id, pred_str))
54 |
--------------------------------------------------------------------------------
/LaViLa/lavila/utils/evaluation_egomcq.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | import torch
8 |
9 |
10 | def egomcq_accuracy_metrics(preds, labels, types):
11 | metrics = {}
12 | type_list = torch.unique(types)
13 | group_list = ["Intra-video", "Inter-video"]
14 | for type_i, group_i in zip(type_list, group_list):
15 | correct = 0
16 | total = 0
17 | for pred, label, type in zip(preds, labels, types):
18 | if type == type_i:
19 | pred_ = torch.argmax(pred)
20 | if pred_.item() == label.item():
21 | correct += 1
22 | total += 1
23 | accuracy = correct/total
24 | metrics[group_i] = accuracy * 100
25 | return metrics
26 |
--------------------------------------------------------------------------------
/LaViLa/lavila/utils/evaluation_ek100cls.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | # Part of the code is from https://github.com/fpv-iplab/rulstm/blob/master/RULSTM/utils.py
8 | # Modified by Yue Zhao
9 |
10 | import numpy as np
11 |
12 |
13 | def get_marginal_indexes(actions, mode):
14 | """For each verb/noun retrieve the list of actions containing that verb/name
15 | Input:
16 | mode: "verb" or "noun"
17 | Output:
18 | a list of numpy array of indexes. If verb/noun 3 is contained in actions 2,8,19,
19 | then output[3] will be np.array([2,8,19])
20 | """
21 | vi = []
22 | for v in range(actions[mode].max()+1):
23 | vals = actions[actions[mode] == v].index.values
24 | if len(vals) > 0:
25 | vi.append(vals)
26 | else:
27 | vi.append(np.array([0]))
28 | return vi
29 |
30 |
31 | def marginalize(probs, indexes):
32 | mprobs = []
33 | for ilist in indexes:
34 | mprobs.append(probs[:, ilist].sum(1))
35 | return np.array(mprobs).T
36 |
--------------------------------------------------------------------------------
/LaViLa/lavila/utils/meter.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | import torch
8 | import torch.distributed as dist
9 | from lavila.utils import distributed as dist_utils
10 |
11 |
12 | class AverageMeter(object):
13 | """Computes and stores the average and current value"""
14 | def __init__(self, name, fmt=':f'):
15 | self.name = name
16 | self.fmt = fmt
17 | self.reset()
18 |
19 | def reset(self):
20 | self.val = 0
21 | self.avg = 0
22 | self.sum = 0
23 | self.count = 0
24 |
25 | def update(self, val, n=1):
26 | self.val = val
27 | self.sum += val * n
28 | self.count += n
29 | self.avg = self.sum / self.count
30 |
31 | def synchronize(self):
32 | if not dist_utils.is_dist_avail_and_initialized():
33 | return
34 | t = torch.tensor([self.sum, self.count], dtype=torch.float64, device='cuda')
35 | dist.barrier()
36 | dist.all_reduce(t)
37 | t = t.tolist()
38 | self.sum = int(t[0])
39 | self.count = t[1]
40 | self.avg = self.sum / self.count
41 |
42 | def __str__(self):
43 | fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
44 | return fmtstr.format(**self.__dict__)
45 |
46 |
47 | class ProgressMeter(object):
48 | def __init__(self, num_batches, meters, prefix=""):
49 | self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
50 | self.meters = meters
51 | self.prefix = prefix
52 |
53 | def display(self, batch):
54 | entries = [self.prefix + self.batch_fmtstr.format(batch)]
55 | entries += [str(meter) for meter in self.meters]
56 | print('\t'.join(entries))
57 |
58 | def synchronize(self):
59 | for meter in self.meters:
60 | meter.synchronize()
61 |
62 | def _get_batch_fmtstr(self, num_batches):
63 | num_digits = len(str(num_batches // 1))
64 | fmt = '{:' + str(num_digits) + 'd}'
65 | return '[' + fmt + '/' + fmt.format(num_batches) + ']'
66 |
--------------------------------------------------------------------------------
/LaViLa/lavila/utils/preprocess.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | import csv
8 |
9 | from lavila.models.tokenizer import MyBertTokenizer, MyDistilBertTokenizer, MyGPT2Tokenizer, SimpleTokenizer
10 |
11 |
12 | def generate_label_map(dataset):
13 | if dataset == 'ek100_cls':
14 | print("Preprocess ek100 action label space")
15 | vn_list = []
16 | mapping_vn2narration = {}
17 | for f in [
18 | 'datasets/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv',
19 | 'datasets/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv',
20 | ]:
21 | csv_reader = csv.reader(open(f))
22 | _ = next(csv_reader) # skip the header
23 | for row in csv_reader:
24 | vn = '{}:{}'.format(int(row[10]), int(row[12]))
25 | narration = row[8]
26 | if vn not in vn_list:
27 | vn_list.append(vn)
28 | if vn not in mapping_vn2narration:
29 | mapping_vn2narration[vn] = [narration]
30 | else:
31 | mapping_vn2narration[vn].append(narration)
32 | # mapping_vn2narration[vn] = [narration]
33 | vn_list = sorted(vn_list)
34 | print('# of action= {}'.format(len(vn_list)))
35 | mapping_vn2act = {vn: i for i, vn in enumerate(vn_list)}
36 | labels = [list(set(mapping_vn2narration[vn_list[i]])) for i in range(len(mapping_vn2act))]
37 | print(labels[:5])
38 | elif dataset == 'charades_ego':
39 | print("=> preprocessing charades_ego action label space")
40 | vn_list = []
41 | labels = []
42 | with open('datasets/CharadesEgo/CharadesEgo/Charades_v1_classes.txt') as f:
43 | csv_reader = csv.reader(f)
44 | for row in csv_reader:
45 | vn = row[0][:4]
46 | vn_list.append(vn)
47 | narration = row[0][5:]
48 | labels.append(narration)
49 | mapping_vn2act = {vn: i for i, vn in enumerate(vn_list)}
50 | print(labels[:5])
51 | elif dataset == 'egtea':
52 | print("=> preprocessing egtea action label space")
53 | labels = []
54 | with open('datasets/EGTEA/action_idx.txt') as f:
55 | for row in f:
56 | row = row.strip()
57 | narration = ' '.join(row.split(' ')[:-1])
58 | labels.append(narration.replace('_', ' ').lower())
59 | # labels.append(narration)
60 | mapping_vn2act = {label: i for i, label in enumerate(labels)}
61 | print(len(labels), labels[:5])
62 | else:
63 | raise NotImplementedError
64 | return labels, mapping_vn2act
65 |
66 |
67 | def generate_tokenizer(model):
68 | if model.endswith('DISTILBERT_BASE'):
69 | tokenizer = MyDistilBertTokenizer('distilbert-base-uncased')
70 | elif model.endswith('BERT_BASE'):
71 | tokenizer = MyBertTokenizer('bert-base-uncased')
72 | elif model.endswith('BERT_LARGE'):
73 | tokenizer = MyBertTokenizer('bert-large-uncased')
74 | elif model.endswith('GPT2'):
75 | tokenizer = MyGPT2Tokenizer('gpt2', add_bos=True)
76 | elif model.endswith('GPT2_MEDIUM'):
77 | tokenizer = MyGPT2Tokenizer('gpt2-medium', add_bos=True)
78 | elif model.endswith('GPT2_LARGE'):
79 | tokenizer = MyGPT2Tokenizer('gpt2-large', add_bos=True)
80 | elif model.endswith('GPT2_XL'):
81 | tokenizer = MyGPT2Tokenizer('gpt2-xl', add_bos=True)
82 | else:
83 | print("Using SimpleTokenizer because of model '{}'. "
84 | "Please check if this is what you want".format(model))
85 | tokenizer = SimpleTokenizer()
86 | return tokenizer
87 |
--------------------------------------------------------------------------------
/LaViLa/lavila/utils/random.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | import random
8 | import numpy as np
9 | import torch
10 |
11 |
12 | def random_seed(seed=42, rank=0):
13 | torch.manual_seed(seed + rank)
14 | np.random.seed(seed + rank)
15 | random.seed(seed + rank)
16 |
--------------------------------------------------------------------------------
/LaViLa/lavila/utils/scheduler.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 |
7 | import numpy as np
8 |
9 |
10 | def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warmup_epochs=0, start_warmup_value=0):
11 | warmup_schedule = np.array([])
12 | warmup_iters = warmup_epochs * niter_per_ep
13 | if warmup_epochs > 0:
14 | warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
15 |
16 | iters = np.arange(epochs * niter_per_ep - warmup_iters)
17 | schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters)))
18 |
19 | schedule = np.concatenate((warmup_schedule, schedule))
20 | assert len(schedule) == epochs * niter_per_ep
21 | return schedule
22 |
--------------------------------------------------------------------------------
/LaViLa/requirements.txt:
--------------------------------------------------------------------------------
1 | timm==0.5.4
2 | torch==1.10.1
3 | torchvision==0.11.2
4 | decord==0.6.0
5 | einops==0.4.1
6 | pandas==1.4.2
7 | pytorchvideo==0.1.5
8 | transformers==4.27
9 | ftfy==4.4.3
10 | spacy==3.4.1
11 | scikit-learn==1.1.1
12 | git+https://github.com/Maluuba/nlg-eval.git@master
13 |
--------------------------------------------------------------------------------
/LaViLa/run_with_submitit_finetune_classification.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """
7 | A script to run multinode training with submitit.
8 | """
9 | import argparse
10 | import os
11 | import uuid
12 | from pathlib import Path
13 |
14 | import main_finetune_classification as main_finetune
15 | import submitit
16 |
17 |
18 | def parse_args():
19 | parser = main_finetune.get_args_parser()
20 | parser = argparse.ArgumentParser("Submitit for lavila fine-tuning", parents=[parser])
21 | parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node")
22 | parser.add_argument("--nodes", default=8, type=int, help="Number of nodes to request")
23 | parser.add_argument("--timeout", default=2880, type=int, help="Duration of the job")
24 | parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.")
25 |
26 | parser.add_argument("--partition", default="learnlab", type=str, help="Partition where to submit")
27 | parser.add_argument("--use_volta32", action='store_true', help="Big models? Use this")
28 | parser.add_argument('--comment', default="", type=str,
29 | help='Comment to pass to scheduler, e.g. priority message')
30 | return parser.parse_args()
31 |
32 |
33 | def get_shared_folder() -> Path:
34 | user = os.getenv("USER")
35 | if Path("/checkpoint/").is_dir():
36 | p = Path(f"/checkpoint/{user}/experiments/lavila_ft")
37 | p.mkdir(exist_ok=True)
38 | return p
39 | raise RuntimeError("No shared folder available")
40 |
41 |
42 | def get_init_file():
43 | # Init file must not exist, but it's parent dir must exist.
44 | os.makedirs(str(get_shared_folder()), exist_ok=True)
45 | init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init"
46 | if init_file.exists():
47 | os.remove(str(init_file))
48 | return init_file
49 |
50 |
51 | class Trainer(object):
52 | def __init__(self, args):
53 | self.args = args
54 |
55 | def __call__(self):
56 | import main_finetune_classification as main_finetune
57 |
58 | self._setup_gpu_args()
59 | main_finetune.main(self.args)
60 |
61 | def checkpoint(self):
62 | import submitit
63 |
64 | self.args.dist_url = get_init_file().as_uri()
65 | print("Requeuing ", self.args)
66 | empty_trainer = type(self)(self.args)
67 | return submitit.helpers.DelayedSubmission(empty_trainer)
68 |
69 | def _setup_gpu_args(self):
70 | import submitit
71 | from pathlib import Path
72 |
73 | job_env = submitit.JobEnvironment()
74 | self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id)))
75 | self.args.gpu = job_env.local_rank
76 | self.args.rank = job_env.global_rank
77 | self.args.world_size = job_env.num_tasks
78 | print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
79 |
80 |
81 | def main():
82 | args = parse_args()
83 | if args.job_dir == "":
84 | args.job_dir = get_shared_folder() / "%j"
85 |
86 | # Note that the folder will depend on the job_id, to easily track experiments
87 | executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30)
88 |
89 | num_gpus_per_node = args.ngpus
90 | nodes = args.nodes
91 | timeout_min = args.timeout
92 |
93 | partition = args.partition
94 | kwargs = {}
95 | if args.use_volta32:
96 | kwargs['slurm_constraint'] = 'volta32gb'
97 | if args.comment:
98 | kwargs['slurm_comment'] = args.comment
99 |
100 | executor.update_parameters(
101 | mem_gb=40 * num_gpus_per_node,
102 | gpus_per_node=num_gpus_per_node,
103 | tasks_per_node=num_gpus_per_node, # one task per GPU
104 | cpus_per_task=10,
105 | nodes=nodes,
106 | timeout_min=timeout_min, # max is 60 * 72
107 | # Below are cluster dependent parameters
108 | slurm_partition=partition,
109 | slurm_signal_delay_s=120,
110 | **kwargs
111 | )
112 |
113 | executor.update_parameters(name="lavila_ft")
114 |
115 | args.dist_url = get_init_file().as_uri()
116 | args.output_dir = args.job_dir
117 |
118 | trainer = Trainer(args)
119 | job = executor.submit(trainer)
120 |
121 | print("Submitted job_id:", job.job_id)
122 |
123 |
124 | if __name__ == "__main__":
125 | main()
126 |
--------------------------------------------------------------------------------
/LaViLa/run_with_submitit_finetune_retrieval.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """
7 | A script to run multinode training with submitit.
8 | """
9 | import argparse
10 | import os
11 | import uuid
12 | from pathlib import Path
13 |
14 | import main_finetune_retrieval as main_finetune
15 | import submitit
16 |
17 |
18 | def parse_args():
19 | parser = main_finetune.get_args_parser()
20 | parser = argparse.ArgumentParser("Submitit for lavila fine-tuning", parents=[parser])
21 | parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node")
22 | parser.add_argument("--nodes", default=8, type=int, help="Number of nodes to request")
23 | parser.add_argument("--timeout", default=2880, type=int, help="Duration of the job")
24 | parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.")
25 |
26 | parser.add_argument("--partition", default="learnlab", type=str, help="Partition where to submit")
27 | parser.add_argument("--use_volta32", action='store_true', help="Big models? Use this")
28 | parser.add_argument('--comment', default="", type=str,
29 | help='Comment to pass to scheduler, e.g. priority message')
30 | return parser.parse_args()
31 |
32 |
33 | def get_shared_folder() -> Path:
34 | user = os.getenv("USER")
35 | if Path("/checkpoint/").is_dir():
36 | p = Path(f"/checkpoint/{user}/experiments/lavila_ft")
37 | p.mkdir(exist_ok=True)
38 | return p
39 | raise RuntimeError("No shared folder available")
40 |
41 |
42 | def get_init_file():
43 | # Init file must not exist, but it's parent dir must exist.
44 | os.makedirs(str(get_shared_folder()), exist_ok=True)
45 | init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init"
46 | if init_file.exists():
47 | os.remove(str(init_file))
48 | return init_file
49 |
50 |
51 | class Trainer(object):
52 | def __init__(self, args):
53 | self.args = args
54 |
55 | def __call__(self):
56 | import main_finetune_retrieval as main_finetune
57 |
58 | self._setup_gpu_args()
59 | main_finetune.main(self.args)
60 |
61 | def checkpoint(self):
62 | import submitit
63 |
64 | self.args.dist_url = get_init_file().as_uri()
65 | print("Requeuing ", self.args)
66 | empty_trainer = type(self)(self.args)
67 | return submitit.helpers.DelayedSubmission(empty_trainer)
68 |
69 | def _setup_gpu_args(self):
70 | import submitit
71 | from pathlib import Path
72 |
73 | job_env = submitit.JobEnvironment()
74 | self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id)))
75 | self.args.gpu = job_env.local_rank
76 | self.args.rank = job_env.global_rank
77 | self.args.world_size = job_env.num_tasks
78 | print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
79 |
80 |
81 | def main():
82 | args = parse_args()
83 | if args.job_dir == "":
84 | args.job_dir = get_shared_folder() / "%j"
85 |
86 | # Note that the folder will depend on the job_id, to easily track experiments
87 | executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30)
88 |
89 | num_gpus_per_node = args.ngpus
90 | nodes = args.nodes
91 | timeout_min = args.timeout
92 |
93 | partition = args.partition
94 | kwargs = {}
95 | if args.use_volta32:
96 | kwargs['slurm_constraint'] = 'volta32gb'
97 | if args.comment:
98 | kwargs['slurm_comment'] = args.comment
99 |
100 | executor.update_parameters(
101 | mem_gb=40 * num_gpus_per_node,
102 | gpus_per_node=num_gpus_per_node,
103 | tasks_per_node=num_gpus_per_node, # one task per GPU
104 | cpus_per_task=10,
105 | nodes=nodes,
106 | timeout_min=timeout_min, # max is 60 * 72
107 | # Below are cluster dependent parameters
108 | slurm_partition=partition,
109 | slurm_signal_delay_s=120,
110 | **kwargs
111 | )
112 |
113 | executor.update_parameters(name="lavila_ft")
114 |
115 | args.dist_url = get_init_file().as_uri()
116 | args.output_dir = args.job_dir
117 |
118 | trainer = Trainer(args)
119 | job = executor.submit(trainer)
120 |
121 | print("Submitted job_id:", job.job_id)
122 |
123 |
124 | if __name__ == "__main__":
125 | main()
126 |
--------------------------------------------------------------------------------
/LaViLa/run_with_submitit_infer_narrator.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | # Copyright (c) Meta Platforms, Inc. and affiliates.
4 | # All rights reserved.
5 |
6 | # This source code is licensed under the license found in the
7 | # LICENSE file in the root directory of this source tree.
8 | """
9 | A script to run multinode training with submitit.
10 | """
11 | import argparse
12 | import os
13 | import uuid
14 | from pathlib import Path
15 |
16 | import main_infer_narrator
17 | import submitit
18 |
19 |
20 | def parse_args():
21 | parser = main_infer_narrator.get_args_parser()
22 | parser = argparse.ArgumentParser("Submitit for inferring narrator", parents=[parser])
23 | parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node")
24 | parser.add_argument("--nodes", default=4, type=int, help="Number of nodes to request")
25 | parser.add_argument("--timeout", default=2880, type=int, help="Duration of the job")
26 | parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.")
27 |
28 | parser.add_argument("--partition", default="learnlab", type=str, help="Partition where to submit")
29 | parser.add_argument("--use_volta32", action='store_true', help="Big models? Use this")
30 | parser.add_argument('--comment', default="", type=str,
31 | help='Comment to pass to scheduler, e.g. priority message')
32 | return parser.parse_args()
33 |
34 |
35 | def get_shared_folder() -> Path:
36 | user = os.getenv("USER")
37 | if Path("/checkpoint/").is_dir():
38 | p = Path(f"/checkpoint/{user}/experiments/extract_caption")
39 | p.mkdir(exist_ok=True)
40 | return p
41 | raise RuntimeError("No shared folder available")
42 |
43 |
44 | def get_init_file():
45 | # Init file must not exist, but it's parent dir must exist.
46 | os.makedirs(str(get_shared_folder()), exist_ok=True)
47 | init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init"
48 | if init_file.exists():
49 | os.remove(str(init_file))
50 | return init_file
51 |
52 |
53 | class Trainer(object):
54 | def __init__(self, args):
55 | self.args = args
56 |
57 | def __call__(self):
58 | import main_infer_narrator
59 |
60 | self._setup_gpu_args()
61 | main_infer_narrator.main(self.args)
62 |
63 | def checkpoint(self):
64 | import submitit
65 |
66 | self.args.dist_url = get_init_file().as_uri()
67 | print("Requeuing ", self.args)
68 | empty_trainer = type(self)(self.args)
69 | return submitit.helpers.DelayedSubmission(empty_trainer)
70 |
71 | def _setup_gpu_args(self):
72 | import submitit
73 | from pathlib import Path
74 |
75 | job_env = submitit.JobEnvironment()
76 | self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id)))
77 | self.args.gpu = job_env.local_rank
78 | self.args.rank = job_env.global_rank
79 | self.args.world_size = job_env.num_tasks
80 | print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
81 |
82 |
83 | def main():
84 | args = parse_args()
85 | if args.job_dir == "":
86 | args.job_dir = get_shared_folder() / "%j"
87 |
88 | # Note that the folder will depend on the job_id, to easily track experiments
89 | executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30)
90 |
91 | num_gpus_per_node = args.ngpus
92 | nodes = args.nodes
93 | timeout_min = args.timeout
94 |
95 | partition = args.partition
96 | kwargs = {}
97 | if args.use_volta32:
98 | kwargs['slurm_constraint'] = 'volta32gb'
99 | if args.comment:
100 | kwargs['slurm_comment'] = args.comment
101 |
102 | executor.update_parameters(
103 | mem_gb=55 * num_gpus_per_node,
104 | gpus_per_node=num_gpus_per_node,
105 | tasks_per_node=num_gpus_per_node, # one task per GPU
106 | cpus_per_task=10,
107 | nodes=nodes,
108 | timeout_min=timeout_min, # max is 60 * 72
109 | # Below are cluster dependent parameters
110 | slurm_partition=partition,
111 | slurm_signal_delay_s=120,
112 | **kwargs
113 | )
114 |
115 | executor.update_parameters(name="infer_narrator")
116 |
117 | args.dist_url = get_init_file().as_uri()
118 | args.output_dir = args.job_dir
119 |
120 | trainer = Trainer(args)
121 | job = executor.submit(trainer)
122 |
123 | print("Submitted job_id:", job.job_id)
124 |
125 |
126 | if __name__ == "__main__":
127 | main()
128 |
--------------------------------------------------------------------------------
/LaViLa/run_with_submitit_pretrain.py:
--------------------------------------------------------------------------------
1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
2 | # All rights reserved.
3 |
4 | # This source code is licensed under the license found in the
5 | # LICENSE file in the root directory of this source tree.
6 | """
7 | A script to run multinode training with submitit.
8 | """
9 | import argparse
10 | import os
11 | import uuid
12 | from pathlib import Path
13 |
14 | import main_pretrain
15 | import submitit
16 |
17 |
18 | def parse_args():
19 | parser = main_pretrain.get_args_parser()
20 | parser = argparse.ArgumentParser("Submitit for lavila pre-training", parents=[parser])
21 | parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node")
22 | parser.add_argument("--nodes", default=8, type=int, help="Number of nodes to request")
23 | parser.add_argument("--timeout", default=2880, type=int, help="Duration of the job")
24 | parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.")
25 |
26 | parser.add_argument("--partition", default="learnlab", type=str, help="Partition where to submit")
27 | parser.add_argument("--use_volta32", action='store_true', help="Big models? Use this")
28 | parser.add_argument('--comment', default="", type=str,
29 | help='Comment to pass to scheduler, e.g. priority message')
30 | return parser.parse_args()
31 |
32 |
33 | def get_shared_folder() -> Path:
34 | user = os.getenv("USER")
35 | if Path("/checkpoint/").is_dir():
36 | p = Path(f"/checkpoint/{user}/experiments/lavila_pretrain")
37 | p.mkdir(exist_ok=True)
38 | return p
39 | raise RuntimeError("No shared folder available")
40 |
41 |
42 | def get_init_file():
43 | # Init file must not exist, but it's parent dir must exist.
44 | os.makedirs(str(get_shared_folder()), exist_ok=True)
45 | init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init"
46 | if init_file.exists():
47 | os.remove(str(init_file))
48 | return init_file
49 |
50 |
51 | class Trainer(object):
52 | def __init__(self, args):
53 | self.args = args
54 |
55 | def __call__(self):
56 | import main_pretrain
57 |
58 | self._setup_gpu_args()
59 | main_pretrain.main(self.args)
60 |
61 | def checkpoint(self):
62 | import submitit
63 |
64 | self.args.dist_url = get_init_file().as_uri()
65 | print("Requeuing ", self.args)
66 | empty_trainer = type(self)(self.args)
67 | return submitit.helpers.DelayedSubmission(empty_trainer)
68 |
69 | def _setup_gpu_args(self):
70 | import submitit
71 | from pathlib import Path
72 |
73 | job_env = submitit.JobEnvironment()
74 | self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id)))
75 | self.args.gpu = job_env.local_rank
76 | self.args.rank = job_env.global_rank
77 | self.args.world_size = job_env.num_tasks
78 | print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
79 |
80 |
81 | def main():
82 | args = parse_args()
83 | if args.job_dir == "":
84 | args.job_dir = get_shared_folder() / "%j"
85 |
86 | # Note that the folder will depend on the job_id, to easily track experiments
87 | executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30)
88 |
89 | num_gpus_per_node = args.ngpus
90 | nodes = args.nodes
91 | timeout_min = args.timeout
92 |
93 | partition = args.partition
94 | kwargs = {}
95 | if args.use_volta32:
96 | kwargs['slurm_constraint'] = 'volta32gb'
97 | if args.comment:
98 | kwargs['slurm_comment'] = args.comment
99 |
100 | executor.update_parameters(
101 | mem_gb=40 * num_gpus_per_node,
102 | gpus_per_node=num_gpus_per_node,
103 | tasks_per_node=num_gpus_per_node, # one task per GPU
104 | cpus_per_task=10,
105 | nodes=nodes,
106 | timeout_min=timeout_min, # max is 60 * 72
107 | # Below are cluster dependent parameters
108 | slurm_partition=partition,
109 | slurm_signal_delay_s=120,
110 | **kwargs
111 | )
112 |
113 | executor.update_parameters(name="lavila_pretrain")
114 |
115 | args.dist_url = get_init_file().as_uri()
116 | args.output_dir = args.job_dir
117 |
118 | trainer = Trainer(args)
119 | job = executor.submit(trainer)
120 |
121 | print("Submitted job_id:", job.job_id)
122 |
123 |
124 | if __name__ == "__main__":
125 | main()
126 |
--------------------------------------------------------------------------------
/LaViLa/scripts/convert_egovlp_ckpt.py:
--------------------------------------------------------------------------------
1 | # This source code is licensed under the license found in the
2 | # LICENSE file in the root directory of this source tree.
3 |
4 | '''
5 | Usage:
6 | ```bash
7 | PYTHONPATH= python scripts/convert_egovlp_ckpt.py \
8 | --input-ckpt \
9 | --output-ckpt egovlp_converted.pth
10 | ```
11 | '''
12 |
13 | import argparse
14 | from collections import OrderedDict
15 | import torch
16 |
17 |
18 | def get_args_parser():
19 | parser = argparse.ArgumentParser(description='Convert EgoVLP checkpoint', add_help=False)
20 | parser.add_argument('--input-ckpt', type=str)
21 | parser.add_argument('--output-ckpt', type=str)
22 | return parser
23 |
24 |
25 | def main(args):
26 | input_ckpt = torch.load(args.input_ckpt, map_location='cpu')
27 | input_ckpt = input_ckpt['state_dict']
28 | output_ckpt = OrderedDict()
29 | for k in input_ckpt:
30 | if k.startswith('module.video_model'):
31 | output_ckpt[k.replace('module.video_model', 'module.visual')] = input_ckpt[k]
32 | elif k.startswith('module.text_model'):
33 | output_ckpt[k.replace('module.text_model', 'module.textual')] = input_ckpt[k]
34 | elif k.startswith('module.txt_proj'):
35 | output_ckpt[k.replace('module.txt_proj', 'module.text_projection')] = input_ckpt[k]
36 | elif k.startswith('module.vid_proj'):
37 | output_ckpt[k.replace('module.vid_proj', 'module.image_projection')] = input_ckpt[k]
38 | else:
39 | print(k)
40 | raise ValueError
41 | torch.save({
42 | 'epoch': 0,
43 | 'state_dict': output_ckpt,
44 | 'best_acc1': 0,
45 | }, args.output_ckpt)
46 |
47 |
48 | if __name__ == '__main__':
49 | parser = argparse.ArgumentParser('Convert EgoVLP checkpoint', parents=[get_args_parser()])
50 | args = parser.parse_args()
51 | main(args)
52 |
--------------------------------------------------------------------------------
/LaViLa/scripts/crop_and_resize_ego4d.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | small_side=288
4 | cliplen_sec=300
5 | max_tries=5
6 | indir="/path/to/full-scale/videos/"
7 | outdir="/path/to/downscaled/videos/"
8 |
9 | cd $indir || exit
10 | all_videos=$(find . -iname "*.mp4")
11 | all_videos=( $all_videos ) # to array
12 | cd -
13 |
14 | for video in "${all_videos[@]}"; do
15 | W=$( ffprobe -v quiet -show_format -show_streams -show_entries stream=width "${indir}/${video}" | grep width )
16 | W=${W#width=}
17 | H=$( ffprobe -v quiet -show_format -show_streams -show_entries stream=height "${indir}/${video}" | grep height )
18 | H=${H#height=}
19 | # Set the smaller side to small_side
20 | # from https://superuser.com/a/624564
21 | if [ $W -gt $H ] && [ $H -gt ${small_side} ]; then
22 | scale_str="-filter:v scale=-1:${small_side}"
23 | elif [ $H -gt $W ] && [ $W -gt ${small_side} ]; then
24 | scale_str="-filter:v scale=${small_side}:-1"
25 | else
26 | # The small side is smaller than required size, so don't resize/distort the video
27 | scale_str=""
28 | fi
29 | vidlen_sec=$( ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "${indir}/${video}" )
30 | mkdir -p "${outdir}/${video}"
31 | for st_sec in $(seq 0 ${cliplen_sec} ${vidlen_sec}); do
32 | outfpath=${outdir}/${video}/${st_sec}.mp4
33 | try=0
34 | while [ $try -le $max_tries ]; do
35 | ffmpeg -y -ss ${st_sec} -i "${indir}/${video}" ${scale_str} -t ${cliplen_sec} "${outfpath}"
36 | try=$(( $try + 1 ))
37 | write_errors=$( ffprobe -v error -i "${outfpath}" )
38 | # If no errors detected by ffprobe, we are done
39 | if [ -z "$write_errors" ]; then
40 | echo $outfpath written successfully in $try tries!
41 | break
42 | fi
43 | done
44 | done
45 | echo "Converted ${video}"
46 | done
47 |
48 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | VideoAgent: A Memory-augmented Multimodal Agent for Video Understanding (ECCV 2024)
2 |
3 | # Introduction
4 | This is the official code repository of [VideoAgent: A Memory-augmented Multimodal Agent for Video Understanding
5 | ](https://videoagent.github.io/). VideoAgent is a mulit-modal agent that can understand the input video and answer the questions raised by you.
6 |
7 | Given a video and a question, VideoAgent has two phases: memory construction phase and inference phase. During the memory construction phase, structured information is extracted from the video and stored in the memory. During the inference phase, a LLM is prompted to use a set of tools interacting with the memory to answer the question.
8 |
9 |
10 |
11 |
12 | # Prerequisites
13 | This project is tested on Ubuntu 20.04 with a NVIDIA RTX 4090(24GB).
14 |
15 |
16 | # Installation Guide
17 | Use the following command to create the environment named as videoagent:
18 | ```sh
19 | conda env create -f environment.yaml
20 | ```
21 |
22 | Create the environment of [Video-LLaVA](https://github.com/PKU-YuanGroup/Video-LLaVA) by running the following command:
23 | ```sh
24 | git clone https://github.com/PKU-YuanGroup/Video-LLaVA
25 | cd Video-LLaVA
26 | conda create -n videollava python=3.10 -y
27 | conda activate videollava
28 | pip install --upgrade pip # enable PEP 660 support
29 | pip install -e .
30 | pip install -e ".[train]"
31 | pip install flash-attn --no-build-isolation
32 | pip install decord opencv-python git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d
33 | ```
34 | Note: Only the conda envrionment named videollava is required for this project, while the Video-LLaVA repository is not required. You can clone Video-LLaVA repository to anywhere you want and build the conda environment named videollava.
35 |
36 | Download the ```cache_dir.zip``` and ```tool_models.zip``` from [here](https://zenodo.org/records/11031717) and unzip them to the directory of ```VideoAgent```. This will create two folder ```cache_dir```(the model weights of VideoLLaVA) and ```tool_models```(the model weights of all other models) under ```VideoAgent```.
37 |
38 | # Usage
39 | Make sure you are under VideoAgent directory.
40 | Enter your OpenAI api key in ```config/default.yaml```.
41 |
42 | First, open a terminal and run:
43 | ```sh
44 | conda activate videollava
45 | python video-llava.py
46 | ```
47 | This will start a Video-LLaVA server process that will deal with Visual Question Answering request raised by VideoAgent.
48 |
49 | Once you see ```ready for connection!``` in the first process, Then, open another terminal and run:
50 | ```sh
51 | conda activate videoagent
52 | python demo.py
53 | ```
54 | This will create a Gradio demo shown as follows.
55 |
56 |
57 |
58 | You can choose the example videos for inference, or you can also upload your own videos and questions. Once submitted, VideoAgent will start processing your video and store the files under ```preprocess/your_video_name```. After processing the input video, it will answer your question.
59 |
60 | The results will provide:
61 | 1. the answer to the question
62 | 2. the replay with object re-ID of the input video
63 | 3. the inference log (chain-of-thought) of VideoAgent
64 |
65 | For batch inference, you can run
66 | ```sh
67 | conda activate videoagent
68 | python main.py
69 | ```
70 |
71 | # Citation
72 | If you find our paper and code useful in your research, please consider giving a star ⭐ and citation 📝.
73 | ```
74 | @inproceedings{fan2025videoagent,
75 | title={Videoagent: A memory-augmented multimodal agent for video understanding},
76 | author={Fan, Yue and Ma, Xiaojian and Wu, Rujie and Du, Yuntao and Li, Jiaqi and Gao, Zhi and Li, Qing},
77 | booktitle={European Conference on Computer Vision},
78 | pages={75--92},
79 | year={2025},
80 | organization={Springer}
81 | }
82 | ```
83 |
84 |
--------------------------------------------------------------------------------
/captioning.py:
--------------------------------------------------------------------------------
1 | import sys
2 | sys.path.insert(0, 'LaViLa/')
3 | import os
4 | import urllib.request
5 | from collections import OrderedDict
6 | import numpy as np
7 | import time
8 | import torch
9 | import torchvision.transforms as transforms
10 | import torchvision.transforms._transforms_video as transforms_video
11 | from LaViLa.lavila.data.video_transforms import Permute
12 | from LaViLa.lavila.models.models import VCLM_OPENAI_TIMESFORMER_LARGE_336PX_GPT2_XL
13 | from LaViLa.lavila.models.tokenizer import MyGPT2Tokenizer
14 | from LaViLa.eval_narrator import decode_one
15 | import json
16 | import cv2
17 | import pickle
18 |
19 |
20 |
21 | class Captioning:
22 | def __init__(self, video_path_list, base_dir='preprocess'):
23 | self.video_path_list = video_path_list
24 | self.seconds_per_caption = 2 # a caption covers 2 seconds
25 | self.frames_per_caption = 4 # a caption is generated from 4 frames in the 2-second segments
26 | self.base_dir = base_dir
27 |
28 |
29 | def generate_captions_for_all_videos(self):
30 | """create the captions for all videos"""
31 | start_time = time.time()
32 | crop_size = 336
33 | val_transform = transforms.Compose([
34 | Permute([3, 0, 1, 2]),
35 | transforms.Resize(crop_size),
36 | transforms.CenterCrop(crop_size),
37 | transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305])
38 | ])
39 | ckpt_name = 'vclm_openai_timesformer_large_336px_gpt2_xl.pt_ego4d.jobid_246897.ep_0003.md5sum_443263.pth'
40 | ckpt_path = os.path.join('tool_models/LaViLa/', ckpt_name)
41 | if not os.path.exists(ckpt_path):
42 | print('downloading model to {}'.format(ckpt_path))
43 | urllib.request.urlretrieve('https://dl.fbaipublicfiles.com/lavila/checkpoints/narrator/{}'.format(ckpt_name), ckpt_path)
44 | ckpt = torch.load(ckpt_path, map_location='cpu')
45 | state_dict = OrderedDict()
46 | for k, v in ckpt['state_dict'].items():
47 | state_dict[k.replace('module.', '')] = v
48 | # instantiate the model, and load the pre-trained weights
49 | model = VCLM_OPENAI_TIMESFORMER_LARGE_336PX_GPT2_XL(
50 | text_use_cls_token=False,
51 | project_embed_dim=256,
52 | gated_xattn=True,
53 | timesformer_gated_xattn=False,
54 | freeze_lm_vclm=False, # we use model.eval() anyway
55 | freeze_visual_vclm=False, # we use model.eval() anyway
56 | num_frames=4,
57 | drop_path_rate=0.
58 | )
59 | model.load_state_dict(state_dict, strict=True)
60 | model.cuda()
61 | model.eval()
62 | tokenizer = MyGPT2Tokenizer('gpt2-xl', add_bos=True)
63 | end_time = time.time()
64 | print(f'time for loading captioning model: {round(end_time-start_time, 3)} seconds')
65 |
66 |
67 | for video_path in self.video_path_list:
68 | cap = cv2.VideoCapture(video_path)
69 | if not cap.isOpened():
70 | print("Error: Unable to open video file.")
71 | continue
72 | fps = round(cap.get(cv2.CAP_PROP_FPS))
73 | total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
74 | total_captions = total_frames//(fps*self.seconds_per_caption)
75 | frame_interval = fps*self.seconds_per_caption//self.frames_per_caption # the interval between two selected frames
76 |
77 | base_name = os.path.basename(video_path).replace(".mp4", "")
78 | video_dir = os.path.join(self.base_dir, base_name)
79 | if not os.path.exists(video_dir):
80 | os.makedirs(video_dir)
81 |
82 | captions = dict()
83 | start_time = time.time()
84 | cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
85 | for caption_id in range(total_captions):
86 | frames = []
87 | for i in range(self.frames_per_caption): # 4 frames are selected for generating the caption
88 | success, frame = cap.read()
89 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
90 | frames.append(frame)
91 | for j in range(frame_interval-1): #skip other frames
92 | success, frame = cap.read()
93 | for i in range(fps*self.seconds_per_caption-frame_interval*self.frames_per_caption):
94 | success, frame = cap.read() #skip remaining frames
95 | frames = [torch.tensor(frame, dtype=torch.float32) for frame in frames]
96 | frames = torch.stack(frames, dim=0)
97 | frames = val_transform(frames)
98 | frames = frames.unsqueeze(0)
99 |
100 | with torch.no_grad():
101 | input_frames = frames.cuda(non_blocking=True)
102 | image_features = model.encode_image(input_frames)
103 | generated_text_ids, ppls = model.generate(
104 | image_features,
105 | tokenizer,
106 | target=None, # free-form generation
107 | max_text_length=77,
108 | top_k=None,
109 | top_p=0.95, # nucleus sampling
110 | num_return_sequences=5, # number of candidates: 5
111 | temperature=0.7,
112 | early_stopping=True,
113 | )
114 | text = ""
115 | length = -1
116 | for i in range(5):
117 | # select the longest candidate as the caption
118 | generated_text_str = decode_one(generated_text_ids[i], tokenizer)
119 | if len(generated_text_str) > length:
120 | length = len(generated_text_str)
121 | text = generated_text_str
122 | caption_start_frame = caption_id*fps*self.seconds_per_caption
123 | caption_end_frame = (caption_id+1)*fps*self.seconds_per_caption
124 | segment = "{}_{}".format(str(caption_start_frame), str(caption_end_frame))
125 | captions[segment] = text
126 | print(f"id: {caption_id}, frame_interval: {segment}, caption: {text}")
127 | end_time = time.time()
128 | cap.release()
129 | print(f"captioning time for video {base_name}: {round(end_time-start_time, 3)} seconds")
130 | with open(os.path.join(video_dir, "captions.json"), 'w') as f:
131 | json.dump(captions, f)
132 | segments = list(captions)
133 | segment2id = dict()
134 | for segment in segments:
135 | segment2id[segment] = len(segment2id)
136 | with open(os.path.join(video_dir, "segment2id.json"), 'w') as f:
137 | json.dump(segment2id, f)
138 |
139 | def run(self):
140 | self.generate_captions_for_all_videos()
--------------------------------------------------------------------------------
/config/default.yaml:
--------------------------------------------------------------------------------
1 | openai_api_key: your_openai-api_key
2 | use_reid: true
3 | vqa_tool: videollava #videollava or gpt-4v
4 | base_dir: preprocess
--------------------------------------------------------------------------------
/database.py:
--------------------------------------------------------------------------------
1 | import os
2 | import json
3 | import pickle
4 | from collections import defaultdict
5 | from encoder import encode_sentences
6 | from utils import compute_cosine_similarity, top_k_indices
7 | import numpy as np
8 | import sqlite3
9 |
10 |
11 | class DataBase:
12 | def __init__(self, video_path, base_dir='preprocess', use_reid=True):
13 | base_name = os.path.basename(video_path).replace(".mp4", "")
14 | self.video_dir = os.path.join(base_dir, base_name)
15 | self.use_reid = use_reid
16 | if self.use_reid:
17 | with open(os.path.join(self.video_dir, 'reid.pkl'), 'rb') as f:
18 | content = pickle.load(f)
19 | self.frame2uid, self.uid2frame, self.uid2category = content[0], content[1], content[2]
20 | with open(os.path.join(self.video_dir, 'uid2clip.pkl'), 'rb') as f:
21 | self.uid2emb = pickle.load(f)
22 | else:
23 | with open(os.path.join(self.video_dir, 'tracking.pkl'), 'rb') as f:
24 | content = pickle.load(f)
25 | self.frame2uid, self.uid2frame, self.uid2category = content[0], content[1], content[2]
26 | with open(os.path.join(self.video_dir, 'tid2clip.pkl'), 'rb') as f:
27 | self.uid2emb = pickle.load(f)
28 |
29 | with open(os.path.join(self.video_dir, 'segment2id.json')) as f:
30 | self.segment2id = json.load(f)
31 | self.segment_id2uids = defaultdict(set)
32 | for frame in self.frame2uid:
33 | segment_id = 0
34 | for segment in self.segment2id:
35 | start, end = segment.split('_')
36 | start, end = int(start), int(end)
37 | if start <= frame <= end:
38 | segment_id = self.segment2id[segment]
39 | break
40 | uids = list(self.frame2uid[frame])
41 | self.segment_id2uids[segment_id].update(uids)
42 |
43 | if os.path.exists('database.db'):
44 | os.remove('database.db')
45 | connection = sqlite3.connect('database.db')
46 | cursor = connection.cursor()
47 | create_object = """
48 | CREATE TABLE Objects(
49 | object_id INT,
50 | category VARCHAR(255),
51 | PRIMARY KEY (object_id)
52 | );
53 | """
54 | cursor.execute(create_object)
55 | create_segment = """
56 | CREATE TABLE Segments(
57 | segment_id INT,
58 | PRIMARY KEY (segment_id)
59 | );
60 | """
61 | cursor.execute(create_segment)
62 | create_object_segment = """
63 | CREATE TABLE Objects_Segments(
64 | object_id INT,
65 | segment_id INT,
66 | PRIMARY KEY (object_id, segment_id),
67 | FOREIGN KEY (object_id) REFERENCES Objects(object_id),
68 | FOREIGN KEY (segment_id) REFERENCES Segments(segment_id)
69 | );
70 | """
71 | cursor.execute(create_object_segment)
72 | connection.commit()
73 |
74 | insert_objects = []
75 | for uid in self.uid2category:
76 | line = "INSERT INTO Objects (object_id, category) VALUES ({}, '{}')".format(str(uid), self.uid2category[uid])
77 | #print(line)
78 | insert_objects.append(line)
79 | for s in insert_objects:
80 | cursor.execute(s)
81 |
82 | insert_segments = []
83 | for segment in self.segment2id:
84 | segment_id = self.segment2id[segment]
85 | line = "INSERT INTO Segments (segment_id) VALUES ({})".format(str(segment_id))
86 | #print(line)
87 | insert_segments.append(line)
88 | for s in insert_segments:
89 | cursor.execute(s)
90 |
91 |
92 | insert_object_segments = []
93 | for segment_id in self.segment_id2uids:
94 | for uid in self.segment_id2uids[segment_id]:
95 | line = "INSERT INTO Objects_Segments (object_id, segment_id) VALUES ({}, {})".format(str(uid), str(segment_id))
96 | #print(line)
97 | insert_object_segments.append(line)
98 | for s in insert_object_segments:
99 | cursor.execute(s)
100 |
101 | connection.commit()
102 | cursor.close()
103 | connection.close()
104 |
105 |
106 | def retrieve_candidate_objects(self, description):
107 | des_emb = encode_sentences([f"a photo of a {description}."], model_name='clip')
108 | scores = compute_cosine_similarity(des_emb, list(self.uid2emb.values()))
109 | indices = np.where(scores >= 0.26)[0]
110 | candidate_uids = []
111 | for i in indices:
112 | candidate_uids.append(list(self.uid2emb)[i])
113 | return candidate_uids
114 |
115 |
116 | def query_database(self, program):
117 | connection = sqlite3.connect('database.db')
118 | cursor = connection.cursor()
119 | try:
120 | cursor.execute(program)
121 | results = cursor.fetchall()
122 | return results
123 | except sqlite3.Error as e:
124 | return e
--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
1 | import gradio as gr
2 | import openai
3 | from main import preprocess, ReActAgent
4 | from multiprocessing import Process
5 | import os
6 | import socket
7 | from omegaconf import OmegaConf
8 |
9 | config = OmegaConf.load('config/default.yaml')
10 | openai_api_key = config['openai_api_key']
11 | use_reid = config['use_reid']
12 | vqa_tool = config['vqa_tool']
13 | base_dir = config['base_dir']
14 |
15 |
16 | def ask_question(video_file, question):
17 | preprocess(video_path_list=[video_file],
18 | base_dir=base_dir,
19 | show_tracking=False)
20 | answer, log = ReActAgent(video_path=video_file, question=question, base_dir=base_dir, vqa_tool=vqa_tool, use_reid=use_reid, openai_api_key=openai_api_key)
21 | base_name = os.path.basename(video_file).replace(".mp4", "")
22 | reid_file = os.path.join("preprocess", base_name, "reid.mp4")
23 | return answer, reid_file, log
24 |
25 |
26 | with gr.Row():
27 | # Define inputs
28 | with gr.Column(scale=6):
29 | video_input = gr.Video(label="Upload a video")
30 | question_input = gr.Textbox(label="Ask a question")
31 |
32 |
33 | # Define output
34 | with gr.Column(scale=6):
35 | output_text = gr.Textbox(label="Answer")
36 | output_reid = gr.Video(label="Video replay with object re-identifcation")
37 | output_log = gr.Textbox(label="Inference log")
38 |
39 |
40 | # Create Gradio interface
41 | gr.Interface(
42 | fn=ask_question,
43 | inputs=[video_input, question_input],
44 | outputs=[output_text, output_reid, output_log],
45 | title="VideoAgent",
46 | examples = [
47 | [f"sample_videos/boats.mp4", "How many boats are there in the video?"],
48 | [f"sample_videos/talking.mp4",
49 | "From what clue do you know that the woman with black spectacles at the start of the video is married?"],
50 | [f"sample_videos/books.mp4",
51 | "Based on the actions observed, what could be a possible motivation or goal for what c is doing in the video?"],
52 | [f"sample_videos/painting.mp4",
53 | "What was the primary purpose of the cup of water in this video, and how did it contribute to the overall painting process?"],
54 | [f"sample_videos/kitchen.mp4",
55 | "Is there a microwave in the kitchen?"],
56 | ],
57 | description="""### This is the demo of [VideoAgent](https://videoagent.github.io/).
58 |
59 | Upload a video and ask a question to get an answer from the VideoAgent."""
60 |
61 | ).launch(share=True)
62 |
--------------------------------------------------------------------------------
/encoder.py:
--------------------------------------------------------------------------------
1 | import json
2 | import openai
3 | import numpy as np
4 | import pickle
5 | from sentence_transformers import SentenceTransformer
6 | import os
7 | from PIL import Image
8 | import clip
9 | import torch
10 | from openai import OpenAI
11 | import torchvision.transforms as T
12 | from PIL import Image
13 | from time import time
14 |
15 |
16 | sentence_models = ['text-embedding-ada-002', 'text-embedding-3-large', 'all-MiniLM-L6-v2', 'all-mpnet-base-v2', 'clip']
17 |
18 |
19 | def encode_sentences(sentence_list, model_name):
20 | '''given a list of sentences, return the embeddings for them using the sentence encoder model'''
21 | assert model_name in sentence_models
22 | emb_list = []
23 | if model_name in['text-embedding-ada-002', 'text-embedding-3-large']: #openai embedding requires api-key
24 | client = OpenAI()
25 | emb = client.embeddings.create(input=sentence_list, model=model_name)
26 | for i in range(len(sentence_list)):
27 | emb_list.append(np.array(emb.data[i].embedding).reshape(1, -1))
28 | emb_list = np.concatenate(emb_list, axis=0)
29 | return emb_list
30 | elif model_name == 'clip': # clip embedding
31 | device = "cuda" if torch.cuda.is_available() else "cpu"
32 | model, transform = clip.load("ViT-B/32", device=device)
33 | with torch.no_grad():
34 | for sentence in sentence_list:
35 | emb_list.append(model.encode_text(clip.tokenize([sentence]).to(device)).cpu().numpy())
36 | emb_list = np.concatenate(emb_list, axis=0)
37 | return emb_list
38 | else: #sentence transformer encoder
39 | model = SentenceTransformer('sentence-transformers/'+model_name)
40 | num = len(sentence_list)
41 | batch_size = 10
42 | batch_num = num // batch_size
43 | with torch.no_grad():
44 | for batch_id in range(batch_num):
45 | batch_sentences = sentence_list[batch_id*10: (batch_id+1)*10]
46 | emb_list.append(model.encode(batch_sentences))
47 | if batch_num * 10 < num: #remaining <10 sentences
48 | remaining_sentences = sentence_list[batch_num*10: num]
49 | emb_list.append(model.encode(remaining_sentences))
50 | return emb_list
51 |
52 |
53 | if __name__ == '__main__':
54 | encode_sentences(['hello!', 'what'], model_name='text-embedding-ada-002')
55 |
56 |
--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
1 | name: videoagent
2 | channels:
3 | - conda-forge
4 | - defaults
5 | dependencies:
6 | - _libgcc_mutex=0.1=conda_forge
7 | - _openmp_mutex=4.5=2_gnu
8 | - bzip2=1.0.8=hd590300_5
9 | - ca-certificates=2023.11.17=hbcca054_0
10 | - ld_impl_linux-64=2.40=h41732ed_0
11 | - libffi=3.4.2=h7f98852_5
12 | - libgcc-ng=13.2.0=h807b86a_3
13 | - libgomp=13.2.0=h807b86a_3
14 | - libnsl=2.0.1=hd590300_0
15 | - libsqlite=3.44.2=h2797004_0
16 | - libuuid=2.38.1=h0b41bf4_0
17 | - libzlib=1.2.13=hd590300_5
18 | - ncurses=6.4=h59595ed_2
19 | - openssl=3.2.0=hd590300_1
20 | - pip=23.3.2=pyhd8ed1ab_0
21 | - python=3.9.18=h0755675_0_cpython
22 | - readline=8.2=h8228510_1
23 | - setuptools=68.2.2=pyhd8ed1ab_0
24 | - tk=8.6.13=noxft_h4845f30_101
25 | - tzdata=2023c=h71feb2d_0
26 | - wheel=0.42.0=pyhd8ed1ab_0
27 | - xz=5.2.6=h166bdaf_0
28 | - pip:
29 | - aiohttp==3.9.1
30 | - aiosignal==1.3.1
31 | - annotated-types==0.6.0
32 | - anyio==3.7.1
33 | - async-timeout==4.0.3
34 | - attrs==23.1.0
35 | - av==11.0.0
36 | - blis==0.7.11
37 | - catalogue==2.0.10
38 | - certifi==2023.11.17
39 | - charset-normalizer==3.3.2
40 | - click==8.1.7
41 | - cmake==3.28.1
42 | - confection==0.1.4
43 | - contourpy==1.2.0
44 | - cycler==0.12.1
45 | - cymem==2.0.8
46 | - cython==3.0.7
47 | - dataclasses-json==0.6.3
48 | - decord==0.6.0
49 | - distro==1.8.0
50 | - einops==0.4.1
51 | - exceptiongroup==1.2.0
52 | - fairscale==0.4.4
53 | - filelock==3.13.1
54 | - fonttools==4.47.0
55 | - frozenlist==1.4.1
56 | - fsspec==2023.12.2
57 | - ftfy==4.4.3
58 | - fvcore==0.1.5.post20221221
59 | - gradio==4.22.0
60 | - gradio-client==0.13.0
61 | - gensim==3.8.3
62 | - greenlet==3.0.3
63 | - h11==0.14.0
64 | - html5lib==1.1
65 | - httpcore==1.0.2
66 | - httpx==0.26.0
67 | - huggingface-hub==0.19.4
68 | - idna==3.6
69 | - imageio==2.33.1
70 | - importlib-resources==6.1.1
71 | - pip-install==1.3.5
72 | - iopath==0.1.10
73 | - jinja2==3.1.2
74 | - joblib==1.3.2
75 | - jsonpatch==1.33
76 | - jsonpointer==2.4
77 | - kiwisolver==1.4.5
78 | - langchain==0.1.2
79 | - langchain-community==0.0.14
80 | - langchain-core==0.1.14
81 | - langchain-openai==0.0.3
82 | - langchainhub==0.1.14
83 | - langcodes==3.3.0
84 | - langsmith==0.0.83
85 | - lapx==0.5.5
86 | - lit==17.0.6
87 | - markupsafe==2.1.3
88 | - marshmallow==3.20.2
89 | - matplotlib==3.8.2
90 | - mpmath==1.3.0
91 | - multidict==6.0.4
92 | - murmurhash==1.0.10
93 | - mypy-extensions==1.0.0
94 | - moviepy==1.0.3
95 | - networkx==3.2.1
96 | - nltk==3.8.1
97 | - numpy==1.26.2
98 | - nvidia-cublas-cu11==11.10.3.66
99 | - nvidia-cublas-cu12==12.1.3.1
100 | - nvidia-cuda-cupti-cu11==11.7.101
101 | - nvidia-cuda-cupti-cu12==12.1.105
102 | - nvidia-cuda-nvrtc-cu11==11.7.99
103 | - nvidia-cuda-nvrtc-cu12==12.1.105
104 | - nvidia-cuda-runtime-cu11==11.7.99
105 | - nvidia-cuda-runtime-cu12==12.1.105
106 | - nvidia-cudnn-cu11==8.5.0.96
107 | - nvidia-cudnn-cu12==8.9.2.26
108 | - nvidia-cufft-cu11==10.9.0.58
109 | - nvidia-cufft-cu12==11.0.2.54
110 | - nvidia-curand-cu11==10.2.10.91
111 | - nvidia-curand-cu12==10.3.2.106
112 | - nvidia-cusolver-cu11==11.4.0.1
113 | - nvidia-cusolver-cu12==11.4.5.107
114 | - nvidia-cusparse-cu11==11.7.4.91
115 | - nvidia-cusparse-cu12==12.1.0.106
116 | - nvidia-nccl-cu11==2.14.3
117 | - nvidia-nccl-cu12==2.18.1
118 | - nvidia-nvjitlink-cu12==12.3.101
119 | - nvidia-nvtx-cu11==11.7.91
120 | - nvidia-nvtx-cu12==12.1.105
121 | - omegaconf==2.3.0
122 | - openai==1.9.0
123 | - opencv-python==4.8.1.78
124 | - packaging==23.2
125 | - pandas==1.3.5
126 | - parameterized==0.9.0
127 | - pathy==0.10.3
128 | - pillow==10.1.0
129 | - pims==0.6.1
130 | - portalocker==2.8.2
131 | - preshed==3.0.9
132 | - protobuf==4.21.12
133 | - psutil==5.9.7
134 | - py-cpuinfo==9.0.0
135 | - pydantic==2.5.3
136 | - pyparsing==3.1.1
137 | - python-dateutil==2.8.2
138 | - pytorchvideo==0.1.5
139 | - pytz==2023.3.post1
140 | - pyyaml==6.0.1
141 | - regex==2023.10.3
142 | - requests==2.31.0
143 | - safetensors==0.4.1
144 | - scikit-learn==1.0.2
145 | - scipy==1.11.4
146 | - seaborn==0.13.1
147 | - sentence-transformers==2.2.2
148 | - sentencepiece==0.1.99
149 | - six==1.16.0
150 | - slicerator==1.1.0
151 | - smart-open==6.4.0
152 | - sniffio==1.3.0
153 | - sqlalchemy==2.0.25
154 | - srsly==2.4.8
155 | - sympy==1.12
156 | - tabulate==0.9.0
157 | - tenacity==8.2.3
158 | - termcolor==2.4.0
159 | - theano==1.0.5
160 | - thinc==8.1.12
161 | - thop==0.1.1-2209072238
162 | - threadpoolctl==3.2.0
163 | - tiktoken==0.5.2
164 | - timm==0.5.4
165 | - tokenizers==0.12.1
166 | - torch==2.1.2
167 | - torchvision==0.16.2
168 | - tqdm==4.66.1
169 | - transformers==4.27.0
170 | - triton==2.1.0
171 | - types-requests==2.31.0.20240106
172 | - typing-extensions==4.9.0
173 | - typing-inspect==0.9.0
174 | - ultralytics==8.0.235
175 | - urllib3==2.1.0
176 | - wasabi==0.10.1
177 | - wcwidth==0.2.12
178 | - webdataset==0.2.86
179 | - webencodings==0.5.1
180 | - xdg==6.0.0
181 | - yacs==0.1.8
182 | - yarl==1.9.4
183 | - zipp==3.17.0
184 | - git+https://github.com/openai/CLIP.git
185 | - git+https://github.com/Maluuba/nlg-eval.git@master
186 |
--------------------------------------------------------------------------------
/imgs/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/imgs/demo.png
--------------------------------------------------------------------------------
/imgs/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/imgs/teaser.png
--------------------------------------------------------------------------------
/preprocess/boats/captions.json:
--------------------------------------------------------------------------------
1 | {"0_48": "#C C looks around ", "48_96": "#C C looks around the", "96_144": "#C C looks at the", "144_192": "#C C looks around ", "192_240": "#C C looks around the lake", "240_288": "#C C looks around "}
--------------------------------------------------------------------------------
/preprocess/boats/reid.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/reid.mp4
--------------------------------------------------------------------------------
/preprocess/boats/reid.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/reid.pkl
--------------------------------------------------------------------------------
/preprocess/boats/segment2id.json:
--------------------------------------------------------------------------------
1 | {"0_48": 0, "48_96": 1, "96_144": 2, "144_192": 3, "192_240": 4, "240_288": 5}
--------------------------------------------------------------------------------
/preprocess/boats/segment_textual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/segment_textual_embedding.pkl
--------------------------------------------------------------------------------
/preprocess/boats/segment_visual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/segment_visual_embedding.pkl
--------------------------------------------------------------------------------
/preprocess/boats/tid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/tid2clip.pkl
--------------------------------------------------------------------------------
/preprocess/boats/tid2dinov2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/tid2dinov2.pkl
--------------------------------------------------------------------------------
/preprocess/boats/tracking.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/tracking.pkl
--------------------------------------------------------------------------------
/preprocess/boats/uid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/uid2clip.pkl
--------------------------------------------------------------------------------
/preprocess/books/captions.json:
--------------------------------------------------------------------------------
1 | {"0_60": "#C C picks books from the floor", "60_120": "#C C picks the book on the floor", "120_180": "#C C holds a book in the book shelf", "180_240": "#C C places the book on the shelf", "240_300": "#C C removes the book from the", "300_360": "#C C holds the book with his left hand", "360_420": "#C C touches a book on a shelf with his right hand", "420_480": "#C C looks at the books on the floor", "480_540": "#C C picks the book from the shelf", "540_600": "#C C arranges books on the shelf", "600_660": "#C C looks around the floor", "660_720": "#C C picks a book from the floor", "720_780": "#C C places the books on the shelf", "780_840": "#C C picks a book from the shelf with his right hand", "840_900": "#C C adjusts the books on the shelf with her left hand", "900_960": "#C C arranges the books in the shelf", "960_1020": "#C C picks a book from the floor", "1020_1080": "#C C picks the books from the floor", "1080_1140": "#C C puts the book on the shelf", "1140_1200": "#C C adjusts the books on the shelf", "1200_1260": "#C C picks a book from the shelf", "1260_1320": "#C C arranges books on the book shelf", "1320_1380": "#C C picks the book from the", "1380_1440": "#C C puts book on top of the bookshe", "1440_1500": "#C C puts the books on the bookshe", "1500_1560": "#C C picks the book from the floor", "1560_1620": "#C C picks books from the floor", "1620_1680": "#C C places the book in his right hand in the bookshelf", "1680_1740": "#C C puts the books in the book shelf with his right hand", "1740_1800": "#C C picks a book from the floor with her left hand", "1800_1860": "#C C arranges books on the shelf", "1860_1920": "#C C arranges books", "1920_1980": "#C C looks around the house", "1980_2040": "#C C stares at the", "2040_2100": "#C C looks at the", "2100_2160": "#C C looks around", "2160_2220": "#C C adjusts the books on the shelf with his hands", "2220_2280": "#C C moves the books in the bookshelf with his right hand", "2280_2340": "#C C arranges books on the shelf", "2340_2400": "#C C touches the books on the shelf", "2400_2460": "#c c puts books on the shelf", "2460_2520": "#C C holds the book with her right hand", "2520_2580": "#C C places the book on the book shelf", "2580_2640": "#C C puts book on the shelf", "2640_2700": "#C C picks the books from the floor", "2700_2760": "#C C looks around the room.", "2760_2820": "#C C adjusts the books in the shelf with his hands", "2820_2880": "#C C picks the books from the floor", "2880_2940": "#C C picks a book from the", "2940_3000": "#C C arranges the books in the shelf", "3000_3060": "#C C arranges books on the shelf", "3060_3120": "#C C picks a book from the floor with her right hand", "3120_3180": "#C C picks the books on the floor", "3180_3240": "#C C picks a book from the shelf with his left hand", "3240_3300": "#C C picks the book from the shelf", "3300_3360": "#C C puts the book in the shelf with her right hand", "3360_3420": "#C C places the book on the shelf with her right hand", "3420_3480": "#C C places the book in his left hand in the shelf", "3480_3540": "#C C adjusts the book on the shelf.", "3540_3600": "#C C holds the books on her hands", "3600_3660": "#C C picks a", "3660_3720": "#C C picks books from the floor", "3720_3780": "#C C picks a book from the floor", "3780_3840": "#C C puts the books in the book shelf", "3840_3900": "#C C picks a book from the shelf", "3900_3960": "#C C arranges books in the shelve", "3960_4020": "#C C adjusts the books on the shelf ", "4020_4080": "#C C puts the books on the", "4080_4140": "#C C picks a book from the floor with her right hand", "4140_4200": "#C C holds the books with her hands", "4200_4260": "#C C picks up the books from the floor", "4260_4320": "#C C puts the books on the floor", "4320_4380": "#C C places the book on the shelf", "4380_4440": "#C C picks the book holder from the floor with her right hand", "4440_4500": "#C C arranges the books in the shelf with his right hand", "4500_4560": "#C C looks at the books on the", "4560_4620": "#C C puts a book on the floor", "4620_4680": "#C C places the book in his left hand on the ground", "4680_4740": "#C C picks the book from the floor with her right hand", "4740_4800": "#C C looks around the", "4800_4860": "#C C picks the book on the shelf", "4860_4920": "#C C arranges the books in the bookcase", "4920_4980": "#C C looks around the house", "4980_5040": "#C C picks the book from the", "5040_5100": "#C C picks a book from the", "5100_5160": "#C C puts the books on the floor", "5160_5220": "#C C puts books on the floor", "5220_5280": "#C C picks the books from the floor", "5280_5340": "#C C looks around the house", "5340_5400": "#C C puts the books on the shelf"}
--------------------------------------------------------------------------------
/preprocess/books/reid.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/reid.mp4
--------------------------------------------------------------------------------
/preprocess/books/reid.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/reid.pkl
--------------------------------------------------------------------------------
/preprocess/books/segment2id.json:
--------------------------------------------------------------------------------
1 | {"0_60": 0, "60_120": 1, "120_180": 2, "180_240": 3, "240_300": 4, "300_360": 5, "360_420": 6, "420_480": 7, "480_540": 8, "540_600": 9, "600_660": 10, "660_720": 11, "720_780": 12, "780_840": 13, "840_900": 14, "900_960": 15, "960_1020": 16, "1020_1080": 17, "1080_1140": 18, "1140_1200": 19, "1200_1260": 20, "1260_1320": 21, "1320_1380": 22, "1380_1440": 23, "1440_1500": 24, "1500_1560": 25, "1560_1620": 26, "1620_1680": 27, "1680_1740": 28, "1740_1800": 29, "1800_1860": 30, "1860_1920": 31, "1920_1980": 32, "1980_2040": 33, "2040_2100": 34, "2100_2160": 35, "2160_2220": 36, "2220_2280": 37, "2280_2340": 38, "2340_2400": 39, "2400_2460": 40, "2460_2520": 41, "2520_2580": 42, "2580_2640": 43, "2640_2700": 44, "2700_2760": 45, "2760_2820": 46, "2820_2880": 47, "2880_2940": 48, "2940_3000": 49, "3000_3060": 50, "3060_3120": 51, "3120_3180": 52, "3180_3240": 53, "3240_3300": 54, "3300_3360": 55, "3360_3420": 56, "3420_3480": 57, "3480_3540": 58, "3540_3600": 59, "3600_3660": 60, "3660_3720": 61, "3720_3780": 62, "3780_3840": 63, "3840_3900": 64, "3900_3960": 65, "3960_4020": 66, "4020_4080": 67, "4080_4140": 68, "4140_4200": 69, "4200_4260": 70, "4260_4320": 71, "4320_4380": 72, "4380_4440": 73, "4440_4500": 74, "4500_4560": 75, "4560_4620": 76, "4620_4680": 77, "4680_4740": 78, "4740_4800": 79, "4800_4860": 80, "4860_4920": 81, "4920_4980": 82, "4980_5040": 83, "5040_5100": 84, "5100_5160": 85, "5160_5220": 86, "5220_5280": 87, "5280_5340": 88, "5340_5400": 89}
--------------------------------------------------------------------------------
/preprocess/books/segment_textual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/segment_textual_embedding.pkl
--------------------------------------------------------------------------------
/preprocess/books/segment_visual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/segment_visual_embedding.pkl
--------------------------------------------------------------------------------
/preprocess/books/tid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/tid2clip.pkl
--------------------------------------------------------------------------------
/preprocess/books/tid2dinov2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/tid2dinov2.pkl
--------------------------------------------------------------------------------
/preprocess/books/tracking.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/tracking.pkl
--------------------------------------------------------------------------------
/preprocess/books/uid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/uid2clip.pkl
--------------------------------------------------------------------------------
/preprocess/kitchen/captions.json:
--------------------------------------------------------------------------------
1 | {"0_30": "#C C opens the kitchen cabinet", "30_60": "#C C opens the cabinet door", "60_90": "#C C picks a glass in the", "90_120": "#C C opens the cabinet.", "120_150": "#C C opens the tap", "150_180": "#C C puts water in the cup", "180_210": "#C C closes the tap.", "210_240": "#C C puts cup on the sink counter", "240_270": "#C C picks the cup from the counter", "270_300": "#C C opens a refrigerator with his left", "300_330": "#C C closes the fridge with his right", "330_360": "#C C picks a bottle of milk from the", "360_390": "#C C puts the bottle in the fridge", "390_420": "#C C closes the fridge with his left hand", "420_450": "#C C closes the refrigerator with his left hand", "450_480": "#C C opens the water bottle lid", "480_510": "#C C covers the kettle with the lid", "510_540": "#C C puts water in the coffee maker", "540_570": "#C C pours water in the sink", "570_600": "#C C pours the milk into the"}
--------------------------------------------------------------------------------
/preprocess/kitchen/reid.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/reid.mp4
--------------------------------------------------------------------------------
/preprocess/kitchen/reid.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/reid.pkl
--------------------------------------------------------------------------------
/preprocess/kitchen/segment2id.json:
--------------------------------------------------------------------------------
1 | {"0_30": 0, "30_60": 1, "60_90": 2, "90_120": 3, "120_150": 4, "150_180": 5, "180_210": 6, "210_240": 7, "240_270": 8, "270_300": 9, "300_330": 10, "330_360": 11, "360_390": 12, "390_420": 13, "420_450": 14, "450_480": 15, "480_510": 16, "510_540": 17, "540_570": 18, "570_600": 19}
--------------------------------------------------------------------------------
/preprocess/kitchen/segment_0.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_0.mp4
--------------------------------------------------------------------------------
/preprocess/kitchen/segment_1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_1.mp4
--------------------------------------------------------------------------------
/preprocess/kitchen/segment_18.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_18.mp4
--------------------------------------------------------------------------------
/preprocess/kitchen/segment_3.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_3.mp4
--------------------------------------------------------------------------------
/preprocess/kitchen/segment_8.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_8.mp4
--------------------------------------------------------------------------------
/preprocess/kitchen/segment_textual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_textual_embedding.pkl
--------------------------------------------------------------------------------
/preprocess/kitchen/segment_visual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_visual_embedding.pkl
--------------------------------------------------------------------------------
/preprocess/kitchen/tid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/tid2clip.pkl
--------------------------------------------------------------------------------
/preprocess/kitchen/tid2dinov2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/tid2dinov2.pkl
--------------------------------------------------------------------------------
/preprocess/kitchen/tracking.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/tracking.pkl
--------------------------------------------------------------------------------
/preprocess/kitchen/uid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/uid2clip.pkl
--------------------------------------------------------------------------------
/preprocess/painting/captions.json:
--------------------------------------------------------------------------------
1 | {"0_60": "#C C draws on the paper with the paint brush in his right hand.", "60_120": "#C C draws on the paper with the painting brush in his right hand. ", "120_180": "#C C draws on the paper with the paint brush in his right hand.", "180_240": "#C C moves a paint palette on the table with his right hand.", "240_300": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "300_360": "#C C draws on the paper with the paint brush in his right hand. ", "360_420": "#C C paints on the paper with the paint brush in his right hand.", "420_480": "#C C adjusts the drawing board with his left hand.", "480_540": "#C C draws on the paper with the paint brush in his right hand.", "540_600": "#C C dips the paint brush in his right hand in the cup of water on the table.", "600_660": "#C C dips the paint brush in his right hand in the paint palette on the table. ", "660_720": "#C C smears watercolor on the watercolor set with the", "720_780": "#C C dips the paint brush in his right hand in the paint palette on the table.", "780_840": "#C C dips the paint brush in his right hand in the paint palette on the table.", "840_900": "#C C dips the paint brush in his right hand in the cup of water on the table", "900_960": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "960_1020": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "1020_1080": "#C C draws on the paper with the paint brush in his right hand.", "1080_1140": "#C C paints a", "1140_1200": "#C C paints a", "1200_1260": "#C C paints a", "1260_1320": "#C C paints on the paper with the paint brush in his right hand", "1320_1380": "#C C draws on the paper with the paint brush in his right hand.", "1380_1440": "#C C adjusts the painting board with his right hand.", "1440_1500": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "1500_1560": "#C C dips the paint brush in his right hand in the paint palette on the table.", "1560_1620": "#C C dips the paint brush in his right hand in the cup of water on the table.", "1620_1680": "#C C stirs brush in the watercolor pan", "1680_1740": "#C C dips the paint brush in his right hand in the paint palette on the table.", "1740_1800": "#C C paints on the paper with the paint brush in his right hand. ", "1800_1860": "#C C paints a", "1860_1920": "#C C paints a", "1920_1980": "#C C paints a", "1980_2040": "#C C draws on the paper with the paint brush in his right hand.", "2040_2100": "#C C paints the", "2100_2160": "#C C paints a", "2160_2220": "#C C draws on the paper with the paint brush in his right hand. ", "2220_2280": "#C C moves the paint palette on the table with his right hand.", "2280_2340": "#C C dips the paint brush in his right hand in the paint palette on the table.", "2340_2400": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "2400_2460": "#C C operates the tablet computer on the table with his right hand.", "2460_2520": "#C C paints a", "2520_2580": "#C C draws on the paper with the paint brush in his right hand.", "2580_2640": "#C C adjusts the drawing board with his left hand.", "2640_2700": "#C C draws on the paper with the paint brush in his right hand.", "2700_2760": "#C C adjusts the book on the table with his left hand", "2760_2820": "#C C lifts the paint brush from the drawing board with his right hand.", "2820_2880": "#C C paints a", "2880_2940": "#C C paints a", "2940_3000": "#C C draws on the paper with the painting brush in his right hand.", "3000_3060": "#C C paints a", "3060_3120": "#C C dips the paint brush in his right hand in the paint palette on the table.", "3120_3180": "#C C draws on the paper with the paint brush in his right hand.", "3180_3240": "#C C paints a", "3240_3300": "#C C adjusts the book on the table with her right hand.", "3300_3360": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "3360_3420": "#C C paints on the paper with the painting brush in his right hand.", "3420_3480": "#C C paints a", "3480_3540": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "3540_3600": "#C C draws on the paper with the paint brush in his right hand.", "3600_3660": "#C C paints a", "3660_3720": "#C C draws on the paper with the paint brush in his right hand.", "3720_3780": "#C C paints on the paper with the paint brush in his right hand.", "3780_3840": "#C C adjusts the drawing board with his right hand.", "3840_3900": "#C C paints the cover of the paint palette with the paint brush in his right hand. ", "3900_3960": "#C C draws on the paper with the paint brush in his right hand.", "3960_4020": "#C C draws on the paper with the painting brush in his right hand.", "4020_4080": "#C C draws on the paper with the paint brush in his right hand.", "4080_4140": "#C C draws on the paper with the paint brush in his right hand.", "4140_4200": "#C C draws on the paper with the paint brush in his right hand.", "4200_4260": "#C C draws on the paper with the painting brush in his right hand.", "4260_4320": "#C C draws on the paper with the paint brush in his right hand.", "4320_4380": "#C C draws on the paper with the paint brush in his right hand.", "4380_4440": "#C C paints a", "4440_4500": "#C C draws on the paper with the painting brush in his right hand.", "4500_4560": "#C C draws on the paper with the paint brush in his right hand.", "4560_4620": "#C C dips the paint brush in his right hand in the paint palette on the table.", "4620_4680": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "4680_4740": "#C C draws on the paper with the paint brush in his right hand.", "4740_4800": "#C C paints a", "4800_4860": "#C C draws on the paper with the paint brush in his right hand.", "4860_4920": "#C C paints a", "4920_4980": "#C C adjusts the book on his lap with his left hand.", "4980_5040": "#C C dips the paint brush in his right hand into the cup of water on the table.", "5040_5100": "#C C moves the painting brush", "5100_5160": "#C C dips the paint brush in his right hand in the cup of water on the table.", "5160_5220": "#C C touches the book with his left hand", "5220_5280": "#C C dips the paint brush in his right hand in the cup of water on the table.", "5280_5340": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "5340_5400": "#C C dips the paint brush in his right hand in the cup of water on the table."}
--------------------------------------------------------------------------------
/preprocess/painting/reid.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/reid.mp4
--------------------------------------------------------------------------------
/preprocess/painting/reid.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/reid.pkl
--------------------------------------------------------------------------------
/preprocess/painting/segment2id.json:
--------------------------------------------------------------------------------
1 | {"0_60": 0, "60_120": 1, "120_180": 2, "180_240": 3, "240_300": 4, "300_360": 5, "360_420": 6, "420_480": 7, "480_540": 8, "540_600": 9, "600_660": 10, "660_720": 11, "720_780": 12, "780_840": 13, "840_900": 14, "900_960": 15, "960_1020": 16, "1020_1080": 17, "1080_1140": 18, "1140_1200": 19, "1200_1260": 20, "1260_1320": 21, "1320_1380": 22, "1380_1440": 23, "1440_1500": 24, "1500_1560": 25, "1560_1620": 26, "1620_1680": 27, "1680_1740": 28, "1740_1800": 29, "1800_1860": 30, "1860_1920": 31, "1920_1980": 32, "1980_2040": 33, "2040_2100": 34, "2100_2160": 35, "2160_2220": 36, "2220_2280": 37, "2280_2340": 38, "2340_2400": 39, "2400_2460": 40, "2460_2520": 41, "2520_2580": 42, "2580_2640": 43, "2640_2700": 44, "2700_2760": 45, "2760_2820": 46, "2820_2880": 47, "2880_2940": 48, "2940_3000": 49, "3000_3060": 50, "3060_3120": 51, "3120_3180": 52, "3180_3240": 53, "3240_3300": 54, "3300_3360": 55, "3360_3420": 56, "3420_3480": 57, "3480_3540": 58, "3540_3600": 59, "3600_3660": 60, "3660_3720": 61, "3720_3780": 62, "3780_3840": 63, "3840_3900": 64, "3900_3960": 65, "3960_4020": 66, "4020_4080": 67, "4080_4140": 68, "4140_4200": 69, "4200_4260": 70, "4260_4320": 71, "4320_4380": 72, "4380_4440": 73, "4440_4500": 74, "4500_4560": 75, "4560_4620": 76, "4620_4680": 77, "4680_4740": 78, "4740_4800": 79, "4800_4860": 80, "4860_4920": 81, "4920_4980": 82, "4980_5040": 83, "5040_5100": 84, "5100_5160": 85, "5160_5220": 86, "5220_5280": 87, "5280_5340": 88, "5340_5400": 89}
--------------------------------------------------------------------------------
/preprocess/painting/segment_83.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/segment_83.mp4
--------------------------------------------------------------------------------
/preprocess/painting/segment_85.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/segment_85.mp4
--------------------------------------------------------------------------------
/preprocess/painting/segment_textual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/segment_textual_embedding.pkl
--------------------------------------------------------------------------------
/preprocess/painting/segment_visual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/segment_visual_embedding.pkl
--------------------------------------------------------------------------------
/preprocess/painting/tid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/tid2clip.pkl
--------------------------------------------------------------------------------
/preprocess/painting/tid2dinov2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/tid2dinov2.pkl
--------------------------------------------------------------------------------
/preprocess/painting/tracking.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/tracking.pkl
--------------------------------------------------------------------------------
/preprocess/painting/uid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/uid2clip.pkl
--------------------------------------------------------------------------------
/preprocess/talking/captions.json:
--------------------------------------------------------------------------------
1 | {"0_60": "#O woman X points at the ceiling", "60_120": "#O person Y converses with C", "120_180": "#O person Y touches her nose with her right", "180_240": "#O woman X lifts up her", "240_300": "#C C moves the hands", "300_360": "#C C looks around the room", "360_420": "#O woman X uses gesture with a", "420_480": "#C C interacts with the woman X", "480_540": "#O woman X points at the ceiling", "540_600": "#O woman X does a hand gesture", "600_660": "#O woman X raises a hand", "660_720": "#O The woman X adjusts her glasses with both hands", "720_780": "#O person X points towards the ceiling", "780_840": "#C C interacts with lady X", "840_900": "#O person Y adjusts the camera", "900_960": "#O A woman X interacts with C", "960_1020": "#O woman X converses with C", "1020_1080": "#C C talks to the colleagues", "1080_1140": "#O woman X talks to woman", "1140_1200": "#O woman Y converses with woman", "1200_1260": "#O person Z moves her hands", "1260_1320": "#O person Y looks at person X", "1320_1380": "#C C looks at the people in the", "1380_1440": "#C C converses with a woman V,W and X and a man Y and Z", "1440_1500": "#O The man Y holds the phone with his right hand.", "1500_1560": "#C C converses with a man X and Y and a woman Z", "1560_1620": "#C C converses with the woman Y", "1620_1680": "#C The man M interacts with C, the woman N, the man M and the woman N", "1680_1740": "#O A Woman M holds her waist with both hands", "1740_1800": "#O The Woman X taps her left fingers on her thigh", "1800_1860": "#O Woman A Holds a camera with hands", "1860_1920": "#O person Y puts the card on the table", "1920_1980": "#O A woman X looks at C", "1980_2040": "#O person X talks to person", "2040_2100": "#O Woman Y eats food with the right", "2100_2160": "#C C looks around the room", "2160_2220": "#O person X interacts with person Z", "2220_2280": "#O A woman X stands in the", "2280_2340": "#C C looks at the people in the", "2340_2400": "#C C looks at the woman", "2400_2460": "#C C stares at a woman Y", "2460_2520": "#C C looks around the house", "2520_2580": "#C C converses with the man Y, the man X and the woman Z", "2580_2640": "#O A man X talks to man Z"}
--------------------------------------------------------------------------------
/preprocess/talking/reid.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/reid.mp4
--------------------------------------------------------------------------------
/preprocess/talking/reid.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/reid.pkl
--------------------------------------------------------------------------------
/preprocess/talking/segment2id.json:
--------------------------------------------------------------------------------
1 | {"0_60": 0, "60_120": 1, "120_180": 2, "180_240": 3, "240_300": 4, "300_360": 5, "360_420": 6, "420_480": 7, "480_540": 8, "540_600": 9, "600_660": 10, "660_720": 11, "720_780": 12, "780_840": 13, "840_900": 14, "900_960": 15, "960_1020": 16, "1020_1080": 17, "1080_1140": 18, "1140_1200": 19, "1200_1260": 20, "1260_1320": 21, "1320_1380": 22, "1380_1440": 23, "1440_1500": 24, "1500_1560": 25, "1560_1620": 26, "1620_1680": 27, "1680_1740": 28, "1740_1800": 29, "1800_1860": 30, "1860_1920": 31, "1920_1980": 32, "1980_2040": 33, "2040_2100": 34, "2100_2160": 35, "2160_2220": 36, "2220_2280": 37, "2280_2340": 38, "2340_2400": 39, "2400_2460": 40, "2460_2520": 41, "2520_2580": 42, "2580_2640": 43}
--------------------------------------------------------------------------------
/preprocess/talking/segment_10.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/segment_10.mp4
--------------------------------------------------------------------------------
/preprocess/talking/segment_11.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/segment_11.mp4
--------------------------------------------------------------------------------
/preprocess/talking/segment_9.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/segment_9.mp4
--------------------------------------------------------------------------------
/preprocess/talking/segment_textual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/segment_textual_embedding.pkl
--------------------------------------------------------------------------------
/preprocess/talking/segment_visual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/segment_visual_embedding.pkl
--------------------------------------------------------------------------------
/preprocess/talking/tid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/tid2clip.pkl
--------------------------------------------------------------------------------
/preprocess/talking/tid2dinov2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/tid2dinov2.pkl
--------------------------------------------------------------------------------
/preprocess/talking/tracking.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/tracking.pkl
--------------------------------------------------------------------------------
/preprocess/talking/uid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/uid2clip.pkl
--------------------------------------------------------------------------------
/prompts/database_query_prompt.txt:
--------------------------------------------------------------------------------
1 | You are tasked with answering a question about a video using a database. The database consists of three tables:
2 |
3 | TABLE Objects(
4 | object_id INT,
5 | category VARCHAR(255),
6 | PRIMARY KEY (object_id)
7 | )
8 | The 'Objects' table catalogues the people or objects in the video, with each assigned a unique 'object_id' and 'category'. For example, an object entry may be (1, 'cup').
9 |
10 | TABLE Segments(
11 | segment_id INT,
12 | PRIMARY KEY (segment_id)
13 | )
14 | The 'Segments' are 2-second segments of the video. The 'segment_id' starts from 0 and increments by 1 sequentially.
15 |
16 | TABLE Objects_Segments(
17 | object_id INT,
18 | segment_id INT,
19 | PRIMARY KEY (object_id, segment_id),
20 | FOREIGN KEY (object_id) REFERENCES Objects(object_id),
21 | FOREIGN KEY (segment_id) REFERENCES Segments(segment_id)
22 | )
23 | The 'Objects_Segments' table links the 'Objects' and 'Segments' tables, recording the appearing objects in each segment.
24 |
25 | You have access to the following tools:
26 |
27 | {tools}
28 |
29 | ATTENTION:
30 | 1. Since you only have information about the objects and their appearing segments, if you think the question requires more information, just output "I cannot answer this question."
31 | 2. The categories of the objects/people are limited. To find a specific object, you can first query the database for all the object categories, and match the object to one of the categories. If you cannot find objects using the categories, you can also try the tool 'retreive_candidate_objects'.
32 | 3. use single quotes for the strings in the MySQL program, for instance: SELECT COUNT(DISTINCT object_id) FROM Objects WHERE category = 'person'
33 |
34 | Use the following format:
35 |
36 | Question: the input question you must answer
37 | Thought: you should always think about what to do
38 | Action: the action to take, should be one of [{tool_names}]
39 | Action Input: the input to the action
40 | Observation: the result of the action... (this Thought/Action/Action Input/Observation can repeat N times)
41 | Thought: I now know the final answer
42 | Final Answer: the answer to the original input question
43 |
44 | Begin!
45 |
46 | Question: {input}
47 | Thought: {agent_scratchpad}
--------------------------------------------------------------------------------
/prompts/multiple_choice_prompt.txt:
--------------------------------------------------------------------------------
1 | You are tasked with answering a multiple-choice question related to a video. The question has 5 choices, labeled as 0, 1, 2, 3, 4. The video is sliced into 2-second segments, each with an segment ID starting from zero and incrementing in chronological order. Each segment has a caption depicting the event.
2 | There is an object memory that saves the objects and their appearing segments. The object memory is maintained by another agent.
3 | You have access to the following tools:
4 |
5 | {tools}
6 |
7 | ATTENTION:
8 | 1. the segment captions with prefix '#C' refer to the camera wearer, while those with prefix '#O' refer to someone other than the camera wearer.
9 | 2. You can use both 'visual_question_answering' and 'object_memory_querying' to answer questions related to objects or people.
10 | 3. The 'visual_question_answering' may have hallucination. You should pay more attention to the description rather than the answer in 'visual_question_answering'.
11 | 4. Use double quotes on the string arguments of the tools. The input to the tools should not contain any single quotes. If the tool has two arguments, output the arguments in brackets such as ("what is the man doing", 1).
12 | 5. Its easier to answer the multiple-choice question by validating the choices.
13 | 6. If the information is too vague to provide an accurate answer, make your best guess.
14 |
15 | Use the following format:
16 |
17 | Question: the input question you must answer
18 | Thought: you should always think about what to do
19 | Action: the action to take, should be one of [{tool_names}]
20 | Action Input: the input to the action
21 | Observation: the result of the action... (this Thought/Action/Action Input/Observation can repeat N times)
22 | Thought: I now know the final answer
23 | Final Answer: the correct choice label (0, 1, 2, 3, 4) to the original input question
24 |
25 | Begin!
26 |
27 | Question: {input}
28 | Thought: {agent_scratchpad}
29 |
--------------------------------------------------------------------------------
/prompts/prompt.txt:
--------------------------------------------------------------------------------
1 | You are tasked with answering a question related to a video. The video is sliced into 2-second segments, each with an segment ID starting from zero and incrementing in chronological order. Each segment has a caption depicting the event.
2 | There is an object memory that saves the objects and their appearing segments. The object memory is maintained by another agent.
3 | You have access to the following tools:
4 |
5 | {tools}
6 |
7 | ATTENTION:
8 | 1. the segment captions with prefix '#C' refer to the camera wearer, while those with prefix '#O' refer to someone other than the camera wearer.
9 | 2. You can use both 'visual_question_answering' and 'object_memory_querying' to answer questions related to objects or people.
10 | 3. The 'visual_question_answering' may have hallucination. You should pay more attention to the description rather than the answer in 'visual_question_answering'.
11 | 4. Use double quotes on the string arguments of the tools. The input to the tools should not contain any single quotes. If the tool has two arguments, output the arguments in brackets such as ("what is the man doing", 1).
12 | 5. If the information is too vague to provide an accurate answer, make your best guess.
13 |
14 | Use the following format:
15 |
16 | Question: the input question you must answer
17 | Thought: you should always think about what to do
18 | Action: the action to take, should be one of [{tool_names}]
19 | Action Input: the input to the action
20 | Observation: the result of the action... (this Thought/Action/Action Input/Observation can repeat N times)
21 | Thought: I now know the final answer
22 | Final Answer: the answer to the original input question
23 |
24 | Begin!
25 |
26 | Question: {input}
27 | Thought: {agent_scratchpad}
28 |
--------------------------------------------------------------------------------
/reid.py:
--------------------------------------------------------------------------------
1 | import cv2
2 | from time import time
3 | import json
4 | import pickle
5 | import os
6 | from collections import defaultdict
7 | import clip
8 | import random as rd
9 | from PIL import Image
10 | import torch
11 | import numpy as np
12 | import imageio
13 | rd.seed(0)
14 |
15 |
16 | def hash_color(obj_id):
17 | np.random.seed(obj_id)
18 | color = np.random.randint(0, 256, 3)
19 | new_color = tuple(int(i) for i in color)
20 | return new_color
21 |
22 |
23 | class ReID:
24 | def __init__(self, video_path_list, base_dir='preprocess'):
25 | self.video_path_list = video_path_list
26 | self.base_dir = base_dir
27 | self.trackid2clip_emb = None
28 | self.trackid2dinov2_emb = None
29 | self.trackid2frame = None
30 | self.trackid2category = None
31 | self.uid2tids = None
32 | self.tid2uid = None
33 |
34 |
35 | def hard_constraint(self, obj1, obj2):
36 | # if self.trackid2category[obj1] != self.trackid2category[obj2]: # if two tracked objects have different categories, they cannot be the same object
37 | # return False
38 | frame1 = set(self.trackid2frame[obj1])
39 | frame2 = set(self.trackid2frame[obj2])
40 | if len(frame1.intersection(frame2)) > 0: # if two tracked objects co-exist, they cannot be the same object
41 | return False
42 | return True
43 |
44 |
45 | def clip_similarity_score(self, obj1, obj2, x0=0.925, slope=20):
46 | clip_emb1 = self.trackid2clip_emb[obj1]
47 | clip_emb2 = self.trackid2clip_emb[obj2]
48 | cosine_score = np.dot(clip_emb1, clip_emb2) / (np.linalg.norm(clip_emb1) * np.linalg.norm(clip_emb2))
49 | clip_score = 1 / (1 + np.exp(-slope * (cosine_score - x0)))
50 | return clip_score
51 |
52 |
53 | def dinov2_similarity_score(self, obj1, obj2, x0=0.5, slope=4.1):
54 | dinov2_emb1 = self.trackid2dinov2_emb[obj1]
55 | dinov2_emb2 = self.trackid2dinov2_emb[obj2]
56 | cosine_score = np.dot(dinov2_emb1, dinov2_emb2) / (np.linalg.norm(dinov2_emb1) * np.linalg.norm(dinov2_emb2))
57 | #dinov2_score = 1 / (1 + np.exp(-slope * (cosine_score - x0)))
58 | dinov2_score = cosine_score
59 | return dinov2_score
60 |
61 |
62 | def compute_score(self, obj1, obj2):
63 | if not self.hard_constraint(obj1, obj2):
64 | return 0
65 | clip_score = self.clip_similarity_score(obj1, obj2)
66 | dinov2_score = self.dinov2_similarity_score(obj1, obj2)
67 | return 0.15*clip_score+ 0.85*dinov2_score
68 |
69 |
70 | def check_group(self, tid, uid):
71 | """tid should has score > 0.5 for all uid objects, and at least one score > 0.62"""
72 | sgn = False
73 | for t in self.uid2tids[uid]:
74 | if self.compute_score(tid, t) < 0.5:
75 | return False
76 | if self.compute_score(tid, t) >= 0.62:
77 | sgn = True
78 | return sgn
79 |
80 |
81 | def reid_for_all_videos(self):
82 | for video_path in self.video_path_list:
83 | base_name = os.path.basename(video_path).replace(".mp4", "")
84 | video_dir = os.path.join(self.base_dir, base_name)
85 | with open(os.path.join(video_dir, 'tid2clip.pkl'), 'rb') as f:
86 | self.trackid2clip_emb = pickle.load(f)
87 | with open(os.path.join(video_dir, 'tid2dinov2.pkl'), 'rb') as f:
88 | self.trackid2dinov2_emb = pickle.load(f)
89 | with open(os.path.join(video_dir, 'tracking.pkl'), 'rb') as f:
90 | content = pickle.load(f)
91 | self.frame2trackid, self.trackid2frame, self.trackid2category = content[0], content[1], content[2]
92 | self.uid2tids = defaultdict(list)
93 | self.tid2uid = dict()
94 |
95 | for frame in self.frame2trackid:
96 | cur_track_ids = self.frame2trackid[frame]
97 | for tid in cur_track_ids:
98 | if tid in self.tid2uid:
99 | continue
100 | sgn = False
101 | for uid in self.uid2tids:
102 | if self.check_group(tid, uid):
103 | self.uid2tids[uid].append(tid)
104 | self.tid2uid[tid] = uid
105 | sgn = True
106 | break
107 | if sgn == False:
108 | uid = len(self.uid2tids)
109 | self.uid2tids[uid].append(tid)
110 | self.tid2uid[tid] = uid
111 |
112 | frame2uid = defaultdict(dict)
113 | uid2frame = defaultdict(list)
114 | uid2category = dict()
115 | uid2clipemb = defaultdict(list)
116 | uid2clip = dict()
117 | for frame in self.frame2trackid:
118 | for tid in self.frame2trackid[frame]:
119 | frame2uid[frame][self.tid2uid[tid]] = self.frame2trackid[frame][tid]
120 | for uid in self.uid2tids:
121 | tids = self.uid2tids[uid]
122 | for tid in tids:
123 | uid2frame[uid] += self.trackid2frame[tid]
124 | uid2clipemb[uid].append(self.trackid2clip_emb[tid])
125 |
126 | for uid in uid2clipemb:
127 | emb = torch.stack(uid2clipemb[uid], dim=0)
128 | emb = torch.mean(emb, dim=0)
129 | uid2clip[uid] = emb
130 | save_file = os.path.join(video_dir, 'uid2clip.pkl')
131 | with open(save_file, 'wb') as f:
132 | pickle.dump(uid2clip, f)
133 |
134 | reid_file = os.path.join(video_dir, 'reid.pkl')
135 | for uid in self.uid2tids:
136 | uid2category[uid] = self.trackid2category[self.uid2tids[uid][0]]
137 | with open(reid_file, 'wb') as f:
138 | pickle.dump([frame2uid, uid2frame, uid2category], f)
139 |
140 |
141 | def replay(self):
142 | for video_path in self.video_path_list:
143 | base_name = os.path.basename(video_path).replace(".mp4", "")
144 | video_dir = os.path.join(self.base_dir, base_name)
145 | with open(os.path.join(video_dir, 'reid.pkl'), 'rb') as f:
146 | content = pickle.load(f)
147 | frame2uid, uid2frame, uid2category = content[0], content[1], content[2]
148 | cap = cv2.VideoCapture(video_path)
149 | cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
150 | frame_idx = -1
151 | writer = imageio.get_writer(os.path.join(video_dir, 'reid.mp4'), fps=15)
152 | while True:
153 | success, frame = cap.read()
154 | frame_idx += 1
155 | if not success:
156 | break
157 | if frame_idx in frame2uid:
158 | for uid in frame2uid[frame_idx]:
159 | c = hash_color(uid)
160 | x, y, w, h = frame2uid[frame_idx][uid][1]
161 | left_top = (int(x-w/2), int(y-h/2))
162 | right_bottom = (int(x+w/2), int(y+h/2))
163 | cv2.rectangle(frame, left_top, right_bottom, c, 2)
164 | label = f'ID: {uid}'
165 | label_position = (int(x-w/2)+2, int(y-h/2)+12)
166 | cv2.putText(frame, label, label_position, cv2.FONT_HERSHEY_SIMPLEX, 0.5, c, 2)
167 | #cv2.imshow("reid", frame)
168 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
169 | writer.append_data(frame)
170 | writer.close()
171 | cap.release()
172 | cv2.destroyAllWindows()
173 |
174 |
175 | def run(self):
176 | self.reid_for_all_videos()
177 | self.replay()
178 |
--------------------------------------------------------------------------------
/sample_videos/boats.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/sample_videos/boats.mp4
--------------------------------------------------------------------------------
/sample_videos/books.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/sample_videos/books.mp4
--------------------------------------------------------------------------------
/sample_videos/kitchen.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/sample_videos/kitchen.mp4
--------------------------------------------------------------------------------
/sample_videos/painting.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/sample_videos/painting.mp4
--------------------------------------------------------------------------------
/sample_videos/talking.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/sample_videos/talking.mp4
--------------------------------------------------------------------------------
/segment_feature.py:
--------------------------------------------------------------------------------
1 | import os
2 | import time
3 | import torch
4 | import json
5 | import cv2
6 | import pickle
7 | from InternVid.viclip import get_viclip, frames2tensor, get_vid_feat
8 | from encoder import encode_sentences
9 |
10 |
11 | model_cfgs = {
12 | 'viclip-l-internvid-10m-flt': {
13 | 'size': 'l',
14 | 'pretrained': 'tool_models/viCLIP/ViClip-InternVid-10M-FLT.pth',
15 | }
16 | }
17 |
18 | class SegmentFeature:
19 | def __init__(self, video_path_list, base_dir='preprocess'):
20 | self.video_path_list = video_path_list
21 | self.base_dir = base_dir
22 | self.seconds_per_feat = 2
23 | self.frames_per_feat = 10
24 |
25 |
26 |
27 | def create_textual_embedding(self):
28 | """use the sentence encoder model to embed the captions of all the videos"""
29 | model='text-embedding-3-large'
30 | for video_path in self.video_path_list:
31 | start_time = time.time()
32 | base_name = os.path.basename(video_path).replace(".mp4", "")
33 | video_dir = os.path.join(self.base_dir, base_name)
34 | with open(os.path.join(video_dir, 'captions.json')) as f:
35 | captions = json.load(f)
36 | caps = list(captions.values())
37 | caption_emb = encode_sentences(sentence_list=caps, model_name=model)
38 | print(caption_emb)
39 | with open(os.path.join(video_dir, f'segment_textual_embedding.pkl'), 'wb') as f:
40 | pickle.dump(caption_emb, f)
41 | end_time = time.time()
42 | print(f"textual encoding time for video {base_name}: {round(end_time-start_time, 3)} seconds")
43 |
44 |
45 | def create_visual_embedding(self):
46 | start_time = time.time()
47 | cfg = model_cfgs['viclip-l-internvid-10m-flt']
48 | model = get_viclip(cfg['size'], cfg['pretrained'])
49 | assert(type(model)==dict and model['viclip'] is not None and model['tokenizer'] is not None)
50 | clip, tokenizer = model['viclip'], model['tokenizer']
51 | clip = clip.to("cuda")
52 | end_time = time.time()
53 | print(f'time for loading viCLIP model: {round(end_time-start_time, 3)} seconds')
54 |
55 | for video_path in self.video_path_list:
56 | base_name = os.path.basename(video_path).replace(".mp4", "")
57 | video_dir = os.path.join(self.base_dir, base_name)
58 | if not os.path.exists(video_dir):
59 | os.makedirs(video_dir)
60 |
61 | cap = cv2.VideoCapture(video_path)
62 | fps = round(cap.get(cv2.CAP_PROP_FPS))
63 | total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
64 | frame_interval = fps*self.seconds_per_feat//self.frames_per_feat
65 | total_feats = total_frames//(fps*self.seconds_per_feat)
66 |
67 | segment_feats = []
68 | start_time = time.time()
69 | cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
70 | for segment_id in range(total_feats):
71 | frames = []
72 | for i in range(self.frames_per_feat):
73 | success, frame = cap.read()
74 | frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
75 | frames.append(frame)
76 | for j in range(frame_interval-1): #skip other frames
77 | success, frame = cap.read()
78 | for i in range(fps*self.seconds_per_feat-frame_interval*self.frames_per_feat):
79 | success, frame = cap.read() #skip remaining frames
80 | frames_tensor = frames2tensor(frames, device='cuda')
81 | with torch.no_grad():
82 | vid_feat = get_vid_feat(frames_tensor, clip).cpu()
83 | segment_feats.append(vid_feat)
84 | segment_feats = torch.cat(segment_feats, dim=0).numpy()
85 | end_time = time.time()
86 | cap.release()
87 | print(segment_feats)
88 | print(f"visual embedding time for video {base_name}: {round(end_time-start_time, 3)} seconds")
89 | with open(os.path.join(video_dir, 'segment_visual_embedding.pkl'), 'wb') as f:
90 | pickle.dump(segment_feats, f)
91 |
92 |
93 | def run(self):
94 | self.create_textual_embedding()
95 | self.create_visual_embedding()
96 |
97 |
--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
1 | from sklearn.metrics.pairwise import cosine_similarity
2 | import numpy as np
3 |
4 |
5 | def compute_cosine_similarity(target_embedding, embedding_list):
6 | target_embedding_tensor = target_embedding.reshape(1, -1)
7 | # Compute cosine similarity
8 | similarity_scores = cosine_similarity(target_embedding_tensor, embedding_list)
9 | return similarity_scores.reshape(-1)
10 |
11 |
12 | def top_k_indices(scores, k):
13 | max_len = scores.shape[0]
14 | k = min(max_len, k)
15 | indices = np.argsort(scores)[-k:][::-1]
16 | return list(indices)
--------------------------------------------------------------------------------
/video-llava.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from videollava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
3 | from videollava.conversation import conv_templates, SeparatorStyle
4 | from videollava.model.builder import load_pretrained_model
5 | from videollava.utils import disable_torch_init
6 | from videollava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
7 | import socket
8 | import os
9 | import pickle
10 |
11 |
12 | def main():
13 | disable_torch_init()
14 | model_path = 'LanguageBind/Video-LLaVA-7B'
15 | cache_dir = 'cache_dir'
16 | device = 'cuda'
17 | load_4bit, load_8bit = True, False
18 | model_name = get_model_name_from_path(model_path)
19 | tokenizer, model, processor, _ = load_pretrained_model(model_path, None, model_name, load_8bit, load_4bit, device=device, cache_dir=cache_dir)
20 | video_processor = processor['video']
21 | server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
22 | if not os.path.exists("tmp"):
23 | os.mkdir("tmp")
24 | if os.path.exists("tmp/vqa.sock"):
25 | os.unlink("tmp/vqa.sock")
26 | server.bind("tmp/vqa.sock")
27 | server.listen(0)
28 | print('ready for connection!')
29 | # with open("tmp/ready.txt", 'w') as f:
30 | # f.write("ready!")
31 | while True:
32 | connection, address = server.accept()
33 | r = connection.recv(1024).decode()
34 | # if r == "stop":
35 | # break
36 | with open('tmp/content.pkl', 'rb') as f:
37 | content = pickle.load(f)
38 | video_path = content['video_path']
39 | questions = ['what is the video about?', content['question']]
40 | answers = []
41 | print('\n'+video_path)
42 | for i in range(2):
43 | video_tensor = video_processor(video_path, return_tensors='pt')['pixel_values']
44 | if type(video_tensor) is list:
45 | tensor = [video.to(model.device, dtype=torch.float16) for video in video_tensor]
46 | else:
47 | tensor = video_tensor.to(model.device, dtype=torch.float16)
48 |
49 | conv_mode = "llava_v1"
50 | conv = conv_templates[conv_mode].copy()
51 | roles = conv.roles
52 |
53 | print(f"{roles[1]}: {questions[i]}")
54 | question = ' '.join([DEFAULT_IMAGE_TOKEN] * model.get_video_tower().config.num_frames) + '\n' + questions[i]
55 | conv.append_message(conv.roles[0], question)
56 | conv.append_message(conv.roles[1], None)
57 | prompt = conv.get_prompt()
58 | input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
59 | stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
60 | keywords = [stop_str]
61 | stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
62 | #print('video & question processing done!')
63 | with torch.inference_mode():
64 | output_ids = model.generate(
65 | input_ids,
66 | images=tensor,
67 | do_sample=True,
68 | temperature=0.1,
69 | max_new_tokens=1024,
70 | use_cache=True,
71 | stopping_criteria=[stopping_criteria])
72 |
73 | outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
74 | outputs = outputs.replace("", "")
75 | answers.append(outputs)
76 | reply = f"Segment description: {answers[0]}\nAnswer to the question: {answers[1]}"
77 | print(reply)
78 | with open('tmp/content.pkl', 'wb') as f:
79 | pickle.dump(reply, f)
80 | connection.send(b'sent')
81 | r = connection.recv(1024)
82 | connection.close()
83 |
84 |
85 | if __name__ == '__main__':
86 | main()
--------------------------------------------------------------------------------