├── InternVid
    ├── README.md
    ├── README_CN.md
    ├── demo.ipynb
    ├── div_sampling.py
    ├── test_viCLIP.py
    ├── utils
    │   ├── basic_utils.py
    │   ├── config.py
    │   ├── config_utils.py
    │   ├── distributed.py
    │   ├── easydict.py
    │   ├── logger.py
    │   ├── optimizer.py
    │   └── scheduler.py
    └── viclip
    │   ├── __init__.py
    │   ├── __pycache__
    │       ├── __init__.cpython-311.pyc
    │       ├── __init__.cpython-39.pyc
    │       ├── simple_tokenizer.cpython-311.pyc
    │       ├── simple_tokenizer.cpython-39.pyc
    │       ├── viclip.cpython-39.pyc
    │       ├── viclip_text.cpython-39.pyc
    │       └── viclip_vision.cpython-39.pyc
    │   ├── bpe_simple_vocab_16e6.txt.gz
    │   ├── simple_tokenizer.py
    │   ├── viclip.py
    │   ├── viclip_text.py
    │   └── viclip_vision.py
├── LICENSE
├── LaViLa
    ├── CODE_OF_CONDUCT.md
    ├── CONTRIBUTING.md
    ├── LICENSE
    ├── README.md
    ├── __pycache__
    │   └── eval_narrator.cpython-39.pyc
    ├── clip_caption.py
    ├── datasets
    │   └── README.md
    ├── demo.py
    ├── demo_narrator.py
    ├── demo_narrator_3rd_person.py
    ├── docs
    │   ├── INSTALL.md
    │   ├── MODEL_ZOO.md
    │   └── PRETRAIN.md
    ├── eval_narrator.py
    ├── eval_zeroshot.py
    ├── lavila
    │   ├── data
    │   │   ├── __pycache__
    │   │   │   ├── datasets.cpython-39.pyc
    │   │   │   └── video_transforms.cpython-39.pyc
    │   │   ├── datasets.py
    │   │   └── video_transforms.py
    │   ├── models
    │   │   ├── __pycache__
    │   │   │   ├── coca.cpython-39.pyc
    │   │   │   ├── distributed_utils.cpython-39.pyc
    │   │   │   ├── gpt2_gated.cpython-39.pyc
    │   │   │   ├── loss.cpython-39.pyc
    │   │   │   ├── models.cpython-39.pyc
    │   │   │   ├── narrator.cpython-39.pyc
    │   │   │   ├── openai_clip.cpython-39.pyc
    │   │   │   ├── openai_model.cpython-39.pyc
    │   │   │   ├── timesformer.cpython-39.pyc
    │   │   │   ├── tokenizer.cpython-39.pyc
    │   │   │   └── utils.cpython-39.pyc
    │   │   ├── bpe_simple_vocab_16e6.txt.gz
    │   │   ├── coca.py
    │   │   ├── distributed_utils.py
    │   │   ├── gpt2_gated.py
    │   │   ├── loss.py
    │   │   ├── models.py
    │   │   ├── narrator.py
    │   │   ├── openai_clip.py
    │   │   ├── openai_model.py
    │   │   ├── timesformer.py
    │   │   ├── tokenizer.py
    │   │   └── utils.py
    │   └── utils
    │   │   ├── __pycache__
    │   │       ├── distributed.cpython-39.pyc
    │   │       └── preprocess.cpython-39.pyc
    │   │   ├── distributed.py
    │   │   ├── evaluation.py
    │   │   ├── evaluation_charades.py
    │   │   ├── evaluation_egomcq.py
    │   │   ├── evaluation_ek100cls.py
    │   │   ├── evaluation_ek100mir.py
    │   │   ├── meter.py
    │   │   ├── preprocess.py
    │   │   ├── random.py
    │   │   └── scheduler.py
    ├── main_finetune_classification.py
    ├── main_finetune_retrieval.py
    ├── main_infer_narrator.py
    ├── main_pretrain.py
    ├── requirements.txt
    ├── run_with_submitit_finetune_classification.py
    ├── run_with_submitit_finetune_retrieval.py
    ├── run_with_submitit_infer_narrator.py
    ├── run_with_submitit_pretrain.py
    └── scripts
    │   ├── convert_egovlp_ckpt.py
    │   └── crop_and_resize_ego4d.sh
├── README.md
├── captioning.py
├── config
    └── default.yaml
├── database.py
├── demo.py
├── encoder.py
├── environment.yaml
├── imgs
    ├── demo.png
    └── teaser.png
├── inference.py
├── main.py
├── preprocess
    ├── boats
    │   ├── captions.json
    │   ├── reid.mp4
    │   ├── reid.pkl
    │   ├── segment2id.json
    │   ├── segment_textual_embedding.pkl
    │   ├── segment_visual_embedding.pkl
    │   ├── tid2clip.pkl
    │   ├── tid2dinov2.pkl
    │   ├── tracking.pkl
    │   └── uid2clip.pkl
    ├── books
    │   ├── captions.json
    │   ├── reid.mp4
    │   ├── reid.pkl
    │   ├── segment2id.json
    │   ├── segment_textual_embedding.pkl
    │   ├── segment_visual_embedding.pkl
    │   ├── tid2clip.pkl
    │   ├── tid2dinov2.pkl
    │   ├── tracking.pkl
    │   └── uid2clip.pkl
    ├── kitchen
    │   ├── captions.json
    │   ├── reid.mp4
    │   ├── reid.pkl
    │   ├── segment2id.json
    │   ├── segment_0.mp4
    │   ├── segment_1.mp4
    │   ├── segment_18.mp4
    │   ├── segment_3.mp4
    │   ├── segment_8.mp4
    │   ├── segment_textual_embedding.pkl
    │   ├── segment_visual_embedding.pkl
    │   ├── tid2clip.pkl
    │   ├── tid2dinov2.pkl
    │   ├── tracking.pkl
    │   └── uid2clip.pkl
    ├── painting
    │   ├── captions.json
    │   ├── reid.mp4
    │   ├── reid.pkl
    │   ├── segment2id.json
    │   ├── segment_83.mp4
    │   ├── segment_85.mp4
    │   ├── segment_textual_embedding.pkl
    │   ├── segment_visual_embedding.pkl
    │   ├── tid2clip.pkl
    │   ├── tid2dinov2.pkl
    │   ├── tracking.pkl
    │   └── uid2clip.pkl
    └── talking
    │   ├── captions.json
    │   ├── reid.mp4
    │   ├── reid.pkl
    │   ├── segment2id.json
    │   ├── segment_10.mp4
    │   ├── segment_11.mp4
    │   ├── segment_9.mp4
    │   ├── segment_textual_embedding.pkl
    │   ├── segment_visual_embedding.pkl
    │   ├── tid2clip.pkl
    │   ├── tid2dinov2.pkl
    │   ├── tracking.pkl
    │   └── uid2clip.pkl
├── prompts
    ├── database_query_prompt.txt
    ├── multiple_choice_prompt.txt
    └── prompt.txt
├── reid.py
├── sample_videos
    ├── boats.mp4
    ├── books.mp4
    ├── kitchen.mp4
    ├── painting.mp4
    └── talking.mp4
├── segment_feature.py
├── tools.py
├── tracking.py
├── utils.py
└── video-llava.py


/InternVid/README_CN.md:
--------------------------------------------------------------------------------
 1 | # InternVid \[[论文](https://arxiv.org/pdf/2307.06942.pdf)\]
 2 | 
 3 | [![数据集](https://img.shields.io/badge/%F0%9F%A4%97%20InternVid-Dataset-blue)](https://huggingface.co/datasets/OpenGVLab/InternVid) | [![模型](https://img.shields.io/badge/%F0%9F%A4%97%20ViCLIP-Model-purple)](https://huggingface.co/OpenGVLab/ViCLIP)
 4 | 
 5 | \[[English verision](README.md)\]
 6 | 
 7 | # :fire: 新闻
 8 | 我们很高兴宣布部分发布一个大规模的视频文本数据集，旨在促进多模态理解和生成。作为此次发布的一部分，我们提供了该数据集的[子集](https://huggingface.co/datasets/OpenGVLab/InternVid)包含1000万个视频剪辑。此外，我们还提供了一个使用ViT-L架构在这个子集上训练的[ViCLIP](https://huggingface.co/OpenGVLab/ViCLIP)。该模型在Kinetics上实现了SOTA的零样本动作识别性能。
 9 | 
10 | 我们提供了示例代码，阐明如何使用ViClip的过程，在[demo.ipynb](https://github.com/OpenGVLab/InternVideo/blob/main/Data/InternVid/demo.ipynb)中有详述。
11 | 
12 | 请关注我们的更新！
13 | 
14 | # 简介
15 | 
16 | **数据**
17 | 
18 | 我们从16个流行类别中收集了各种百分比的视频。为了确保多样性，我们选择了来自不同语言的国家的视频，而非依赖于一个主导语言环境。我们采样的国家包括英国、美国、澳大利亚、日本、韩国、中国、俄罗斯和法国等。在时长方面，每个视频平均持续351.9秒。几乎一半（49%）的视频时长不超过五分钟，而四分之一（26%）的视频时长在五到十分钟之间。只有8%的视频超过20分钟。在策划的视频中，85%是高分辨率（720P），其余15%的分辨率从360P至720P不等。虽然低分辨率的视频在内容生成任务中可能表现不如高分辨率的视频，但只要配有适当的字幕，它们仍可用于视频-语言表示学习。
19 | 
20 | ![b469e00b43d46a6b3f89899483abcf6](https://github.com/OpenGVLab/InternVideo/assets/43169235/7d6aca7d-362a-425d-9ef2-ec0189491b52)
21 | 
22 | InternVid展示了在分割剪辑级别上具有不同剪辑时长和字幕长度的多样性。美学分数和剪辑-字幕相似度均匀分布。大部分剪辑的长度在0-10秒之间，占所有剪辑的85%。大约一半的剪辑字幕含有10-20个单词，而三分之一的剪辑字幕含有少于10个单词。大约11%的剪辑具有超过20个单词的长字幕。
23 | 
24 | ![429af4993adb77478c000c865ae5a1b](https://github.com/OpenGVLab/InternVideo/assets/43169235/f64588c3-81e8-43de-b771-46500474d2ff)
25 | 
26 | **ViCLIP: 一个简单的用于转移视频-文本表示的视频CLIP**
27 | 
28 | 基于<a href="https://github.com/openai/CLIP">CLIP</a>, 我们构建了一个简单的视频-文本预训练基线ViCLIP。它由视频编码器（ViT）和文本编码器组成，如下所示。这两个模块都是从相应的CLIP组件初始化的。我们将视频编码器中的原生注意力更新为时空注意力，同时保持其他设计元素不变。为了高效学习，我们在预训练中对视频进行了掩蔽处理。
29 | 
30 | <img width="633" alt="87c6263cc4aceee72cc8e37085a8109" src="https://github.com/OpenGVLab/InternVideo/assets/43169235/1e540a2b-f503-4036-b2a8-ba99401fc5b0">
31 | 
32 | 
33 | # 数据 & 模型库
34 | 
35 | ### 预训练数据 & 模型
36 | 
37 | <div>
38 | 
39 | |      模型      |   训练数据   |                                               描述                                                |
40 | | :-----------------: | :----------------------: | :---------------------------------------------------------------------------------------------------: |
41 | | ViCLIP-L-14 \[[HuggingFace](https://huggingface.co/OpenGVLab/ViCLIP) \| [Aliyun](https://pjlab-gvm-data.oss-cn-shanghai.aliyuncs.com/internvideo/viclip/ViClip-InternVid-10M-FLT.pth)\] | InternVid-10M-FLT \[[HuggingFace](https://huggingface.co/datasets/OpenGVLab/InternVid) \| [OpenDataLab](https://opendatalab.com/shepshep/InternVid)\] |    |
42 | </div>
43 | 
44 | 
45 | ## Citation
46 | 
47 | 如果您发现这项工作对您的研究有所帮助，请考虑引用InternVid。您的肯定将极大地帮助我们继续为研究社区贡献资源。
48 | 
49 | ```
50 | @article{wang2023internvid,
51 |   title={InternVid: A Large-scale Video-Text Dataset for Multimodal Understanding and Generation},
52 |   author={Wang, Yi and He, Yinan and Li, Yizhuo and Li, Kunchang and Yu, Jiashuo and Ma, Xin and Chen, Xinyuan and Wang, Yaohui and Luo, Ping and Liu, Ziwei and Wang, Yali and Wang, Limin and Qiao, Yu},
53 |   journal={arXiv preprint arXiv:2307.06942},
54 |   year={2023}
55 | }
56 | 
57 | @article{wang2022internvideo,
58 |   title={InternVideo: General Video Foundation Models via Generative and Discriminative Learning},
59 |   author={Wang, Yi and Li, Kunchang and Li, Yizhuo and He, Yinan and Huang, Bingkun and Zhao, Zhiyu and Zhang, Hongjie and Xu, Jilan and Liu, Yi and Wang, Zun and Xing, Sen and Chen, Guo and Pan, Junting and Yu, Jiashuo and Wang, Yali and Wang, Limin and Qiao, Yu},
60 |   journal={arXiv preprint arXiv:2212.03191},
61 |   year={2022}
62 | }
63 | ```


--------------------------------------------------------------------------------
/InternVid/div_sampling.py:
--------------------------------------------------------------------------------
 1 | from collections import Counter
 2 | import json
 3 | import random
 4 | import numpy as np
 5 | data = json.load(open("/path/to/to_sample"))
 6 | video_id = set([x["video"].split("/")[-1][:11] for x in data])
 7 | video_id_counter = Counter([x["video"].split("/")[-1][:11] for x in data])
 8 | sampling_weights = [1.0 / video_id_counter[x["video"].split("/")[-1][:11]] for x in data]
 9 | np.random.seed(42)
10 | sampling_weights = np.array(sampling_weights)
11 | sampling_weights = sampling_weights / sampling_weights.sum()
12 | sampled_index = np.random.choice(len(data), 10647458, replace=False, p=sampling_weights)
13 | data = [data[i] for i in sampled_index]
14 | json.dump(data, open("/path/to/sampled", "w"))


--------------------------------------------------------------------------------
/InternVid/test_viCLIP.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | import os
 3 | import cv2
 4 | 
 5 | from viclip import get_viclip, retrieve_text, _frame_from_video
 6 | video = cv2.VideoCapture('Data/InternVid/example1.mp4')
 7 | frames = [x for x in _frame_from_video(video)]
 8 | print('frames', frames)
 9 | # modify xxx to the path of the pretrained model
10 | model_cfgs = {
11 |     'viclip-l-internvid-10m-flt': {
12 |         'size': 'l',
13 |         'pretrained': '/home/yue/data/ViClip-InternVid-10M-FLT.pth',
14 |     },
15 |     'viclip-l-internvid-200m': {
16 |         'size': 'l',
17 |         'pretrained': 'xxx/ViCLIP-L_InternVid-200M.pth',
18 |     },
19 |     'viclip-b-internvid-10m-flt': {
20 |         'size': 'b',
21 |         'pretrained': 'xxx/ViCLIP-B_InternVid-FLT-10M.pth',
22 |     },
23 |     'viclip-b-internvid-200m': {
24 |         'size': 'b',
25 |         'pretrained': 'xxx/ViCLIP-B_InternVid-200M.pth',
26 |     },
27 | }
28 | 
29 | text_candidates = ["A playful dog and its owner wrestle in the snowy yard, chasing each other with joyous abandon.",
30 |                    "A man in a gray coat walks through the snowy landscape, pulling a sleigh loaded with toys.",
31 |                    "A person dressed in a blue jacket shovels the snow-covered pavement outside their house.",
32 |                    "A pet dog excitedly runs through the snowy yard, chasing a toy thrown by its owner.",
33 |                    "A person stands on the snowy floor, pushing a sled loaded with blankets, preparing for a fun-filled ride.",
34 |                    "A man in a gray hat and coat walks through the snowy yard, carefully navigating around the trees.",
35 |                    "A playful dog slides down a snowy hill, wagging its tail with delight.",
36 |                    "A person in a blue jacket walks their pet on a leash, enjoying a peaceful winter walk among the trees.",
37 |                    "A man in a gray sweater plays fetch with his dog in the snowy yard, throwing a toy and watching it run.",
38 |                    "A person bundled up in a blanket walks through the snowy landscape, enjoying the serene winter scenery."]
39 | 
40 | cfg = model_cfgs['viclip-l-internvid-10m-flt']
41 | model_l = get_viclip(cfg['size'], cfg['pretrained'])
42 | print('a')
43 | texts, probs = retrieve_text(frames, text_candidates, models=model_l, topk=5)
44 | 


--------------------------------------------------------------------------------
/InternVid/utils/config_utils.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | import os
 3 | import sys
 4 | from os.path import dirname, join
 5 | 
 6 | from utils.config import Config
 7 | from utils.distributed import init_distributed_mode, is_main_process
 8 | from utils.logger import setup_logger
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | 
13 | def setup_config():
14 |     """Conbine yaml config and command line config with OmegaConf.
15 |     Also converts types, e.g., `'None'` (str) --> `None` (None)
16 |     """
17 |     config = Config.get_config()
18 |     if config.debug:
19 |         config.wandb.enable = False
20 |     return config
21 | 
22 | 
23 | def setup_evaluate_config(config):
24 |     """setup evaluation default settings, e.g., disable wandb"""
25 |     assert config.evaluate
26 |     config.wandb.enable = False
27 |     if config.output_dir is None:
28 |         config.output_dir = join(dirname(config.pretrained_path), "eval")
29 |     return config
30 | 
31 | 
32 | def setup_output_dir(output_dir, excludes=["code"]):
33 |     """ensure not overwritting an exisiting/non-empty output dir"""
34 |     if not os.path.exists(output_dir):
35 |         os.makedirs(output_dir, exist_ok=False)
36 |     else:
37 |         existing_dirs_files = os.listdir(output_dir)  # list
38 |         remaining = set(existing_dirs_files) - set(excludes)
39 |         remaining = [e for e in remaining if "slurm" not in e]
40 |         remaining = [e for e in remaining if ".out" not in e]
41 |         # assert len(remaining) == 0, f"remaining dirs or files: {remaining}"
42 |         logger.warn(f"remaining dirs or files: {remaining}")
43 | 
44 | 
45 | def setup_main():
46 |     """
47 |     Setup config, logger, output_dir, etc.
48 |     Shared for pretrain and all downstream tasks.
49 |     """
50 |     config = setup_config()
51 |     if hasattr(config, "evaluate") and config.evaluate:
52 |         config = setup_evaluate_config(config)
53 |     init_distributed_mode(config)
54 | 
55 |     if is_main_process():
56 |         setup_output_dir(config.output_dir, excludes=["code"])
57 |         setup_logger(output=config.output_dir, color=True, name="vindlu")
58 |         logger.info(f"config: {Config.pretty_text(config)}")
59 |         Config.dump(config, os.path.join(config.output_dir, "config.json"))
60 |     return config
61 | 


--------------------------------------------------------------------------------
/InternVid/utils/distributed.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/utils/distributed.py


--------------------------------------------------------------------------------
/InternVid/utils/easydict.py:
--------------------------------------------------------------------------------
  1 | class EasyDict(dict):
  2 |     """
  3 |     Get attributes
  4 | 
  5 |     >>> d = EasyDict({'foo':3})
  6 |     >>> d['foo']
  7 |     3
  8 |     >>> d.foo
  9 |     3
 10 |     >>> d.bar
 11 |     Traceback (most recent call last):
 12 |     ...
 13 |     AttributeError: 'EasyDict' object has no attribute 'bar'
 14 | 
 15 |     Works recursively
 16 | 
 17 |     >>> d = EasyDict({'foo':3, 'bar':{'x':1, 'y':2}})
 18 |     >>> isinstance(d.bar, dict)
 19 |     True
 20 |     >>> d.bar.x
 21 |     1
 22 | 
 23 |     Bullet-proof
 24 | 
 25 |     >>> EasyDict({})
 26 |     {}
 27 |     >>> EasyDict(d={})
 28 |     {}
 29 |     >>> EasyDict(None)
 30 |     {}
 31 |     >>> d = {'a': 1}
 32 |     >>> EasyDict(**d)
 33 |     {'a': 1}
 34 | 
 35 |     Set attributes
 36 | 
 37 |     >>> d = EasyDict()
 38 |     >>> d.foo = 3
 39 |     >>> d.foo
 40 |     3
 41 |     >>> d.bar = {'prop': 'value'}
 42 |     >>> d.bar.prop
 43 |     'value'
 44 |     >>> d
 45 |     {'foo': 3, 'bar': {'prop': 'value'}}
 46 |     >>> d.bar.prop = 'newer'
 47 |     >>> d.bar.prop
 48 |     'newer'
 49 | 
 50 | 
 51 |     Values extraction
 52 | 
 53 |     >>> d = EasyDict({'foo':0, 'bar':[{'x':1, 'y':2}, {'x':3, 'y':4}]})
 54 |     >>> isinstance(d.bar, list)
 55 |     True
 56 |     >>> from operator import attrgetter
 57 |     >>> map(attrgetter('x'), d.bar)
 58 |     [1, 3]
 59 |     >>> map(attrgetter('y'), d.bar)
 60 |     [2, 4]
 61 |     >>> d = EasyDict()
 62 |     >>> d.keys()
 63 |     []
 64 |     >>> d = EasyDict(foo=3, bar=dict(x=1, y=2))
 65 |     >>> d.foo
 66 |     3
 67 |     >>> d.bar.x
 68 |     1
 69 | 
 70 |     Still like a dict though
 71 | 
 72 |     >>> o = EasyDict({'clean':True})
 73 |     >>> o.items()
 74 |     [('clean', True)]
 75 | 
 76 |     And like a class
 77 | 
 78 |     >>> class Flower(EasyDict):
 79 |     ...     power = 1
 80 |     ...
 81 |     >>> f = Flower()
 82 |     >>> f.power
 83 |     1
 84 |     >>> f = Flower({'height': 12})
 85 |     >>> f.height
 86 |     12
 87 |     >>> f['power']
 88 |     1
 89 |     >>> sorted(f.keys())
 90 |     ['height', 'power']
 91 | 
 92 |     update and pop items
 93 |     >>> d = EasyDict(a=1, b='2')
 94 |     >>> e = EasyDict(c=3.0, a=9.0)
 95 |     >>> d.update(e)
 96 |     >>> d.c
 97 |     3.0
 98 |     >>> d['c']
 99 |     3.0
100 |     >>> d.get('c')
101 |     3.0
102 |     >>> d.update(a=4, b=4)
103 |     >>> d.b
104 |     4
105 |     >>> d.pop('a')
106 |     4
107 |     >>> d.a
108 |     Traceback (most recent call last):
109 |     ...
110 |     AttributeError: 'EasyDict' object has no attribute 'a'
111 |     """
112 | 
113 |     def __init__(self, d=None, **kwargs):
114 |         if d is None:
115 |             d = {}
116 |         if kwargs:
117 |             d.update(**kwargs)
118 |         for k, v in d.items():
119 |             setattr(self, k, v)
120 |         # Class attributes
121 |         for k in self.__class__.__dict__.keys():
122 |             if not (k.startswith("__") and k.endswith("__")) and not k in ("update", "pop"):
123 |                 setattr(self, k, getattr(self, k))
124 | 
125 |     def __setattr__(self, name, value):
126 |         if isinstance(value, (list, tuple)):
127 |             value = [self.__class__(x) if isinstance(x, dict) else x for x in value]
128 |         elif isinstance(value, dict) and not isinstance(value, self.__class__):
129 |             value = self.__class__(value)
130 |         super(EasyDict, self).__setattr__(name, value)
131 |         super(EasyDict, self).__setitem__(name, value)
132 | 
133 |     __setitem__ = __setattr__
134 | 
135 |     def update(self, e=None, **f):
136 |         d = e or dict()
137 |         d.update(f)
138 |         for k in d:
139 |             setattr(self, k, d[k])
140 | 
141 |     def pop(self, k, d=None):
142 |         if hasattr(self, k):
143 |             delattr(self, k)
144 |         return super(EasyDict, self).pop(k, d)
145 | 
146 | 
147 | if __name__ == "__main__":
148 |     import doctest
149 | 
150 | 


--------------------------------------------------------------------------------
/InternVid/utils/logger.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/utils/logger.py


--------------------------------------------------------------------------------
/InternVid/utils/optimizer.py:
--------------------------------------------------------------------------------
  1 | """ Optimizer Factory w/ Custom Weight Decay
  2 | Hacked together by / Copyright 2020 Ross Wightman
  3 | """
  4 | import re
  5 | import torch
  6 | from torch import optim as optim
  7 | from utils.distributed import is_main_process
  8 | import logging
  9 | logger = logging.getLogger(__name__)
 10 | try:
 11 |     from apex.optimizers import FusedNovoGrad, FusedAdam, FusedLAMB, FusedSGD
 12 |     has_apex = True
 13 | except ImportError:
 14 |     has_apex = False
 15 | 
 16 | 
 17 | def add_weight_decay(model, weight_decay, no_decay_list=(), filter_bias_and_bn=True):
 18 |     named_param_tuples = []
 19 |     for name, param in model.named_parameters():
 20 |         if not param.requires_grad:
 21 |             continue  # frozen weights
 22 |         if filter_bias_and_bn and (len(param.shape) == 1 or name.endswith(".bias")):
 23 |             named_param_tuples.append([name, param, 0])
 24 |         elif name in no_decay_list:
 25 |             named_param_tuples.append([name, param, 0])
 26 |         else:
 27 |             named_param_tuples.append([name, param, weight_decay])
 28 |     return named_param_tuples
 29 | 
 30 | 
 31 | def add_different_lr(named_param_tuples_or_model, diff_lr_names, diff_lr, default_lr):
 32 |     """use lr=diff_lr for modules named found in diff_lr_names,
 33 |     otherwise use lr=default_lr
 34 | 
 35 |     Args:
 36 |         named_param_tuples_or_model: List([name, param, weight_decay]), or nn.Module
 37 |         diff_lr_names: List(str)
 38 |         diff_lr: float
 39 |         default_lr: float
 40 |     Returns:
 41 |         named_param_tuples_with_lr: List([name, param, weight_decay, lr])
 42 |     """
 43 |     named_param_tuples_with_lr = []
 44 |     logger.info(f"diff_names: {diff_lr_names}, diff_lr: {diff_lr}")
 45 |     for name, p, wd in named_param_tuples_or_model:
 46 |         use_diff_lr = False
 47 |         for diff_name in diff_lr_names:
 48 |             # if diff_name in name:
 49 |             if re.search(diff_name, name) is not None:
 50 |                 logger.info(f"param {name} use different_lr: {diff_lr}")
 51 |                 use_diff_lr = True
 52 |                 break
 53 | 
 54 |         named_param_tuples_with_lr.append(
 55 |             [name, p, wd, diff_lr if use_diff_lr else default_lr]
 56 |         )
 57 | 
 58 |     if is_main_process():
 59 |         for name, _, wd, diff_lr in named_param_tuples_with_lr:
 60 |             logger.info(f"param {name}: wd: {wd}, lr: {diff_lr}")
 61 | 
 62 |     return named_param_tuples_with_lr
 63 | 
 64 | 
 65 | def create_optimizer_params_group(named_param_tuples_with_lr):
 66 |     """named_param_tuples_with_lr: List([name, param, weight_decay, lr])"""
 67 |     group = {}
 68 |     for name, p, wd, lr in named_param_tuples_with_lr:
 69 |         if wd not in group:
 70 |             group[wd] = {}
 71 |         if lr not in group[wd]:
 72 |             group[wd][lr] = []
 73 |         group[wd][lr].append(p)
 74 | 
 75 |     optimizer_params_group = []
 76 |     for wd, lr_groups in group.items():
 77 |         for lr, p in lr_groups.items():
 78 |             optimizer_params_group.append(dict(
 79 |                 params=p,
 80 |                 weight_decay=wd,
 81 |                 lr=lr
 82 |             ))
 83 |             logger.info(f"optimizer -- lr={lr} wd={wd} len(p)={len(p)}")
 84 |     return optimizer_params_group
 85 | 
 86 | 
 87 | def create_optimizer(args, model, filter_bias_and_bn=True):
 88 |     opt_lower = args.opt.lower()
 89 |     weight_decay = args.weight_decay
 90 |     # check for modules that requires different lr
 91 |     if hasattr(args, "different_lr") and args.different_lr.enable:
 92 |         diff_lr_module_names = args.different_lr.module_names
 93 |         diff_lr = args.different_lr.lr
 94 |     else:
 95 |         diff_lr_module_names = []
 96 |         diff_lr = None
 97 | 
 98 |     no_decay = {}
 99 |     if hasattr(model, 'no_weight_decay'):
100 |         no_decay = model.no_weight_decay()
101 |     named_param_tuples = add_weight_decay(
102 |         model, weight_decay, no_decay, filter_bias_and_bn)
103 |     named_param_tuples = add_different_lr(
104 |         named_param_tuples, diff_lr_module_names, diff_lr, args.lr)
105 |     parameters = create_optimizer_params_group(named_param_tuples)
106 | 
107 |     if 'fused' in opt_lower:
108 |         assert has_apex and torch.cuda.is_available(), 'APEX and CUDA required for fused optimizers'
109 | 
110 |     opt_args = dict(lr=args.lr, weight_decay=weight_decay)
111 |     if hasattr(args, 'opt_eps') and args.opt_eps is not None:
112 |         opt_args['eps'] = args.opt_eps
113 |     if hasattr(args, 'opt_betas') and args.opt_betas is not None:
114 |         opt_args['betas'] = args.opt_betas
115 |     if hasattr(args, 'opt_args') and args.opt_args is not None:
116 |         opt_args.update(args.opt_args)
117 | 
118 |     opt_split = opt_lower.split('_')
119 |     opt_lower = opt_split[-1]
120 |     if opt_lower == 'sgd' or opt_lower == 'nesterov':
121 |         opt_args.pop('eps', None)
122 |         optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=True, **opt_args)
123 |     elif opt_lower == 'momentum':
124 |         opt_args.pop('eps', None)
125 |         optimizer = optim.SGD(parameters, momentum=args.momentum, nesterov=False, **opt_args)
126 |     elif opt_lower == 'adam':
127 |         optimizer = optim.Adam(parameters, **opt_args)
128 |     elif opt_lower == 'adamw':
129 |         optimizer = optim.AdamW(parameters, **opt_args)
130 |     else:
131 |         assert False and "Invalid optimizer"
132 |         raise ValueError
133 |     return optimizer
134 | 


--------------------------------------------------------------------------------
/InternVid/utils/scheduler.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/utils/scheduler.py


--------------------------------------------------------------------------------
/InternVid/viclip/__init__.py:
--------------------------------------------------------------------------------
 1 | from .simple_tokenizer import SimpleTokenizer as _Tokenizer
 2 | from .viclip import ViCLIP
 3 | import torch
 4 | import numpy as np
 5 | import cv2
 6 | import os
 7 | 
 8 | 
 9 | def get_viclip(size='l', 
10 |                pretrain=os.path.join(os.path.dirname(os.path.abspath(__file__)), "ViClip-InternVid-10M-FLT.pth")):
11 |     
12 |     tokenizer = _Tokenizer()
13 |     vclip = ViCLIP(tokenizer=tokenizer, size=size, pretrain=pretrain)
14 |     m = {'viclip':vclip, 'tokenizer':tokenizer}
15 |     
16 |     return m
17 | 
18 | def get_text_feat_dict(texts, clip, tokenizer, text_feat_d={}):
19 |     for t in texts:
20 |         feat = clip.get_text_features(t, tokenizer, text_feat_d)
21 |         text_feat_d[t] = feat
22 |     return text_feat_d
23 | 
24 | def get_vid_feat(frames, clip):
25 |     return clip.get_vid_features(frames)
26 | 
27 | 
28 | def _frame_from_video(video):
29 |     while video.isOpened():
30 |         success, frame = video.read()
31 |         if success:
32 |             yield frame
33 |         else:
34 |             break
35 | 
36 | v_mean = np.array([0.485, 0.456, 0.406]).reshape(1,1,3)
37 | v_std = np.array([0.229, 0.224, 0.225]).reshape(1,1,3)
38 | def normalize(data):
39 |     return (data/255.0-v_mean)/v_std
40 | 
41 | def frames2tensor(vid_list, fnum=8, target_size=(224, 224), device=torch.device('cuda')):
42 |     assert(len(vid_list) >= fnum)
43 |     step = len(vid_list) // fnum
44 |     vid_list = vid_list[::step][:fnum]
45 |     vid_list = [cv2.resize(x[:,:,::-1], target_size) for x in vid_list]
46 |     vid_tube = [np.expand_dims(normalize(x), axis=(0, 1)) for x in vid_list]
47 |     vid_tube = np.concatenate(vid_tube, axis=1)
48 |     vid_tube = np.transpose(vid_tube, (0, 1, 4, 2, 3))
49 |     vid_tube = torch.from_numpy(vid_tube).to(device, non_blocking=True).float()
50 |     return vid_tube
51 | 
52 | def retrieve_text(frames, 
53 |                   texts, 
54 |                   models={'viclip':None, 
55 |                           'tokenizer':None},
56 |                   topk=5, 
57 |                   device=torch.device('cuda')):
58 |     # clip, tokenizer = get_clip(name, model_cfg['size'], model_cfg['pretrained'], model_cfg['reload'])
59 |     assert(type(models)==dict and models['viclip'] is not None and models['tokenizer'] is not None)
60 |     clip, tokenizer = models['viclip'], models['tokenizer']
61 |     clip = clip.to(device)
62 |     frames_tensor = frames2tensor(frames, device=device)
63 |     vid_feat = get_vid_feat(frames_tensor, clip)
64 | 
65 |     text_feat_d = {}
66 |     text_feat_d = get_text_feat_dict(texts, clip, tokenizer, text_feat_d)
67 |     text_feats = [text_feat_d[t] for t in texts]
68 |     text_feats_tensor = torch.cat(text_feats, 0)
69 |     
70 |     probs, idxs = clip.get_predict_label(vid_feat, text_feats_tensor, top=topk)
71 | 
72 |     ret_texts = [texts[i] for i in idxs.numpy()[0].tolist()]
73 |     return ret_texts, probs.numpy()[0]


--------------------------------------------------------------------------------
/InternVid/viclip/__pycache__/__init__.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/__init__.cpython-311.pyc


--------------------------------------------------------------------------------
/InternVid/viclip/__pycache__/__init__.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/__init__.cpython-39.pyc


--------------------------------------------------------------------------------
/InternVid/viclip/__pycache__/simple_tokenizer.cpython-311.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/simple_tokenizer.cpython-311.pyc


--------------------------------------------------------------------------------
/InternVid/viclip/__pycache__/simple_tokenizer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/simple_tokenizer.cpython-39.pyc


--------------------------------------------------------------------------------
/InternVid/viclip/__pycache__/viclip.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/viclip.cpython-39.pyc


--------------------------------------------------------------------------------
/InternVid/viclip/__pycache__/viclip_text.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/viclip_text.cpython-39.pyc


--------------------------------------------------------------------------------
/InternVid/viclip/__pycache__/viclip_vision.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/__pycache__/viclip_vision.cpython-39.pyc


--------------------------------------------------------------------------------
/InternVid/viclip/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/InternVid/viclip/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/InternVid/viclip/simple_tokenizer.py:
--------------------------------------------------------------------------------
  1 | import gzip
  2 | import html
  3 | import os
  4 | from functools import lru_cache
  5 | 
  6 | import ftfy
  7 | import regex as re
  8 | 
  9 | 
 10 | @lru_cache()
 11 | def default_bpe():
 12 |     return os.path.join(os.path.dirname(os.path.abspath(__file__)), "bpe_simple_vocab_16e6.txt.gz")
 13 | # @lru_cache()
 14 | # def default_bpe():
 15 | #     return "bpe_simple_vocab_16e6.txt.gz"
 16 | 
 17 | 
 18 | @lru_cache()
 19 | def bytes_to_unicode():
 20 |     """
 21 |     Returns list of utf-8 byte and a corresponding list of unicode strings.
 22 |     The reversible bpe codes work on unicode strings.
 23 |     This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
 24 |     When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
 25 |     This is a signficant percentage of your normal, say, 32K bpe vocab.
 26 |     To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
 27 |     And avoids mapping to whitespace/control characters the bpe code barfs on.
 28 |     """
 29 |     bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
 30 |     cs = bs[:]
 31 |     n = 0
 32 |     for b in range(2**8):
 33 |         if b not in bs:
 34 |             bs.append(b)
 35 |             cs.append(2**8+n)
 36 |             n += 1
 37 |     cs = [chr(n) for n in cs]
 38 |     return dict(zip(bs, cs))
 39 | 
 40 | 
 41 | def get_pairs(word):
 42 |     """Return set of symbol pairs in a word.
 43 |     Word is represented as tuple of symbols (symbols being variable-length strings).
 44 |     """
 45 |     pairs = set()
 46 |     prev_char = word[0]
 47 |     for char in word[1:]:
 48 |         pairs.add((prev_char, char))
 49 |         prev_char = char
 50 |     return pairs
 51 | 
 52 | 
 53 | def basic_clean(text):
 54 |     text = ftfy.fix_text(text)
 55 |     text = html.unescape(html.unescape(text))
 56 |     return text.strip()
 57 | 
 58 | 
 59 | def whitespace_clean(text):
 60 |     text = re.sub(r'\s+', ' ', text)
 61 |     text = text.strip()
 62 |     return text
 63 | 
 64 | 
 65 | class SimpleTokenizer(object):
 66 |     def __init__(self, bpe_path: str = default_bpe()):
 67 |         self.byte_encoder = bytes_to_unicode()
 68 |         self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
 69 |         merges = gzip.open(bpe_path).read().decode("utf-8").split('\n')
 70 |         merges = merges[1:49152-256-2+1]
 71 |         merges = [tuple(merge.split()) for merge in merges]
 72 |         vocab = list(bytes_to_unicode().values())
 73 |         vocab = vocab + [v+'</w>' for v in vocab]
 74 |         for merge in merges:
 75 |             vocab.append(''.join(merge))
 76 |         vocab.extend(['<|startoftext|>', '<|endoftext|>'])
 77 |         self.encoder = dict(zip(vocab, range(len(vocab))))
 78 |         self.decoder = {v: k for k, v in self.encoder.items()}
 79 |         self.bpe_ranks = dict(zip(merges, range(len(merges))))
 80 |         self.cache = {'<|startoftext|>': '<|startoftext|>', '<|endoftext|>': '<|endoftext|>'}
 81 |         self.pat = re.compile(r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
 82 | 
 83 |     def bpe(self, token):
 84 |         if token in self.cache:
 85 |             return self.cache[token]
 86 |         word = tuple(token[:-1]) + ( token[-1] + '</w>',)
 87 |         pairs = get_pairs(word)
 88 | 
 89 |         if not pairs:
 90 |             return token+'</w>'
 91 | 
 92 |         while True:
 93 |             bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
 94 |             if bigram not in self.bpe_ranks:
 95 |                 break
 96 |             first, second = bigram
 97 |             new_word = []
 98 |             i = 0
 99 |             while i < len(word):
100 |                 try:
101 |                     j = word.index(first, i)
102 |                     new_word.extend(word[i:j])
103 |                     i = j
104 |                 except:
105 |                     new_word.extend(word[i:])
106 |                     break
107 | 
108 |                 if word[i] == first and i < len(word)-1 and word[i+1] == second:
109 |                     new_word.append(first+second)
110 |                     i += 2
111 |                 else:
112 |                     new_word.append(word[i])
113 |                     i += 1
114 |             new_word = tuple(new_word)
115 |             word = new_word
116 |             if len(word) == 1:
117 |                 break
118 |             else:
119 |                 pairs = get_pairs(word)
120 |         word = ' '.join(word)
121 |         self.cache[token] = word
122 |         return word
123 | 
124 |     def encode(self, text):
125 |         bpe_tokens = []
126 |         text = whitespace_clean(basic_clean(text)).lower()
127 |         for token in re.findall(self.pat, text):
128 |             token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
129 |             bpe_tokens.extend(self.encoder[bpe_token] for bpe_token in self.bpe(token).split(' '))
130 |         return bpe_tokens
131 | 
132 |     def decode(self, tokens):
133 |         text = ''.join([self.decoder[token] for token in tokens])
134 |         text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors="replace").replace('</w>', ' ')
135 |         return text
136 | 


--------------------------------------------------------------------------------
/LaViLa/CODE_OF_CONDUCT.md:
--------------------------------------------------------------------------------
1 | # Code of Conduct
2 | 
3 | Facebook has adopted a Code of Conduct that we expect project participants to adhere to.
4 | Please read the [full text](https://code.fb.com/codeofconduct/)
5 | so that you can understand what actions will and will not be tolerated.


--------------------------------------------------------------------------------
/LaViLa/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # Contributing to LaViLa
 2 | We want to make contributing to this project as easy and transparent as
 3 | possible.
 4 | 
 5 | ## Our Development Process
 6 | Minor changes and improvements will be released on an ongoing basis. Larger changes (e.g., changesets implementing a new paper) will be released on a more periodic basis.
 7 | 
 8 | ## Pull Requests
 9 | We actively welcome your pull requests.
10 | 
11 | 1. Fork the repo and create your branch from `main`.
12 | 2. If you've added code that should be tested, add tests.
13 | 3. If you've changed APIs, update the documentation.
14 | 4. Ensure the test suite passes.
15 | 5. Make sure your code lints.
16 | 6. If you haven't already, complete the Contributor License Agreement ("CLA").
17 | 
18 | ## Contributor License Agreement ("CLA")
19 | In order to accept your pull request, we need you to submit a CLA. You only need
20 | to do this once to work on any of Facebook's open source projects.
21 | 
22 | Complete your CLA here: <https://code.facebook.com/cla>
23 | 
24 | ## Issues
25 | We use GitHub issues to track public bugs. Please ensure your description is
26 | clear and has sufficient instructions to be able to reproduce the issue.
27 | 
28 | Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
29 | disclosure of security bugs. In those cases, please go through the process
30 | outlined on that page and do not file a public issue.
31 | 
32 | ## Coding Style  
33 | * 4 spaces for indentation rather than tabs
34 | * 80 character line length
35 | * PEP8 formatting following [Black](https://black.readthedocs.io/en/stable/)
36 | 
37 | ## License
38 | By contributing to LaViLa, you agree that your contributions will be licensed
39 | under the LICENSE file in the root directory of this source tree.
40 | 


--------------------------------------------------------------------------------
/LaViLa/LICENSE:
--------------------------------------------------------------------------------
 1 | 
 2 | MIT License
 3 | 
 4 | Copyright (c) Meta Platforms, Inc. and affiliates.
 5 | 
 6 | Permission is hereby granted, free of charge, to any person obtaining a copy
 7 | of this software and associated documentation files (the "Software"), to deal
 8 | in the Software without restriction, including without limitation the rights
 9 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 | copies of the Software, and to permit persons to whom the Software is
11 | furnished to do so, subject to the following conditions:
12 | 
13 | The above copyright notice and this permission notice shall be included in all
14 | copies or substantial portions of the Software.
15 | 
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 | SOFTWARE.
23 | 


--------------------------------------------------------------------------------
/LaViLa/__pycache__/eval_narrator.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/__pycache__/eval_narrator.cpython-39.pyc


--------------------------------------------------------------------------------
/LaViLa/clip_caption.py:
--------------------------------------------------------------------------------
  1 | import decord
  2 | import matplotlib.pyplot as plt
  3 | import numpy as np
  4 | from collections import OrderedDict
  5 | import time
  6 | import torch
  7 | import torchvision.transforms as transforms
  8 | import torchvision.transforms._transforms_video as transforms_video
  9 | import sys
 10 | sys.path.insert(0, './')
 11 | from lavila.data.video_transforms import Permute
 12 | from lavila.data.datasets import get_frame_ids, video_loader_by_frames
 13 | from lavila.models.models import VCLM_OPENAI_TIMESFORMER_BASE_GPT2
 14 | from lavila.models.tokenizer import MyGPT2Tokenizer
 15 | from base64 import b64encode
 16 | import os
 17 | import fnmatch
 18 | import imageio
 19 | import json
 20 | import cv2
 21 | 
 22 | 
 23 | ckpt_path = 'vclm_openai_timesformer_base_gpt2_base.pt_ego4d.jobid_319630.ep_0002.md5sum_68a71f.pth'
 24 | ckpt = torch.load(ckpt_path, map_location='cpu')
 25 | state_dict = OrderedDict()
 26 | for k, v in ckpt['state_dict'].items():
 27 |   state_dict[k.replace('module.', '')] = v
 28 | 
 29 | # instantiate the model, and load the pre-trained weights
 30 | model = VCLM_OPENAI_TIMESFORMER_BASE_GPT2(
 31 |     text_use_cls_token=False,
 32 |     project_embed_dim=256,
 33 |     gated_xattn=True,
 34 |     timesformer_gated_xattn=False,
 35 |     freeze_lm_vclm=False,
 36 |     freeze_visual_vclm=False,
 37 |     freeze_visual_vclm_temporal=False,
 38 |     num_frames=4,
 39 |     drop_path_rate=0.
 40 | )
 41 | 
 42 | model.load_state_dict(state_dict, strict=True)
 43 | model.eval()
 44 | tokenizer = MyGPT2Tokenizer('gpt2', add_bos=True)
 45 | 
 46 | candidate_num = 5
 47 | crop_size = 224
 48 | val_transform = transforms.Compose([
 49 |     Permute([3, 0, 1, 2]),
 50 |     transforms.Resize(crop_size),
 51 |     transforms.CenterCrop(crop_size),
 52 |     transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305])
 53 | ])
 54 | 
 55 | 
 56 | def decode_one(generated_ids, tokenizer):
 57 |     # get the index of <EOS>
 58 |     if tokenizer.eos_token_id == tokenizer.bos_token_id:
 59 |         if tokenizer.eos_token_id in generated_ids[1:].tolist():
 60 |             eos_id = generated_ids[1:].tolist().index(tokenizer.eos_token_id) + 1
 61 |         else:
 62 |             eos_id = len(generated_ids.tolist()) - 1
 63 |     elif tokenizer.eos_token_id in generated_ids.tolist():
 64 |         eos_id = generated_ids.tolist().index(tokenizer.eos_token_id)
 65 |     else:
 66 |         eos_id = len(generated_ids.tolist()) - 1
 67 |     generated_text_str = tokenizer.tokenizer.decode(generated_ids[1:eos_id].tolist())
 68 |     return generated_text_str
 69 | 
 70 | 
 71 | def create_caption(frames):
 72 |     with torch.no_grad():
 73 |         image_features = model.encode_image(frames)
 74 |         generated_text_ids, ppls = model.generate(
 75 |             image_features,
 76 |             tokenizer,
 77 |             target=None, # free-form generation
 78 |             max_text_length=77,
 79 |             top_k=None,
 80 |             top_p=0.95,  # nucleus sampling
 81 |             num_return_sequences=candidate_num, # number of candidates: 10
 82 |             temperature=0.9,
 83 |             early_stopping=True,
 84 |         )
 85 |         longest_sentence = ""
 86 |         for i in range(candidate_num):
 87 |             generated_text_str = decode_one(generated_text_ids[i], tokenizer)
 88 |             if len(generated_text_str) > len(longest_sentence):
 89 |                 longest_sentence = generated_text_str
 90 |         return longest_sentence
 91 | 
 92 | 
 93 | def captioning(frame_path, fps, caption_seconds=2, frames_per_caption=4):
 94 |     frame_interval = int(fps*caption_seconds/frames_per_caption)
 95 |     sequential_image_list = []
 96 |     sequential_caption_list = dict()
 97 | 
 98 |     for root, dirs, files in os.walk(frame_path):
 99 |         for file in files:
100 |             if fnmatch.fnmatch(file, '*.jpg'):
101 |                 sequential_image_list.append(file)
102 | 
103 |     sequential_image_list.sort() # ordered frame list
104 | 
105 |     start_frame = int(sequential_image_list[0].split('.')[0].split('_')[-1])
106 |     end_frame = int(sequential_image_list[-1].split('.')[0].split('_')[-1])
107 | 
108 |     print(start_frame)
109 |     print(end_frame)
110 |     total_frames = end_frame-start_frame+1
111 | 
112 |     total_captions = total_frames//(fps*caption_seconds)
113 |     IMAGE_NAME_PATTERN = "video_frame_{:07d}.jpg"
114 | 
115 | 
116 |     for i in range(total_captions):
117 |         print(i)
118 |         caption_start_frame = start_frame + i * fps * caption_seconds
119 |         caption_end_frame = start_frame + (i+1) * fps * caption_seconds
120 |         input_frames = []
121 |         for j in range(frames_per_caption):
122 |             frame_idx = caption_start_frame + j* frame_interval
123 |             print('frame: ', frame_idx)
124 |             frame_name = IMAGE_NAME_PATTERN.format(frame_idx)
125 |             image_file = os.path.join(frame_path, frame_name)
126 |             image = imageio.imread(image_file)
127 |             input_frames.append(image)
128 |         input_frames = torch.from_numpy(np.stack(input_frames, axis=0)).float() #[4, w, h, 3]
129 |         #print("input_frames: ", input_frames)
130 |         #print("input_frames.size: ", input_frames.size())
131 |         frames = val_transform(input_frames) 
132 |         frames = frames.unsqueeze(0)
133 |         caption = create_caption(frames)
134 |         time_stamps = "{}-{}".format(str(caption_start_frame), str(caption_end_frame))
135 |         sequential_caption_list[time_stamps] = caption
136 | 
137 |     with open(os.path.join(frame_path, 'captions.json'), 'w') as f:
138 |         json.dump(sequential_caption_list, f)
139 | 
140 | 
141 | 
142 | def captioning(frame_path, fps, caption_seconds=2, frames_per_caption=4):
143 |     frame_interval = int(fps*caption_seconds/frames_per_caption)
144 |     sequential_image_list = []
145 |     sequential_caption_list = dict()
146 | 
147 |     for root, dirs, files in os.walk(frame_path):
148 |         for file in files:
149 |             if fnmatch.fnmatch(file, '*.jpg'):
150 |                 sequential_image_list.append(file)
151 | 
152 |     sequential_image_list.sort() # ordered frame list
153 | 
154 |     start_frame = int(sequential_image_list[0].split('.')[0].split('_')[-1])
155 |     end_frame = int(sequential_image_list[-1].split('.')[0].split('_')[-1])
156 | 
157 |     print(start_frame)
158 |     print(end_frame)
159 |     total_frames = end_frame-start_frame+1
160 | 
161 |     total_captions = total_frames//(fps*caption_seconds)
162 |     IMAGE_NAME_PATTERN = "video_frame_{:07d}.jpg"
163 | 
164 | 
165 |     for i in range(total_captions):
166 |         print(i)
167 |         caption_start_frame = start_frame + i * fps * caption_seconds
168 |         caption_end_frame = start_frame + (i+1) * fps * caption_seconds
169 |         input_frames = []
170 |         for j in range(frames_per_caption):
171 |             frame_idx = caption_start_frame + j* frame_interval
172 |             print('frame: ', frame_idx)
173 |             frame_name = IMAGE_NAME_PATTERN.format(frame_idx)
174 |             image_file = os.path.join(frame_path, frame_name)
175 |             image = imageio.imread(image_file)
176 |             input_frames.append(image)
177 |         input_frames = torch.from_numpy(np.stack(input_frames, axis=0)).float() #[4, w, h, 3]
178 |         #print("input_frames: ", input_frames)
179 |         #print("input_frames.size: ", input_frames.size())
180 |         frames = val_transform(input_frames) 
181 |         frames = frames.unsqueeze(0)
182 |         caption = create_caption(frames)
183 |         time_stamps = "{}-{}".format(str(caption_start_frame), str(caption_end_frame))
184 |         sequential_caption_list[time_stamps] = caption
185 | 
186 |     with open(os.path.join(frame_path, 'captions.json'), 'w') as f:
187 |         json.dump(sequential_caption_list, f)


--------------------------------------------------------------------------------
/LaViLa/datasets/README.md:
--------------------------------------------------------------------------------
  1 | # Preparing datasets for LAVILA
  2 | 
  3 | Please download the (selected) datasets from the official websites and place or sim-link them under `$LAVILA_ROOT/datasets/`.
  4 | 
  5 | ```bash
  6 | $LAVILA_ROOT/datasets/
  7 |     CharadesEgo/
  8 |     EGTEA/
  9 |     EK100/
 10 |     Ego4D/
 11 | ```
 12 | 
 13 | ## Ego4D
 14 | 1. Download [Ego4D videos](https://ego4d-data.org/docs/start-here/#download-data) (license is required).
 15 | 
 16 | 2. Preprocess
 17 | 
 18 |     We cut each video into 5-minute-long chunks and resize the smaller size to be 288 pixels for faster IO. Please refer to [this script](scripts/crop_and_resize_ego4d.sh) for more details. 
 19 | 
 20 | 3. Download annotations
 21 | 
 22 |     a. Download [egomcq.json](https://drive.google.com/file/d/1-5iRYf4BCHmj4MYQYFRMY4bhsWJUN3rW/view) to `$LAVILA_ROOT/datasets/Ego4D` (if you want to evaluate EgoMCQ).
 23 | 
 24 |     b. Download [metadata for train split](https://dl.fbaipublicfiles.com/lavila/metadata/ego4d/ego4d_train.pkl) and [val split](https://dl.fbaipublicfiles.com/lavila/metadata/ego4d/ego4d_val.pkl) to `$LAVILA_ROOT/datasets/Ego4D` ((if you want to train LAVILA from scratch).
 25 | 
 26 | The fold should look like this:
 27 | ```bash
 28 | $LAVILA_ROOT/datasets/
 29 |     Ego4D/
 30 |         ego4d_train.pkl
 31 |         ego4d_val.pkl
 32 |         egomcq.json
 33 |         video_288px/
 34 |             000786a7-3f9d-4fe6-bfb3-045b368f7d44.mp4/
 35 |                 0.mp4
 36 |                 300.mp4
 37 |             000a3525-6c98-4650-aaab-be7d2c7b9402.mp4/
 38 |                 0.mp4
 39 |             ...
 40 | ```
 41 | 
 42 | 
 43 | ## EPIC-Kitchens-100 (EK-100)
 44 | 
 45 | 1. Download annotations
 46 | 
 47 | ```bash
 48 | # Assume that you are under `datasets/EK100/`
 49 | git clone https://github.com/epic-kitchens/epic-kitchens-100-annotations
 50 | ```
 51 | 
 52 | 2. Download videos.
 53 | 
 54 |     a. For raw videos, please download them from [https://epic-kitchens.github.io/](https://epic-kitchens.github.io/).
 55 | 
 56 |     b. (Recommended) The raw videos are huge (~1 TB). As an alternative, please check out a [resized version](https://utexas.box.com/s/l7ij81ie5q07p9fdg0vtejihq61liln9).
 57 | 
 58 | 3. (For EK-100 MIR)
 59 | 
 60 |     a. Generate the relevancy matrix of train/val splits using [the official code](https://github.com/mwray/Joint-Part-of-Speech-Embeddings).
 61 | 
 62 |     b. (Recommended) The generated result has some randomness. Therefore, we also provide the [replica of train split](https://dl.fbaipublicfiles.com/lavila/metadata/EK100/caption_relevancy_EPIC_100_retrieval_train.pkl) and [val split](https://dl.fbaipublicfiles.com/lavila/metadata/EK100/caption_relevancy_EPIC_100_retrieval_test.pkl). Please put them to the folder `$LAVILA_ROOT/datasets/EK100/epic-kitchens-100-annotations/retrieval_annotations/relevancy/`.
 63 | 
 64 | 
 65 | The folder should look like this:
 66 | ```bash
 67 | $LAVILA_ROOT/datasets/
 68 |     EK100/
 69 |         epic-kitchens-100-annotations/
 70 |             EPIC_100_train.csv
 71 |             EPIC_100_validation.csv
 72 |             ...
 73 |             retrieval_annotations/relevancy/  # this appears if you do 3.
 74 |                 caption_relevancy_EPIC_100_retrieval_train.pkl
 75 |                 caption_relevancy_EPIC_100_retrieval_test.pkl
 76 |         video_ht256px/
 77 |             P01/
 78 |                 P01_01.MP4
 79 |                 P01_02.MP4
 80 |                 ...
 81 |                 P01_19.MP4
 82 |             P02/
 83 |                 P02_01.MP4
 84 |                 P02_02.MP4
 85 |                 ...
 86 |                 P02_15.MP4
 87 |             ...
 88 | ```
 89 | 
 90 | ## CharadesEgo
 91 | 
 92 | 1. Download annotations at [https://prior.allenai.org/projects/charades-ego](https://prior.allenai.org/projects/charades-ego).
 93 | ```bash
 94 | ### Annotations
 95 | # Assume that you are under `datasets/CharadesEgo/`
 96 | wget https://ai2-public-datasets.s3-us-west-2.amazonaws.com/charades/CharadesEgo.zip
 97 | unzip CharadesEgo.zip && rm CharadesEgo.zip
 98 | ```
 99 | 
100 | 2. Download data (~11GB) at [https://prior.allenai.org/projects/charades-ego](https://prior.allenai.org/projects/charades-ego).
101 | ```bash
102 | ### Data
103 | wget https://ai2-public-datasets.s3-us-west-2.amazonaws.com/charades/CharadesEgo_v1_480.tar
104 | tar -xvf CharadesEgo_v1_480.tar  # Or specify an external path using `-C` and sim-link it to here
105 | rm CharadesEgo_v1_480.tar
106 | ```
107 | 
108 | 3. (For fine-tuning CharadesEgo) Download two additional metadata files: [clip-level metadata (train)](https://dl.fbaipublicfiles.com/lavila/metadata/CharadesEgo/metadata_filtered_train.pkl) and [clip-level metadata (val)](https://dl.fbaipublicfiles.com/lavila/metadata/CharadesEgo/metadata_filtered_val.pkl). Put them to the folder `$LAVILA_ROOT/datasets/CharadesEgo/CharadesEgo/`.
109 | 
110 | The folder should look like this:
111 | ```bash
112 | $LAVILA_ROOT/datasets/
113 |     CharadesEgo/
114 |         CharadesEgo/
115 |             CharadesEgo_v1_train_only1st.csv
116 |             CharadesEgo_v1_test_only1st.csv
117 |             ...
118 |             metadata_filtered_train.pkl  # this appears if you do 3.
119 |             metadata_filtered_val.pkl    # this appears if you do 3.
120 |         CharadesEgo_v1_480/
121 |             005BU.mp4
122 |             005BUEGO.mp4
123 |             ...
124 | ```
125 | 
126 | 
127 | ## EGTEA
128 | 
129 | 1. Visit [https://cbs.ic.gatech.edu/fpv/](https://cbs.ic.gatech.edu/fpv/).
130 | 
131 | 2. Download `TRIMMED_ACTION_CLIPS` (~20GB) and `ACTION_ANNOTATIONS` and untar to the current folder `$LAVILA_ROOT/datasets/EGTEA`.
132 | 
133 | ```bash
134 | unzip action_annotation.zip -d EGTEA/ && rm action_annotation.zip
135 | ```
136 | 
137 | The folder should look like this:
138 | ```bash
139 | $LAVILA_ROOT/datasets/
140 |     EGTEA/
141 |         train_split1.txt
142 |         test_split1.txt
143 |         cropped_clips/
144 |             OP01-R01-PastaSalad/
145 |                 OP01-R01-PastaSalad-1002316-1004005-F024051-F024101.mp4
146 |                 OP01-R01-PastaSalad-1004110-1021110-F024057-F024548.mp4
147 |                 OP01-R01-PastaSalad-1022590-1024050-F024539-F024581.mp4
148 |                 ...
149 |             OP01-R02-TurkeySandwich/
150 |                 OP01-R02-TurkeySandwich-102320-105110-F002449-F002529.mp4
151 |                 OP01-R02-TurkeySandwich-105440-106460-F002528-F002558.mp4
152 |                 OP01-R02-TurkeySandwich-107332-133184-F002513-F003259.mp4
153 |                 ...
154 |             ...
155 | ```
156 | 


--------------------------------------------------------------------------------
/LaViLa/demo.py:
--------------------------------------------------------------------------------
  1 | import decord
  2 | import matplotlib.pyplot as plt
  3 | import numpy as np
  4 | from collections import OrderedDict
  5 | import time
  6 | import torch
  7 | import torchvision.transforms as transforms
  8 | import torchvision.transforms._transforms_video as transforms_video
  9 | 
 10 | import sys
 11 | sys.path.insert(0, './')
 12 | from lavila.data.video_transforms import Permute
 13 | from lavila.data.datasets import get_frame_ids, video_loader_by_frames
 14 | from lavila.models.models import VCLM_OPENAI_TIMESFORMER_BASE_GPT2
 15 | from lavila.models.tokenizer import MyGPT2Tokenizer
 16 | 
 17 | 
 18 | video_path = 'assets/3c0dffd0-e38e-4643-bc48-d513943dc20b_012_014.mp4'
 19 | 
 20 | 
 21 | from base64 import b64encode
 22 | 
 23 | 
 24 | # The video is represented by `num_seg=4` frames
 25 | vr = decord.VideoReader(video_path)
 26 | print("total length:", len(vr))
 27 | num_seg = 4
 28 | frame_ids = get_frame_ids(0, len(vr), num_segments=num_seg, jitter=False)
 29 | frames = video_loader_by_frames('./', video_path, frame_ids)
 30 | print(frames)
 31 | print('frames_size:', frames.size()) #[num_seg, w, h, 3]
 32 | 
 33 | 
 34 | # display the subsampled frames
 35 | # plt.figure(figsize=(16, 40))
 36 | # for i in range(num_seg):
 37 | #   plt.subplot(1, num_seg, i + 1)
 38 | #   plt.imshow(frames[i].cpu().numpy().astype(int))
 39 | #   plt.axis('off')
 40 | # plt.show()
 41 | 
 42 | 
 43 | ckpt_path = 'vclm_openai_timesformer_base_gpt2_base.pt_ego4d.jobid_319630.ep_0002.md5sum_68a71f.pth'
 44 | ckpt = torch.load(ckpt_path, map_location='cpu')
 45 | state_dict = OrderedDict()
 46 | for k, v in ckpt['state_dict'].items():
 47 |   state_dict[k.replace('module.', '')] = v
 48 | 
 49 | # instantiate the model, and load the pre-trained weights
 50 | model = VCLM_OPENAI_TIMESFORMER_BASE_GPT2(
 51 |     text_use_cls_token=False,
 52 |     project_embed_dim=256,
 53 |     gated_xattn=True,
 54 |     timesformer_gated_xattn=False,
 55 |     freeze_lm_vclm=False,
 56 |     freeze_visual_vclm=False,
 57 |     freeze_visual_vclm_temporal=False,
 58 |     num_frames=4,
 59 |     drop_path_rate=0.
 60 | )
 61 | 
 62 | model.load_state_dict(state_dict, strict=True)
 63 | 
 64 | num_params = sum(p.numel() for p in model.parameters())
 65 | print(f'model params: {num_params}')
 66 | model.eval()
 67 | #model.cuda()
 68 | print('loaded into GPU')
 69 | # transforms on input frames
 70 | crop_size = 224
 71 | val_transform = transforms.Compose([
 72 |     Permute([3, 0, 1, 2]),
 73 |     transforms.Resize(crop_size),
 74 |     transforms.CenterCrop(crop_size),
 75 |     transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305])
 76 | ])
 77 | frames = val_transform(frames) 
 78 | print("frames shape before squeeze: ", frames.size()) #[3, 4, 224, 224]
 79 | frames = frames.unsqueeze(0) # fake a batch dimension
 80 | print("frames shape: ", frames.size()) #[1, 3, 4, 224, 224]
 81 | 
 82 | tokenizer = MyGPT2Tokenizer('gpt2', add_bos=True)
 83 | 
 84 | candidate_num = 5
 85 | 
 86 | def decode_one(generated_ids, tokenizer):
 87 |     # get the index of <EOS>
 88 |     if tokenizer.eos_token_id == tokenizer.bos_token_id:
 89 |         if tokenizer.eos_token_id in generated_ids[1:].tolist():
 90 |             eos_id = generated_ids[1:].tolist().index(tokenizer.eos_token_id) + 1
 91 |         else:
 92 |             eos_id = len(generated_ids.tolist()) - 1
 93 |     elif tokenizer.eos_token_id in generated_ids.tolist():
 94 |         eos_id = generated_ids.tolist().index(tokenizer.eos_token_id)
 95 |     else:
 96 |         eos_id = len(generated_ids.tolist()) - 1
 97 |     generated_text_str = tokenizer.tokenizer.decode(generated_ids[1:eos_id].tolist())
 98 |     return generated_text_str
 99 | 
100 | 
101 | 
102 | start_time = time.time()
103 | for i in range(100):
104 |     with torch.no_grad():
105 |         image_features = model.encode_image(frames)
106 |         generated_text_ids, ppls = model.generate(
107 |             image_features,
108 |             tokenizer,
109 |             target=None, # free-form generation
110 |             max_text_length=77,
111 |             top_k=None,
112 |             top_p=0.95,  # nucleus sampling
113 |             num_return_sequences=candidate_num, # number of candidates: 10
114 |             temperature=0.7,
115 |             early_stopping=True,
116 |         )
117 |         for i in range(candidate_num):
118 |             generated_text_str = decode_one(generated_text_ids[i], tokenizer)
119 |             print('{}: {}'.format(i, generated_text_str))
120 | end_time = time.time()
121 | print(end_time-start_time)


--------------------------------------------------------------------------------
/LaViLa/demo_narrator.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | 
  8 | import argparse
  9 | import os
 10 | import urllib.request
 11 | from collections import OrderedDict
 12 | 
 13 | import torch
 14 | import torchvision.transforms as transforms
 15 | import torchvision.transforms._transforms_video as transforms_video
 16 | import decord
 17 | 
 18 | from lavila.data.video_transforms import Permute
 19 | from lavila.data.datasets import get_frame_ids, video_loader_by_frames
 20 | from lavila.models.models import VCLM_OPENAI_TIMESFORMER_LARGE_336PX_GPT2_XL
 21 | from lavila.models.tokenizer import MyGPT2Tokenizer
 22 | from eval_narrator import decode_one
 23 | import cv2
 24 | 
 25 | def main(args):
 26 | 
 27 |     vr = decord.VideoReader(args.video_path)
 28 |     num_seg = 4
 29 |     frame_ids = get_frame_ids(0, len(vr), num_segments=num_seg, jitter=False)
 30 |     print('frame_ids: ', frame_ids)
 31 |     frames = video_loader_by_frames('./', args.video_path, frame_ids)
 32 |     test_frame = frames[0].numpy()
 33 |     print(test_frame.shape)
 34 |     cv2.imwrite("test_frame.jpg", cv2.cvtColor(test_frame, cv2.COLOR_BGR2RGB))
 35 |     ckpt_name = 'vclm_openai_timesformer_large_336px_gpt2_xl.pt_ego4d.jobid_246897.ep_0003.md5sum_443263.pth'
 36 |     ckpt_path = os.path.join('modelzoo/', ckpt_name)
 37 |     os.makedirs('modelzoo/', exist_ok=True)
 38 |     if not os.path.exists(ckpt_path):
 39 |         print('downloading model to {}'.format(ckpt_path))
 40 |         urllib.request.urlretrieve('https://dl.fbaipublicfiles.com/lavila/checkpoints/narrator/{}'.format(ckpt_name), ckpt_path)
 41 |     ckpt = torch.load(ckpt_path, map_location='cpu')
 42 |     state_dict = OrderedDict()
 43 |     for k, v in ckpt['state_dict'].items():
 44 |         state_dict[k.replace('module.', '')] = v
 45 | 
 46 |     # instantiate the model, and load the pre-trained weights
 47 |     model = VCLM_OPENAI_TIMESFORMER_LARGE_336PX_GPT2_XL(
 48 |         text_use_cls_token=False,
 49 |         project_embed_dim=256,
 50 |         gated_xattn=True,
 51 |         timesformer_gated_xattn=False,
 52 |         freeze_lm_vclm=False,      # we use model.eval() anyway
 53 |         freeze_visual_vclm=False,  # we use model.eval() anyway
 54 |         num_frames=4,
 55 |         drop_path_rate=0.
 56 |     )
 57 |     model.load_state_dict(state_dict, strict=True)
 58 |     if args.cuda:
 59 |         model.cuda()
 60 |     model.eval()
 61 | 
 62 |     # transforms on input frames
 63 |     crop_size = 336
 64 |     val_transform = transforms.Compose([
 65 |         Permute([3, 0, 1, 2]),
 66 |         transforms.Resize(crop_size),
 67 |         transforms.CenterCrop(crop_size),
 68 |         transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305])
 69 |     ])
 70 |     frames = val_transform(frames)
 71 |     frames = frames.unsqueeze(0)  # fake a batch dimension
 72 | 
 73 |     tokenizer = MyGPT2Tokenizer('gpt2-xl', add_bos=True)
 74 |     with torch.no_grad():
 75 |         if args.cuda:
 76 |             frames = frames.cuda(non_blocking=True)
 77 |         image_features = model.encode_image(frames)
 78 |         generated_text_ids, ppls = model.generate(
 79 |             image_features,
 80 |             tokenizer,
 81 |             target=None,  # free-form generation
 82 |             max_text_length=77,
 83 |             top_k=None,
 84 |             top_p=0.95,   # nucleus sampling
 85 |             num_return_sequences=10,  # number of candidates: 10
 86 |             temperature=0.7,
 87 |             early_stopping=True,
 88 |         )
 89 | 
 90 |     for i in range(10):
 91 |         generated_text_str = decode_one(generated_text_ids[i], tokenizer)
 92 |         print('{}: {}'.format(i, generated_text_str))
 93 | 
 94 | 
 95 | if __name__ == '__main__':
 96 |     parser = argparse.ArgumentParser('lavila narrator demo')
 97 |     parser.add_argument('--cuda', default=True, action='store_true', help='use cuda')
 98 |     parser.add_argument('--video-path', default='assets/3c0dffd0-e38e-4643-bc48-d513943dc20b_012_014.mp4', type=str, help='video path')
 99 |     #parser.add_argument('--video-path', default='/home/yue/data/mount/fillipo/Datasets/Ego4d/v1/full_scale/0a3dc289-557f-4121-9bc7-521a2b5d3bb8.mp4', type=str, help='video path')
100 |     args = parser.parse_args()
101 |     main(args)
102 | 


--------------------------------------------------------------------------------
/LaViLa/demo_narrator_3rd_person.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | 
  8 | import argparse
  9 | import os
 10 | import urllib.request
 11 | from collections import OrderedDict
 12 | 
 13 | import torch
 14 | import torchvision.transforms as transforms
 15 | import torchvision.transforms._transforms_video as transforms_video
 16 | import decord
 17 | 
 18 | from lavila.data.video_transforms import Permute
 19 | from lavila.data.datasets import get_frame_ids, video_loader_by_frames
 20 | from lavila.models.models import VCLM_OPENAI_TIMESFORMER_LARGE_GPT2_XL
 21 | from lavila.models.tokenizer import MyGPT2Tokenizer
 22 | from eval_narrator import decode_one
 23 | 
 24 | 
 25 | def main(args):
 26 | 
 27 |     vr = decord.VideoReader(args.video_path)
 28 |     num_seg = 4
 29 |     frame_ids = get_frame_ids(0, len(vr), num_segments=num_seg, jitter=False)
 30 |     frames = video_loader_by_frames('./', args.video_path, frame_ids)
 31 | 
 32 |     ckpt_name = 'vclm_openai_timesformer_large_gpt2_xl.pt_htm.jobid_341080.ep_0001.pth'
 33 |     ckpt_path = os.path.join('modelzoo/', ckpt_name)
 34 |     os.makedirs('modelzoo/', exist_ok=True)
 35 |     if not os.path.exists(ckpt_path):
 36 |         print('downloading model to {}'.format(ckpt_path))
 37 |         urllib.request.urlretrieve('https://dl.fbaipublicfiles.com/lavila/checkpoints/narrator/htm_aa/{}'.format(ckpt_name), ckpt_path)
 38 |     ckpt = torch.load(ckpt_path, map_location='cpu')
 39 |     state_dict = OrderedDict()
 40 |     for k, v in ckpt['state_dict'].items():
 41 |         state_dict[k.replace('module.', '')] = v
 42 | 
 43 |     # instantiate the model, and load the pre-trained weights
 44 |     model = VCLM_OPENAI_TIMESFORMER_LARGE_GPT2_XL(
 45 |         text_use_cls_token=False,
 46 |         project_embed_dim=256,
 47 |         gated_xattn=True,
 48 |         timesformer_gated_xattn=False,
 49 |         freeze_lm_vclm=False,      # we use model.eval() anyway
 50 |         freeze_visual_vclm=False,  # we use model.eval() anyway
 51 |         freeze_visual_vclm_temporal=False,
 52 |         num_frames=4,
 53 |         drop_path_rate=0.
 54 |     )
 55 |     model.load_state_dict(state_dict, strict=True)
 56 |     if args.cuda:
 57 |         model.cuda()
 58 |     model.eval()
 59 | 
 60 |     # transforms on input frames
 61 |     crop_size = 224
 62 |     val_transform = transforms.Compose([
 63 |         Permute([3, 0, 1, 2]),
 64 |         transforms.Resize(crop_size),
 65 |         transforms.CenterCrop(crop_size),
 66 |         transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305])
 67 |     ])
 68 |     frames = val_transform(frames)
 69 |     frames = frames.unsqueeze(0)  # fake a batch dimension
 70 | 
 71 |     tokenizer = MyGPT2Tokenizer('gpt2-xl', add_bos=True)
 72 |     with torch.no_grad():
 73 |         if args.cuda:
 74 |             frames = frames.cuda(non_blocking=True)
 75 |         image_features = model.encode_image(frames)
 76 |         generated_text_ids, ppls = model.generate(
 77 |             image_features,
 78 |             tokenizer,
 79 |             target=None,  # free-form generation
 80 |             max_text_length=77,
 81 |             top_k=None,
 82 |             top_p=0.95,   # nucleus sampling
 83 |             num_return_sequences=10,  # number of candidates: 10
 84 |             temperature=0.7,
 85 |             early_stopping=True,
 86 |         )
 87 | 
 88 |     for i in range(10):
 89 |         generated_text_str = decode_one(generated_text_ids[i], tokenizer)
 90 |         print('{}: {}'.format(i, generated_text_str))
 91 | 
 92 | 
 93 | if __name__ == '__main__':
 94 |     parser = argparse.ArgumentParser('lavila narrator demo')
 95 |     parser.add_argument('--cuda', action='store_true', help='use cuda')
 96 |     parser.add_argument('--video-path', type=str,
 97 |                         default='assets/mixkit-pastry-chef-cutting-a-loaf-into-slices-43015-medium.mp4')
 98 |     args = parser.parse_args()
 99 |     main(args)
100 | 


--------------------------------------------------------------------------------
/LaViLa/docs/INSTALL.md:
--------------------------------------------------------------------------------
 1 | # Installation
 2 | 
 3 | ## Requirements
 4 | 
 5 | 
 6 | ## Example conda environment setup
 7 | 
 8 | ```bash
 9 | conda create --name lavila python=3.8 -y
10 | conda activate lavila
11 | pip install -r requirements.txt
12 | ```
13 | 
14 | ## datasets
15 | If you want to train/evaluate on the datasets, please see [datasets/README.md](../datasets/README.md) to see how we prepare datasets for this project.
16 | 


--------------------------------------------------------------------------------
/LaViLa/docs/PRETRAIN.md:
--------------------------------------------------------------------------------
  1 | # LAVILA Pretraining
  2 | 
  3 | In this doc, we provide a step-by-step guide (with commands) to train LaViLa.
  4 | Note that we recommend running the following job with four 8x V100 (32GB) nodes (or eight nodes for the larger backbone) using [submitit](https://github.com/facebookincubator/submitit).
  5 | See how to install submitit at [here](./MODEL_ZOO.md#multi-node-training).
  6 | 
  7 | 
  8 | ## Pre-training Dual-Encoder Baseline
  9 | 
 10 | We first pre-train a dual-encoder baseline with human annotations on Ego4d clips.
 11 | The goal is (1) to establish a comparable baseline for LAVILA, and (2) provide a video encoder for narrator (see below).
 12 | We use a default batch size of 32 per gpu so that the total batch size for InfoNCE loss is `32*8*4=1024`.
 13 | 
 14 | <details><summary> Train a baseline dual-encoder (with TSF-B) </summary>
 15 | 
 16 | ```bash
 17 | python run_with_submitit_pretrain.py --model CLIP_OPENAI_TIMESFORMER_BASE \
 18 |     --norm-embed --freeze-temperature \
 19 |     --fix-lr --contrastive-use-vissl \
 20 |     --nodes 4 --use_volta32
 21 | ```
 22 | </details>
 23 | 
 24 | To fit a High-Resolution TimeSformer-Large with a sufficient batch size, we use [DistilBERT](https://huggingface.co/docs/transformers/model_doc/distilbert), a memory-efficient text encoder, instead of the original text encoder in the CLIP. Additionally we apply [gradient checkpointing](https://pytorch.org/docs/stable/checkpoint.html) and [Zero Redundancy Optimizer (ZeRO)](https://arxiv.org/abs/1910.02054).
 25 | 
 26 | <details><summary> Train a baseline dual-encoder (with TSF-L@HR) </summary>
 27 | 
 28 | ```bash
 29 | python run_with_submitit_pretrain.py --model CLIP_OPENAI_TIMESFORMER_LARGE_336PX_DISTILBERT_BASE \
 30 |     --batch-size 8 \
 31 |     --use-checkpoint --use-zero \
 32 |     --norm-embed --freeze-temperature \
 33 |     --fix-lr --contrastive-use-vissl \
 34 |     --nodes 8 --use_volta32
 35 | ```
 36 | </details>
 37 | 
 38 | ## Training and Evaluating Narrator
 39 | 
 40 | The narrator is a *visually conditioned* large language model (VCLM), which comprises a pre-trained video encoder (obtained above), a text decoder (GPT-2 family), and a few gated cross-attention modules that attends visual information while captioning. Both the video encoder and the text decoder are kept frozen while the cross-attention modules are learnable.
 41 | 
 42 | Note that we turn off Pytorch's automatic mixed-precision (AMP) during training the narrator. We observe training is instable if AMP is on.
 43 | 
 44 | Also note that `$PATH` can be found in the `Vis. Encoder` column of [MODEL_ZOO.md#Narrator](./MODEL_ZOO.md#narrator). If you are using your own checkpoint (e.g. pre-trained in the previous step), please make sure that the following keys in the checkpoint have been dropped: `epoch`, `optimizer`, and `scaler`.
 45 | 
 46 | <details><summary> Train a baseline narrator (TSF-B as visual encoder and GPT-2 base as textual decoder) </summary>
 47 | 
 48 | ```bash
 49 | python run_with_submitit_pretrain.py \
 50 |     --model VCLM_OPENAI_TIMESFORMER_BASE_GPT2 \
 51 |     --gated-xattn --freeze-lm-vclm --freeze-visual-vclm --freeze-visual-vclm-temporal \
 52 |     --fix-lr --batch-size 8 --clip-grad-value 1.0 --eval-freq 1 --disable-amp \
 53 |     --nodes 4 --use_volta32 --resume $PATH   # Eg. $PATH can be "modelzoo/clip_openai_timesformer_base.baseline.ep_0003.pth"
 54 | ```
 55 | 
 56 | </details>
 57 | 
 58 | <details><summary> Train a strong narrator (TSF-L@HR as visual encoder and GPT-2 XL as textual decoder) </summary>
 59 | 
 60 | ```bash
 61 | python run_with_submitit_pretrain.py \
 62 |     --model VCLM_OPENAI_TIMESFORMER_LARGE_336PX_GPT2_XL \
 63 |     --gated-xattn --freeze-lm-vclm --freeze-visual-vclm --freeze-visual-vclm-temporal --use-checkpoint \
 64 |     --fix-lr --batch-size 8 --clip-grad-value 1.0 --eval-freq 1 --disable-amp \
 65 |     --nodes 4 --use_volta32 --resume $PATH   # Eg. $PATH can be "modelzoo/clip_openai_timesformer_large_336px_distilbert_base.baseline.ep_0003.pth"
 66 | ```
 67 | </details>
 68 | 
 69 | <details><summary> Evaluate the narrator on Ego4D val split </summary>
 70 | 
 71 | ```bash
 72 | torchrun --nproc_per_node=1 eval_narrator.py \
 73 |     --caption-top-p 0.95 --caption-temperature 0.7 \
 74 |     --eval-freq 10000 \     # evaluate on the val split of Ego4D (1/10000-subset for fast evaluation)
 75 |     --resume $VCLM_CHECKPOINT
 76 | ```
 77 | This will output some common NLG metrics, such as BLEU-x, METEOR, ROUGE_L, and CIDEr (using the human narrations as ground-truth).
 78 | </details>
 79 | 
 80 | ## Narrating video clips using LAVILA-Narrator
 81 | 
 82 | 
 83 | <details><summary> Infer the narrator </summary>
 84 | 
 85 | ```bash
 86 | python run_with_submitit_infer_narrator.py \
 87 |     --metadata datasets/Ego4D/ego4d_train.pkl \
 88 |     --batch-size 64 \
 89 |     --resume $PATH --use-half \
 90 |     --nodes 4 --use_volta32
 91 | ```
 92 | </details>
 93 | 
 94 | It will generate a pickle file (`$output_dir/total.pkl`) which is a list of quintuples - `(video_uid: str, start_time: float, end_time: float, narration_list: List[str], NLL_list: List[float])`.
 95 | 
 96 | For narrator-generated narrations on Ego4D ground-truth clips, we also provide a [replica](https://dl.fbaipublicfiles.com/lavila/metadata/ego4d/ego4d_train.narrator_63690737.return_10.pkl). Note that the narrator used here is our best performing one.
 97 | 
 98 | In addition, we can apply this narrator over the entire video for temporally dense auto-narration. We provide a [replica](https://dl.fbaipublicfiles.com/lavila/metadata/ego4d/ego4d_train.uncovered_all.narrator_63690737.return_5.pkl) (excluding the annotated clips).
 99 | 
100 | ## Rephrasing human narrations using LAVILA-Rephraser
101 | 
102 | Rephraser is a standard LLM that can paraphrase narrations in existing clips.
103 | Specifically, we use an off-the-shelf T5-based paraphraser which is publicly available at [Hugging Face's model hub](https://huggingface.co/ramsrigouthamg/t5-large-paraphraser-diverse-high-quality).
104 | For more details, please refer to the [model card](https://huggingface.co/ramsrigouthamg/t5-large-paraphraser-diverse-high-quality).
105 | 
106 | For rephrased human narrations on Ego4D ground-truth clips, we provide a [replica](https://dl.fbaipublicfiles.com/lavila/metadata/ego4d/ego4d_train.rephraser.no_punkt_top3.pkl).
107 | 
108 | 
109 | ## Pre-training LAVILA Dual-Encoder
110 | Now we are ready to pre-train our LAVILA's dual-encoder by combining human annotations (augmented by Rephraser) and the Narrator-generated narrations.
111 | 
112 | <details><summary> Training a LaViLa dual-encoder </summary>
113 | 
114 | ```bash
115 | python run_with_submitit_pretrain.py --model CLIP_OPENAI_TIMESFORMER_BASE \
116 |     --metadata datasets/Ego4D/ego4d_train.rephraser.no_punkt_top3.pkl \
117 |     --metadata-aux datasets/Ego4D/ego4d_train.narrator_63690737.return_10.pkl \  # also optionally add `datasets/Ego4D/ego4d_train.uncovered_all.narrator_63690737.return_5.pkl`
118 |     --norm-embed --freeze-temperature \
119 |     --freeze-pseudo-temperature \
120 |     --fix-lr --contrastive-use-vissl \
121 |     --nodes 4 --use_volta32
122 | ```
123 | </details>
124 | 
125 | ## Down-stream Evaluation
126 | With the pre-trained dual-encoder at hand, we now can do zero-shot or fine-tuning evalution evaluations on down-stream benchmarks.
127 | Please refer to [MODEL_ZOO.md](./MODEL_ZOO.md#zero-shot) for more details.
128 | 


--------------------------------------------------------------------------------
/LaViLa/lavila/data/__pycache__/datasets.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/data/__pycache__/datasets.cpython-39.pyc


--------------------------------------------------------------------------------
/LaViLa/lavila/data/__pycache__/video_transforms.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/data/__pycache__/video_transforms.cpython-39.pyc


--------------------------------------------------------------------------------
/LaViLa/lavila/data/video_transforms.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import math
  8 | from typing import Sequence
  9 | import torch
 10 | import torch.nn as nn
 11 | from torchvision import transforms
 12 | 
 13 | 
 14 | class Permute(nn.Module):
 15 |     """
 16 |     Permutation as an op
 17 |     """
 18 | 
 19 |     def __init__(self, ordering):
 20 |         super().__init__()
 21 |         self.ordering = ordering
 22 | 
 23 |     def forward(self, frames):
 24 |         """
 25 |         Args:
 26 |             frames in some ordering, by default (C, T, H, W)
 27 |         Returns:
 28 |             frames in the ordering that was specified
 29 |         """
 30 |         return frames.permute(self.ordering)
 31 | 
 32 | 
 33 | class TemporalCrop(nn.Module):
 34 |     """
 35 |     Convert the video into smaller clips temporally.
 36 |     """
 37 | 
 38 |     def __init__(
 39 |         self, frames_per_clip: int = 8, stride: int = 8, frame_stride: int = 1
 40 |     ):
 41 |         super().__init__()
 42 |         self.frames = frames_per_clip
 43 |         self.stride = stride
 44 |         self.frame_stride = frame_stride
 45 | 
 46 |     def forward(self, video):
 47 |         assert video.ndim == 4, "Must be (C, T, H, W)"
 48 |         res = []
 49 |         for start in range(
 50 |             0, video.size(1) - (self.frames * self.frame_stride) + 1, self.stride
 51 |         ):
 52 |             end = start + (self.frames) * self.frame_stride
 53 |             res.append(video[:, start: end: self.frame_stride, ...])
 54 |         return res
 55 | 
 56 | 
 57 | def crop_boxes(boxes, x_offset, y_offset):
 58 |     """
 59 |     Peform crop on the bounding boxes given the offsets.
 60 |     Args:
 61 |         boxes (ndarray or None): bounding boxes to peform crop. The dimension
 62 |             is `num boxes` x 4.
 63 |         x_offset (int): cropping offset in the x axis.
 64 |         y_offset (int): cropping offset in the y axis.
 65 |     Returns:
 66 |         cropped_boxes (ndarray or None): the cropped boxes with dimension of
 67 |             `num boxes` x 4.
 68 |     """
 69 |     cropped_boxes = boxes.copy()
 70 |     cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
 71 |     cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
 72 | 
 73 |     return cropped_boxes
 74 | 
 75 | 
 76 | def uniform_crop(images, size, spatial_idx, boxes=None, scale_size=None):
 77 |     """
 78 |     Perform uniform spatial sampling on the images and corresponding boxes.
 79 |     Args:
 80 |         images (tensor): images to perform uniform crop. The dimension is
 81 |             `num frames` x `channel` x `height` x `width`.
 82 |         size (int): size of height and weight to crop the images.
 83 |         spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
 84 |             is larger than height. Or 0, 1, or 2 for top, center, and bottom
 85 |             crop if height is larger than width.
 86 |         boxes (ndarray or None): optional. Corresponding boxes to images.
 87 |             Dimension is `num boxes` x 4.
 88 |         scale_size (int): optinal. If not None, resize the images to scale_size before
 89 |             performing any crop.
 90 |     Returns:
 91 |         cropped (tensor): images with dimension of
 92 |             `num frames` x `channel` x `size` x `size`.
 93 |         cropped_boxes (ndarray or None): the cropped boxes with dimension of
 94 |             `num boxes` x 4.
 95 |     """
 96 |     assert spatial_idx in [0, 1, 2]
 97 |     ndim = len(images.shape)
 98 |     if ndim == 3:
 99 |         images = images.unsqueeze(0)
100 |     height = images.shape[2]
101 |     width = images.shape[3]
102 | 
103 |     if scale_size is not None:
104 |         if width <= height:
105 |             width, height = scale_size, int(height / width * scale_size)
106 |         else:
107 |             width, height = int(width / height * scale_size), scale_size
108 |         images = torch.nn.functional.interpolate(
109 |             images,
110 |             size=(height, width),
111 |             mode="bilinear",
112 |             align_corners=False,
113 |         )
114 | 
115 |     y_offset = int(math.ceil((height - size) / 2))
116 |     x_offset = int(math.ceil((width - size) / 2))
117 | 
118 |     if height > width:
119 |         if spatial_idx == 0:
120 |             y_offset = 0
121 |         elif spatial_idx == 2:
122 |             y_offset = height - size
123 |     else:
124 |         if spatial_idx == 0:
125 |             x_offset = 0
126 |         elif spatial_idx == 2:
127 |             x_offset = width - size
128 |     cropped = images[:, :, y_offset: y_offset + size, x_offset: x_offset + size]
129 |     cropped_boxes = crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
130 |     if ndim == 3:
131 |         cropped = cropped.squeeze(0)
132 |     return cropped, cropped_boxes
133 | 
134 | 
135 | class SpatialCrop(nn.Module):
136 |     """
137 |     Convert the video into 3 smaller clips spatially. Must be used after the
138 |         temporal crops to get spatial crops, and should be used with
139 |         -2 in the spatial crop at the slowfast augmentation stage (so full
140 |         frames are passed in here). Will return a larger list with the
141 |         3x spatial crops as well. It's useful for 3x4 testing (eg in SwinT)
142 |         or 3x10 testing in SlowFast etc.
143 |     """
144 | 
145 |     def __init__(self, crop_size: int = 224, num_crops: int = 3):
146 |         super().__init__()
147 |         self.crop_size = crop_size
148 |         if num_crops == 6:
149 |             self.crops_to_ext = [0, 1, 2]
150 |             # I guess Swin uses 5 crops without flipping, but that doesn't
151 |             # make sense given they first resize to 224 and take 224 crops.
152 |             # (pg 6 of https://arxiv.org/pdf/2106.13230.pdf)
153 |             # So I'm assuming we can use flipped crops and that will add sth..
154 |             self.flipped_crops_to_ext = [0, 1, 2]
155 |         elif num_crops == 3:
156 |             self.crops_to_ext = [0, 1, 2]
157 |             self.flipped_crops_to_ext = []
158 |         elif num_crops == 1:
159 |             self.crops_to_ext = [1]
160 |             self.flipped_crops_to_ext = []
161 |         else:
162 |             raise NotImplementedError(
163 |                 "Nothing else supported yet, "
164 |                 "slowfast only takes 0, 1, 2 as arguments"
165 |             )
166 | 
167 |     def forward(self, videos: Sequence[torch.Tensor]):
168 |         """
169 |         Args:
170 |             videos: A list of C, T, H, W videos.
171 |         Returns:
172 |             videos: A list with 3x the number of elements. Each video converted
173 |                 to C, T, H', W' by spatial cropping.
174 |         """
175 |         assert isinstance(videos, list), "Must be a list of videos after temporal crops"
176 |         assert all([video.ndim == 4 for video in videos]), "Must be (C,T,H,W)"
177 |         res = []
178 |         for video in videos:
179 |             for spatial_idx in self.crops_to_ext:
180 |                 res.append(uniform_crop(video, self.crop_size, spatial_idx)[0])
181 |             if not self.flipped_crops_to_ext:
182 |                 continue
183 |             flipped_video = transforms.functional.hflip(video)
184 |             for spatial_idx in self.flipped_crops_to_ext:
185 |                 res.append(uniform_crop(flipped_video, self.crop_size, spatial_idx)[0])
186 |         return res
187 | 


--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/coca.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/coca.cpython-39.pyc


--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/distributed_utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/distributed_utils.cpython-39.pyc


--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/gpt2_gated.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/gpt2_gated.cpython-39.pyc


--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/loss.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/loss.cpython-39.pyc


--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/models.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/models.cpython-39.pyc


--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/narrator.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/narrator.cpython-39.pyc


--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/openai_clip.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/openai_clip.cpython-39.pyc


--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/openai_model.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/openai_model.cpython-39.pyc


--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/timesformer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/timesformer.cpython-39.pyc


--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/tokenizer.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/tokenizer.cpython-39.pyc


--------------------------------------------------------------------------------
/LaViLa/lavila/models/__pycache__/utils.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/__pycache__/utils.cpython-39.pyc


--------------------------------------------------------------------------------
/LaViLa/lavila/models/bpe_simple_vocab_16e6.txt.gz:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/models/bpe_simple_vocab_16e6.txt.gz


--------------------------------------------------------------------------------
/LaViLa/lavila/models/coca.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | # Part of the code is from https://github.com/lucidrains/CoCa-pytorch/blob/main/coca_pytorch/coca_pytorch.py
  8 | # Modified by Yue Zhao
  9 | # The original code is under MIT License
 10 | 
 11 | import torch
 12 | import torch.nn as nn
 13 | import torch.nn.functional as F
 14 | from torch import einsum
 15 | from einops import rearrange
 16 | 
 17 | 
 18 | def exists(val):
 19 |     return val is not None
 20 | 
 21 | 
 22 | def default(val, d):
 23 |     return val if exists(val) else d
 24 | 
 25 | 
 26 | # normalization
 27 | # they use layernorm without bias, something that pytorch does not offer
 28 | class LayerNorm(nn.Module):
 29 |     def __init__(self, dim):
 30 |         super().__init__()
 31 |         self.gamma = nn.Parameter(torch.ones(dim))
 32 |         self.register_buffer("beta", torch.zeros(dim))
 33 | 
 34 |     def forward(self, x):
 35 |         return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta)
 36 | 
 37 | 
 38 | class Residual(nn.Module):
 39 |     def __init__(self, fn):
 40 |         super().__init__()
 41 |         self.fn = fn
 42 | 
 43 |     def forward(self, x, *args, **kwargs):
 44 |         return self.fn(x, *args, **kwargs) + x
 45 | 
 46 | 
 47 | # classic Noam Shazeer paper, except here they use SwiGLU instead of the more popular GEGLU for gating the feedforward
 48 | # https://arxiv.org/abs/2002.05202
 49 | class SwiGLU(nn.Module):
 50 |     def forward(self, x):
 51 |         x, gate = x.chunk(2, dim=-1)
 52 |         return F.silu(gate) * x
 53 | 
 54 | 
 55 | class CrossAttention(nn.Module):
 56 |     def __init__(
 57 |         self,
 58 |         dim,
 59 |         *,
 60 |         context_dim=None,
 61 |         dim_head=64,
 62 |         heads=8,
 63 |         parallel_ff=False,
 64 |         ff_mult=4,
 65 |         norm_context=False
 66 |     ):
 67 |         super().__init__()
 68 |         self.heads = heads
 69 |         self.scale = dim_head ** -0.5
 70 |         inner_dim = heads * dim_head
 71 |         context_dim = default(context_dim, dim)
 72 | 
 73 |         self.norm = LayerNorm(dim)
 74 |         self.context_norm = LayerNorm(context_dim) if norm_context else nn.Identity()
 75 | 
 76 |         self.to_q = nn.Linear(dim, inner_dim, bias=False)
 77 |         self.to_kv = nn.Linear(context_dim, dim_head * 2, bias=False)
 78 |         self.to_out = nn.Linear(inner_dim, dim, bias=False)
 79 | 
 80 |         # whether to have parallel feedforward
 81 | 
 82 |         ff_inner_dim = ff_mult * dim
 83 | 
 84 |         self.ff = nn.Sequential(
 85 |             nn.Linear(dim, ff_inner_dim * 2, bias=False),
 86 |             SwiGLU(),
 87 |             nn.Linear(ff_inner_dim, dim, bias=False)
 88 |         ) if parallel_ff else None
 89 | 
 90 |     def forward(self, x, context):
 91 |         """
 92 |         einstein notation
 93 |         b - batch
 94 |         h - heads
 95 |         n, i, j - sequence length (base sequence length, source, target)
 96 |         d - feature dimension
 97 |         """
 98 | 
 99 |         # pre-layernorm, for queries and context
100 |         x = self.norm(x)
101 |         context = self.context_norm(context)
102 | 
103 |         # get queries
104 |         q = self.to_q(x)
105 |         q = rearrange(q, 'b n (h d) -> b h n d', h=self.heads)
106 | 
107 |         # scale
108 |         q = q * self.scale
109 | 
110 |         # get key / values
111 |         k, v = self.to_kv(context).chunk(2, dim=-1)
112 | 
113 |         # query / key similarity
114 |         sim = einsum('b h i d, b j d -> b h i j', q, k)
115 | 
116 |         # attention
117 |         sim = sim - sim.amax(dim=-1, keepdim=True)
118 |         attn = sim.softmax(dim=-1)
119 | 
120 |         # aggregate
121 |         out = einsum('b h i j, b j d -> b h i d', attn, v)
122 | 
123 |         # merge and combine heads
124 |         out = rearrange(out, 'b h n d -> b n (h d)')
125 |         out = self.to_out(out)
126 | 
127 |         # add parallel feedforward (for multimodal layers)
128 |         if exists(self.ff):
129 |             out = out + self.ff(x)
130 | 
131 |         return out
132 | 


--------------------------------------------------------------------------------
/LaViLa/lavila/models/distributed_utils.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Facebook, Inc. and its affiliates.
 2 | 
 3 | # This source code is licensed under the MIT license found in the
 4 | # LICENSE file in the root directory of this source tree.
 5 | 
 6 | # Part of the code is from
 7 | # `https://github.com/facebookresearch/vissl/blob/main/vissl/utils/distributed_utils.py` and
 8 | # `https://github.com/facebookresearch/ClassyVision/blob/main/classy_vision/generic/distributed_util.py`
 9 | # Modified by Yue Zhao
10 | # The original code is under MIT License
11 | 
12 | import torch
13 | import torch.distributed as dist
14 | from typing import Tuple
15 | 
16 | 
17 | def convert_to_distributed_tensor(tensor: torch.Tensor) -> Tuple[torch.Tensor, str]:
18 |     """
19 |     For some backends, such as NCCL, communication only works if the
20 |     tensor is on the GPU. This helper function converts to the correct
21 |     device and returns the tensor + original device.
22 |     """
23 |     orig_device = "cpu" if not tensor.is_cuda else "gpu"
24 |     if (
25 |         torch.distributed.is_available()
26 |         and torch.distributed.get_backend() == torch.distributed.Backend.NCCL
27 |         and not tensor.is_cuda
28 |     ):
29 |         tensor = tensor.cuda()
30 |     return (tensor, orig_device)
31 | 
32 | 
33 | def convert_to_normal_tensor(tensor: torch.Tensor, orig_device: str) -> torch.Tensor:
34 |     """
35 |     For some backends, such as NCCL, communication only works if the
36 |     tensor is on the GPU. This converts the tensor back to original device.
37 |     """
38 |     if tensor.is_cuda and orig_device == "cpu":
39 |         tensor = tensor.cpu()
40 |     return tensor
41 | 
42 | 
43 | def is_distributed_training_run() -> bool:
44 |     return (
45 |         torch.distributed.is_available()
46 |         and torch.distributed.is_initialized()
47 |         and (torch.distributed.get_world_size() > 1)
48 |     )
49 | 
50 | 
51 | class GatherLayer(torch.autograd.Function):
52 |     """
53 |     Gather tensors from all workers with support for backward propagation:
54 |     This implementation does not cut the gradients as torch.distributed.all_gather does.
55 |     """
56 | 
57 |     @staticmethod
58 |     def forward(ctx, x):
59 |         output = [torch.zeros_like(x) for _ in range(dist.get_world_size())]
60 |         dist.all_gather(output, x)
61 |         return tuple(output)
62 | 
63 |     @staticmethod
64 |     def backward(ctx, *grads):
65 |         all_gradients = torch.stack(grads)
66 |         dist.all_reduce(all_gradients)
67 |         return all_gradients[dist.get_rank()]
68 | 
69 | 
70 | def gather_from_all(tensor: torch.Tensor) -> torch.Tensor:
71 |     """
72 |     Similar to classy_vision.generic.distributed_util.gather_from_all
73 |     except that it does not cut the gradients
74 |     """
75 |     if tensor.ndim == 0:
76 |         # 0 dim tensors cannot be gathered. so unsqueeze
77 |         tensor = tensor.unsqueeze(0)
78 | 
79 |     if is_distributed_training_run():
80 |         tensor, orig_device = convert_to_distributed_tensor(tensor)
81 |         gathered_tensors = GatherLayer.apply(tensor)
82 |         gathered_tensors = [
83 |             convert_to_normal_tensor(_tensor, orig_device)
84 |             for _tensor in gathered_tensors
85 |         ]
86 |     else:
87 |         gathered_tensors = [tensor]
88 |     gathered_tensor = torch.cat(gathered_tensors, 0)
89 |     return gathered_tensor
90 | 


--------------------------------------------------------------------------------
/LaViLa/lavila/models/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | from collections import OrderedDict
  8 | import functools
  9 | import torch
 10 | import torch.nn.functional as F
 11 | 
 12 | 
 13 | def inflate_positional_embeds(
 14 |     current_model_state_dict, new_state_dict,
 15 |     num_frames=4,
 16 |     load_temporal_fix='bilinear',
 17 | ):
 18 |     # allow loading of timesformer with fewer num_frames
 19 |     curr_keys = list(current_model_state_dict.keys())
 20 |     if 'visual.temporal_embed' in new_state_dict and 'visual.temporal_embed' in curr_keys:
 21 |         load_temporal_embed = new_state_dict['visual.temporal_embed']
 22 |         load_num_frames = load_temporal_embed.shape[1]
 23 |         curr_num_frames = num_frames
 24 |         embed_dim = load_temporal_embed.shape[2]
 25 | 
 26 |         if load_num_frames != curr_num_frames:
 27 |             if load_num_frames > curr_num_frames:
 28 |                 print(f'### loaded SpaceTimeTransformer model has MORE frames than current...'
 29 |                       f'### loading weights, filling in the extras via {load_temporal_fix}')
 30 |                 new_temporal_embed = load_temporal_embed[:, :curr_num_frames, :]
 31 |             else:
 32 |                 print(f'### loaded SpaceTimeTransformer model has FEWER frames than current...'
 33 |                       f'### loading weights, filling in the extras via {load_temporal_fix}')
 34 |                 if load_temporal_fix == 'zeros':
 35 |                     new_temporal_embed = torch.zeros([load_temporal_embed.shape[0], curr_num_frames, embed_dim])
 36 |                     new_temporal_embed[:, :load_num_frames] = load_temporal_embed
 37 |                 elif load_temporal_fix in ['interp', 'bilinear']:
 38 |                     # interpolate
 39 |                     # unsqueeze so pytorch thinks its an image
 40 |                     mode = 'nearest'
 41 |                     if load_temporal_fix == 'bilinear':
 42 |                         mode = 'bilinear'
 43 |                     load_temporal_embed = load_temporal_embed.unsqueeze(0)
 44 |                     new_temporal_embed = F.interpolate(load_temporal_embed,
 45 |                                                        (curr_num_frames, embed_dim), mode=mode).squeeze(0)
 46 |                 else:
 47 |                     raise NotImplementedError
 48 |             new_state_dict['visual.temporal_embed'] = new_temporal_embed
 49 |     # allow loading with smaller spatial patches. assumes custom border crop, to append the
 50 |     # border patches to the input sequence
 51 |     if 'visual.pos_embed' in new_state_dict and 'visual.pos_embed' in curr_keys:
 52 |         load_pos_embed = new_state_dict['visual.pos_embed']
 53 |         load_num_patches = load_pos_embed.shape[1]
 54 |         curr_pos_embed = current_model_state_dict['visual.pos_embed']
 55 |         if load_num_patches != curr_pos_embed.shape[1]:
 56 |             raise NotImplementedError(
 57 |                 'Loading models with different spatial resolution / patch number not yet implemented, sorry.')
 58 | 
 59 |     return new_state_dict
 60 | 
 61 | 
 62 | def rsetattr(obj, attr, val):
 63 |     pre, _, post = attr.rpartition('.')
 64 |     return setattr(rgetattr(obj, pre) if pre else obj, post, val)
 65 | 
 66 | 
 67 | def rgetattr(obj, attr, *args):
 68 |     def _getattr(obj, attr):
 69 |         return getattr(obj, attr, *args)
 70 |     return functools.reduce(_getattr, [obj] + attr.split('.'))
 71 | 
 72 | 
 73 | # util functions to convert CLIP-style model keys to TimeSformer-style
 74 | def remap_keys(clip_state_dict, transformer_layers=12):
 75 |     remapped_state_dict = OrderedDict()
 76 |     key_mapping = {
 77 |         "class_embedding": "cls_token",
 78 |         "positional_embedding": "pos_embed",
 79 |         "conv1.weight": "patch_embed.proj.weight",
 80 |         "ln_pre.weight": "ln_pre.weight",
 81 |         "ln_pre.bias": "ln_pre.bias",
 82 |         "ln_post.weight": "norm.weight",
 83 |         "ln_post.bias": "norm.bias",
 84 |     }
 85 |     for layer in range(transformer_layers):
 86 |         key_mapping[f"transformer.resblocks.{layer}.attn.in_proj_weight"] = f"blocks.{layer}.attn.qkv.weight"
 87 |         key_mapping[f"transformer.resblocks.{layer}.attn.in_proj_bias"] = f"blocks.{layer}.attn.qkv.bias"
 88 |         key_mapping[f"transformer.resblocks.{layer}.attn.out_proj.weight"] = f"blocks.{layer}.attn.proj.weight"
 89 |         key_mapping[f"transformer.resblocks.{layer}.attn.out_proj.bias"] = f"blocks.{layer}.attn.proj.bias"
 90 |         key_mapping[f"transformer.resblocks.{layer}.ln_1.weight"] = f"blocks.{layer}.norm1.weight"
 91 |         key_mapping[f"transformer.resblocks.{layer}.ln_1.bias"] = f"blocks.{layer}.norm1.bias"
 92 |         key_mapping[f"transformer.resblocks.{layer}.mlp.c_fc.weight"] = f"blocks.{layer}.mlp.fc1.weight"
 93 |         key_mapping[f"transformer.resblocks.{layer}.mlp.c_fc.bias"] = f"blocks.{layer}.mlp.fc1.bias"
 94 |         key_mapping[f"transformer.resblocks.{layer}.mlp.c_proj.weight"] = f"blocks.{layer}.mlp.fc2.weight"
 95 |         key_mapping[f"transformer.resblocks.{layer}.mlp.c_proj.bias"] = f"blocks.{layer}.mlp.fc2.bias"
 96 |         key_mapping[f"transformer.resblocks.{layer}.ln_2.weight"] = f"blocks.{layer}.norm2.weight"
 97 |         key_mapping[f"transformer.resblocks.{layer}.ln_2.bias"] = f"blocks.{layer}.norm2.bias"
 98 | 
 99 |     for key in clip_state_dict:
100 |         if key == 'proj':
101 |             continue  # due to possible dim mismatch, we load this later
102 |         if key == "class_embedding":
103 |             clip_state_dict[key] = clip_state_dict[key].unsqueeze(0).unsqueeze(0)
104 |         if key == "positional_embedding":
105 |             clip_state_dict[key] = clip_state_dict[key].unsqueeze(0)
106 |         remapped_state_dict[key_mapping[key]] = clip_state_dict[key]
107 | 
108 |     return remapped_state_dict
109 | 


--------------------------------------------------------------------------------
/LaViLa/lavila/utils/__pycache__/distributed.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/utils/__pycache__/distributed.cpython-39.pyc


--------------------------------------------------------------------------------
/LaViLa/lavila/utils/__pycache__/preprocess.cpython-39.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/LaViLa/lavila/utils/__pycache__/preprocess.cpython-39.pyc


--------------------------------------------------------------------------------
/LaViLa/lavila/utils/distributed.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | 
  7 | import os
  8 | import shutil
  9 | import torch
 10 | import torch.distributed as dist
 11 | 
 12 | 
 13 | def get_model(model):
 14 |     if isinstance(model, torch.nn.DataParallel) \
 15 |       or isinstance(model, torch.nn.parallel.DistributedDataParallel):
 16 |         return model.module
 17 |     else:
 18 |         return model
 19 | 
 20 | 
 21 | def setup_for_distributed(is_master):
 22 |     """
 23 |     This function disables printing when not in master process
 24 |     """
 25 |     import builtins as __builtin__
 26 |     builtin_print = __builtin__.print
 27 | 
 28 |     def print(*args, **kwargs):
 29 |         force = kwargs.pop('force', False)
 30 |         if is_master or force:
 31 |             builtin_print(*args, **kwargs)
 32 | 
 33 |     __builtin__.print = print
 34 | 
 35 | 
 36 | def is_dist_avail_and_initialized():
 37 |     if not dist.is_available():
 38 |         return False
 39 |     if not dist.is_initialized():
 40 |         return False
 41 |     return True
 42 | 
 43 | 
 44 | def get_world_size():
 45 |     if not is_dist_avail_and_initialized():
 46 |         return 1
 47 |     else:
 48 |         return dist.get_world_size()
 49 | 
 50 | 
 51 | def get_rank():
 52 |     if not is_dist_avail_and_initialized():
 53 |         return 0
 54 |     return dist.get_rank()
 55 | 
 56 | 
 57 | def is_main_process():
 58 |     return get_rank() == 0
 59 | 
 60 | 
 61 | def save_on_master(state, is_best, output_dir, is_epoch=True):
 62 |     if is_main_process():
 63 |         ckpt_path = f'{output_dir}/checkpoint.pt'
 64 |         best_path = f'{output_dir}/checkpoint_best.pt'
 65 |         if is_best:
 66 |             torch.save(state, best_path)
 67 |         if is_epoch:
 68 |             if isinstance(state['epoch'], int):
 69 |                 ckpt2_path = '{}/checkpoint_{:04d}.pt'.format(output_dir, state['epoch'])
 70 |             else:
 71 |                 ckpt2_path = '{}/checkpoint_{:.4f}.pt'.format(output_dir, state['epoch'])
 72 |             torch.save(state, ckpt_path)
 73 |             shutil.copy(ckpt_path, ckpt2_path)
 74 | 
 75 | 
 76 | def init_distributed_mode(args):
 77 |     if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
 78 |         args.rank = int(os.environ["RANK"])
 79 |         args.world_size = int(os.environ['WORLD_SIZE'])
 80 |         args.gpu = int(os.environ['LOCAL_RANK'])
 81 |     elif 'SLURM_PROCID' in os.environ:
 82 |         args.rank = int(os.environ['SLURM_PROCID'])
 83 |         args.gpu = args.rank % torch.cuda.device_count()
 84 |     else:
 85 |         print('Not using distributed mode')
 86 |         args.distributed = False
 87 |         return
 88 | 
 89 |     args.distributed = True
 90 | 
 91 |     torch.cuda.set_device(args.gpu)
 92 |     args.dist_backend = 'nccl'
 93 |     print('| distributed init (rank {}): {}'.format(
 94 |         args.rank, args.dist_url), flush=True)
 95 |     torch.distributed.init_process_group(
 96 |         backend=args.dist_backend,
 97 |         init_method=args.dist_url,
 98 |         world_size=args.world_size,
 99 |         rank=args.rank
100 |     )
101 |     torch.distributed.barrier()
102 |     setup_for_distributed(args.rank == 0)
103 | 


--------------------------------------------------------------------------------
/LaViLa/lavila/utils/evaluation.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import numpy as np
 8 | import torch
 9 | 
10 | 
11 | def accuracy(output, target, topk=(1,)):
12 |     """Computes the accuracy over the k top predictions for the specified values of k"""
13 |     with torch.no_grad():
14 |         maxk = max(topk)
15 |         batch_size = target.size(0)
16 | 
17 |         _, pred = output.topk(maxk, 1, True, True)
18 |         pred = pred.t()
19 |         correct = pred.eq(target.reshape(1, -1).expand_as(pred))
20 | 
21 |         res = []
22 |         for k in topk:
23 |             correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
24 |             res.append(correct_k.mul_(100.0 / batch_size))
25 |         return res
26 | 
27 | 
28 | def get_mean_accuracy(cm):
29 |     list_acc = []
30 |     for i in range(len(cm)):
31 |         acc = 0
32 |         if cm[i, :].sum() > 0:
33 |             acc = cm[i, i] / cm[i, :].sum()
34 |         list_acc.append(acc)
35 | 
36 |     return 100 * np.mean(list_acc), 100 * np.trace(cm) / np.sum(cm)
37 | 


--------------------------------------------------------------------------------
/LaViLa/lavila/utils/evaluation_charades.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import numpy as np
 8 | 
 9 | 
10 | def compute_map(submission_array, gt_array):
11 |     """ Returns mAP, weighted mAP, and AP array """
12 |     m_aps = []
13 |     n_classes = submission_array.shape[1]
14 |     for oc_i in range(n_classes):
15 |         sorted_idxs = np.argsort(-submission_array[:, oc_i])
16 |         tp = gt_array[:, oc_i][sorted_idxs] == 1
17 |         fp = np.invert(tp)
18 |         n_pos = tp.sum()
19 |         if n_pos < 0.1:
20 |             m_aps.append(float('nan'))
21 |             continue
22 |         fp.sum()
23 |         f_pcs = np.cumsum(fp)
24 |         t_pcs = np.cumsum(tp)
25 |         prec = t_pcs / (f_pcs+t_pcs).astype(float)
26 |         avg_prec = 0
27 |         for i in range(submission_array.shape[0]):
28 |             if tp[i]:
29 |                 avg_prec += prec[i]
30 |         m_aps.append(avg_prec / n_pos.astype(float))
31 |     m_aps = np.array(m_aps)
32 |     m_ap = np.mean(m_aps)
33 |     w_ap = (m_aps * gt_array.sum(axis=0) / gt_array.sum().sum().astype(float))
34 |     return m_ap, w_ap, m_aps
35 | 
36 | 
37 | def charades_map(submission_array, gt_array):
38 |     """
39 |     Approximate version of the charades evaluation function
40 |     For precise numbers, use the submission file with the official matlab script
41 |     """
42 |     fix = submission_array.copy()
43 |     empty = np.sum(gt_array, axis=1) == 0
44 |     fix[empty, :] = np.NINF
45 |     return compute_map(fix, gt_array)
46 | 
47 | 
48 | def create_submission(video_list, predictions, out_file):
49 |     assert len(video_list) == predictions.shape[0]
50 |     with open(out_file, 'w') as f:
51 |         for i, video_id in enumerate(video_list):
52 |             pred_str = ' '.join(map(lambda x: str(x), predictions[i].tolist()))
53 |             f.write('{} {}\n\n'.format(video_id, pred_str))
54 | 


--------------------------------------------------------------------------------
/LaViLa/lavila/utils/evaluation_egomcq.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import torch
 8 | 
 9 | 
10 | def egomcq_accuracy_metrics(preds, labels, types):
11 |     metrics = {}
12 |     type_list = torch.unique(types)
13 |     group_list = ["Intra-video", "Inter-video"]
14 |     for type_i, group_i in zip(type_list, group_list):
15 |         correct = 0
16 |         total = 0
17 |         for pred, label, type in zip(preds, labels, types):
18 |             if type == type_i:
19 |                 pred_ = torch.argmax(pred)
20 |                 if pred_.item() == label.item():
21 |                     correct += 1
22 |                 total += 1
23 |         accuracy = correct/total
24 |         metrics[group_i] = accuracy * 100
25 |     return metrics
26 | 


--------------------------------------------------------------------------------
/LaViLa/lavila/utils/evaluation_ek100cls.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | # Part of the code is from https://github.com/fpv-iplab/rulstm/blob/master/RULSTM/utils.py
 8 | # Modified by Yue Zhao
 9 | 
10 | import numpy as np
11 | 
12 | 
13 | def get_marginal_indexes(actions, mode):
14 |     """For each verb/noun retrieve the list of actions containing that verb/name
15 |         Input:
16 |             mode: "verb" or "noun"
17 |         Output:
18 |             a list of numpy array of indexes. If verb/noun 3 is contained in actions 2,8,19,
19 |             then output[3] will be np.array([2,8,19])
20 |     """
21 |     vi = []
22 |     for v in range(actions[mode].max()+1):
23 |         vals = actions[actions[mode] == v].index.values
24 |         if len(vals) > 0:
25 |             vi.append(vals)
26 |         else:
27 |             vi.append(np.array([0]))
28 |     return vi
29 | 
30 | 
31 | def marginalize(probs, indexes):
32 |     mprobs = []
33 |     for ilist in indexes:
34 |         mprobs.append(probs[:, ilist].sum(1))
35 |     return np.array(mprobs).T
36 | 


--------------------------------------------------------------------------------
/LaViLa/lavila/utils/meter.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import torch
 8 | import torch.distributed as dist
 9 | from lavila.utils import distributed as dist_utils
10 | 
11 | 
12 | class AverageMeter(object):
13 |     """Computes and stores the average and current value"""
14 |     def __init__(self, name, fmt=':f'):
15 |         self.name = name
16 |         self.fmt = fmt
17 |         self.reset()
18 | 
19 |     def reset(self):
20 |         self.val = 0
21 |         self.avg = 0
22 |         self.sum = 0
23 |         self.count = 0
24 | 
25 |     def update(self, val, n=1):
26 |         self.val = val
27 |         self.sum += val * n
28 |         self.count += n
29 |         self.avg = self.sum / self.count
30 | 
31 |     def synchronize(self):
32 |         if not dist_utils.is_dist_avail_and_initialized():
33 |             return
34 |         t = torch.tensor([self.sum, self.count], dtype=torch.float64, device='cuda')
35 |         dist.barrier()
36 |         dist.all_reduce(t)
37 |         t = t.tolist()
38 |         self.sum = int(t[0])
39 |         self.count = t[1]
40 |         self.avg = self.sum / self.count
41 | 
42 |     def __str__(self):
43 |         fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
44 |         return fmtstr.format(**self.__dict__)
45 | 
46 | 
47 | class ProgressMeter(object):
48 |     def __init__(self, num_batches, meters, prefix=""):
49 |         self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
50 |         self.meters = meters
51 |         self.prefix = prefix
52 | 
53 |     def display(self, batch):
54 |         entries = [self.prefix + self.batch_fmtstr.format(batch)]
55 |         entries += [str(meter) for meter in self.meters]
56 |         print('\t'.join(entries))
57 | 
58 |     def synchronize(self):
59 |         for meter in self.meters:
60 |             meter.synchronize()
61 | 
62 |     def _get_batch_fmtstr(self, num_batches):
63 |         num_digits = len(str(num_batches // 1))
64 |         fmt = '{:' + str(num_digits) + 'd}'
65 |         return '[' + fmt + '/' + fmt.format(num_batches) + ']'
66 | 


--------------------------------------------------------------------------------
/LaViLa/lavila/utils/preprocess.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import csv
 8 | 
 9 | from lavila.models.tokenizer import MyBertTokenizer, MyDistilBertTokenizer, MyGPT2Tokenizer, SimpleTokenizer
10 | 
11 | 
12 | def generate_label_map(dataset):
13 |     if dataset == 'ek100_cls':
14 |         print("Preprocess ek100 action label space")
15 |         vn_list = []
16 |         mapping_vn2narration = {}
17 |         for f in [
18 |             'datasets/EK100/epic-kitchens-100-annotations/EPIC_100_train.csv',
19 |             'datasets/EK100/epic-kitchens-100-annotations/EPIC_100_validation.csv',
20 |         ]:
21 |             csv_reader = csv.reader(open(f))
22 |             _ = next(csv_reader)  # skip the header
23 |             for row in csv_reader:
24 |                 vn = '{}:{}'.format(int(row[10]), int(row[12]))
25 |                 narration = row[8]
26 |                 if vn not in vn_list:
27 |                     vn_list.append(vn)
28 |                 if vn not in mapping_vn2narration:
29 |                     mapping_vn2narration[vn] = [narration]
30 |                 else:
31 |                     mapping_vn2narration[vn].append(narration)
32 |                 # mapping_vn2narration[vn] = [narration]
33 |         vn_list = sorted(vn_list)
34 |         print('# of action= {}'.format(len(vn_list)))
35 |         mapping_vn2act = {vn: i for i, vn in enumerate(vn_list)}
36 |         labels = [list(set(mapping_vn2narration[vn_list[i]])) for i in range(len(mapping_vn2act))]
37 |         print(labels[:5])
38 |     elif dataset == 'charades_ego':
39 |         print("=> preprocessing charades_ego action label space")
40 |         vn_list = []
41 |         labels = []
42 |         with open('datasets/CharadesEgo/CharadesEgo/Charades_v1_classes.txt') as f:
43 |             csv_reader = csv.reader(f)
44 |             for row in csv_reader:
45 |                 vn = row[0][:4]
46 |                 vn_list.append(vn)
47 |                 narration = row[0][5:]
48 |                 labels.append(narration)
49 |         mapping_vn2act = {vn: i for i, vn in enumerate(vn_list)}
50 |         print(labels[:5])
51 |     elif dataset == 'egtea':
52 |         print("=> preprocessing egtea action label space")
53 |         labels = []
54 |         with open('datasets/EGTEA/action_idx.txt') as f:
55 |             for row in f:
56 |                 row = row.strip()
57 |                 narration = ' '.join(row.split(' ')[:-1])
58 |                 labels.append(narration.replace('_', ' ').lower())
59 |                 # labels.append(narration)
60 |         mapping_vn2act = {label: i for i, label in enumerate(labels)}
61 |         print(len(labels), labels[:5])
62 |     else:
63 |         raise NotImplementedError
64 |     return labels, mapping_vn2act
65 | 
66 | 
67 | def generate_tokenizer(model):
68 |     if model.endswith('DISTILBERT_BASE'):
69 |         tokenizer = MyDistilBertTokenizer('distilbert-base-uncased')
70 |     elif model.endswith('BERT_BASE'):
71 |         tokenizer = MyBertTokenizer('bert-base-uncased')
72 |     elif model.endswith('BERT_LARGE'):
73 |         tokenizer = MyBertTokenizer('bert-large-uncased')
74 |     elif model.endswith('GPT2'):
75 |         tokenizer = MyGPT2Tokenizer('gpt2', add_bos=True)
76 |     elif model.endswith('GPT2_MEDIUM'):
77 |         tokenizer = MyGPT2Tokenizer('gpt2-medium', add_bos=True)
78 |     elif model.endswith('GPT2_LARGE'):
79 |         tokenizer = MyGPT2Tokenizer('gpt2-large', add_bos=True)
80 |     elif model.endswith('GPT2_XL'):
81 |         tokenizer = MyGPT2Tokenizer('gpt2-xl', add_bos=True)
82 |     else:
83 |         print("Using SimpleTokenizer because of model '{}'. "
84 |               "Please check if this is what you want".format(model))
85 |         tokenizer = SimpleTokenizer()
86 |     return tokenizer
87 | 


--------------------------------------------------------------------------------
/LaViLa/lavila/utils/random.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import random
 8 | import numpy as np
 9 | import torch
10 | 
11 | 
12 | def random_seed(seed=42, rank=0):
13 |     torch.manual_seed(seed + rank)
14 |     np.random.seed(seed + rank)
15 |     random.seed(seed + rank)
16 | 


--------------------------------------------------------------------------------
/LaViLa/lavila/utils/scheduler.py:
--------------------------------------------------------------------------------
 1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
 2 | # All rights reserved.
 3 | 
 4 | # This source code is licensed under the license found in the
 5 | # LICENSE file in the root directory of this source tree.
 6 | 
 7 | import numpy as np
 8 | 
 9 | 
10 | def cosine_scheduler(base_value, final_value, epochs, niter_per_ep, warmup_epochs=0, start_warmup_value=0):
11 |     warmup_schedule = np.array([])
12 |     warmup_iters = warmup_epochs * niter_per_ep
13 |     if warmup_epochs > 0:
14 |         warmup_schedule = np.linspace(start_warmup_value, base_value, warmup_iters)
15 | 
16 |     iters = np.arange(epochs * niter_per_ep - warmup_iters)
17 |     schedule = final_value + 0.5 * (base_value - final_value) * (1 + np.cos(np.pi * iters / len(iters)))
18 | 
19 |     schedule = np.concatenate((warmup_schedule, schedule))
20 |     assert len(schedule) == epochs * niter_per_ep
21 |     return schedule
22 | 


--------------------------------------------------------------------------------
/LaViLa/requirements.txt:
--------------------------------------------------------------------------------
 1 | timm==0.5.4
 2 | torch==1.10.1
 3 | torchvision==0.11.2
 4 | decord==0.6.0
 5 | einops==0.4.1
 6 | pandas==1.4.2
 7 | pytorchvideo==0.1.5
 8 | transformers==4.27
 9 | ftfy==4.4.3
10 | spacy==3.4.1
11 | scikit-learn==1.1.1
12 | git+https://github.com/Maluuba/nlg-eval.git@master
13 | 


--------------------------------------------------------------------------------
/LaViLa/run_with_submitit_finetune_classification.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """
  7 | A script to run multinode training with submitit.
  8 | """
  9 | import argparse
 10 | import os
 11 | import uuid
 12 | from pathlib import Path
 13 | 
 14 | import main_finetune_classification as main_finetune
 15 | import submitit
 16 | 
 17 | 
 18 | def parse_args():
 19 |     parser = main_finetune.get_args_parser()
 20 |     parser = argparse.ArgumentParser("Submitit for lavila fine-tuning", parents=[parser])
 21 |     parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node")
 22 |     parser.add_argument("--nodes", default=8, type=int, help="Number of nodes to request")
 23 |     parser.add_argument("--timeout", default=2880, type=int, help="Duration of the job")
 24 |     parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.")
 25 | 
 26 |     parser.add_argument("--partition", default="learnlab", type=str, help="Partition where to submit")
 27 |     parser.add_argument("--use_volta32", action='store_true', help="Big models? Use this")
 28 |     parser.add_argument('--comment', default="", type=str,
 29 |                         help='Comment to pass to scheduler, e.g. priority message')
 30 |     return parser.parse_args()
 31 | 
 32 | 
 33 | def get_shared_folder() -> Path:
 34 |     user = os.getenv("USER")
 35 |     if Path("/checkpoint/").is_dir():
 36 |         p = Path(f"/checkpoint/{user}/experiments/lavila_ft")
 37 |         p.mkdir(exist_ok=True)
 38 |         return p
 39 |     raise RuntimeError("No shared folder available")
 40 | 
 41 | 
 42 | def get_init_file():
 43 |     # Init file must not exist, but it's parent dir must exist.
 44 |     os.makedirs(str(get_shared_folder()), exist_ok=True)
 45 |     init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init"
 46 |     if init_file.exists():
 47 |         os.remove(str(init_file))
 48 |     return init_file
 49 | 
 50 | 
 51 | class Trainer(object):
 52 |     def __init__(self, args):
 53 |         self.args = args
 54 | 
 55 |     def __call__(self):
 56 |         import main_finetune_classification as main_finetune
 57 | 
 58 |         self._setup_gpu_args()
 59 |         main_finetune.main(self.args)
 60 | 
 61 |     def checkpoint(self):
 62 |         import submitit
 63 | 
 64 |         self.args.dist_url = get_init_file().as_uri()
 65 |         print("Requeuing ", self.args)
 66 |         empty_trainer = type(self)(self.args)
 67 |         return submitit.helpers.DelayedSubmission(empty_trainer)
 68 | 
 69 |     def _setup_gpu_args(self):
 70 |         import submitit
 71 |         from pathlib import Path
 72 | 
 73 |         job_env = submitit.JobEnvironment()
 74 |         self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id)))
 75 |         self.args.gpu = job_env.local_rank
 76 |         self.args.rank = job_env.global_rank
 77 |         self.args.world_size = job_env.num_tasks
 78 |         print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
 79 | 
 80 | 
 81 | def main():
 82 |     args = parse_args()
 83 |     if args.job_dir == "":
 84 |         args.job_dir = get_shared_folder() / "%j"
 85 | 
 86 |     # Note that the folder will depend on the job_id, to easily track experiments
 87 |     executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30)
 88 | 
 89 |     num_gpus_per_node = args.ngpus
 90 |     nodes = args.nodes
 91 |     timeout_min = args.timeout
 92 | 
 93 |     partition = args.partition
 94 |     kwargs = {}
 95 |     if args.use_volta32:
 96 |         kwargs['slurm_constraint'] = 'volta32gb'
 97 |     if args.comment:
 98 |         kwargs['slurm_comment'] = args.comment
 99 | 
100 |     executor.update_parameters(
101 |         mem_gb=40 * num_gpus_per_node,
102 |         gpus_per_node=num_gpus_per_node,
103 |         tasks_per_node=num_gpus_per_node,  # one task per GPU
104 |         cpus_per_task=10,
105 |         nodes=nodes,
106 |         timeout_min=timeout_min,  # max is 60 * 72
107 |         # Below are cluster dependent parameters
108 |         slurm_partition=partition,
109 |         slurm_signal_delay_s=120,
110 |         **kwargs
111 |     )
112 | 
113 |     executor.update_parameters(name="lavila_ft")
114 | 
115 |     args.dist_url = get_init_file().as_uri()
116 |     args.output_dir = args.job_dir
117 | 
118 |     trainer = Trainer(args)
119 |     job = executor.submit(trainer)
120 | 
121 |     print("Submitted job_id:", job.job_id)
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     main()
126 | 


--------------------------------------------------------------------------------
/LaViLa/run_with_submitit_finetune_retrieval.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """
  7 | A script to run multinode training with submitit.
  8 | """
  9 | import argparse
 10 | import os
 11 | import uuid
 12 | from pathlib import Path
 13 | 
 14 | import main_finetune_retrieval as main_finetune
 15 | import submitit
 16 | 
 17 | 
 18 | def parse_args():
 19 |     parser = main_finetune.get_args_parser()
 20 |     parser = argparse.ArgumentParser("Submitit for lavila fine-tuning", parents=[parser])
 21 |     parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node")
 22 |     parser.add_argument("--nodes", default=8, type=int, help="Number of nodes to request")
 23 |     parser.add_argument("--timeout", default=2880, type=int, help="Duration of the job")
 24 |     parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.")
 25 | 
 26 |     parser.add_argument("--partition", default="learnlab", type=str, help="Partition where to submit")
 27 |     parser.add_argument("--use_volta32", action='store_true', help="Big models? Use this")
 28 |     parser.add_argument('--comment', default="", type=str,
 29 |                         help='Comment to pass to scheduler, e.g. priority message')
 30 |     return parser.parse_args()
 31 | 
 32 | 
 33 | def get_shared_folder() -> Path:
 34 |     user = os.getenv("USER")
 35 |     if Path("/checkpoint/").is_dir():
 36 |         p = Path(f"/checkpoint/{user}/experiments/lavila_ft")
 37 |         p.mkdir(exist_ok=True)
 38 |         return p
 39 |     raise RuntimeError("No shared folder available")
 40 | 
 41 | 
 42 | def get_init_file():
 43 |     # Init file must not exist, but it's parent dir must exist.
 44 |     os.makedirs(str(get_shared_folder()), exist_ok=True)
 45 |     init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init"
 46 |     if init_file.exists():
 47 |         os.remove(str(init_file))
 48 |     return init_file
 49 | 
 50 | 
 51 | class Trainer(object):
 52 |     def __init__(self, args):
 53 |         self.args = args
 54 | 
 55 |     def __call__(self):
 56 |         import main_finetune_retrieval as main_finetune
 57 | 
 58 |         self._setup_gpu_args()
 59 |         main_finetune.main(self.args)
 60 | 
 61 |     def checkpoint(self):
 62 |         import submitit
 63 | 
 64 |         self.args.dist_url = get_init_file().as_uri()
 65 |         print("Requeuing ", self.args)
 66 |         empty_trainer = type(self)(self.args)
 67 |         return submitit.helpers.DelayedSubmission(empty_trainer)
 68 | 
 69 |     def _setup_gpu_args(self):
 70 |         import submitit
 71 |         from pathlib import Path
 72 | 
 73 |         job_env = submitit.JobEnvironment()
 74 |         self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id)))
 75 |         self.args.gpu = job_env.local_rank
 76 |         self.args.rank = job_env.global_rank
 77 |         self.args.world_size = job_env.num_tasks
 78 |         print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
 79 | 
 80 | 
 81 | def main():
 82 |     args = parse_args()
 83 |     if args.job_dir == "":
 84 |         args.job_dir = get_shared_folder() / "%j"
 85 | 
 86 |     # Note that the folder will depend on the job_id, to easily track experiments
 87 |     executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30)
 88 | 
 89 |     num_gpus_per_node = args.ngpus
 90 |     nodes = args.nodes
 91 |     timeout_min = args.timeout
 92 | 
 93 |     partition = args.partition
 94 |     kwargs = {}
 95 |     if args.use_volta32:
 96 |         kwargs['slurm_constraint'] = 'volta32gb'
 97 |     if args.comment:
 98 |         kwargs['slurm_comment'] = args.comment
 99 | 
100 |     executor.update_parameters(
101 |         mem_gb=40 * num_gpus_per_node,
102 |         gpus_per_node=num_gpus_per_node,
103 |         tasks_per_node=num_gpus_per_node,  # one task per GPU
104 |         cpus_per_task=10,
105 |         nodes=nodes,
106 |         timeout_min=timeout_min,  # max is 60 * 72
107 |         # Below are cluster dependent parameters
108 |         slurm_partition=partition,
109 |         slurm_signal_delay_s=120,
110 |         **kwargs
111 |     )
112 | 
113 |     executor.update_parameters(name="lavila_ft")
114 | 
115 |     args.dist_url = get_init_file().as_uri()
116 |     args.output_dir = args.job_dir
117 | 
118 |     trainer = Trainer(args)
119 |     job = executor.submit(trainer)
120 | 
121 |     print("Submitted job_id:", job.job_id)
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     main()
126 | 


--------------------------------------------------------------------------------
/LaViLa/run_with_submitit_infer_narrator.py:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  4 | # All rights reserved.
  5 | 
  6 | # This source code is licensed under the license found in the
  7 | # LICENSE file in the root directory of this source tree.
  8 | """
  9 | A script to run multinode training with submitit.
 10 | """
 11 | import argparse
 12 | import os
 13 | import uuid
 14 | from pathlib import Path
 15 | 
 16 | import main_infer_narrator
 17 | import submitit
 18 | 
 19 | 
 20 | def parse_args():
 21 |     parser = main_infer_narrator.get_args_parser()
 22 |     parser = argparse.ArgumentParser("Submitit for inferring narrator", parents=[parser])
 23 |     parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node")
 24 |     parser.add_argument("--nodes", default=4, type=int, help="Number of nodes to request")
 25 |     parser.add_argument("--timeout", default=2880, type=int, help="Duration of the job")
 26 |     parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.")
 27 | 
 28 |     parser.add_argument("--partition", default="learnlab", type=str, help="Partition where to submit")
 29 |     parser.add_argument("--use_volta32", action='store_true', help="Big models? Use this")
 30 |     parser.add_argument('--comment', default="", type=str,
 31 |                         help='Comment to pass to scheduler, e.g. priority message')
 32 |     return parser.parse_args()
 33 | 
 34 | 
 35 | def get_shared_folder() -> Path:
 36 |     user = os.getenv("USER")
 37 |     if Path("/checkpoint/").is_dir():
 38 |         p = Path(f"/checkpoint/{user}/experiments/extract_caption")
 39 |         p.mkdir(exist_ok=True)
 40 |         return p
 41 |     raise RuntimeError("No shared folder available")
 42 | 
 43 | 
 44 | def get_init_file():
 45 |     # Init file must not exist, but it's parent dir must exist.
 46 |     os.makedirs(str(get_shared_folder()), exist_ok=True)
 47 |     init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init"
 48 |     if init_file.exists():
 49 |         os.remove(str(init_file))
 50 |     return init_file
 51 | 
 52 | 
 53 | class Trainer(object):
 54 |     def __init__(self, args):
 55 |         self.args = args
 56 | 
 57 |     def __call__(self):
 58 |         import main_infer_narrator
 59 | 
 60 |         self._setup_gpu_args()
 61 |         main_infer_narrator.main(self.args)
 62 | 
 63 |     def checkpoint(self):
 64 |         import submitit
 65 | 
 66 |         self.args.dist_url = get_init_file().as_uri()
 67 |         print("Requeuing ", self.args)
 68 |         empty_trainer = type(self)(self.args)
 69 |         return submitit.helpers.DelayedSubmission(empty_trainer)
 70 | 
 71 |     def _setup_gpu_args(self):
 72 |         import submitit
 73 |         from pathlib import Path
 74 | 
 75 |         job_env = submitit.JobEnvironment()
 76 |         self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id)))
 77 |         self.args.gpu = job_env.local_rank
 78 |         self.args.rank = job_env.global_rank
 79 |         self.args.world_size = job_env.num_tasks
 80 |         print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
 81 | 
 82 | 
 83 | def main():
 84 |     args = parse_args()
 85 |     if args.job_dir == "":
 86 |         args.job_dir = get_shared_folder() / "%j"
 87 | 
 88 |     # Note that the folder will depend on the job_id, to easily track experiments
 89 |     executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30)
 90 | 
 91 |     num_gpus_per_node = args.ngpus
 92 |     nodes = args.nodes
 93 |     timeout_min = args.timeout
 94 | 
 95 |     partition = args.partition
 96 |     kwargs = {}
 97 |     if args.use_volta32:
 98 |         kwargs['slurm_constraint'] = 'volta32gb'
 99 |     if args.comment:
100 |         kwargs['slurm_comment'] = args.comment
101 | 
102 |     executor.update_parameters(
103 |         mem_gb=55 * num_gpus_per_node,
104 |         gpus_per_node=num_gpus_per_node,
105 |         tasks_per_node=num_gpus_per_node,  # one task per GPU
106 |         cpus_per_task=10,
107 |         nodes=nodes,
108 |         timeout_min=timeout_min,  # max is 60 * 72
109 |         # Below are cluster dependent parameters
110 |         slurm_partition=partition,
111 |         slurm_signal_delay_s=120,
112 |         **kwargs
113 |     )
114 | 
115 |     executor.update_parameters(name="infer_narrator")
116 | 
117 |     args.dist_url = get_init_file().as_uri()
118 |     args.output_dir = args.job_dir
119 | 
120 |     trainer = Trainer(args)
121 |     job = executor.submit(trainer)
122 | 
123 |     print("Submitted job_id:", job.job_id)
124 | 
125 | 
126 | if __name__ == "__main__":
127 |     main()
128 | 


--------------------------------------------------------------------------------
/LaViLa/run_with_submitit_pretrain.py:
--------------------------------------------------------------------------------
  1 | # Copyright (c) Meta Platforms, Inc. and affiliates.
  2 | # All rights reserved.
  3 | 
  4 | # This source code is licensed under the license found in the
  5 | # LICENSE file in the root directory of this source tree.
  6 | """
  7 | A script to run multinode training with submitit.
  8 | """
  9 | import argparse
 10 | import os
 11 | import uuid
 12 | from pathlib import Path
 13 | 
 14 | import main_pretrain
 15 | import submitit
 16 | 
 17 | 
 18 | def parse_args():
 19 |     parser = main_pretrain.get_args_parser()
 20 |     parser = argparse.ArgumentParser("Submitit for lavila pre-training", parents=[parser])
 21 |     parser.add_argument("--ngpus", default=8, type=int, help="Number of gpus to request on each node")
 22 |     parser.add_argument("--nodes", default=8, type=int, help="Number of nodes to request")
 23 |     parser.add_argument("--timeout", default=2880, type=int, help="Duration of the job")
 24 |     parser.add_argument("--job_dir", default="", type=str, help="Job dir. Leave empty for automatic.")
 25 | 
 26 |     parser.add_argument("--partition", default="learnlab", type=str, help="Partition where to submit")
 27 |     parser.add_argument("--use_volta32", action='store_true', help="Big models? Use this")
 28 |     parser.add_argument('--comment', default="", type=str,
 29 |                         help='Comment to pass to scheduler, e.g. priority message')
 30 |     return parser.parse_args()
 31 | 
 32 | 
 33 | def get_shared_folder() -> Path:
 34 |     user = os.getenv("USER")
 35 |     if Path("/checkpoint/").is_dir():
 36 |         p = Path(f"/checkpoint/{user}/experiments/lavila_pretrain")
 37 |         p.mkdir(exist_ok=True)
 38 |         return p
 39 |     raise RuntimeError("No shared folder available")
 40 | 
 41 | 
 42 | def get_init_file():
 43 |     # Init file must not exist, but it's parent dir must exist.
 44 |     os.makedirs(str(get_shared_folder()), exist_ok=True)
 45 |     init_file = get_shared_folder() / f"{uuid.uuid4().hex}_init"
 46 |     if init_file.exists():
 47 |         os.remove(str(init_file))
 48 |     return init_file
 49 | 
 50 | 
 51 | class Trainer(object):
 52 |     def __init__(self, args):
 53 |         self.args = args
 54 | 
 55 |     def __call__(self):
 56 |         import main_pretrain
 57 | 
 58 |         self._setup_gpu_args()
 59 |         main_pretrain.main(self.args)
 60 | 
 61 |     def checkpoint(self):
 62 |         import submitit
 63 | 
 64 |         self.args.dist_url = get_init_file().as_uri()
 65 |         print("Requeuing ", self.args)
 66 |         empty_trainer = type(self)(self.args)
 67 |         return submitit.helpers.DelayedSubmission(empty_trainer)
 68 | 
 69 |     def _setup_gpu_args(self):
 70 |         import submitit
 71 |         from pathlib import Path
 72 | 
 73 |         job_env = submitit.JobEnvironment()
 74 |         self.args.output_dir = Path(str(self.args.output_dir).replace("%j", str(job_env.job_id)))
 75 |         self.args.gpu = job_env.local_rank
 76 |         self.args.rank = job_env.global_rank
 77 |         self.args.world_size = job_env.num_tasks
 78 |         print(f"Process group: {job_env.num_tasks} tasks, rank: {job_env.global_rank}")
 79 | 
 80 | 
 81 | def main():
 82 |     args = parse_args()
 83 |     if args.job_dir == "":
 84 |         args.job_dir = get_shared_folder() / "%j"
 85 | 
 86 |     # Note that the folder will depend on the job_id, to easily track experiments
 87 |     executor = submitit.AutoExecutor(folder=args.job_dir, slurm_max_num_timeout=30)
 88 | 
 89 |     num_gpus_per_node = args.ngpus
 90 |     nodes = args.nodes
 91 |     timeout_min = args.timeout
 92 | 
 93 |     partition = args.partition
 94 |     kwargs = {}
 95 |     if args.use_volta32:
 96 |         kwargs['slurm_constraint'] = 'volta32gb'
 97 |     if args.comment:
 98 |         kwargs['slurm_comment'] = args.comment
 99 | 
100 |     executor.update_parameters(
101 |         mem_gb=40 * num_gpus_per_node,
102 |         gpus_per_node=num_gpus_per_node,
103 |         tasks_per_node=num_gpus_per_node,  # one task per GPU
104 |         cpus_per_task=10,
105 |         nodes=nodes,
106 |         timeout_min=timeout_min,  # max is 60 * 72
107 |         # Below are cluster dependent parameters
108 |         slurm_partition=partition,
109 |         slurm_signal_delay_s=120,
110 |         **kwargs
111 |     )
112 | 
113 |     executor.update_parameters(name="lavila_pretrain")
114 | 
115 |     args.dist_url = get_init_file().as_uri()
116 |     args.output_dir = args.job_dir
117 | 
118 |     trainer = Trainer(args)
119 |     job = executor.submit(trainer)
120 | 
121 |     print("Submitted job_id:", job.job_id)
122 | 
123 | 
124 | if __name__ == "__main__":
125 |     main()
126 | 


--------------------------------------------------------------------------------
/LaViLa/scripts/convert_egovlp_ckpt.py:
--------------------------------------------------------------------------------
 1 | # This source code is licensed under the license found in the
 2 | # LICENSE file in the root directory of this source tree.
 3 | 
 4 | '''
 5 | Usage:
 6 | ```bash
 7 | PYTHONPATH=<EgoVLP-ROOT> python scripts/convert_egovlp_ckpt.py \
 8 |     --input-ckpt <EGOVLP_PATH> \
 9 |     --output-ckpt egovlp_converted.pth
10 | ```
11 | '''
12 | 
13 | import argparse
14 | from collections import OrderedDict
15 | import torch
16 | 
17 | 
18 | def get_args_parser():
19 |     parser = argparse.ArgumentParser(description='Convert EgoVLP checkpoint', add_help=False)
20 |     parser.add_argument('--input-ckpt', type=str)
21 |     parser.add_argument('--output-ckpt', type=str)
22 |     return parser
23 | 
24 | 
25 | def main(args):
26 |     input_ckpt = torch.load(args.input_ckpt, map_location='cpu')
27 |     input_ckpt = input_ckpt['state_dict']
28 |     output_ckpt = OrderedDict()
29 |     for k in input_ckpt:
30 |         if k.startswith('module.video_model'):
31 |             output_ckpt[k.replace('module.video_model', 'module.visual')] = input_ckpt[k]
32 |         elif k.startswith('module.text_model'):
33 |             output_ckpt[k.replace('module.text_model', 'module.textual')] = input_ckpt[k]
34 |         elif k.startswith('module.txt_proj'):
35 |             output_ckpt[k.replace('module.txt_proj', 'module.text_projection')] = input_ckpt[k]
36 |         elif k.startswith('module.vid_proj'):
37 |             output_ckpt[k.replace('module.vid_proj', 'module.image_projection')] = input_ckpt[k]
38 |         else:
39 |             print(k)
40 |             raise ValueError
41 |     torch.save({
42 |         'epoch': 0,
43 |         'state_dict': output_ckpt,
44 |         'best_acc1': 0,
45 |         }, args.output_ckpt)
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     parser = argparse.ArgumentParser('Convert EgoVLP checkpoint', parents=[get_args_parser()])
50 |     args = parser.parse_args()
51 |     main(args)
52 | 


--------------------------------------------------------------------------------
/LaViLa/scripts/crop_and_resize_ego4d.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | small_side=288
 4 | cliplen_sec=300
 5 | max_tries=5
 6 | indir="/path/to/full-scale/videos/"
 7 | outdir="/path/to/downscaled/videos/"
 8 | 
 9 | cd $indir || exit
10 | all_videos=$(find . -iname "*.mp4")
11 | all_videos=( $all_videos )  # to array
12 | cd -
13 | 
14 | for video in "${all_videos[@]}"; do
15 |     W=$( ffprobe -v quiet -show_format -show_streams -show_entries stream=width "${indir}/${video}" | grep width )
16 |     W=${W#width=}
17 |     H=$( ffprobe -v quiet -show_format -show_streams -show_entries stream=height "${indir}/${video}" | grep height )
18 |     H=${H#height=}
19 |     # Set the smaller side to small_side
20 |     # from https://superuser.com/a/624564
21 |     if [ $W -gt $H ] && [ $H -gt ${small_side} ]; then
22 | 	scale_str="-filter:v scale=-1:${small_side}"
23 |     elif [ $H -gt $W ] && [ $W -gt ${small_side} ]; then
24 | 	scale_str="-filter:v scale=${small_side}:-1"
25 |     else
26 | 	# The small side is smaller than required size, so don't resize/distort the video
27 | 	scale_str=""
28 |     fi
29 |     vidlen_sec=$( ffprobe -v error -show_entries format=duration -of default=noprint_wrappers=1:nokey=1 "${indir}/${video}" )
30 |     mkdir -p "${outdir}/${video}"
31 |     for st_sec in $(seq 0 ${cliplen_sec} ${vidlen_sec}); do
32 | 	outfpath=${outdir}/${video}/${st_sec}.mp4
33 | 	try=0
34 | 	while [ $try -le $max_tries ]; do
35 |             ffmpeg -y -ss ${st_sec} -i "${indir}/${video}" ${scale_str} -t ${cliplen_sec} "${outfpath}"
36 | 	    try=$(( $try + 1 ))
37 | 	    write_errors=$( ffprobe -v error -i "${outfpath}" )
38 | 	    # If no errors detected by ffprobe, we are done
39 | 	    if [ -z "$write_errors" ]; then
40 | 		echo $outfpath written successfully in $try tries!
41 | 		break
42 |             fi
43 | 	done
44 |     done
45 |     echo "Converted ${video}"
46 | done
47 | 
48 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | <h1>VideoAgent: A Memory-augmented Multimodal Agent for Video Understanding (ECCV 2024)</h1>
 2 | 
 3 | # Introduction
 4 | This is the official code repository of [VideoAgent: A Memory-augmented Multimodal Agent for Video Understanding
 5 | ](https://videoagent.github.io/). VideoAgent is a mulit-modal agent that can understand the input video and answer the questions raised by you.
 6 | 
 7 | Given a video and a question, VideoAgent has two phases: memory construction phase and inference phase. During the memory construction phase, structured information is extracted from the video and stored in the memory. During the inference phase, a LLM is prompted to use a set of tools interacting with the memory to answer the question.
 8 | <p align="center">
 9 | <img src="imgs/teaser.png" width=90%>
10 | </p>
11 | 
12 | # Prerequisites
13 | This project is tested on Ubuntu 20.04 with a NVIDIA RTX 4090(24GB).  
14 | 
15 | 
16 | # Installation Guide
17 | Use the following command to create the environment named as videoagent:
18 | ```sh
19 | conda env create -f environment.yaml
20 | ```
21 | 
22 | Create the environment of [Video-LLaVA](https://github.com/PKU-YuanGroup/Video-LLaVA) by running the following command:
23 | ```sh
24 | git clone https://github.com/PKU-YuanGroup/Video-LLaVA
25 | cd Video-LLaVA
26 | conda create -n videollava python=3.10 -y
27 | conda activate videollava
28 | pip install --upgrade pip  # enable PEP 660 support
29 | pip install -e .
30 | pip install -e ".[train]"
31 | pip install flash-attn --no-build-isolation
32 | pip install decord opencv-python git+https://github.com/facebookresearch/pytorchvideo.git@28fe037d212663c6a24f373b94cc5d478c8c1a1d
33 | ```
34 | Note: Only the conda envrionment named videollava is required for this project, while the Video-LLaVA repository is not required. You can clone Video-LLaVA repository to anywhere you want and build the conda environment named videollava.
35 | 
36 | Download the ```cache_dir.zip``` and ```tool_models.zip``` from [here](https://zenodo.org/records/11031717) and unzip them to the directory of ```VideoAgent```. This will create two folder ```cache_dir```(the model weights of VideoLLaVA) and ```tool_models```(the model weights of all other models) under ```VideoAgent```.
37 | 
38 | # Usage
39 | Make sure you are under VideoAgent directory.
40 | Enter your OpenAI api key in ```config/default.yaml```.
41 | 
42 | First, open a terminal and run:
43 | ```sh
44 | conda activate videollava
45 | python video-llava.py
46 | ```
47 | This will start a Video-LLaVA server process that will deal with Visual Question Answering request raised by VideoAgent.
48 | 
49 | Once you see ```ready for connection!``` in the first process, Then, open another terminal and run:
50 | ```sh
51 | conda activate videoagent
52 | python demo.py
53 | ```
54 | This will create a Gradio demo shown as follows.
55 | <p align="center">
56 | <img src="imgs/demo.png" width=90%>
57 | </p>
58 | You can choose the example videos for inference, or you can also upload your own videos and questions. Once submitted, VideoAgent will start processing your video and store the files under ```preprocess/your_video_name```. After processing the input video, it will answer your question.
59 | 
60 | The results will provide:
61 | 1. the answer to the question
62 | 2. the replay with object re-ID of the input video
63 | 3. the inference log (chain-of-thought) of VideoAgent
64 | 
65 | For batch inference, you can run
66 | ```sh
67 | conda activate videoagent
68 | python main.py
69 | ```
70 | 
71 | # Citation
72 | If you find our paper and code useful in your research, please consider giving a star ⭐ and citation 📝.
73 | ```
74 | @inproceedings{fan2025videoagent,
75 |   title={Videoagent: A memory-augmented multimodal agent for video understanding},
76 |   author={Fan, Yue and Ma, Xiaojian and Wu, Rujie and Du, Yuntao and Li, Jiaqi and Gao, Zhi and Li, Qing},
77 |   booktitle={European Conference on Computer Vision},
78 |   pages={75--92},
79 |   year={2025},
80 |   organization={Springer}
81 | }
82 | ```
83 | 
84 | 


--------------------------------------------------------------------------------
/captioning.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.insert(0, 'LaViLa/')
  3 | import os
  4 | import urllib.request
  5 | from collections import OrderedDict
  6 | import numpy as np
  7 | import time
  8 | import torch
  9 | import torchvision.transforms as transforms
 10 | import torchvision.transforms._transforms_video as transforms_video
 11 | from LaViLa.lavila.data.video_transforms import Permute
 12 | from LaViLa.lavila.models.models import VCLM_OPENAI_TIMESFORMER_LARGE_336PX_GPT2_XL
 13 | from LaViLa.lavila.models.tokenizer import MyGPT2Tokenizer
 14 | from LaViLa.eval_narrator import decode_one
 15 | import json
 16 | import cv2
 17 | import pickle
 18 | 
 19 | 
 20 | 
 21 | class Captioning:
 22 |     def __init__(self, video_path_list, base_dir='preprocess'):
 23 |         self.video_path_list = video_path_list
 24 |         self.seconds_per_caption = 2 # a caption covers 2 seconds
 25 |         self.frames_per_caption = 4 # a caption is generated from 4 frames in the 2-second segments
 26 |         self.base_dir = base_dir
 27 | 
 28 | 
 29 |     def generate_captions_for_all_videos(self):
 30 |         """create the captions for all videos"""
 31 |         start_time = time.time()
 32 |         crop_size = 336
 33 |         val_transform = transforms.Compose([
 34 |             Permute([3, 0, 1, 2]),
 35 |             transforms.Resize(crop_size),
 36 |             transforms.CenterCrop(crop_size),
 37 |             transforms_video.NormalizeVideo(mean=[108.3272985, 116.7460125, 104.09373615000001], std=[68.5005327, 66.6321579, 70.32316305])
 38 |         ])
 39 |         ckpt_name = 'vclm_openai_timesformer_large_336px_gpt2_xl.pt_ego4d.jobid_246897.ep_0003.md5sum_443263.pth'
 40 |         ckpt_path = os.path.join('tool_models/LaViLa/', ckpt_name)
 41 |         if not os.path.exists(ckpt_path):
 42 |             print('downloading model to {}'.format(ckpt_path))
 43 |             urllib.request.urlretrieve('https://dl.fbaipublicfiles.com/lavila/checkpoints/narrator/{}'.format(ckpt_name), ckpt_path)
 44 |         ckpt = torch.load(ckpt_path, map_location='cpu')
 45 |         state_dict = OrderedDict()
 46 |         for k, v in ckpt['state_dict'].items():
 47 |             state_dict[k.replace('module.', '')] = v
 48 |         # instantiate the model, and load the pre-trained weights
 49 |         model = VCLM_OPENAI_TIMESFORMER_LARGE_336PX_GPT2_XL(
 50 |             text_use_cls_token=False,
 51 |             project_embed_dim=256,
 52 |             gated_xattn=True,
 53 |             timesformer_gated_xattn=False,
 54 |             freeze_lm_vclm=False,      # we use model.eval() anyway
 55 |             freeze_visual_vclm=False,  # we use model.eval() anyway
 56 |             num_frames=4,
 57 |             drop_path_rate=0.
 58 |         )
 59 |         model.load_state_dict(state_dict, strict=True)
 60 |         model.cuda()
 61 |         model.eval()
 62 |         tokenizer = MyGPT2Tokenizer('gpt2-xl', add_bos=True)
 63 |         end_time = time.time()
 64 |         print(f'time for loading captioning model: {round(end_time-start_time, 3)} seconds')
 65 | 
 66 | 
 67 |         for video_path in self.video_path_list:
 68 |             cap = cv2.VideoCapture(video_path)
 69 |             if not cap.isOpened():
 70 |                 print("Error: Unable to open video file.")
 71 |                 continue
 72 |             fps = round(cap.get(cv2.CAP_PROP_FPS))
 73 |             total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
 74 |             total_captions = total_frames//(fps*self.seconds_per_caption)
 75 |             frame_interval = fps*self.seconds_per_caption//self.frames_per_caption # the interval between two selected frames
 76 |         
 77 |             base_name = os.path.basename(video_path).replace(".mp4", "")
 78 |             video_dir = os.path.join(self.base_dir, base_name)
 79 |             if not os.path.exists(video_dir):
 80 |                 os.makedirs(video_dir)
 81 | 
 82 |             captions = dict()
 83 |             start_time = time.time()
 84 |             cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
 85 |             for caption_id in range(total_captions):
 86 |                 frames = []
 87 |                 for i in range(self.frames_per_caption): # 4 frames are selected for generating the caption
 88 |                     success, frame = cap.read()
 89 |                     frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
 90 |                     frames.append(frame)
 91 |                     for j in range(frame_interval-1): #skip other frames
 92 |                         success, frame = cap.read()
 93 |                 for i in range(fps*self.seconds_per_caption-frame_interval*self.frames_per_caption):
 94 |                     success, frame = cap.read() #skip remaining frames
 95 |                 frames = [torch.tensor(frame, dtype=torch.float32) for frame in frames]
 96 |                 frames = torch.stack(frames, dim=0)
 97 |                 frames = val_transform(frames)
 98 |                 frames = frames.unsqueeze(0)
 99 | 
100 |                 with torch.no_grad():
101 |                     input_frames = frames.cuda(non_blocking=True)
102 |                     image_features = model.encode_image(input_frames)
103 |                     generated_text_ids, ppls = model.generate(
104 |                         image_features,
105 |                         tokenizer,
106 |                         target=None,  # free-form generation
107 |                         max_text_length=77,
108 |                         top_k=None,
109 |                         top_p=0.95,   # nucleus sampling
110 |                         num_return_sequences=5,  # number of candidates: 5
111 |                         temperature=0.7,
112 |                         early_stopping=True,
113 |                     )
114 |                 text = ""
115 |                 length = -1
116 |                 for i in range(5):
117 |                     # select the longest candidate as the caption
118 |                     generated_text_str = decode_one(generated_text_ids[i], tokenizer)
119 |                     if len(generated_text_str) > length:
120 |                         length = len(generated_text_str)
121 |                         text = generated_text_str
122 |                 caption_start_frame = caption_id*fps*self.seconds_per_caption
123 |                 caption_end_frame = (caption_id+1)*fps*self.seconds_per_caption
124 |                 segment = "{}_{}".format(str(caption_start_frame), str(caption_end_frame))
125 |                 captions[segment] = text
126 |                 print(f"id: {caption_id}, frame_interval: {segment}, caption: {text}")
127 |             end_time = time.time()
128 |             cap.release()
129 |             print(f"captioning time for video {base_name}: {round(end_time-start_time, 3)} seconds")
130 |             with open(os.path.join(video_dir, "captions.json"), 'w') as f:
131 |                 json.dump(captions, f)
132 |             segments = list(captions)
133 |             segment2id = dict()
134 |             for segment in segments:
135 |                 segment2id[segment] = len(segment2id)
136 |             with open(os.path.join(video_dir, "segment2id.json"), 'w') as f:
137 |                 json.dump(segment2id, f)
138 | 
139 |     def run(self):
140 |         self.generate_captions_for_all_videos()


--------------------------------------------------------------------------------
/config/default.yaml:
--------------------------------------------------------------------------------
1 | openai_api_key: your_openai-api_key
2 | use_reid: true
3 | vqa_tool: videollava #videollava or gpt-4v
4 | base_dir: preprocess


--------------------------------------------------------------------------------
/database.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import json
  3 | import pickle
  4 | from collections import defaultdict
  5 | from encoder import encode_sentences
  6 | from utils import compute_cosine_similarity, top_k_indices
  7 | import numpy as np
  8 | import sqlite3
  9 | 
 10 | 
 11 | class DataBase:
 12 |     def __init__(self, video_path, base_dir='preprocess', use_reid=True):
 13 |         base_name = os.path.basename(video_path).replace(".mp4", "")
 14 |         self.video_dir = os.path.join(base_dir, base_name)
 15 |         self.use_reid = use_reid
 16 |         if self.use_reid:
 17 |             with open(os.path.join(self.video_dir, 'reid.pkl'), 'rb') as f:
 18 |                 content = pickle.load(f)
 19 |             self.frame2uid, self.uid2frame, self.uid2category = content[0], content[1], content[2]
 20 |             with open(os.path.join(self.video_dir, 'uid2clip.pkl'), 'rb') as f:
 21 |                 self.uid2emb = pickle.load(f)
 22 |         else:
 23 |             with open(os.path.join(self.video_dir, 'tracking.pkl'), 'rb') as f:
 24 |                 content = pickle.load(f)
 25 |             self.frame2uid, self.uid2frame, self.uid2category = content[0], content[1], content[2]
 26 |             with open(os.path.join(self.video_dir, 'tid2clip.pkl'), 'rb') as f:
 27 |                 self.uid2emb = pickle.load(f)
 28 | 
 29 |         with open(os.path.join(self.video_dir, 'segment2id.json')) as f:
 30 |             self.segment2id = json.load(f)
 31 |         self.segment_id2uids = defaultdict(set)
 32 |         for frame in self.frame2uid:
 33 |             segment_id = 0
 34 |             for segment in self.segment2id:
 35 |                 start, end = segment.split('_')
 36 |                 start, end = int(start), int(end)
 37 |                 if start <= frame <= end:
 38 |                     segment_id = self.segment2id[segment]
 39 |                     break
 40 |             uids = list(self.frame2uid[frame])
 41 |             self.segment_id2uids[segment_id].update(uids)
 42 |         
 43 |         if os.path.exists('database.db'):
 44 |             os.remove('database.db')
 45 |         connection = sqlite3.connect('database.db')
 46 |         cursor = connection.cursor()
 47 |         create_object = """
 48 |         CREATE TABLE Objects(
 49 |             object_id INT,
 50 |             category VARCHAR(255),
 51 |             PRIMARY KEY (object_id)
 52 |         );
 53 |         """
 54 |         cursor.execute(create_object)
 55 |         create_segment = """
 56 |         CREATE TABLE Segments(
 57 |             segment_id INT,
 58 |             PRIMARY KEY (segment_id)
 59 |         );
 60 |         """
 61 |         cursor.execute(create_segment)
 62 |         create_object_segment = """
 63 |         CREATE TABLE Objects_Segments(
 64 |             object_id INT,
 65 |             segment_id INT,
 66 |             PRIMARY KEY (object_id, segment_id),
 67 |             FOREIGN KEY (object_id) REFERENCES Objects(object_id),
 68 |             FOREIGN KEY (segment_id) REFERENCES Segments(segment_id)
 69 |         );
 70 |         """
 71 |         cursor.execute(create_object_segment)
 72 |         connection.commit()
 73 | 
 74 |         insert_objects = []
 75 |         for uid in self.uid2category:
 76 |             line = "INSERT INTO Objects (object_id, category) VALUES ({}, '{}')".format(str(uid), self.uid2category[uid])
 77 |             #print(line)
 78 |             insert_objects.append(line)
 79 |         for s in insert_objects:
 80 |             cursor.execute(s)
 81 | 
 82 |         insert_segments = []
 83 |         for segment in self.segment2id:
 84 |             segment_id = self.segment2id[segment]
 85 |             line = "INSERT INTO Segments (segment_id) VALUES ({})".format(str(segment_id))
 86 |             #print(line)
 87 |             insert_segments.append(line)
 88 |         for s in insert_segments:
 89 |             cursor.execute(s)
 90 | 
 91 | 
 92 |         insert_object_segments = []
 93 |         for segment_id in self.segment_id2uids:
 94 |             for uid in self.segment_id2uids[segment_id]:
 95 |                 line = "INSERT INTO Objects_Segments (object_id, segment_id) VALUES ({}, {})".format(str(uid), str(segment_id))
 96 |                 #print(line)
 97 |                 insert_object_segments.append(line)
 98 |         for s in insert_object_segments:
 99 |             cursor.execute(s)
100 |         
101 |         connection.commit()
102 |         cursor.close()
103 |         connection.close()
104 | 
105 | 
106 |     def retrieve_candidate_objects(self, description):
107 |         des_emb = encode_sentences([f"a photo of a {description}."], model_name='clip')
108 |         scores = compute_cosine_similarity(des_emb, list(self.uid2emb.values()))
109 |         indices = np.where(scores >= 0.26)[0]
110 |         candidate_uids = []
111 |         for i in indices:
112 |             candidate_uids.append(list(self.uid2emb)[i])
113 |         return candidate_uids
114 | 
115 | 
116 |     def query_database(self, program):
117 |         connection = sqlite3.connect('database.db')
118 |         cursor = connection.cursor()
119 |         try:
120 |             cursor.execute(program)
121 |             results = cursor.fetchall()
122 |             return results
123 |         except sqlite3.Error as e:
124 |             return e


--------------------------------------------------------------------------------
/demo.py:
--------------------------------------------------------------------------------
 1 | import gradio as gr
 2 | import openai
 3 | from main import preprocess, ReActAgent
 4 | from multiprocessing import Process
 5 | import os
 6 | import socket
 7 | from omegaconf import OmegaConf
 8 | 
 9 | config = OmegaConf.load('config/default.yaml')
10 | openai_api_key = config['openai_api_key']
11 | use_reid = config['use_reid']
12 | vqa_tool = config['vqa_tool']
13 | base_dir = config['base_dir']
14 | 
15 | 
16 | def ask_question(video_file, question):
17 |     preprocess(video_path_list=[video_file], 
18 |                base_dir=base_dir, 
19 |                show_tracking=False)
20 |     answer, log = ReActAgent(video_path=video_file, question=question, base_dir=base_dir, vqa_tool=vqa_tool, use_reid=use_reid, openai_api_key=openai_api_key)
21 |     base_name = os.path.basename(video_file).replace(".mp4", "")
22 |     reid_file = os.path.join("preprocess", base_name, "reid.mp4")
23 |     return answer, reid_file, log
24 | 
25 | 
26 | with gr.Row():
27 |     # Define inputs
28 |     with gr.Column(scale=6):
29 |         video_input = gr.Video(label="Upload a video")
30 |         question_input = gr.Textbox(label="Ask a question")
31 | 
32 | 
33 |     # Define output    
34 |     with gr.Column(scale=6):
35 |         output_text = gr.Textbox(label="Answer")
36 |         output_reid = gr.Video(label="Video replay with object re-identifcation")
37 |         output_log = gr.Textbox(label="Inference log")
38 | 
39 | 
40 | # Create Gradio interface
41 | gr.Interface(
42 |     fn=ask_question,
43 |     inputs=[video_input, question_input],
44 |     outputs=[output_text, output_reid, output_log],
45 |     title="VideoAgent",
46 |     examples = [
47 |             [f"sample_videos/boats.mp4", "How many boats are there in the video?"],
48 |             [f"sample_videos/talking.mp4",
49 |                 "From what clue do you know that the woman with black spectacles at the start of the video is married?"],
50 |             [f"sample_videos/books.mp4",
51 |                 "Based on the actions observed, what could be a possible motivation or goal for what c is doing in the video?"],
52 |             [f"sample_videos/painting.mp4",
53 |                 "What was the primary purpose of the cup of water in this video, and how did it contribute to the overall painting process?"],
54 |             [f"sample_videos/kitchen.mp4",
55 |                 "Is there a microwave in the kitchen?"],
56 |         ],
57 |     description="""### This is the demo of [VideoAgent](https://videoagent.github.io/).
58 |     
59 |     Upload a video and ask a question to get an answer from the VideoAgent."""
60 | 
61 | ).launch(share=True)
62 | 


--------------------------------------------------------------------------------
/encoder.py:
--------------------------------------------------------------------------------
 1 | import json
 2 | import openai
 3 | import numpy as np
 4 | import pickle
 5 | from sentence_transformers import SentenceTransformer
 6 | import os
 7 | from PIL import Image
 8 | import clip
 9 | import torch
10 | from openai import OpenAI
11 | import torchvision.transforms as T
12 | from PIL import Image
13 | from time import time
14 | 
15 | 
16 | sentence_models = ['text-embedding-ada-002', 'text-embedding-3-large', 'all-MiniLM-L6-v2', 'all-mpnet-base-v2', 'clip']
17 | 
18 | 
19 | def encode_sentences(sentence_list, model_name):
20 |     '''given a list of sentences, return the embeddings for them using the sentence encoder model'''
21 |     assert model_name in sentence_models
22 |     emb_list = []
23 |     if model_name in['text-embedding-ada-002', 'text-embedding-3-large']: #openai embedding requires api-key
24 |         client = OpenAI()
25 |         emb = client.embeddings.create(input=sentence_list, model=model_name)
26 |         for i in range(len(sentence_list)):
27 |             emb_list.append(np.array(emb.data[i].embedding).reshape(1, -1))
28 |         emb_list = np.concatenate(emb_list, axis=0)
29 |         return emb_list
30 |     elif model_name == 'clip': # clip embedding
31 |         device = "cuda" if torch.cuda.is_available() else "cpu"
32 |         model, transform = clip.load("ViT-B/32", device=device)
33 |         with torch.no_grad():
34 |             for sentence in sentence_list:
35 |                 emb_list.append(model.encode_text(clip.tokenize([sentence]).to(device)).cpu().numpy())
36 |         emb_list = np.concatenate(emb_list, axis=0)
37 |         return emb_list
38 |     else: #sentence transformer encoder
39 |         model = SentenceTransformer('sentence-transformers/'+model_name)
40 |         num = len(sentence_list)
41 |         batch_size = 10
42 |         batch_num = num // batch_size
43 |         with torch.no_grad():
44 |             for batch_id in range(batch_num):
45 |                 batch_sentences = sentence_list[batch_id*10: (batch_id+1)*10]
46 |                 emb_list.append(model.encode(batch_sentences))
47 |             if batch_num * 10 < num: #remaining <10 sentences
48 |                 remaining_sentences = sentence_list[batch_num*10: num]
49 |                 emb_list.append(model.encode(remaining_sentences))
50 |         return emb_list
51 | 
52 | 
53 | if __name__ == '__main__':
54 |     encode_sentences(['hello!', 'what'], model_name='text-embedding-ada-002')
55 | 
56 | 


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
  1 | name: videoagent
  2 | channels:
  3 |   - conda-forge
  4 |   - defaults
  5 | dependencies:
  6 |   - _libgcc_mutex=0.1=conda_forge
  7 |   - _openmp_mutex=4.5=2_gnu
  8 |   - bzip2=1.0.8=hd590300_5
  9 |   - ca-certificates=2023.11.17=hbcca054_0
 10 |   - ld_impl_linux-64=2.40=h41732ed_0
 11 |   - libffi=3.4.2=h7f98852_5
 12 |   - libgcc-ng=13.2.0=h807b86a_3
 13 |   - libgomp=13.2.0=h807b86a_3
 14 |   - libnsl=2.0.1=hd590300_0
 15 |   - libsqlite=3.44.2=h2797004_0
 16 |   - libuuid=2.38.1=h0b41bf4_0
 17 |   - libzlib=1.2.13=hd590300_5
 18 |   - ncurses=6.4=h59595ed_2
 19 |   - openssl=3.2.0=hd590300_1
 20 |   - pip=23.3.2=pyhd8ed1ab_0
 21 |   - python=3.9.18=h0755675_0_cpython
 22 |   - readline=8.2=h8228510_1
 23 |   - setuptools=68.2.2=pyhd8ed1ab_0
 24 |   - tk=8.6.13=noxft_h4845f30_101
 25 |   - tzdata=2023c=h71feb2d_0
 26 |   - wheel=0.42.0=pyhd8ed1ab_0
 27 |   - xz=5.2.6=h166bdaf_0
 28 |   - pip:
 29 |       - aiohttp==3.9.1
 30 |       - aiosignal==1.3.1
 31 |       - annotated-types==0.6.0
 32 |       - anyio==3.7.1
 33 |       - async-timeout==4.0.3
 34 |       - attrs==23.1.0
 35 |       - av==11.0.0
 36 |       - blis==0.7.11
 37 |       - catalogue==2.0.10
 38 |       - certifi==2023.11.17
 39 |       - charset-normalizer==3.3.2
 40 |       - click==8.1.7
 41 |       - cmake==3.28.1
 42 |       - confection==0.1.4
 43 |       - contourpy==1.2.0
 44 |       - cycler==0.12.1
 45 |       - cymem==2.0.8
 46 |       - cython==3.0.7
 47 |       - dataclasses-json==0.6.3
 48 |       - decord==0.6.0
 49 |       - distro==1.8.0
 50 |       - einops==0.4.1
 51 |       - exceptiongroup==1.2.0
 52 |       - fairscale==0.4.4
 53 |       - filelock==3.13.1
 54 |       - fonttools==4.47.0
 55 |       - frozenlist==1.4.1
 56 |       - fsspec==2023.12.2
 57 |       - ftfy==4.4.3
 58 |       - fvcore==0.1.5.post20221221
 59 |       - gradio==4.22.0
 60 |       - gradio-client==0.13.0
 61 |       - gensim==3.8.3
 62 |       - greenlet==3.0.3
 63 |       - h11==0.14.0
 64 |       - html5lib==1.1
 65 |       - httpcore==1.0.2
 66 |       - httpx==0.26.0
 67 |       - huggingface-hub==0.19.4
 68 |       - idna==3.6
 69 |       - imageio==2.33.1
 70 |       - importlib-resources==6.1.1
 71 |       - pip-install==1.3.5
 72 |       - iopath==0.1.10
 73 |       - jinja2==3.1.2
 74 |       - joblib==1.3.2
 75 |       - jsonpatch==1.33
 76 |       - jsonpointer==2.4
 77 |       - kiwisolver==1.4.5
 78 |       - langchain==0.1.2
 79 |       - langchain-community==0.0.14
 80 |       - langchain-core==0.1.14
 81 |       - langchain-openai==0.0.3
 82 |       - langchainhub==0.1.14
 83 |       - langcodes==3.3.0
 84 |       - langsmith==0.0.83
 85 |       - lapx==0.5.5
 86 |       - lit==17.0.6
 87 |       - markupsafe==2.1.3
 88 |       - marshmallow==3.20.2
 89 |       - matplotlib==3.8.2
 90 |       - mpmath==1.3.0
 91 |       - multidict==6.0.4
 92 |       - murmurhash==1.0.10
 93 |       - mypy-extensions==1.0.0
 94 |       - moviepy==1.0.3
 95 |       - networkx==3.2.1
 96 |       - nltk==3.8.1
 97 |       - numpy==1.26.2
 98 |       - nvidia-cublas-cu11==11.10.3.66
 99 |       - nvidia-cublas-cu12==12.1.3.1
100 |       - nvidia-cuda-cupti-cu11==11.7.101
101 |       - nvidia-cuda-cupti-cu12==12.1.105
102 |       - nvidia-cuda-nvrtc-cu11==11.7.99
103 |       - nvidia-cuda-nvrtc-cu12==12.1.105
104 |       - nvidia-cuda-runtime-cu11==11.7.99
105 |       - nvidia-cuda-runtime-cu12==12.1.105
106 |       - nvidia-cudnn-cu11==8.5.0.96
107 |       - nvidia-cudnn-cu12==8.9.2.26
108 |       - nvidia-cufft-cu11==10.9.0.58
109 |       - nvidia-cufft-cu12==11.0.2.54
110 |       - nvidia-curand-cu11==10.2.10.91
111 |       - nvidia-curand-cu12==10.3.2.106
112 |       - nvidia-cusolver-cu11==11.4.0.1
113 |       - nvidia-cusolver-cu12==11.4.5.107
114 |       - nvidia-cusparse-cu11==11.7.4.91
115 |       - nvidia-cusparse-cu12==12.1.0.106
116 |       - nvidia-nccl-cu11==2.14.3
117 |       - nvidia-nccl-cu12==2.18.1
118 |       - nvidia-nvjitlink-cu12==12.3.101
119 |       - nvidia-nvtx-cu11==11.7.91
120 |       - nvidia-nvtx-cu12==12.1.105
121 |       - omegaconf==2.3.0
122 |       - openai==1.9.0
123 |       - opencv-python==4.8.1.78
124 |       - packaging==23.2
125 |       - pandas==1.3.5
126 |       - parameterized==0.9.0
127 |       - pathy==0.10.3
128 |       - pillow==10.1.0
129 |       - pims==0.6.1
130 |       - portalocker==2.8.2
131 |       - preshed==3.0.9
132 |       - protobuf==4.21.12
133 |       - psutil==5.9.7
134 |       - py-cpuinfo==9.0.0
135 |       - pydantic==2.5.3
136 |       - pyparsing==3.1.1
137 |       - python-dateutil==2.8.2
138 |       - pytorchvideo==0.1.5
139 |       - pytz==2023.3.post1
140 |       - pyyaml==6.0.1
141 |       - regex==2023.10.3
142 |       - requests==2.31.0
143 |       - safetensors==0.4.1
144 |       - scikit-learn==1.0.2
145 |       - scipy==1.11.4
146 |       - seaborn==0.13.1
147 |       - sentence-transformers==2.2.2
148 |       - sentencepiece==0.1.99
149 |       - six==1.16.0
150 |       - slicerator==1.1.0
151 |       - smart-open==6.4.0
152 |       - sniffio==1.3.0
153 |       - sqlalchemy==2.0.25
154 |       - srsly==2.4.8
155 |       - sympy==1.12
156 |       - tabulate==0.9.0
157 |       - tenacity==8.2.3
158 |       - termcolor==2.4.0
159 |       - theano==1.0.5
160 |       - thinc==8.1.12
161 |       - thop==0.1.1-2209072238
162 |       - threadpoolctl==3.2.0
163 |       - tiktoken==0.5.2
164 |       - timm==0.5.4
165 |       - tokenizers==0.12.1
166 |       - torch==2.1.2
167 |       - torchvision==0.16.2
168 |       - tqdm==4.66.1
169 |       - transformers==4.27.0
170 |       - triton==2.1.0
171 |       - types-requests==2.31.0.20240106
172 |       - typing-extensions==4.9.0
173 |       - typing-inspect==0.9.0
174 |       - ultralytics==8.0.235
175 |       - urllib3==2.1.0
176 |       - wasabi==0.10.1
177 |       - wcwidth==0.2.12
178 |       - webdataset==0.2.86
179 |       - webencodings==0.5.1
180 |       - xdg==6.0.0
181 |       - yacs==0.1.8
182 |       - yarl==1.9.4
183 |       - zipp==3.17.0
184 |       - git+https://github.com/openai/CLIP.git
185 |       - git+https://github.com/Maluuba/nlg-eval.git@master
186 | 


--------------------------------------------------------------------------------
/imgs/demo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/imgs/demo.png


--------------------------------------------------------------------------------
/imgs/teaser.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/imgs/teaser.png


--------------------------------------------------------------------------------
/preprocess/boats/captions.json:
--------------------------------------------------------------------------------
1 | {"0_48": "#C C looks around ", "48_96": "#C C looks around the", "96_144": "#C C looks at the", "144_192": "#C C looks around ", "192_240": "#C C looks around the lake", "240_288": "#C C looks around "}


--------------------------------------------------------------------------------
/preprocess/boats/reid.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/reid.mp4


--------------------------------------------------------------------------------
/preprocess/boats/reid.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/reid.pkl


--------------------------------------------------------------------------------
/preprocess/boats/segment2id.json:
--------------------------------------------------------------------------------
1 | {"0_48": 0, "48_96": 1, "96_144": 2, "144_192": 3, "192_240": 4, "240_288": 5}


--------------------------------------------------------------------------------
/preprocess/boats/segment_textual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/segment_textual_embedding.pkl


--------------------------------------------------------------------------------
/preprocess/boats/segment_visual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/segment_visual_embedding.pkl


--------------------------------------------------------------------------------
/preprocess/boats/tid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/tid2clip.pkl


--------------------------------------------------------------------------------
/preprocess/boats/tid2dinov2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/tid2dinov2.pkl


--------------------------------------------------------------------------------
/preprocess/boats/tracking.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/tracking.pkl


--------------------------------------------------------------------------------
/preprocess/boats/uid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/boats/uid2clip.pkl


--------------------------------------------------------------------------------
/preprocess/books/captions.json:
--------------------------------------------------------------------------------
1 | {"0_60": "#C C picks books from the floor", "60_120": "#C C picks the book on the floor", "120_180": "#C C holds a book in the book shelf", "180_240": "#C C places the book on the shelf", "240_300": "#C C removes the book from the", "300_360": "#C C holds the book with his left hand", "360_420": "#C C touches a book on a shelf with his right hand", "420_480": "#C C looks at the books on the floor", "480_540": "#C C picks the book from the shelf", "540_600": "#C C arranges books on the shelf", "600_660": "#C C looks around the floor", "660_720": "#C C picks a book from the floor", "720_780": "#C C places the books on the shelf", "780_840": "#C C picks a book from the shelf with his right hand", "840_900": "#C C adjusts the books on the shelf with her left hand", "900_960": "#C C arranges the books in the shelf", "960_1020": "#C C picks a book from the floor", "1020_1080": "#C C picks the books from the floor", "1080_1140": "#C C puts the book on the shelf", "1140_1200": "#C C adjusts the books on the shelf", "1200_1260": "#C C picks a book from the shelf", "1260_1320": "#C C arranges books on the book shelf", "1320_1380": "#C C picks the book from the", "1380_1440": "#C C puts book on top of the bookshe", "1440_1500": "#C C puts the books on the bookshe", "1500_1560": "#C C picks the book from the floor", "1560_1620": "#C C picks books from the floor", "1620_1680": "#C C places the book in his right hand in the bookshelf", "1680_1740": "#C C puts the books in the book shelf with his right hand", "1740_1800": "#C C picks a book from the floor with her left hand", "1800_1860": "#C C arranges books on the shelf", "1860_1920": "#C C arranges books", "1920_1980": "#C C looks around the house", "1980_2040": "#C C stares at the", "2040_2100": "#C C looks at the", "2100_2160": "#C C looks around", "2160_2220": "#C C adjusts the books on the shelf with his hands", "2220_2280": "#C C moves the books in the bookshelf with his right hand", "2280_2340": "#C C arranges books on the shelf", "2340_2400": "#C C touches the books on the shelf", "2400_2460": "#c c puts books on the shelf", "2460_2520": "#C C holds the book with her right hand", "2520_2580": "#C C places the book on the book shelf", "2580_2640": "#C C puts book on the shelf", "2640_2700": "#C C picks the books from the floor", "2700_2760": "#C C looks around the room.", "2760_2820": "#C C adjusts the books in the shelf with his hands", "2820_2880": "#C C picks the books from the floor", "2880_2940": "#C C picks a book from the", "2940_3000": "#C C arranges the books in the shelf", "3000_3060": "#C C arranges books on the shelf", "3060_3120": "#C C picks a book from the floor with her right hand", "3120_3180": "#C C picks the books on the floor", "3180_3240": "#C C picks a book from the shelf with his left hand", "3240_3300": "#C C picks the book from the shelf", "3300_3360": "#C C puts the book in the shelf with her right hand", "3360_3420": "#C C places the book on the shelf with her right hand", "3420_3480": "#C C places the book in his left hand in the shelf", "3480_3540": "#C C adjusts the book on the shelf.", "3540_3600": "#C C holds the books on her hands", "3600_3660": "#C C picks a", "3660_3720": "#C C picks books from the floor", "3720_3780": "#C C picks a book from the floor", "3780_3840": "#C C puts the books in the book shelf", "3840_3900": "#C C picks a book from the shelf", "3900_3960": "#C C arranges books in the shelve", "3960_4020": "#C C adjusts the books on the shelf ", "4020_4080": "#C C puts the books on the", "4080_4140": "#C C picks a book from the floor with her right hand", "4140_4200": "#C C holds the books with her hands", "4200_4260": "#C C picks up the books from the floor", "4260_4320": "#C C puts the books on the floor", "4320_4380": "#C C places the book on the shelf", "4380_4440": "#C C picks the book holder from the floor with her right hand", "4440_4500": "#C C arranges the books in the shelf with his right hand", "4500_4560": "#C C looks at the books on the", "4560_4620": "#C C puts a book on the floor", "4620_4680": "#C C places the book in his left hand on the ground", "4680_4740": "#C C picks the book from the floor with her right hand", "4740_4800": "#C C looks around the", "4800_4860": "#C C picks the book on the shelf", "4860_4920": "#C C arranges the books in the bookcase", "4920_4980": "#C C looks around the house", "4980_5040": "#C C picks the book from the", "5040_5100": "#C C picks a book from the", "5100_5160": "#C C puts the books on the floor", "5160_5220": "#C C puts books on the floor", "5220_5280": "#C C picks the books from the floor", "5280_5340": "#C C looks around the house", "5340_5400": "#C C puts the books on the shelf"}


--------------------------------------------------------------------------------
/preprocess/books/reid.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/reid.mp4


--------------------------------------------------------------------------------
/preprocess/books/reid.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/reid.pkl


--------------------------------------------------------------------------------
/preprocess/books/segment2id.json:
--------------------------------------------------------------------------------
1 | {"0_60": 0, "60_120": 1, "120_180": 2, "180_240": 3, "240_300": 4, "300_360": 5, "360_420": 6, "420_480": 7, "480_540": 8, "540_600": 9, "600_660": 10, "660_720": 11, "720_780": 12, "780_840": 13, "840_900": 14, "900_960": 15, "960_1020": 16, "1020_1080": 17, "1080_1140": 18, "1140_1200": 19, "1200_1260": 20, "1260_1320": 21, "1320_1380": 22, "1380_1440": 23, "1440_1500": 24, "1500_1560": 25, "1560_1620": 26, "1620_1680": 27, "1680_1740": 28, "1740_1800": 29, "1800_1860": 30, "1860_1920": 31, "1920_1980": 32, "1980_2040": 33, "2040_2100": 34, "2100_2160": 35, "2160_2220": 36, "2220_2280": 37, "2280_2340": 38, "2340_2400": 39, "2400_2460": 40, "2460_2520": 41, "2520_2580": 42, "2580_2640": 43, "2640_2700": 44, "2700_2760": 45, "2760_2820": 46, "2820_2880": 47, "2880_2940": 48, "2940_3000": 49, "3000_3060": 50, "3060_3120": 51, "3120_3180": 52, "3180_3240": 53, "3240_3300": 54, "3300_3360": 55, "3360_3420": 56, "3420_3480": 57, "3480_3540": 58, "3540_3600": 59, "3600_3660": 60, "3660_3720": 61, "3720_3780": 62, "3780_3840": 63, "3840_3900": 64, "3900_3960": 65, "3960_4020": 66, "4020_4080": 67, "4080_4140": 68, "4140_4200": 69, "4200_4260": 70, "4260_4320": 71, "4320_4380": 72, "4380_4440": 73, "4440_4500": 74, "4500_4560": 75, "4560_4620": 76, "4620_4680": 77, "4680_4740": 78, "4740_4800": 79, "4800_4860": 80, "4860_4920": 81, "4920_4980": 82, "4980_5040": 83, "5040_5100": 84, "5100_5160": 85, "5160_5220": 86, "5220_5280": 87, "5280_5340": 88, "5340_5400": 89}


--------------------------------------------------------------------------------
/preprocess/books/segment_textual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/segment_textual_embedding.pkl


--------------------------------------------------------------------------------
/preprocess/books/segment_visual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/segment_visual_embedding.pkl


--------------------------------------------------------------------------------
/preprocess/books/tid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/tid2clip.pkl


--------------------------------------------------------------------------------
/preprocess/books/tid2dinov2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/tid2dinov2.pkl


--------------------------------------------------------------------------------
/preprocess/books/tracking.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/tracking.pkl


--------------------------------------------------------------------------------
/preprocess/books/uid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/books/uid2clip.pkl


--------------------------------------------------------------------------------
/preprocess/kitchen/captions.json:
--------------------------------------------------------------------------------
1 | {"0_30": "#C C opens the kitchen cabinet", "30_60": "#C C opens the cabinet door", "60_90": "#C C picks a glass in the", "90_120": "#C C opens the cabinet.", "120_150": "#C C opens the tap", "150_180": "#C C puts water in the cup", "180_210": "#C C closes the tap.", "210_240": "#C C puts cup on the sink counter", "240_270": "#C C picks the cup from the counter", "270_300": "#C C opens a refrigerator with his left", "300_330": "#C C closes the fridge with his right", "330_360": "#C C picks a bottle of milk from the", "360_390": "#C C puts the bottle in the fridge", "390_420": "#C C closes the fridge with his left hand", "420_450": "#C C closes the refrigerator with his left hand", "450_480": "#C C opens the water bottle lid", "480_510": "#C C covers the kettle with the lid", "510_540": "#C C puts water in the coffee maker", "540_570": "#C C pours water in the sink", "570_600": "#C C pours the milk into the"}


--------------------------------------------------------------------------------
/preprocess/kitchen/reid.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/reid.mp4


--------------------------------------------------------------------------------
/preprocess/kitchen/reid.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/reid.pkl


--------------------------------------------------------------------------------
/preprocess/kitchen/segment2id.json:
--------------------------------------------------------------------------------
1 | {"0_30": 0, "30_60": 1, "60_90": 2, "90_120": 3, "120_150": 4, "150_180": 5, "180_210": 6, "210_240": 7, "240_270": 8, "270_300": 9, "300_330": 10, "330_360": 11, "360_390": 12, "390_420": 13, "420_450": 14, "450_480": 15, "480_510": 16, "510_540": 17, "540_570": 18, "570_600": 19}


--------------------------------------------------------------------------------
/preprocess/kitchen/segment_0.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_0.mp4


--------------------------------------------------------------------------------
/preprocess/kitchen/segment_1.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_1.mp4


--------------------------------------------------------------------------------
/preprocess/kitchen/segment_18.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_18.mp4


--------------------------------------------------------------------------------
/preprocess/kitchen/segment_3.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_3.mp4


--------------------------------------------------------------------------------
/preprocess/kitchen/segment_8.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_8.mp4


--------------------------------------------------------------------------------
/preprocess/kitchen/segment_textual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_textual_embedding.pkl


--------------------------------------------------------------------------------
/preprocess/kitchen/segment_visual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/segment_visual_embedding.pkl


--------------------------------------------------------------------------------
/preprocess/kitchen/tid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/tid2clip.pkl


--------------------------------------------------------------------------------
/preprocess/kitchen/tid2dinov2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/tid2dinov2.pkl


--------------------------------------------------------------------------------
/preprocess/kitchen/tracking.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/tracking.pkl


--------------------------------------------------------------------------------
/preprocess/kitchen/uid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/kitchen/uid2clip.pkl


--------------------------------------------------------------------------------
/preprocess/painting/captions.json:
--------------------------------------------------------------------------------
1 | {"0_60": "#C C draws on the paper with the paint brush in his right hand.", "60_120": "#C C draws on the paper with the painting brush in his right hand. ", "120_180": "#C C draws on the paper with the paint brush in his right hand.", "180_240": "#C C moves a paint palette on the table with his right hand.", "240_300": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "300_360": "#C C draws on the paper with the paint brush in his right hand. ", "360_420": "#C C paints on the paper with the paint brush in his right hand.", "420_480": "#C C adjusts the drawing board with his left hand.", "480_540": "#C C draws on the paper with the paint brush in his right hand.", "540_600": "#C C dips the paint brush in his right hand in the cup of water on the table.", "600_660": "#C C dips the paint brush in his right hand in the paint palette on the table.                                                        ", "660_720": "#C C smears watercolor on the watercolor set with the", "720_780": "#C C dips the paint brush in his right hand in the paint palette on the table.", "780_840": "#C C dips the paint brush in his right hand in the paint palette on the table.", "840_900": "#C C dips the paint brush in his right hand in the cup of water on the table", "900_960": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "960_1020": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "1020_1080": "#C C draws on the paper with the paint brush in his right hand.", "1080_1140": "#C C paints a", "1140_1200": "#C C paints a", "1200_1260": "#C C paints a", "1260_1320": "#C C paints on the paper with the paint brush in his right hand", "1320_1380": "#C C draws on the paper with the paint brush in his right hand.", "1380_1440": "#C C adjusts the painting board with his right hand.", "1440_1500": "#C C paints the cover of the  paint palette with the paint brush in his right hand.", "1500_1560": "#C C dips the paint brush in his right hand in the paint palette on the table.", "1560_1620": "#C C dips the paint brush in his right hand in the cup of water on the table.", "1620_1680": "#C C stirs brush in the watercolor pan", "1680_1740": "#C C dips the paint brush in his right hand in the paint palette on the table.", "1740_1800": "#C C paints on the paper with the paint brush in his right hand. ", "1800_1860": "#C C paints a", "1860_1920": "#C C paints a", "1920_1980": "#C C paints a", "1980_2040": "#C C draws on the paper with the paint brush in his right hand.", "2040_2100": "#C C paints the", "2100_2160": "#C C paints a", "2160_2220": "#C C draws on the paper with the paint brush in his right hand. ", "2220_2280": "#C C moves the paint palette on the table with his right hand.", "2280_2340": "#C C dips the paint brush in his right hand in the paint palette on the table.", "2340_2400": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "2400_2460": "#C C operates the tablet computer on the table with his right hand.", "2460_2520": "#C C paints a", "2520_2580": "#C C draws on the paper with the paint brush in his right hand.", "2580_2640": "#C C adjusts the drawing board with his left hand.", "2640_2700": "#C C draws on the paper with the paint brush in his right hand.", "2700_2760": "#C C adjusts the book on the table with his left hand", "2760_2820": "#C C lifts the paint brush from the drawing board with his right hand.", "2820_2880": "#C C paints a", "2880_2940": "#C C paints a", "2940_3000": "#C C draws on the paper with the painting brush in his right hand.", "3000_3060": "#C C paints a", "3060_3120": "#C C dips the paint brush in his right hand in the paint palette on the table.", "3120_3180": "#C C draws on the paper with the paint brush in his right hand.", "3180_3240": "#C C paints a", "3240_3300": "#C C adjusts the book on the table with her right hand.", "3300_3360": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "3360_3420": "#C C paints on the paper with the painting brush in his right hand.", "3420_3480": "#C C paints a", "3480_3540": "#C C paints the cover of the  paint palette with the paint brush in his right hand.", "3540_3600": "#C C draws on the paper with the paint brush in his right hand.", "3600_3660": "#C C paints a", "3660_3720": "#C C draws on the paper with the paint brush in his right hand.", "3720_3780": "#C C paints on the paper with the paint brush in his right hand.", "3780_3840": "#C C adjusts the drawing board with his right hand.", "3840_3900": "#C C paints the cover of the paint palette with the paint brush in his right hand. ", "3900_3960": "#C C draws on the paper with the paint brush in his right hand.", "3960_4020": "#C C draws on the paper with the painting brush in his right hand.", "4020_4080": "#C C draws on the paper with the paint brush in his right hand.", "4080_4140": "#C C draws on the paper with the paint brush in his right hand.", "4140_4200": "#C C draws on the paper with the paint brush in his right hand.", "4200_4260": "#C C draws on the paper with the painting brush in his right hand.", "4260_4320": "#C C draws on the paper with the paint brush in his right hand.", "4320_4380": "#C C draws on the paper with the paint brush in his right hand.", "4380_4440": "#C C paints a", "4440_4500": "#C C draws on the paper with the painting brush in his right hand.", "4500_4560": "#C C draws on the paper with the paint brush in his right hand.", "4560_4620": "#C C dips the paint brush in his right hand in the paint palette on the table.", "4620_4680": "#C C paints the cover of the  paint palette with the paint brush in his right hand.", "4680_4740": "#C C draws on the paper with the paint brush in his right hand.", "4740_4800": "#C C paints a", "4800_4860": "#C C draws on the paper with the paint brush in his right hand.", "4860_4920": "#C C paints a", "4920_4980": "#C C adjusts the book on his lap with his left hand.", "4980_5040": "#C C dips the paint brush in his right hand into the cup of water on the table.", "5040_5100": "#C C moves the painting brush", "5100_5160": "#C C dips the paint brush in his right hand in the cup of water on the table.", "5160_5220": "#C C touches the book with his left hand", "5220_5280": "#C C dips the paint brush in his right hand in the cup of water on the table.", "5280_5340": "#C C paints the cover of the paint palette with the paint brush in his right hand.", "5340_5400": "#C C dips the paint brush in his right hand in the cup of water on the table."}


--------------------------------------------------------------------------------
/preprocess/painting/reid.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/reid.mp4


--------------------------------------------------------------------------------
/preprocess/painting/reid.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/reid.pkl


--------------------------------------------------------------------------------
/preprocess/painting/segment2id.json:
--------------------------------------------------------------------------------
1 | {"0_60": 0, "60_120": 1, "120_180": 2, "180_240": 3, "240_300": 4, "300_360": 5, "360_420": 6, "420_480": 7, "480_540": 8, "540_600": 9, "600_660": 10, "660_720": 11, "720_780": 12, "780_840": 13, "840_900": 14, "900_960": 15, "960_1020": 16, "1020_1080": 17, "1080_1140": 18, "1140_1200": 19, "1200_1260": 20, "1260_1320": 21, "1320_1380": 22, "1380_1440": 23, "1440_1500": 24, "1500_1560": 25, "1560_1620": 26, "1620_1680": 27, "1680_1740": 28, "1740_1800": 29, "1800_1860": 30, "1860_1920": 31, "1920_1980": 32, "1980_2040": 33, "2040_2100": 34, "2100_2160": 35, "2160_2220": 36, "2220_2280": 37, "2280_2340": 38, "2340_2400": 39, "2400_2460": 40, "2460_2520": 41, "2520_2580": 42, "2580_2640": 43, "2640_2700": 44, "2700_2760": 45, "2760_2820": 46, "2820_2880": 47, "2880_2940": 48, "2940_3000": 49, "3000_3060": 50, "3060_3120": 51, "3120_3180": 52, "3180_3240": 53, "3240_3300": 54, "3300_3360": 55, "3360_3420": 56, "3420_3480": 57, "3480_3540": 58, "3540_3600": 59, "3600_3660": 60, "3660_3720": 61, "3720_3780": 62, "3780_3840": 63, "3840_3900": 64, "3900_3960": 65, "3960_4020": 66, "4020_4080": 67, "4080_4140": 68, "4140_4200": 69, "4200_4260": 70, "4260_4320": 71, "4320_4380": 72, "4380_4440": 73, "4440_4500": 74, "4500_4560": 75, "4560_4620": 76, "4620_4680": 77, "4680_4740": 78, "4740_4800": 79, "4800_4860": 80, "4860_4920": 81, "4920_4980": 82, "4980_5040": 83, "5040_5100": 84, "5100_5160": 85, "5160_5220": 86, "5220_5280": 87, "5280_5340": 88, "5340_5400": 89}


--------------------------------------------------------------------------------
/preprocess/painting/segment_83.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/segment_83.mp4


--------------------------------------------------------------------------------
/preprocess/painting/segment_85.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/segment_85.mp4


--------------------------------------------------------------------------------
/preprocess/painting/segment_textual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/segment_textual_embedding.pkl


--------------------------------------------------------------------------------
/preprocess/painting/segment_visual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/segment_visual_embedding.pkl


--------------------------------------------------------------------------------
/preprocess/painting/tid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/tid2clip.pkl


--------------------------------------------------------------------------------
/preprocess/painting/tid2dinov2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/tid2dinov2.pkl


--------------------------------------------------------------------------------
/preprocess/painting/tracking.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/tracking.pkl


--------------------------------------------------------------------------------
/preprocess/painting/uid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/painting/uid2clip.pkl


--------------------------------------------------------------------------------
/preprocess/talking/captions.json:
--------------------------------------------------------------------------------
1 | {"0_60": "#O woman X points at the ceiling", "60_120": "#O person Y converses with C", "120_180": "#O person Y touches her nose with her right", "180_240": "#O woman X lifts up her", "240_300": "#C C moves the hands", "300_360": "#C C looks around the room", "360_420": "#O woman X uses gesture with a", "420_480": "#C C interacts with the woman X", "480_540": "#O woman X points at the ceiling", "540_600": "#O woman X does a hand gesture", "600_660": "#O woman X raises a hand", "660_720": "#O The woman X adjusts her glasses with both hands", "720_780": "#O person X points towards the ceiling", "780_840": "#C C interacts with lady X", "840_900": "#O person Y adjusts the camera", "900_960": "#O A woman X interacts with C", "960_1020": "#O woman X converses with C", "1020_1080": "#C C talks to the colleagues", "1080_1140": "#O woman X talks to woman", "1140_1200": "#O woman Y converses with woman", "1200_1260": "#O person Z moves her hands", "1260_1320": "#O person Y looks at person X", "1320_1380": "#C C looks at the people in the", "1380_1440": "#C C converses with a woman V,W and X and a man Y and Z", "1440_1500": "#O The man Y holds the phone with his right hand.", "1500_1560": "#C C converses with a man X and Y  and a woman Z", "1560_1620": "#C C converses with the woman Y", "1620_1680": "#C The man M interacts with C, the woman N, the man M and the woman N", "1680_1740": "#O A Woman M holds her waist with both hands", "1740_1800": "#O The Woman X taps her left fingers on her thigh", "1800_1860": "#O Woman A Holds a camera with hands", "1860_1920": "#O person Y puts the card on the table", "1920_1980": "#O A woman X looks at C", "1980_2040": "#O person X talks to person", "2040_2100": "#O Woman Y eats food with the right", "2100_2160": "#C C looks around the room", "2160_2220": "#O person X interacts with person Z", "2220_2280": "#O A woman X stands in the", "2280_2340": "#C C looks at the people in the", "2340_2400": "#C C looks at the woman", "2400_2460": "#C C stares at a woman Y", "2460_2520": "#C C looks around the house", "2520_2580": "#C C converses with the man Y, the man X and the woman Z", "2580_2640": "#O A man X talks to man Z"}


--------------------------------------------------------------------------------
/preprocess/talking/reid.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/reid.mp4


--------------------------------------------------------------------------------
/preprocess/talking/reid.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/reid.pkl


--------------------------------------------------------------------------------
/preprocess/talking/segment2id.json:
--------------------------------------------------------------------------------
1 | {"0_60": 0, "60_120": 1, "120_180": 2, "180_240": 3, "240_300": 4, "300_360": 5, "360_420": 6, "420_480": 7, "480_540": 8, "540_600": 9, "600_660": 10, "660_720": 11, "720_780": 12, "780_840": 13, "840_900": 14, "900_960": 15, "960_1020": 16, "1020_1080": 17, "1080_1140": 18, "1140_1200": 19, "1200_1260": 20, "1260_1320": 21, "1320_1380": 22, "1380_1440": 23, "1440_1500": 24, "1500_1560": 25, "1560_1620": 26, "1620_1680": 27, "1680_1740": 28, "1740_1800": 29, "1800_1860": 30, "1860_1920": 31, "1920_1980": 32, "1980_2040": 33, "2040_2100": 34, "2100_2160": 35, "2160_2220": 36, "2220_2280": 37, "2280_2340": 38, "2340_2400": 39, "2400_2460": 40, "2460_2520": 41, "2520_2580": 42, "2580_2640": 43}


--------------------------------------------------------------------------------
/preprocess/talking/segment_10.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/segment_10.mp4


--------------------------------------------------------------------------------
/preprocess/talking/segment_11.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/segment_11.mp4


--------------------------------------------------------------------------------
/preprocess/talking/segment_9.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/segment_9.mp4


--------------------------------------------------------------------------------
/preprocess/talking/segment_textual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/segment_textual_embedding.pkl


--------------------------------------------------------------------------------
/preprocess/talking/segment_visual_embedding.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/segment_visual_embedding.pkl


--------------------------------------------------------------------------------
/preprocess/talking/tid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/tid2clip.pkl


--------------------------------------------------------------------------------
/preprocess/talking/tid2dinov2.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/tid2dinov2.pkl


--------------------------------------------------------------------------------
/preprocess/talking/tracking.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/tracking.pkl


--------------------------------------------------------------------------------
/preprocess/talking/uid2clip.pkl:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/preprocess/talking/uid2clip.pkl


--------------------------------------------------------------------------------
/prompts/database_query_prompt.txt:
--------------------------------------------------------------------------------
 1 | You are tasked with answering a question about a video using a database. The database consists of three tables:
 2 | 
 3 | TABLE Objects(
 4 |     object_id INT,
 5 |     category VARCHAR(255),
 6 |     PRIMARY KEY (object_id)
 7 | )
 8 | The 'Objects' table catalogues the people or objects in the video, with each assigned a unique 'object_id' and 'category'. For example, an object entry may be (1, 'cup').
 9 | 
10 | TABLE Segments(
11 |     segment_id INT,
12 |     PRIMARY KEY (segment_id)
13 | )
14 | The 'Segments' are 2-second segments of the video. The 'segment_id' starts from 0 and increments by 1 sequentially.
15 | 
16 | TABLE Objects_Segments(
17 |     object_id INT,
18 |     segment_id INT,
19 |     PRIMARY KEY (object_id, segment_id),
20 |     FOREIGN KEY (object_id) REFERENCES Objects(object_id),
21 |     FOREIGN KEY (segment_id) REFERENCES Segments(segment_id)
22 | )
23 | The 'Objects_Segments' table links the 'Objects' and 'Segments' tables, recording the appearing objects in each segment.
24 | 
25 | You have access to the following tools:
26 | 
27 | {tools}
28 | 
29 | ATTENTION:
30 | 1. Since you only have information about the objects and their appearing segments, if you think the question requires more information, just output "I cannot answer this question."
31 | 2. The categories of the objects/people are limited. To find a specific object, you can first query the database for all the object categories, and match the object to one of the categories. If you cannot find objects using the categories, you can also try the tool 'retreive_candidate_objects'.
32 | 3. use single quotes for the strings in the MySQL program, for instance: SELECT COUNT(DISTINCT object_id) FROM Objects WHERE category = 'person'
33 | 
34 | Use the following format:
35 | 
36 | Question: the input question you must answer
37 | Thought: you should always think about what to do
38 | Action: the action to take, should be one of [{tool_names}]
39 | Action Input: the input to the action
40 | Observation: the result of the action... (this Thought/Action/Action Input/Observation can repeat N times)
41 | Thought: I now know the final answer
42 | Final Answer: the answer to the original input question
43 | 
44 | Begin!
45 | 
46 | Question: {input}
47 | Thought: {agent_scratchpad}


--------------------------------------------------------------------------------
/prompts/multiple_choice_prompt.txt:
--------------------------------------------------------------------------------
 1 | You are tasked with answering a multiple-choice question related to a video. The question has 5 choices, labeled as 0, 1, 2, 3, 4. The video is sliced into 2-second segments, each with an segment ID starting from zero and incrementing in chronological order. Each segment has a caption depicting the event. 
 2 | There is an object memory that saves the objects and their appearing segments. The object memory is maintained by another agent.
 3 | You have access to the following tools:
 4 | 
 5 | {tools}
 6 | 
 7 | ATTENTION: 
 8 | 1. the segment captions with prefix '#C' refer to the camera wearer, while those with prefix '#O' refer to someone other than the camera wearer.
 9 | 2. You can use both 'visual_question_answering' and 'object_memory_querying' to answer questions related to objects or people.
10 | 3. The 'visual_question_answering' may have hallucination. You should pay more attention to the description rather than the answer in 'visual_question_answering'.
11 | 4. Use double quotes on the string arguments of the tools. The input to the tools should not contain any single quotes. If the tool has two arguments, output the arguments in brackets such as ("what is the man doing", 1).
12 | 5. Its easier to answer the multiple-choice question by validating the choices.
13 | 6. If the information is too vague to provide an accurate answer, make your best guess.
14 | 
15 | Use the following format:
16 | 
17 | Question: the input question you must answer
18 | Thought: you should always think about what to do
19 | Action: the action to take, should be one of [{tool_names}]
20 | Action Input: the input to the action
21 | Observation: the result of the action... (this Thought/Action/Action Input/Observation can repeat N times)
22 | Thought: I now know the final answer
23 | Final Answer: the correct choice label (0, 1, 2, 3, 4) to the original input question
24 | 
25 | Begin!
26 | 
27 | Question: {input}
28 | Thought: {agent_scratchpad}
29 | 


--------------------------------------------------------------------------------
/prompts/prompt.txt:
--------------------------------------------------------------------------------
 1 | You are tasked with answering a question related to a video. The video is sliced into 2-second segments, each with an segment ID starting from zero and incrementing in chronological order. Each segment has a caption depicting the event. 
 2 | There is an object memory that saves the objects and their appearing segments. The object memory is maintained by another agent.
 3 | You have access to the following tools:
 4 | 
 5 | {tools}
 6 | 
 7 | ATTENTION: 
 8 | 1. the segment captions with prefix '#C' refer to the camera wearer, while those with prefix '#O' refer to someone other than the camera wearer.
 9 | 2. You can use both 'visual_question_answering' and 'object_memory_querying' to answer questions related to objects or people.
10 | 3. The 'visual_question_answering' may have hallucination. You should pay more attention to the description rather than the answer in 'visual_question_answering'.
11 | 4. Use double quotes on the string arguments of the tools. The input to the tools should not contain any single quotes. If the tool has two arguments, output the arguments in brackets such as ("what is the man doing", 1).
12 | 5. If the information is too vague to provide an accurate answer, make your best guess.
13 | 
14 | Use the following format:
15 | 
16 | Question: the input question you must answer
17 | Thought: you should always think about what to do
18 | Action: the action to take, should be one of [{tool_names}]
19 | Action Input: the input to the action
20 | Observation: the result of the action... (this Thought/Action/Action Input/Observation can repeat N times)
21 | Thought: I now know the final answer
22 | Final Answer: the answer to the original input question
23 | 
24 | Begin!
25 | 
26 | Question: {input}
27 | Thought: {agent_scratchpad}
28 | 


--------------------------------------------------------------------------------
/reid.py:
--------------------------------------------------------------------------------
  1 | import cv2
  2 | from time import time
  3 | import json
  4 | import pickle
  5 | import os
  6 | from collections import defaultdict
  7 | import clip
  8 | import random as rd
  9 | from PIL import Image
 10 | import torch
 11 | import numpy as np
 12 | import imageio
 13 | rd.seed(0)
 14 | 
 15 | 
 16 | def hash_color(obj_id):
 17 |     np.random.seed(obj_id)
 18 |     color = np.random.randint(0, 256, 3)
 19 |     new_color = tuple(int(i) for i in color)
 20 |     return new_color
 21 |     
 22 | 
 23 | class ReID:
 24 |     def __init__(self, video_path_list, base_dir='preprocess'):
 25 |         self.video_path_list = video_path_list
 26 |         self.base_dir = base_dir
 27 |         self.trackid2clip_emb = None
 28 |         self.trackid2dinov2_emb = None
 29 |         self.trackid2frame = None
 30 |         self.trackid2category = None
 31 |         self.uid2tids = None
 32 |         self.tid2uid = None
 33 | 
 34 | 
 35 |     def hard_constraint(self, obj1, obj2):
 36 |         # if self.trackid2category[obj1] != self.trackid2category[obj2]: # if two tracked objects have different categories, they cannot be the same object 
 37 |         #     return False
 38 |         frame1 = set(self.trackid2frame[obj1])
 39 |         frame2 = set(self.trackid2frame[obj2])
 40 |         if len(frame1.intersection(frame2)) > 0: # if two tracked objects co-exist, they cannot be the same object
 41 |             return False
 42 |         return True
 43 |     
 44 | 
 45 |     def clip_similarity_score(self, obj1, obj2, x0=0.925, slope=20):
 46 |         clip_emb1 = self.trackid2clip_emb[obj1]
 47 |         clip_emb2 = self.trackid2clip_emb[obj2]
 48 |         cosine_score = np.dot(clip_emb1, clip_emb2) / (np.linalg.norm(clip_emb1) * np.linalg.norm(clip_emb2))
 49 |         clip_score = 1 / (1 + np.exp(-slope * (cosine_score - x0)))
 50 |         return clip_score
 51 | 
 52 | 
 53 |     def dinov2_similarity_score(self, obj1, obj2, x0=0.5, slope=4.1):
 54 |         dinov2_emb1 = self.trackid2dinov2_emb[obj1]
 55 |         dinov2_emb2 = self.trackid2dinov2_emb[obj2]
 56 |         cosine_score = np.dot(dinov2_emb1, dinov2_emb2) / (np.linalg.norm(dinov2_emb1) * np.linalg.norm(dinov2_emb2))
 57 |         #dinov2_score = 1 / (1 + np.exp(-slope * (cosine_score - x0)))
 58 |         dinov2_score = cosine_score
 59 |         return dinov2_score
 60 |     
 61 |     
 62 |     def compute_score(self, obj1, obj2):
 63 |         if not self.hard_constraint(obj1, obj2):
 64 |             return 0
 65 |         clip_score = self.clip_similarity_score(obj1, obj2)
 66 |         dinov2_score = self.dinov2_similarity_score(obj1, obj2)
 67 |         return 0.15*clip_score+ 0.85*dinov2_score
 68 | 
 69 | 
 70 |     def check_group(self, tid, uid):
 71 |         """tid should has score > 0.5 for all uid objects, and at least one score > 0.62"""
 72 |         sgn = False
 73 |         for t in self.uid2tids[uid]:
 74 |             if self.compute_score(tid, t) < 0.5:
 75 |                 return False
 76 |             if self.compute_score(tid, t) >= 0.62:
 77 |                 sgn = True
 78 |         return sgn
 79 |     
 80 | 
 81 |     def reid_for_all_videos(self):
 82 |         for video_path in self.video_path_list:
 83 |             base_name = os.path.basename(video_path).replace(".mp4", "")
 84 |             video_dir = os.path.join(self.base_dir, base_name)
 85 |             with open(os.path.join(video_dir, 'tid2clip.pkl'), 'rb') as f:
 86 |                 self.trackid2clip_emb = pickle.load(f)
 87 |             with open(os.path.join(video_dir, 'tid2dinov2.pkl'), 'rb') as f:
 88 |                 self.trackid2dinov2_emb = pickle.load(f)
 89 |             with open(os.path.join(video_dir, 'tracking.pkl'), 'rb') as f:
 90 |                 content = pickle.load(f)
 91 |             self.frame2trackid, self.trackid2frame, self.trackid2category = content[0], content[1], content[2]
 92 |             self.uid2tids = defaultdict(list)
 93 |             self.tid2uid = dict()
 94 | 
 95 |             for frame in self.frame2trackid:
 96 |                 cur_track_ids = self.frame2trackid[frame]
 97 |                 for tid in cur_track_ids:
 98 |                     if tid in self.tid2uid:
 99 |                         continue
100 |                     sgn = False
101 |                     for uid in self.uid2tids:
102 |                         if self.check_group(tid, uid):
103 |                             self.uid2tids[uid].append(tid)
104 |                             self.tid2uid[tid] = uid
105 |                             sgn = True
106 |                             break
107 |                     if sgn == False:
108 |                         uid = len(self.uid2tids)
109 |                         self.uid2tids[uid].append(tid)
110 |                         self.tid2uid[tid] = uid
111 |             
112 |             frame2uid = defaultdict(dict)
113 |             uid2frame = defaultdict(list)
114 |             uid2category = dict()
115 |             uid2clipemb = defaultdict(list)
116 |             uid2clip = dict()
117 |             for frame in self.frame2trackid:
118 |                 for tid in self.frame2trackid[frame]:
119 |                     frame2uid[frame][self.tid2uid[tid]] = self.frame2trackid[frame][tid]
120 |             for uid in self.uid2tids:
121 |                 tids = self.uid2tids[uid]
122 |                 for tid in tids:
123 |                     uid2frame[uid] += self.trackid2frame[tid]
124 |                     uid2clipemb[uid].append(self.trackid2clip_emb[tid])
125 |             
126 |             for uid in uid2clipemb:
127 |                 emb = torch.stack(uid2clipemb[uid], dim=0)
128 |                 emb = torch.mean(emb, dim=0)
129 |                 uid2clip[uid] = emb
130 |             save_file = os.path.join(video_dir, 'uid2clip.pkl')
131 |             with open(save_file, 'wb') as f:
132 |                 pickle.dump(uid2clip, f)
133 |             
134 |             reid_file = os.path.join(video_dir, 'reid.pkl')
135 |             for uid in self.uid2tids:
136 |                 uid2category[uid] = self.trackid2category[self.uid2tids[uid][0]]
137 |             with open(reid_file, 'wb') as f:
138 |                 pickle.dump([frame2uid, uid2frame, uid2category], f)
139 | 
140 | 
141 |     def replay(self):
142 |         for video_path in self.video_path_list:
143 |             base_name = os.path.basename(video_path).replace(".mp4", "")
144 |             video_dir = os.path.join(self.base_dir, base_name)
145 |             with open(os.path.join(video_dir, 'reid.pkl'), 'rb') as f:
146 |                 content = pickle.load(f)
147 |             frame2uid, uid2frame, uid2category = content[0], content[1], content[2]
148 |             cap = cv2.VideoCapture(video_path)
149 |             cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
150 |             frame_idx = -1
151 |             writer = imageio.get_writer(os.path.join(video_dir, 'reid.mp4'), fps=15)
152 |             while True:
153 |                 success, frame = cap.read()
154 |                 frame_idx += 1
155 |                 if not success:
156 |                     break
157 |                 if frame_idx in frame2uid:
158 |                     for uid in frame2uid[frame_idx]:
159 |                         c = hash_color(uid)
160 |                         x, y, w, h = frame2uid[frame_idx][uid][1]
161 |                         left_top = (int(x-w/2), int(y-h/2))
162 |                         right_bottom = (int(x+w/2), int(y+h/2))
163 |                         cv2.rectangle(frame, left_top, right_bottom, c, 2)
164 |                         label = f'ID: {uid}'
165 |                         label_position = (int(x-w/2)+2, int(y-h/2)+12)
166 |                         cv2.putText(frame, label, label_position, cv2.FONT_HERSHEY_SIMPLEX, 0.5, c, 2)
167 |                     #cv2.imshow("reid", frame)
168 |                     frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
169 |                     writer.append_data(frame)
170 |             writer.close()
171 |             cap.release()
172 |             cv2.destroyAllWindows()
173 | 
174 | 
175 |     def run(self):
176 |         self.reid_for_all_videos()
177 |         self.replay()
178 | 


--------------------------------------------------------------------------------
/sample_videos/boats.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/sample_videos/boats.mp4


--------------------------------------------------------------------------------
/sample_videos/books.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/sample_videos/books.mp4


--------------------------------------------------------------------------------
/sample_videos/kitchen.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/sample_videos/kitchen.mp4


--------------------------------------------------------------------------------
/sample_videos/painting.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/sample_videos/painting.mp4


--------------------------------------------------------------------------------
/sample_videos/talking.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/YueFan1014/VideoAgent/42e1b70d947d4ae6ee627a3e9dda4898844e97b1/sample_videos/talking.mp4


--------------------------------------------------------------------------------
/segment_feature.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import time
 3 | import torch
 4 | import json
 5 | import cv2
 6 | import pickle
 7 | from InternVid.viclip import get_viclip, frames2tensor, get_vid_feat
 8 | from encoder import encode_sentences
 9 | 
10 | 
11 | model_cfgs = {
12 |     'viclip-l-internvid-10m-flt': {
13 |         'size': 'l',
14 |         'pretrained': 'tool_models/viCLIP/ViClip-InternVid-10M-FLT.pth',
15 |     }
16 | }
17 | 
18 | class SegmentFeature:
19 |     def __init__(self, video_path_list, base_dir='preprocess'):
20 |         self.video_path_list = video_path_list
21 |         self.base_dir = base_dir
22 |         self.seconds_per_feat = 2
23 |         self.frames_per_feat = 10
24 |         
25 |        
26 | 
27 |     def create_textual_embedding(self):
28 |         """use the sentence encoder model to embed the captions of all the videos"""
29 |         model='text-embedding-3-large'
30 |         for video_path in self.video_path_list:
31 |             start_time = time.time()
32 |             base_name = os.path.basename(video_path).replace(".mp4", "")
33 |             video_dir = os.path.join(self.base_dir, base_name)
34 |             with open(os.path.join(video_dir, 'captions.json')) as f:
35 |                 captions = json.load(f)
36 |             caps = list(captions.values())
37 |             caption_emb = encode_sentences(sentence_list=caps, model_name=model)
38 |             print(caption_emb)
39 |             with open(os.path.join(video_dir, f'segment_textual_embedding.pkl'), 'wb') as f:
40 |                 pickle.dump(caption_emb, f)
41 |             end_time = time.time()
42 |             print(f"textual encoding time for video {base_name}: {round(end_time-start_time, 3)} seconds")
43 | 
44 | 
45 |     def create_visual_embedding(self):
46 |         start_time = time.time()
47 |         cfg = model_cfgs['viclip-l-internvid-10m-flt']
48 |         model = get_viclip(cfg['size'], cfg['pretrained'])
49 |         assert(type(model)==dict and model['viclip'] is not None and model['tokenizer'] is not None)
50 |         clip, tokenizer = model['viclip'], model['tokenizer']
51 |         clip = clip.to("cuda")
52 |         end_time = time.time()
53 |         print(f'time for loading viCLIP model: {round(end_time-start_time, 3)} seconds')
54 | 
55 |         for video_path in self.video_path_list:
56 |             base_name = os.path.basename(video_path).replace(".mp4", "")
57 |             video_dir = os.path.join(self.base_dir, base_name)
58 |             if not os.path.exists(video_dir):
59 |                 os.makedirs(video_dir)
60 |             
61 |             cap = cv2.VideoCapture(video_path)
62 |             fps = round(cap.get(cv2.CAP_PROP_FPS))
63 |             total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
64 |             frame_interval = fps*self.seconds_per_feat//self.frames_per_feat
65 |             total_feats = total_frames//(fps*self.seconds_per_feat)
66 | 
67 |             segment_feats = []
68 |             start_time = time.time()
69 |             cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
70 |             for segment_id in range(total_feats):
71 |                 frames = []
72 |                 for i in range(self.frames_per_feat):
73 |                     success, frame = cap.read()
74 |                     frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
75 |                     frames.append(frame)
76 |                     for j in range(frame_interval-1): #skip other frames
77 |                         success, frame = cap.read()
78 |                 for i in range(fps*self.seconds_per_feat-frame_interval*self.frames_per_feat):
79 |                     success, frame = cap.read() #skip remaining frames
80 |                 frames_tensor = frames2tensor(frames, device='cuda')
81 |                 with torch.no_grad():
82 |                     vid_feat = get_vid_feat(frames_tensor, clip).cpu()
83 |                 segment_feats.append(vid_feat)
84 |             segment_feats = torch.cat(segment_feats, dim=0).numpy()
85 |             end_time = time.time()
86 |             cap.release()
87 |             print(segment_feats)
88 |             print(f"visual embedding time for video {base_name}: {round(end_time-start_time, 3)} seconds")
89 |             with open(os.path.join(video_dir, 'segment_visual_embedding.pkl'), 'wb') as f:
90 |                 pickle.dump(segment_feats, f)
91 | 
92 | 
93 |     def run(self):
94 |         self.create_textual_embedding()
95 |         self.create_visual_embedding()
96 | 
97 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | from sklearn.metrics.pairwise import cosine_similarity
 2 | import numpy as np
 3 | 
 4 | 
 5 | def compute_cosine_similarity(target_embedding, embedding_list):
 6 |     target_embedding_tensor = target_embedding.reshape(1, -1)
 7 |     # Compute cosine similarity
 8 |     similarity_scores = cosine_similarity(target_embedding_tensor, embedding_list)
 9 |     return similarity_scores.reshape(-1)
10 | 
11 | 
12 | def top_k_indices(scores, k):
13 |     max_len = scores.shape[0]
14 |     k = min(max_len, k)
15 |     indices = np.argsort(scores)[-k:][::-1]
16 |     return list(indices)


--------------------------------------------------------------------------------
/video-llava.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | from videollava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
 3 | from videollava.conversation import conv_templates, SeparatorStyle
 4 | from videollava.model.builder import load_pretrained_model
 5 | from videollava.utils import disable_torch_init
 6 | from videollava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
 7 | import socket
 8 | import os
 9 | import pickle
10 | 
11 | 
12 | def main():
13 |     disable_torch_init()
14 |     model_path = 'LanguageBind/Video-LLaVA-7B'
15 |     cache_dir = 'cache_dir'
16 |     device = 'cuda'
17 |     load_4bit, load_8bit = True, False
18 |     model_name = get_model_name_from_path(model_path)
19 |     tokenizer, model, processor, _ = load_pretrained_model(model_path, None, model_name, load_8bit, load_4bit, device=device, cache_dir=cache_dir)
20 |     video_processor = processor['video']
21 |     server = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
22 |     if not os.path.exists("tmp"):
23 |         os.mkdir("tmp")
24 |     if os.path.exists("tmp/vqa.sock"):
25 |         os.unlink("tmp/vqa.sock")
26 |     server.bind("tmp/vqa.sock")
27 |     server.listen(0)
28 |     print('ready for connection!')
29 |     # with open("tmp/ready.txt", 'w') as f:
30 |     #     f.write("ready!")
31 |     while True:
32 |         connection, address = server.accept()
33 |         r = connection.recv(1024).decode()
34 |         # if r == "stop":
35 |         #     break
36 |         with open('tmp/content.pkl', 'rb') as f:
37 |             content = pickle.load(f)
38 |         video_path = content['video_path']
39 |         questions = ['what is the video about?', content['question']]
40 |         answers = []
41 |         print('\n'+video_path)
42 |         for i in range(2):
43 |             video_tensor = video_processor(video_path, return_tensors='pt')['pixel_values']
44 |             if type(video_tensor) is list:
45 |                 tensor = [video.to(model.device, dtype=torch.float16) for video in video_tensor]
46 |             else:
47 |                 tensor = video_tensor.to(model.device, dtype=torch.float16)
48 |             
49 |             conv_mode = "llava_v1"
50 |             conv = conv_templates[conv_mode].copy()
51 |             roles = conv.roles
52 | 
53 |             print(f"{roles[1]}: {questions[i]}")
54 |             question = ' '.join([DEFAULT_IMAGE_TOKEN] * model.get_video_tower().config.num_frames) + '\n' + questions[i]
55 |             conv.append_message(conv.roles[0], question)
56 |             conv.append_message(conv.roles[1], None)
57 |             prompt = conv.get_prompt()
58 |             input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
59 |             stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
60 |             keywords = [stop_str]
61 |             stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
62 |             #print('video & question processing done!')
63 |             with torch.inference_mode():
64 |                 output_ids = model.generate(
65 |                     input_ids,
66 |                     images=tensor,
67 |                     do_sample=True,
68 |                     temperature=0.1,
69 |                     max_new_tokens=1024,
70 |                     use_cache=True,
71 |                     stopping_criteria=[stopping_criteria])
72 | 
73 |             outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()
74 |             outputs = outputs.replace("</s>", "")
75 |             answers.append(outputs)
76 |         reply = f"Segment description: {answers[0]}\nAnswer to the question: {answers[1]}"
77 |         print(reply)
78 |         with open('tmp/content.pkl', 'wb') as f:
79 |             pickle.dump(reply, f)
80 |         connection.send(b'sent')
81 |         r = connection.recv(1024)
82 |         connection.close()
83 | 
84 | 
85 | if __name__ == '__main__':
86 |     main()


--------------------------------------------------------------------------------