├── .gitignore
├── Crawler
├── README.md
├── download_cmd_generation.py
├── download_jukin_video_id.py
├── parallel_download_VATEX.py
└── parallel_download_jukin.py
├── README.md
├── pyproject.toml
└── valley
├── configs
├── deepspeed
│ ├── config_zero2.json
│ ├── config_zero3.json
│ └── config_zero3_offload.json
└── experiment
│ ├── valley_stage1.yaml
│ ├── valley_stage2.yaml
│ ├── valley_stage2_lora.yaml
│ └── valley_stage2_zero3.yaml
├── constants.py
├── conversation.py
├── data
├── dataset.py
└── video_transform.py
├── inference
├── run_valley.py
├── run_valley_conv.py
└── run_valley_llamma_v2.py
├── logo
├── demo.GIF
├── demo_screenshot.jpg
└── lama_with_valley.jpeg
├── model
├── apply_delta.py
├── make_delta.py
└── valley_model.py
├── serve
├── __init__.py
├── cli.py
├── controller.py
├── examples
│ ├── images
│ │ ├── c790e7358b6f9de50ccfc78d2fba1b97.jpg
│ │ └── f4cefeeb3f10a2afb4bb077a415f9fb8.jpg
│ └── videos
│ │ ├── aa5dbc3a110f410bb02572408b0fb778.mp4
│ │ └── dc52388394cc9f692d16a95d9833ca07.mp4
├── gateway
│ ├── README.md
│ └── nginx.conf
├── gradio_css.py
├── gradio_patch.py
├── gradio_web_server_video.py
├── model_worker.py
├── register_worker.py
└── test_message.py
├── train
├── train.py
├── train.sh
└── trainner.py
├── util
├── config.py
├── data_util.py
└── decode_img.py
└── utils.py
/.gitignore:
--------------------------------------------------------------------------------
1 | *-conv.json
2 | *.log
3 | **/__pycache__
4 | **.egg-info
5 | weight
6 | serve_images
7 | wandb
--------------------------------------------------------------------------------
/Crawler/README.md:
--------------------------------------------------------------------------------
1 | # Download Video from VATEX and Jukinmedia
2 | first, cd into the Crawler folder
3 | ```shell
4 | cd Crawler
5 | ```
6 | ## VATEX
7 | run the following script to generate the download cmd string to a file
8 | ```shell
9 | python download_cmd_generation.py --video_path [VIDEO SAVE FOLDER] --ann_file [VATEX ANNOTATION FILE PATH] --output_filename [CMD FILE OUTPUT PATH]
10 | ```
11 |
12 | Then run following script to download the video from Youtube.(**PS: This script calls the youtube video download tool [yt-dlp](https://github.com/yt-dlp/yt-dlp)**. You need to install it first)
13 |
14 | ```shell
15 | python parallel_download_VATEX.py --num_process 32 --cmd_file [THE FILE YOU GENERATE FROM LAST STEP]
16 | ```
17 |
18 | ## Jukin Media
19 | Run the following script to generate the video_id and video information from jukinmedia to a file.
20 | ```shell
21 | python download_jukin_video_id.py --savefolder [THE VIDEO INFO SAVEFOLDER]
22 | ```
23 |
24 | Run the folowing script to download the video
25 | ```shell
26 | python parallel_download_jukin.py --save_dir [VIDEO SAVE FOLDER] --input_file [THE FILE YOU GENERATE FROM LAST STEP] --num_process 5
27 | ```
28 |
--------------------------------------------------------------------------------
/Crawler/download_cmd_generation.py:
--------------------------------------------------------------------------------
1 | ''' usage: generate command script to download youtube video
2 | '''
3 | from argparse import ArgumentParser
4 | import glob
5 | import json
6 | import os
7 |
8 | def crosscheck_videos(video_path, ann_file):
9 | # Get existing videos
10 | existing_vids = glob.glob("%s/*.mp4" % video_path)
11 | for idx, vid in enumerate(existing_vids):
12 | basename = os.path.basename(vid).split(".mp4")[0]
13 | if len(basename) == 13:
14 | existing_vids[idx] = basename[2:]
15 | elif len(basename) == 11:
16 | existing_vids[idx] = basename
17 | else:
18 | raise RuntimeError("Unknown filename format: %s", vid)
19 | # Read an get video IDs from annotation file
20 | with open(ann_file, "r") as fobj:
21 | anet_v_1_0 = json.load(fobj)
22 | if 'VATEX' in video_path:
23 | all_vids = list(set(['_'.join(item['videoID'].split('_')[:-2]) for item in anet_v_1_0]))
24 | else:
25 | raise ValueError('Not VATEX form data, you need to cumtomize the code.')
26 | non_existing_videos = []
27 | for vid in all_vids:
28 | if vid in existing_vids:
29 | continue
30 | else:
31 | non_existing_videos.append(vid)
32 | return non_existing_videos
33 |
34 | def main(args):
35 | non_existing_videos = crosscheck_videos(args.video_path, args.ann_file)
36 | filename = os.path.join(args.video_path, "v_%s.mp4")
37 | cmd_base = "yt-dlp --merge-output-format mp4 "
38 | cmd_base += '"https://www.youtube.com/watch?v=%s" '
39 | cmd_base += '-o "%s" ' % filename
40 | cmd_base += '|| mv "%s.part" "%s"' % (filename,filename)
41 | with open(args.output_filename, "w") as fobj:
42 | for vid in non_existing_videos:
43 | cmd = cmd_base % (vid, vid, vid, vid)
44 | fobj.write("%s\n" % cmd)
45 |
46 | if __name__ == "__main__":
47 | parser = ArgumentParser(description="Script to double check video content.")
48 | parser.add_argument("--video_path", required=True, help="Where are located the videos? (Full path)")
49 | parser.add_argument("--ann_file", required=True, help="Where is the annotation file?")
50 | parser.add_argument("--output_filename", default='./VATEX/cmd_list.txt',required=True, help="Output script location.")
51 | args = parser.parse_args()
52 | main(args)
53 |
--------------------------------------------------------------------------------
/Crawler/download_jukin_video_id.py:
--------------------------------------------------------------------------------
1 | import requests
2 | import json as js
3 | import math
4 | from argparse import ArgumentParser
5 |
6 | def main(args):
7 | headers = {
8 | "X-Algolia-Api-Key": "a6099f9d3771d6ceb142321ac5273d16",
9 | "X-Algolia-Application-Id": "XSWHBQ6C6E",
10 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
11 | }
12 | category_number = {
13 | "Fails": 10000, "Pets": 10000, "Awesome": 9507, "Wildlife": 8896, "Humor": 6493, "Talent": 5471,
14 | "DIY": 2569, "Uplifting": 2431, "Newsworthy": 1957, "Cute": 1952, "Parenting": 1880, "Weather": 1630,
15 | "Fitness": 1385, "Family": 1296, "Art": 1154, "Food": 1116, "Crashes": 980, "Sports": 947, "Vehicles": 439,
16 | "Lifestyle": 370, "Nature": 330, "Travel": 294, "Crime": 161, "Paranormal": 115, "RecordSetter": 3, "Nitro Circus": 1
17 | }
18 |
19 | sum_data = 0
20 | for key in category_number:
21 | sum_data+=category_number[key]
22 | print('number of all vid: ',sum_data)
23 |
24 | result_number = dict()
25 | for category in category_number:
26 | page_number = math.ceil(category_number[category]/1000)
27 | data = []
28 | for i in range(page_number):
29 | json_data = {"query":"","userToken":"guest","hitsPerPage":1000,"page":i,"facets":["category"],"facetFilters":[["category:"+category]]}
30 | a = requests.post("https://xswhbq6c6e-2.algolianet.com/1/indexes/public_lp/query", headers=headers, json=json_data
31 | )
32 | data+=js.loads(a.content)['hits']
33 | result_number[category] = data
34 |
35 |
36 | js.dump(result_number,open(args.savefolder+'/'+'jukin-100k.json','w'))
37 |
38 | if __name__ == "__main__":
39 | parser = ArgumentParser(description="Script to parallel download jukinmedia video")
40 | parser.add_argument("--savefolder", default='./jukinmedia',)
41 | args = parser.parse_args()
42 | main(args)
--------------------------------------------------------------------------------
/Crawler/parallel_download_VATEX.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 | from multiprocessing import Pool
3 | import subprocess
4 | from tqdm import tqdm
5 | def download(cmd):
6 | try:
7 | subprocess.run(cmd, shell=True, capture_output=True)
8 | except:
9 | pass
10 | def main(args):
11 | cmd_list = open(args.cmd_file,'r').readlines()
12 | pbar = tqdm(total=len(cmd_list))
13 | pbar.set_description('download')
14 | update = lambda *args: pbar.update()
15 | p = Pool(int(args.num_process)) # 指定进程池中的进程数
16 | for i, cmd in enumerate(cmd_list):
17 | p.apply_async(download, args = (cmd.strip(),), callback=update)
18 |
19 | print('Waiting for all subprocesses done...')
20 | p.close()
21 | p.join()
22 | print('All subprocesses done.')
23 |
24 | if __name__ == "__main__":
25 | parser = ArgumentParser(description="Script to parallel downloads videos")
26 | parser.add_argument("--num_process", default=32,)
27 | parser.add_argument("--cmd_file", default='./VATEX/cmd_list.txt',)
28 | args = parser.parse_args()
29 | main(args)
--------------------------------------------------------------------------------
/Crawler/parallel_download_jukin.py:
--------------------------------------------------------------------------------
1 | from argparse import ArgumentParser
2 | import json as js
3 | from concurrent.futures import ThreadPoolExecutor
4 | import time
5 | import math
6 | import sys
7 | import time
8 | from concurrent.futures import ThreadPoolExecutor
9 | import threading
10 | from pathlib import Path
11 | import requests
12 | class ThreadPool:
13 | def __init__(self, max_thread_num=5):
14 | # 记录全部线程是否已经结束
15 | self.over = False
16 | # 记录所有的子线程完成后的返回值
17 | self.results = []
18 |
19 | # 子线程函数体
20 | self.func = None
21 | # 需要传进子线程的参数,数组中每一个元素都是一个元组
22 | # 例如有一个函数定义add(a,b),返回a和b的和
23 | # 则数组表现为[(1,2),(3,10),...]
24 | # 可以依据数组中的每一个元组建立一个线程
25 | self.args_list = None
26 | # 需要完成的任务的数量,获取自参数数组的长度
27 | self.task_num = 0
28 | # 线程池同时容纳的最大线程数,默认为5
29 | self.max_thread_num = max_thread_num
30 | # 初始化线程池
31 | self.pool = ThreadPoolExecutor(max_workers=max_thread_num)
32 | self.cond = threading.Condition()
33 |
34 | # 设置线程池中执行任务的各项参数
35 | def set_tasks(self, func, args_list):
36 | # 需要完成的任务的数量,获取自参数数组的长度
37 | self.task_num = len(args_list)
38 | # 参数数组
39 | self.args_list = args_list
40 | # 线程中执行的函数体
41 | self.func = func
42 |
43 | # 显示进度条,用以查看所有任务的完成进度
44 | @staticmethod
45 | def show_process(desc_text, curr, total):
46 | proc = math.ceil(curr / total * 100)
47 | show_line = '\r' + desc_text + ':' + '>' * proc \
48 | + ' ' * (100 - proc) + '[%s%%]' % proc \
49 | + '[%s/%s]' % (curr, total)
50 | sys.stdout.write(show_line)
51 | sys.stdout.flush()
52 | time.sleep(0.1)
53 |
54 | # 线程完成后的回调,功能有3
55 | # 1:监控所有任务的完成进度
56 | # 2:收集任务完成后的结果
57 | # 3.继续向线程池中添加新的任务
58 | def get_result(self, future):
59 | # 监控线程完成进度
60 | self.show_process('任务完成进度', self.task_num - len(self.args_list), self.task_num)
61 | # 将函数处理的返回值添加到结果集合当中,若没有返回值,则future.result()的值是None
62 | self.results.append(future.result())
63 | # 若参数数组中含有元素,则说明还有后续的任务
64 | if len(self.args_list):
65 | # 提取出将要执行的一个任务的参数
66 | args = self.args_list.pop()
67 | # 向线程池中提交一个新任务,第一个参数是函数体,第二个参数是执行函数时所需要的各项参数
68 | task = self.pool.submit(self.func, *args)
69 | # 绑定任务完成后的回调
70 | task.add_done_callback(self.get_result)
71 | else:
72 | # 若结果的数量与任务的数量相等,则说明所有的任务已经完成
73 | if self.task_num == len(self.results):
74 | print('\n', '任务完成')
75 | # 获取锁
76 | self.cond.acquire()
77 | # 通知
78 | self.cond.notify()
79 | # 释放锁
80 | self.cond.release()
81 | return
82 |
83 | def _start_tasks(self):
84 | # 向线程池中添加到最大数量的线程
85 | for i in range(self.max_thread_num):
86 | # 作出所有任务是否已经完成的判断,原因如下:
87 | # 如果直接向线程池提交巨大数量的任务,线程池会创建任务队列,占用大量内存
88 | # 为减少创建任务队列的巨大开销,本类中所有子线程在完成后的回调中,会向线程池中提交新的任务
89 | # 循环往复,直到所有任务全部完成,而任务队列几乎不存在
90 | # 1:当提交的任务数量小于线程池容纳的最大线程数,在本循环中,必会出现所有任务已经提交的情况
91 | # 2:当函数执行速度非常快的时候,也会出现所有任务已经提交的情况
92 |
93 | # 如果参数数组中还有元素,则说明没有到达线程池的上限
94 | if len(self.args_list):
95 | # 取出一组参数,同时删除该任务
96 | args = self.args_list.pop()
97 | # 向线程池中提交新的任务
98 | task = self.pool.submit(self.func, *args)
99 | # 绑定任务完成后的回调
100 | task.add_done_callback(self.get_result)
101 | # 所有任务已经全部提交,跳出循环
102 | else:
103 | break
104 |
105 | # 获取最终所有线程完成后的处理结果
106 | def final_results(self):
107 | # 开始执行所有任务
108 | self._start_tasks()
109 | # 获取结果时,会有两种情况
110 | # 所有的任务都已经完成了,直接返回结果就行
111 | if self.task_num == len(self.results):
112 | return self.results
113 | # 线程池中还有未完成的线程,只有当线程池中的任务全部结束才能够获取到最终的结果
114 | # 这种情况会在线程池容量过大或者线程极度耗时时才会出现
115 | else:
116 | # 获取锁
117 | self.cond.acquire()
118 | # 阻塞当前线程,等待通知
119 | self.cond.wait()
120 | # 已经获取到通知,释放锁
121 | self.cond.release()
122 | # 返回结果集
123 | return self.results
124 | # 参数times用来模拟网络请求的时间
125 |
126 | def download(save_dir,jmId):
127 | headers = {
128 | "X-Algolia-Api-Key": "a6099f9d3771d6ceb142321ac5273d16",
129 | "X-Algolia-Application-Id": "XSWHBQ6C6E",
130 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
131 | }
132 | response = requests.post('https://www.jukinmedia.com/api/public/video/downloadVideo/'+jmId,headers=headers)
133 | video_response = requests.get(js.loads(response.content)['url'])
134 | f = open(save_dir/('v_'+str(jmId)+'.mp4'),'wb')
135 | f.write(video_response.content)
136 | f.close()
137 | print('{} succeed!'.format(jmId))
138 |
139 | def check_already(save_dir, args_list):
140 | already_file = list(map(lambda x:x.name.split('.')[0].split('_')[1],Path(save_dir).rglob('*.mp4')))
141 | result = []
142 | for _, arg in args_list:
143 | if arg not in already_file:
144 | result.append((_,arg))
145 | print('already {}, left {}'.format(len(already_file), len(result)))
146 | return result
147 |
148 |
149 | def main(args):
150 | input_file_path = Path(args.input_file)
151 | all_data = js.load(open(input_file_path,'r'))
152 |
153 | Path(args.save_dir).mkdir(exist_ok=True, parents=True)
154 | tp = ThreadPool(args.num_process)
155 | args_list = []
156 | for cat in all_data:
157 | args_list+=all_data[cat]
158 | args_list = [(args.save_dir,data['jmId']) for data in args_list]
159 | args_list = check_already(args.save_dir,args_list)
160 | tp.set_tasks(download, args_list)
161 | # 获取每个线程执行的结果
162 | res = tp.final_results()
163 |
164 | if __name__ == "__main__":
165 | parser = ArgumentParser(description="Script to parallel downloads videos")
166 | parser.add_argument("--save_dir", required=True,)
167 | parser.add_argument("--input_file", default='./jukinmedia/jukin-100k.json',)
168 | parser.add_argument("--num_process", type = int, default=5)
169 | args = parser.parse_args()
170 | main(args)
171 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # ⛰️Valley: Video Assistant with Large Language model Enhanced abilitY
2 | Understanding Complex Videos Relying on Large Language and Vision Models
3 | [[Project Page](https://valley-vl.github.io/)] [[Paper](https://arxiv.org/pdf/2306.07207.pdf)]
4 | The online demo is no longer available, because we released the code for offline demo deployment
5 |
6 |
7 |
8 | **Video Assistant with Large Language model Enhanced abilitY**
9 | [Ruipu Luo*](https://github.com/RupertLuo), [Ziwang Zhao*](), [Min Yang*](https://github.com/feymanpriv) (*Equal Contribution)
10 |
11 |
12 | 
13 | Generated by stablecog via "A cute llama with valley"
14 |
15 |
16 | [](https://github.com/tatsu-lab/stanford_alpaca/blob/main/LICENSE)
17 | [](https://github.com/tatsu-lab/stanford_alpaca/blob/main/DATA_LICENSE)
18 | **Usage and License Notices**: The data, code and checkpoint is intended and licensed for research use only. They are also restricted to uses that follow the license agreement of LLaMA, Vicuna and GPT-4. The dataset is CC BY NC 4.0 (allowing only non-commercial use) and models trained using the dataset should not be used outside of research purposes.
19 |
20 | ## Release
21 | - [24/12/24] 🔥🔥🔥🔥 [**Valley 2.0**](https://github.com/bytedance/Valley) is now available which is based on Qwen2.5!!! We strongly recommend using Valley 2.0, which has better code usability and better performance in image understanding and video understanding. **DOWNLOAD and TRY!** \[[code](https://github.com/bytedance/Valley)\]| \[[model](https://huggingface.co/bytedance-research/Valley-Eagle-7B)\]
22 | - [24/02/21] 🔥 **Important!!** Due to my previous preprocessing error, the actual valley_instruct data set did not have 65K, and the actual number was 65k. This data error has now been fixed. The link is at [Valley-Instruct-65k](https://huggingface.co/datasets/luoruipu1/Valley-Instruct-65k). And we provide a script ```get_jukinmedia_videourl.py``` to get the url of jukinmedia video. Please see huggingface’s repo for details.
23 | - [8/14] 🫧 We released the Chinese version of Valley! Now its 7B and 13b weights are available at [Chinese-Valley7B-V1](https://huggingface.co/Zhaoziwang/chinese_valley7b_v1) and [Chinese-Valley13B-V1](https://huggingface.co/Zhaoziwang/chinese_valley13b_v1).
24 | - [8/10] 🫧 Realeased pretrain stage weight of 13b and 7b ,[Valley2-7b-pretrain](https://huggingface.co/luoruipu1/Valley2-7b-pretrain/), [valley-13b-pretrain](https://huggingface.co/luoruipu1/valley-13b-pretrain).
25 | - [8/8] 🫧 We released the self-collected and expanded instruction fine-tuning dataset ([Valley-Instruct-65K](https://huggingface.co/datasets/luoruipu1/Valley-Instruct-65K)).
26 | - [8/7] 🫧 We released [Valley2-7b](https://huggingface.co/luoruipu1/Valley2-7b), It replaces Vicuna with Llama 2.
27 | - [7/23] 🫧 We modified the our training code to make it easier to train valley and also support the training of lora.
28 | - [7/5] 🫧 Release training code for valley, and upload our pretraining data.
29 | - [6/21] 🫧 upload offline demo code.
30 | - [6/14] 🫧 build a share link ~[[demo]()]~.
31 | - [6/13] 🫧 We uploaded model weight of [Valley-13b-v1-delta](https://huggingface.co/luoruipu1/valley-13b-v1-delta).
32 | - [6/12] 🫧 We released Valley: Video Assistant with Large Language model Enhanced abilitY. Checkout the [paper](https://arxiv.org/pdf/2306.07207.pdf).
33 |
34 | ## Install
35 | 1. Clone this repository and navigate to Valley folder
36 | ```
37 | git clone https://github.com/RupertLuo/Valley.git
38 | cd Valley
39 | ```
40 | 2. Install Package
41 | ```
42 | conda create -n valley python=3.10 -y
43 | conda activate valley
44 | pip install --upgrade pip
45 | pip install -e .
46 | ```
47 | ## Data
48 | In the pretrain stage, we use the data from [LLaVA-CC3M-Pretrain-595K](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain) and the [Valley-webvid2M-Pretrain-703K](https://huggingface.co/datasets/luoruipu1/Valley-webvid2M-Pretrain-703K) collected and filtered by ourselves. The acquisition of picture and video data can refer to [LLAVA]( https://llava-vl.github.io/) and [Webvid](https://github.com/m-bain/webvid)
49 |
50 | In the finetune stage, we use the data from [LLaVA-instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K), [VideoChat-instruct-11K](https://github.com/OpenGVLab/InternVideo/tree/main/Data/instruction_data) and our self-collected [Valley-Instruct-65K](https://huggingface.co/datasets/luoruipu1/Valley-Instruct-65K). For the images and videos of the first two parts, please refer to their official website. Here we describe how we obtain the data we collect ourselves ([Valley-Instruct-65K](https://huggingface.co/datasets/luoruipu1/Valley-Instruct-65K)).
51 |
52 | 1. Part of Valley-Instruct-65K is collected from the open source dataset [VATEX](https://eric-xw.github.io/vatex-website/explore.html), which contains about 20k downloadable videos. You can download the original annotation file ("ava_vatex_training_v1.0.json") from its official website. Its video comes from YouTube, and now there are many open source tools that can download YouTube videos by video id. We provide a tool to download its videos, the tool is located in the [Crawler](./Crawler/) folder, please read the tool's [Readme.md](./Crawler/README.md) to use it.
53 | 2. Another part of Valley-Instruct-65K is collected from a video site, named [JukinMedia](https://www.jukinmedia.com/). It contains a wide variety of videos. We also provide a tool to download jukinmedia videos and its high quality descriptions, the tool is located in the [Crawler](./Crawler/) folder, please read the tool's [Readme.md](./Crawler/README.md) to use it.
54 |
55 |
56 | ## ValleyWeight
57 | ### Valley 13b v1
58 | We release [Valley-13b-v1](https://huggingface.co/luoruipu1/valley-13b-v1-delta) delta weights weights to comply with the LLaMA model license. You can apply this delta weights to original LLaMA model weight through the instructions blew:
59 |
60 | 1. Get the original LLaMA weights in the huggingface format by following the instructions structions [here](https://huggingface.co/docs/transformers/main/model_doc/llama).
61 | 2. Use the following scripts to get Valley weights by applying our delta ([13b-v1](https://huggingface.co/luoruipu1/valley-13b-v1-delta)).
62 | ```bash
63 | python3 valley/model/apply_delta.py \
64 | --base /path/to/llama-13b \
65 | --target /output/path/to/Valley-13B-v1 \
66 | --delta /path/to/valley-13b-v1-delta
67 | ```
68 | ### Valley2 7b
69 | For the Valley2-7b model, we provide direct weights, the address is [here](https://huggingface.co/luoruipu1/Valley2-7b)
70 |
71 | ### Chinese Valley 13b
72 | We now support **Chinese valley**. We use "BelleGroup/BELLE-LLaMA-EXT-13B" as LLM backbone, and "OFA-Sys/chinese-clip-vit-large-patch14" for visual backbone, the address is [here](https://huggingface.co/Zhaoziwang/chinese_valley_v1).
73 |
74 | ### Pretrain Weight
75 | We provide [13b](https://huggingface.co/luoruipu1/valley-13b-pretrain) and [7b](https://huggingface.co/luoruipu1/Valley2-7b-pretrain/) pre-trained weights so that people can fine-tune directly on our pre-trained weights with their own fine-tuning data.
76 |
77 | ## Web UI
78 |
79 | 
80 |
81 |
82 | The framework of this webUI comes from [LLaVA](https://github.com/haotian-liu/LLaVA) and [FastChat](https://github.com/lm-sys/FastChat), we modified a part of the code to make this demo support the input of video and images.
83 | #### launch a controller
84 | ```bsah
85 | python valley/serve/controller.py
86 | ```
87 | #### launch a model worker
88 | ```bsah
89 | python valley/serve/model_worker.py --model-path /path/to/valley-13b-v1
90 | ```
91 | Ps: At present, only single card mode is supported to load the model, and at least 30G of video memory is required, so the graphics card needs at least one Tesla V100.
92 | #### launch a gradio demo
93 | ```bash
94 | python valley/serve/gradio_web_server_video.py --share
95 | ```
96 |
97 |
98 | ## Inference Valley in Command Line
99 | We now update inference code which is more convient, and supports input in the form of openai api.
100 |
101 | Inference CLI
102 | ```
103 | python3 inference/run_valley.py --model-name [PATH TO VALLEY WEIGHT] --video_file [PATH TO VIDEO] --quary [YOUR QUERY ON THE VIDEO]
104 | ```
105 |
106 | Inference Chinese Valley
107 | ```
108 | python3 inference/run_valley.py --model-name [PATH TO CHINESE VALLEY WEIGHT] --video_file [PATH TO VIDEO] --query [YOUR QUERY ON THE VIDEO] --system-prompt "你是大型语言视觉助手 Chinese-Valley。你能够理解用户提供的视觉内容或视频,并使用自然语言协助用户完成各种任务。请仔细按照人类的指令进行回答,并详细解释你的答案。"
109 | ```
110 |
111 | Inference in code
112 |
113 | - You can utilize the code located at [valley/inference/run_valley_llamma_v2.py](valley/inference/run_valley_llamma_v2.py) to run inference on a video. All that's required is a video path
114 |
115 | ```bash
116 | python valley/inference/run_valley_llamma_v2.py --video_file
117 | ```
118 |
119 | - luoruipu1/Valley2-7b is used in the provided code.
120 |
121 | ## Train Valley Step By Step
122 |
123 | Inspired by LLAVA, we adopt a two-stage training method. The pre-training stage uses the [Valley-webvid2M-Pretrain-703K](https://huggingface.co/datasets/luoruipu1/Valley-webvid2M-Pretrain-703K) and [LLaVA-CC3M-Pretrain-595K](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain). And fine-tune stage uses [LLaVA-instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) , [VideoChat-instruct-11K](https://github.com/OpenGVLab/InternVideo/tree/main/Data/instruction_data) and [Valley-Instruct-65K](https://huggingface.co/datasets/luoruipu1/Valley-Instruct-65K)
124 |
125 | We modified our code for training valley and managed the model hyperparameters with yaml files. Run the following two scripts to perform valley training.
126 |
127 | ### Pretrain
128 | The llm backbone that currently supports pre-training is Llama(7b,13b), vicuna(7b,13b), stable-vicuna(13b), Llama2(chat-7b, chat-13b). You need to download these open source language model weights yourself and convert them to the huggingface format.
129 | ```shell
130 | bash valley/train/train.sh valley/configs/experiment/valley_stage1.yaml
131 | ```
132 |
133 | #### Finetune
134 |
135 | ```shell
136 | bash valley/train/train.sh valley/configs/experiment/valley_stage2.yaml
137 | ```
138 |
139 |
140 |
141 | ## Acknowledgement
142 |
143 | - [LLaVA](https://github.com/haotian-liu/LLaVA) & [MOSS](https://github.com/OpenLMLab/MOSS): Thanks to these two repositories for providing high-quality code, our code is based on them.
144 | ## Citation
145 | If the project is helpful to your research, please consider citing our paper as follows
146 |
147 | ```bibtex
148 | @misc{luo2023valley,
149 | title={Valley: Video Assistant with Large Language model Enhanced abilitY},
150 | author={Ruipu Luo and Ziwang Zhao and Min Yang and Junwei Dong and Minghui Qiu and Pengcheng Lu and Tao Wang and Zhongyu Wei},
151 | year={2023},
152 | eprint={2306.07207},
153 | archivePrefix={arXiv},
154 | primaryClass={cs.CV}
155 | }
156 | ```
157 |
--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
1 | [build-system]
2 | requires = ["setuptools>=61.0"]
3 | build-backend = "setuptools.build_meta"
4 |
5 | [project]
6 | name = "valley"
7 | version = "0.1.0"
8 | description = "A video assistant towards instruction tuning"
9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 | "Programming Language :: Python :: 3",
13 | "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 | "accelerate", "einops", "fastapi", "gradio==3.23", "markdown2[all]", "numpy",
17 | "requests", "sentencepiece", "tokenizers==0.12.1",
18 | "torch", "torchvision", "uvicorn",
19 | "transformers@git+https://github.com/huggingface/transformers.git@cae78c46",
20 | "ninja", "decord","einops","scikit-image","opencv-python-headless","peft",
21 | "deepspeed", "prettytable","wandb"
22 | ]
23 |
24 | [project.urls]
25 | "Bug Tracker" = "https://github.com/RupertLuo/Valley/issues"
26 |
27 | [tool.setuptools.packages.find]
28 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
29 |
30 | [tool.wheel]
31 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
--------------------------------------------------------------------------------
/valley/configs/deepspeed/config_zero2.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 |
14 | "zero_optimization": {
15 | "stage": 2,
16 | "allgather_partitions": true,
17 | "allgather_bucket_size": 5e8,
18 | "overlap_comm": true,
19 | "reduce_scatter": true,
20 | "reduce_bucket_size": 5e8,
21 | "contiguous_gradients": true
22 | },
23 |
24 | "gradient_accumulation_steps": "auto",
25 | "gradient_clipping": "auto",
26 | "steps_per_print": 2000,
27 | "train_batch_size": "auto",
28 | "train_micro_batch_size_per_gpu": "auto",
29 | "wall_clock_breakdown": false
30 | }
--------------------------------------------------------------------------------
/valley/configs/deepspeed/config_zero3.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "train_micro_batch_size_per_gpu": "auto",
14 | "train_batch_size": "auto",
15 | "gradient_accumulation_steps": "auto",
16 | "zero_optimization": {
17 | "stage": 3,
18 | "overlap_comm": true,
19 | "contiguous_gradients": true,
20 | "sub_group_size": 1e9,
21 | "reduce_bucket_size": "auto",
22 | "stage3_prefetch_bucket_size": "auto",
23 | "stage3_param_persistence_threshold": "auto",
24 | "stage3_max_live_parameters": 1e9,
25 | "stage3_max_reuse_distance": 1e9,
26 | "stage3_gather_16bit_weights_on_model_save": true
27 | }
28 | }
--------------------------------------------------------------------------------
/valley/configs/deepspeed/config_zero3_offload.json:
--------------------------------------------------------------------------------
1 | {
2 | "fp16": {
3 | "enabled": "auto",
4 | "loss_scale": 0,
5 | "loss_scale_window": 1000,
6 | "initial_scale_power": 16,
7 | "hysteresis": 2,
8 | "min_loss_scale": 1
9 | },
10 | "bf16": {
11 | "enabled": "auto"
12 | },
13 | "optimizer": {
14 | "type": "AdamW",
15 | "params": {
16 | "lr": "auto",
17 | "betas": "auto",
18 | "eps": "auto",
19 | "weight_decay": "auto"
20 | }
21 | },
22 | "scheduler": {
23 | "type": "WarmupLR",
24 | "params": {
25 | "warmup_min_lr": "auto",
26 | "warmup_max_lr": "auto",
27 | "warmup_num_steps": "auto"
28 | }
29 | },
30 | "zero_optimization": {
31 | "stage": 3,
32 | "offload_optimizer": {
33 | "device": "cpu",
34 | "pin_memory": true
35 | },
36 | "offload_param": {
37 | "device": "cpu",
38 | "pin_memory": true
39 | },
40 | "overlap_comm": true,
41 | "contiguous_gradients": true,
42 | "sub_group_size": 1e9,
43 | "reduce_bucket_size": "auto",
44 | "stage3_prefetch_bucket_size": "auto",
45 | "stage3_param_persistence_threshold": "auto",
46 | "stage3_max_live_parameters": 1e9,
47 | "stage3_max_reuse_distance": 1e9,
48 | "gather_16bit_weights_on_model_save": true
49 | },
50 | "gradient_accumulation_steps": "auto",
51 | "gradient_clipping": "auto",
52 | "train_batch_size": "auto",
53 | "train_micro_batch_size_per_gpu": "auto",
54 | "steps_per_print": 1e5,
55 | "wall_clock_breakdown": false
56 | }
--------------------------------------------------------------------------------
/valley/configs/experiment/valley_stage1.yaml:
--------------------------------------------------------------------------------
1 | model_name_or_path: Path/to/opensource/LLM
2 | data_path: Path/to/LLaVA-CC3M-Pretrain-595K/chat.json
3 | image_folder: Path/to/LLaVA-CC3M-Pretrain-595K/image_new
4 | video_data_path: Path/to/webvid_703K/chat.json
5 | video_folder: Path/to/webvid_703K/videos
6 | output_dir: Path/to/model/out/dir
7 | # experiment name
8 | project_name: valley
9 | run_name: valley_stage1
10 |
11 | # Whether to make the system prompt a mask in the label, and others do not mask
12 | only_mask_system: False
13 | # system prompt style
14 | conv_mode: v1
15 | # wether freeze backbone
16 | freeze_backbone: True
17 | # wether tune multimodal projection layer
18 | tune_mm_mlp_adapter: True
19 | # wether lora
20 | lora: False
21 | # wether multimodal
22 | is_multimodal: True
23 |
24 | num_train_epochs: 1
25 | per_device_train_batch_size: 16
26 | save_strategy: steps
27 | save_steps: 2400
28 | learning_rate: 2e-3
29 | gradient_checkpointing: True
30 |
31 | # wether do fast epoch
32 | fast_epoch: False
33 |
34 | vision_tower: openai/clip-vit-large-patch14
35 | mm_vision_select_layer: -2
36 | mm_use_im_start_end: True
37 | lazy_preprocess: True
38 | bf16: False
39 | fp16: True
40 | tf32: False
41 | per_device_eval_batch_size: 1
42 | gradient_accumulation_steps: 1
43 | evaluation_strategy: "no"
44 | save_total_limit: 1
45 | weight_decay: 0.
46 | warmup_ratio: 0.03
47 | lr_scheduler_type: cosine
48 | logging_steps: 1
49 | model_max_length: 2048
50 | adam_beta1: 0.9
51 | adam_beta2: 0.95
52 | deepspeed: valley/configs/deepspeed/config_zero2.json
53 | report_to: wandb
--------------------------------------------------------------------------------
/valley/configs/experiment/valley_stage2.yaml:
--------------------------------------------------------------------------------
1 | model_name_or_path: Path/ to/ pretrain/ valley/ from/ stage1
2 | data_path: Path/ to/ LLaVA-Instruct-150K/ llava_instruct_150k.json
3 | image_folder: Path/ to/ COCO/ train2014
4 | video_data_path: /Path/ to/ Valley-Instruct/ valley_instruct_73k.json
5 | video_folder: Path/ to/ Valley-Instruct/ videos
6 | output_dir: Model/ Output/ path
7 | prediction_file_name: Model/ Output/ path/ eval_result.jsonl # evaluation file output path
8 | # experiment name
9 | project_name: valley
10 | run_name: valley_stage2
11 | # Whether to make the system prompt a mask in the label, and others do not mask
12 | only_mask_system: False
13 | # system prompt style
14 | conv_mode: v1
15 | # wether freeze backbone
16 | freeze_backbone: False
17 | # wether tune multimodal projection layer
18 | tune_mm_mlp_adapter: True
19 | # wether lora
20 | lora: False
21 | # wether multimodal
22 | is_multimodal: True
23 |
24 | num_train_epochs: 3
25 | per_device_train_batch_size: 1
26 | per_device_eval_batch_size: 1 # must 1
27 | save_strategy: steps
28 | save_steps: 3000
29 | evaluation_strategy: 'no'
30 | eval_steps: 3000
31 | eval_num: 600
32 | use_legacy_prediction_loop: True
33 | predict_with_generate: True
34 | prediction_loss_only: False
35 | generation_max_length: 1536
36 | learning_rate: 2e-5
37 | gradient_checkpointing: True
38 |
39 | # wether do fast epoch
40 | fast_epoch: False
41 |
42 | vision_tower: openai/clip-vit-large-patch14
43 | mm_vision_select_layer: -2
44 | mm_use_im_start_end: True
45 | lazy_preprocess: True
46 | bf16: True
47 | fp16: False
48 | tf32: False
49 | gradient_accumulation_steps: 1
50 | weight_decay: 0.
51 | warmup_ratio: 0.03
52 | lr_scheduler_type: cosine
53 | logging_steps: 1
54 | model_max_length: 2048
55 | deepspeed: valley/configs/deepspeed/config_zero2.json
56 | report_to: wandb
--------------------------------------------------------------------------------
/valley/configs/experiment/valley_stage2_lora.yaml:
--------------------------------------------------------------------------------
1 | model_name_or_path: Path/ to/ pretrain/ valley/ from/ stage1
2 | data_path: Path/ to/ LLaVA-Instruct-150K/ llava_instruct_150k.json
3 | image_folder: Path/ to/ COCO/ train2014
4 | video_data_path: /Path/ to/ Valley-Instruct/ valley_instruct_73k.json
5 | video_folder: Path/ to/ Valley-Instruct/ videos
6 | output_dir: Model/ Output/ path
7 | prediction_file_name: Model/ Output/ path/ eval_result.jsonl # evaluation file output path
8 | # experiment name
9 | project_name: valley
10 | run_name: valley_stage2_lora
11 | # Whether to make the system prompt a mask in the label, and others do not mask
12 | only_mask_system: False
13 | # system prompt style
14 | conv_mode: v1
15 | # wether freeze backbone
16 | freeze_backbone: False
17 | # wether tune multimodal projection layer
18 | tune_mm_mlp_adapter: True
19 | # wether lora
20 | lora: True
21 | # wether multimodal
22 | is_multimodal: True
23 |
24 | num_train_epochs: 3
25 | per_device_train_batch_size: 4
26 | save_strategy: 'no'
27 | lora_save_strategy: steps # if do lora training, turn on this button, to only save lora weight. support ['steps','epochs','no']
28 | save_steps: 5000
29 | learning_rate: 5e-4
30 | gradient_checkpointing: True
31 |
32 | # wether do fast epoch
33 | fast_epoch: False
34 |
35 | vision_tower: openai/clip-vit-large-patch14
36 | mm_vision_select_layer: -2
37 | mm_use_im_start_end: True
38 | lazy_preprocess: True
39 | bf16: False
40 | fp16: True
41 | tf32: False
42 | per_device_eval_batch_size: 1
43 | gradient_accumulation_steps: 1
44 | evaluation_strategy: "no"
45 | save_total_limit: 3
46 | weight_decay: 0.
47 | warmup_ratio: 0.03
48 | lr_scheduler_type: cosine
49 | logging_steps: 1
50 | model_max_length: 2048
51 | adam_beta1: 0.9
52 | adam_beta2: 0.95
53 | deepspeed: valley/configs/deepspeed/config_zero2.json
54 | report_to: wandb
--------------------------------------------------------------------------------
/valley/configs/experiment/valley_stage2_zero3.yaml:
--------------------------------------------------------------------------------
1 | model_name_or_path: Path/ to/ pretrain/ valley/ from/ stage1
2 | data_path: Path/ to/ LLaVA-Instruct-150K/ llava_instruct_150k.json
3 | image_folder: Path/ to/ COCO/ train2014
4 | video_data_path: /Path/ to/ Valley-Instruct/ valley_instruct_73k.json
5 | video_folder: Path/ to/ Valley-Instruct/ videos
6 | output_dir: Model/ Output/ path
7 | prediction_file_name: Model/ Output/ path/ eval_result.jsonl # evaluation file output path
8 | # experiment name
9 | project_name: valley2
10 | run_name: valley_stage2_zero3
11 | # Whether to make the system prompt a mask in the label, and others do not mask
12 | only_mask_system: False
13 | # system prompt style
14 | conv_mode: v1
15 | # wether freeze backbone
16 | freeze_backbone: False
17 | # wether tune multimodal projection layer
18 | tune_mm_mlp_adapter: True
19 | # wether freeze multimodal projection layer
20 | freeze_mm_mlp_adapter: False
21 | # wether lora
22 | lora: False
23 | # wether multimodal
24 | is_multimodal: True
25 |
26 | num_train_epochs: 3
27 | per_device_train_batch_size: 1 # zero3 must 1
28 | per_device_eval_batch_size: 1 # must 1
29 | save_strategy: steps
30 | save_steps: 3000
31 | evaluation_strategy: "no"
32 | eval_steps: 3000
33 | eval_num: 600
34 | use_legacy_prediction_loop: True
35 | predict_with_generate: True
36 | prediction_loss_only: False
37 | generation_max_length: 1536
38 | learning_rate: 2e-5
39 | gradient_checkpointing: True
40 |
41 | # wether do fast epoch
42 | fast_epoch: False
43 |
44 | vision_tower: openai/clip-vit-large-patch14
45 | mm_vision_select_layer: -2
46 | mm_use_im_start_end: True
47 | lazy_preprocess: True
48 | bf16: False
49 | fp16: True
50 | tf32: False
51 | gradient_accumulation_steps: 1
52 | weight_decay: 0.
53 | warmup_ratio: 0.03
54 | lr_scheduler_type: cosine
55 | logging_steps: 1
56 | model_max_length: 2048
57 | deepspeed: valley/configs/deepspeed/config_zero3.json
58 | report_to: wandb
--------------------------------------------------------------------------------
/valley/constants.py:
--------------------------------------------------------------------------------
1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30
2 | WORKER_HEART_BEAT_INTERVAL = 15
3 |
4 | LOGDIR = "."
5 |
--------------------------------------------------------------------------------
/valley/conversation.py:
--------------------------------------------------------------------------------
1 | import dataclasses
2 | from enum import auto, Enum
3 | from typing import List, Tuple
4 | import base64
5 |
6 | class SeparatorStyle(Enum):
7 | """Different separator style."""
8 | SINGLE = auto()
9 | TWO = auto()
10 |
11 |
12 | @dataclasses.dataclass
13 | class Conversation:
14 | """A class that keeps all conversation history."""
15 | system: str
16 | roles: List[str]
17 | messages: List[List[str]]
18 | offset: int
19 | sep_style: SeparatorStyle = SeparatorStyle.SINGLE
20 | sep: str = "###"
21 | sep2: str = None
22 | mode: str = None
23 | skip_next: bool = False
24 |
25 | def get_prompt(self):
26 | if self.sep_style == SeparatorStyle.SINGLE:
27 | ret = self.system + self.sep
28 | for role, message in self.messages:
29 | if message:
30 | if type(message) is tuple:
31 | message, _, _ = message
32 | ret += role + ": " + message + self.sep
33 | else:
34 | ret += role + ":"
35 | return ret
36 | elif self.sep_style == SeparatorStyle.TWO:
37 | seps = [self.sep, self.sep2]
38 | ret = self.system + seps[0]
39 | for i, (role, message) in enumerate(self.messages):
40 | if message:
41 | if type(message) is tuple:
42 | message, _, _ = message
43 | ret += role + ": " + message + seps[i % 2]
44 | else:
45 | ret += role + ":"
46 | return ret
47 | else:
48 | raise ValueError(f"Invalid style: {self.sep_style}")
49 |
50 | def append_message(self, role, message):
51 | self.messages.append([role, message])
52 |
53 |
54 | def get_video(self,):
55 | videos = []
56 | path_list = []
57 | for i, (role, msg) in enumerate(self.messages[self.offset:]):
58 | if i % 2 == 0:
59 | if type(msg) is tuple:
60 | msg, video_path, image_process_mode = msg
61 | path_list.append(video_path)
62 | with open(video_path, "rb") as videoFile:
63 | video_b64_str = base64.b64encode(videoFile.read())
64 | videos.append(video_b64_str)
65 | return videos, path_list
66 | def get_images(self, return_pil=False):
67 | images = []
68 | for i, (role, msg) in enumerate(self.messages[self.offset:]):
69 | if i % 2 == 0:
70 | if type(msg) is tuple:
71 | import base64
72 | from io import BytesIO
73 | from PIL import Image
74 | msg, image_list, image_process_mode = msg
75 | if type(image_list) is not list:
76 | image_list = [image_list]
77 | for image in image_list:
78 | if image_process_mode == "Pad":
79 | def expand2square(pil_img, background_color=(122, 116, 104)):
80 | width, height = pil_img.size
81 | if width == height:
82 | return pil_img
83 | elif width > height:
84 | result = Image.new(pil_img.mode, (width, width), background_color)
85 | result.paste(pil_img, (0, (width - height) // 2))
86 | return result
87 | else:
88 | result = Image.new(pil_img.mode, (height, height), background_color)
89 | result.paste(pil_img, ((height - width) // 2, 0))
90 | return result
91 | image = expand2square(image)
92 | elif image_process_mode == "Crop":
93 | pass
94 | elif image_process_mode == "Resize":
95 | image = image.resize((224, 224))
96 | else:
97 | raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
98 | max_hw, min_hw = max(image.size), min(image.size)
99 | aspect_ratio = max_hw / min_hw
100 | max_len, min_len = 800, 400
101 | shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
102 | longest_edge = int(shortest_edge * aspect_ratio)
103 | W, H = image.size
104 | if H > W:
105 | H, W = longest_edge, shortest_edge
106 | else:
107 | H, W = shortest_edge, longest_edge
108 | image = image.resize((W, H))
109 | if return_pil:
110 | images.append(image)
111 | else:
112 | buffered = BytesIO()
113 | image.save(buffered, format="JPEG")
114 | img_b64_str = base64.b64encode(buffered.getvalue()).decode()
115 | images.append(img_b64_str)
116 | return images
117 |
118 | def to_gradio_chatbot(self):
119 | ret = []
120 | for i, (role, msg) in enumerate(self.messages[self.offset:]):
121 | if i % 2 == 0:
122 | if type(msg) is tuple:
123 | import base64
124 | from io import BytesIO
125 | msg, image, image_process_mode = msg
126 | img_str = ''
127 | max_hw, min_hw = max(image.size), min(image.size)
128 | aspect_ratio = max_hw / min_hw
129 | max_len, min_len = 800, 400
130 | shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
131 | longest_edge = int(shortest_edge * aspect_ratio)
132 | W, H = image.size
133 | if H > W:
134 | H, W = longest_edge, shortest_edge
135 | else:
136 | H, W = shortest_edge, longest_edge
137 | image = image.resize((W, H))
138 | # image = image.resize((224, 224))
139 | buffered = BytesIO()
140 | image.save(buffered, format="JPEG")
141 | img_b64_str = base64.b64encode(buffered.getvalue()).decode()
142 | img_str = img_str+f'
'
143 | msg = msg.replace('', '')+img_str
144 | ret.append([msg, None])
145 | else:
146 | ret.append([msg, None])
147 | else:
148 | ret[-1][-1] = msg
149 | return ret
150 |
151 | def video_to_gradio_chatbot(self):
152 | ret = []
153 | for i, (role, msg) in enumerate(self.messages[self.offset:]):
154 | if i % 2 == 0:
155 | if type(msg) is tuple:
156 |
157 | msg, video, image_process_mode = msg
158 | with open(video, "rb") as videoFile:
159 | video_b64_str = base64.b64encode(videoFile.read()).decode("utf-8")
160 | img_str = ''
161 | img_str = img_str+f''''''
164 | msg = msg.replace('