├── .gitignore ├── Crawler ├── README.md ├── download_cmd_generation.py ├── download_jukin_video_id.py ├── parallel_download_VATEX.py └── parallel_download_jukin.py ├── README.md ├── pyproject.toml └── valley ├── configs ├── deepspeed │ ├── config_zero2.json │ ├── config_zero3.json │ └── config_zero3_offload.json └── experiment │ ├── valley_stage1.yaml │ ├── valley_stage2.yaml │ ├── valley_stage2_lora.yaml │ └── valley_stage2_zero3.yaml ├── constants.py ├── conversation.py ├── data ├── dataset.py └── video_transform.py ├── inference ├── run_valley.py ├── run_valley_conv.py └── run_valley_llamma_v2.py ├── logo ├── demo.GIF ├── demo_screenshot.jpg └── lama_with_valley.jpeg ├── model ├── apply_delta.py ├── make_delta.py └── valley_model.py ├── serve ├── __init__.py ├── cli.py ├── controller.py ├── examples │ ├── images │ │ ├── c790e7358b6f9de50ccfc78d2fba1b97.jpg │ │ └── f4cefeeb3f10a2afb4bb077a415f9fb8.jpg │ └── videos │ │ ├── aa5dbc3a110f410bb02572408b0fb778.mp4 │ │ └── dc52388394cc9f692d16a95d9833ca07.mp4 ├── gateway │ ├── README.md │ └── nginx.conf ├── gradio_css.py ├── gradio_patch.py ├── gradio_web_server_video.py ├── model_worker.py ├── register_worker.py └── test_message.py ├── train ├── train.py ├── train.sh └── trainner.py ├── util ├── config.py ├── data_util.py └── decode_img.py └── utils.py /.gitignore: -------------------------------------------------------------------------------- 1 | *-conv.json 2 | *.log 3 | **/__pycache__ 4 | **.egg-info 5 | weight 6 | serve_images 7 | wandb -------------------------------------------------------------------------------- /Crawler/README.md: -------------------------------------------------------------------------------- 1 | # Download Video from VATEX and Jukinmedia 2 | first, cd into the Crawler folder 3 | ```shell 4 | cd Crawler 5 | ``` 6 | ## VATEX 7 | run the following script to generate the download cmd string to a file 8 | ```shell 9 | python download_cmd_generation.py --video_path [VIDEO SAVE FOLDER] --ann_file [VATEX ANNOTATION FILE PATH] --output_filename [CMD FILE OUTPUT PATH] 10 | ``` 11 | 12 | Then run following script to download the video from Youtube.(**PS: This script calls the youtube video download tool [yt-dlp](https://github.com/yt-dlp/yt-dlp)**. You need to install it first) 13 | 14 | ```shell 15 | python parallel_download_VATEX.py --num_process 32 --cmd_file [THE FILE YOU GENERATE FROM LAST STEP] 16 | ``` 17 | 18 | ## Jukin Media 19 | Run the following script to generate the video_id and video information from jukinmedia to a file. 20 | ```shell 21 | python download_jukin_video_id.py --savefolder [THE VIDEO INFO SAVEFOLDER] 22 | ``` 23 | 24 | Run the folowing script to download the video 25 | ```shell 26 | python parallel_download_jukin.py --save_dir [VIDEO SAVE FOLDER] --input_file [THE FILE YOU GENERATE FROM LAST STEP] --num_process 5 27 | ``` 28 | -------------------------------------------------------------------------------- /Crawler/download_cmd_generation.py: -------------------------------------------------------------------------------- 1 | ''' usage: generate command script to download youtube video 2 | ''' 3 | from argparse import ArgumentParser 4 | import glob 5 | import json 6 | import os 7 | 8 | def crosscheck_videos(video_path, ann_file): 9 | # Get existing videos 10 | existing_vids = glob.glob("%s/*.mp4" % video_path) 11 | for idx, vid in enumerate(existing_vids): 12 | basename = os.path.basename(vid).split(".mp4")[0] 13 | if len(basename) == 13: 14 | existing_vids[idx] = basename[2:] 15 | elif len(basename) == 11: 16 | existing_vids[idx] = basename 17 | else: 18 | raise RuntimeError("Unknown filename format: %s", vid) 19 | # Read an get video IDs from annotation file 20 | with open(ann_file, "r") as fobj: 21 | anet_v_1_0 = json.load(fobj) 22 | if 'VATEX' in video_path: 23 | all_vids = list(set(['_'.join(item['videoID'].split('_')[:-2]) for item in anet_v_1_0])) 24 | else: 25 | raise ValueError('Not VATEX form data, you need to cumtomize the code.') 26 | non_existing_videos = [] 27 | for vid in all_vids: 28 | if vid in existing_vids: 29 | continue 30 | else: 31 | non_existing_videos.append(vid) 32 | return non_existing_videos 33 | 34 | def main(args): 35 | non_existing_videos = crosscheck_videos(args.video_path, args.ann_file) 36 | filename = os.path.join(args.video_path, "v_%s.mp4") 37 | cmd_base = "yt-dlp --merge-output-format mp4 " 38 | cmd_base += '"https://www.youtube.com/watch?v=%s" ' 39 | cmd_base += '-o "%s" ' % filename 40 | cmd_base += '|| mv "%s.part" "%s"' % (filename,filename) 41 | with open(args.output_filename, "w") as fobj: 42 | for vid in non_existing_videos: 43 | cmd = cmd_base % (vid, vid, vid, vid) 44 | fobj.write("%s\n" % cmd) 45 | 46 | if __name__ == "__main__": 47 | parser = ArgumentParser(description="Script to double check video content.") 48 | parser.add_argument("--video_path", required=True, help="Where are located the videos? (Full path)") 49 | parser.add_argument("--ann_file", required=True, help="Where is the annotation file?") 50 | parser.add_argument("--output_filename", default='./VATEX/cmd_list.txt',required=True, help="Output script location.") 51 | args = parser.parse_args() 52 | main(args) 53 | -------------------------------------------------------------------------------- /Crawler/download_jukin_video_id.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import json as js 3 | import math 4 | from argparse import ArgumentParser 5 | 6 | def main(args): 7 | headers = { 8 | "X-Algolia-Api-Key": "a6099f9d3771d6ceb142321ac5273d16", 9 | "X-Algolia-Application-Id": "XSWHBQ6C6E", 10 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" 11 | } 12 | category_number = { 13 | "Fails": 10000, "Pets": 10000, "Awesome": 9507, "Wildlife": 8896, "Humor": 6493, "Talent": 5471, 14 | "DIY": 2569, "Uplifting": 2431, "Newsworthy": 1957, "Cute": 1952, "Parenting": 1880, "Weather": 1630, 15 | "Fitness": 1385, "Family": 1296, "Art": 1154, "Food": 1116, "Crashes": 980, "Sports": 947, "Vehicles": 439, 16 | "Lifestyle": 370, "Nature": 330, "Travel": 294, "Crime": 161, "Paranormal": 115, "RecordSetter": 3, "Nitro Circus": 1 17 | } 18 | 19 | sum_data = 0 20 | for key in category_number: 21 | sum_data+=category_number[key] 22 | print('number of all vid: ',sum_data) 23 | 24 | result_number = dict() 25 | for category in category_number: 26 | page_number = math.ceil(category_number[category]/1000) 27 | data = [] 28 | for i in range(page_number): 29 | json_data = {"query":"","userToken":"guest","hitsPerPage":1000,"page":i,"facets":["category"],"facetFilters":[["category:"+category]]} 30 | a = requests.post("https://xswhbq6c6e-2.algolianet.com/1/indexes/public_lp/query", headers=headers, json=json_data 31 | ) 32 | data+=js.loads(a.content)['hits'] 33 | result_number[category] = data 34 | 35 | 36 | js.dump(result_number,open(args.savefolder+'/'+'jukin-100k.json','w')) 37 | 38 | if __name__ == "__main__": 39 | parser = ArgumentParser(description="Script to parallel download jukinmedia video") 40 | parser.add_argument("--savefolder", default='./jukinmedia',) 41 | args = parser.parse_args() 42 | main(args) -------------------------------------------------------------------------------- /Crawler/parallel_download_VATEX.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | from multiprocessing import Pool 3 | import subprocess 4 | from tqdm import tqdm 5 | def download(cmd): 6 | try: 7 | subprocess.run(cmd, shell=True, capture_output=True) 8 | except: 9 | pass 10 | def main(args): 11 | cmd_list = open(args.cmd_file,'r').readlines() 12 | pbar = tqdm(total=len(cmd_list)) 13 | pbar.set_description('download') 14 | update = lambda *args: pbar.update() 15 | p = Pool(int(args.num_process)) # 指定进程池中的进程数 16 | for i, cmd in enumerate(cmd_list): 17 | p.apply_async(download, args = (cmd.strip(),), callback=update) 18 | 19 | print('Waiting for all subprocesses done...') 20 | p.close() 21 | p.join() 22 | print('All subprocesses done.') 23 | 24 | if __name__ == "__main__": 25 | parser = ArgumentParser(description="Script to parallel downloads videos") 26 | parser.add_argument("--num_process", default=32,) 27 | parser.add_argument("--cmd_file", default='./VATEX/cmd_list.txt',) 28 | args = parser.parse_args() 29 | main(args) -------------------------------------------------------------------------------- /Crawler/parallel_download_jukin.py: -------------------------------------------------------------------------------- 1 | from argparse import ArgumentParser 2 | import json as js 3 | from concurrent.futures import ThreadPoolExecutor 4 | import time 5 | import math 6 | import sys 7 | import time 8 | from concurrent.futures import ThreadPoolExecutor 9 | import threading 10 | from pathlib import Path 11 | import requests 12 | class ThreadPool: 13 | def __init__(self, max_thread_num=5): 14 | # 记录全部线程是否已经结束 15 | self.over = False 16 | # 记录所有的子线程完成后的返回值 17 | self.results = [] 18 | 19 | # 子线程函数体 20 | self.func = None 21 | # 需要传进子线程的参数,数组中每一个元素都是一个元组 22 | # 例如有一个函数定义add(a,b),返回a和b的和 23 | # 则数组表现为[(1,2),(3,10),...] 24 | # 可以依据数组中的每一个元组建立一个线程 25 | self.args_list = None 26 | # 需要完成的任务的数量,获取自参数数组的长度 27 | self.task_num = 0 28 | # 线程池同时容纳的最大线程数,默认为5 29 | self.max_thread_num = max_thread_num 30 | # 初始化线程池 31 | self.pool = ThreadPoolExecutor(max_workers=max_thread_num) 32 | self.cond = threading.Condition() 33 | 34 | # 设置线程池中执行任务的各项参数 35 | def set_tasks(self, func, args_list): 36 | # 需要完成的任务的数量,获取自参数数组的长度 37 | self.task_num = len(args_list) 38 | # 参数数组 39 | self.args_list = args_list 40 | # 线程中执行的函数体 41 | self.func = func 42 | 43 | # 显示进度条,用以查看所有任务的完成进度 44 | @staticmethod 45 | def show_process(desc_text, curr, total): 46 | proc = math.ceil(curr / total * 100) 47 | show_line = '\r' + desc_text + ':' + '>' * proc \ 48 | + ' ' * (100 - proc) + '[%s%%]' % proc \ 49 | + '[%s/%s]' % (curr, total) 50 | sys.stdout.write(show_line) 51 | sys.stdout.flush() 52 | time.sleep(0.1) 53 | 54 | # 线程完成后的回调,功能有3 55 | # 1:监控所有任务的完成进度 56 | # 2:收集任务完成后的结果 57 | # 3.继续向线程池中添加新的任务 58 | def get_result(self, future): 59 | # 监控线程完成进度 60 | self.show_process('任务完成进度', self.task_num - len(self.args_list), self.task_num) 61 | # 将函数处理的返回值添加到结果集合当中,若没有返回值,则future.result()的值是None 62 | self.results.append(future.result()) 63 | # 若参数数组中含有元素,则说明还有后续的任务 64 | if len(self.args_list): 65 | # 提取出将要执行的一个任务的参数 66 | args = self.args_list.pop() 67 | # 向线程池中提交一个新任务,第一个参数是函数体,第二个参数是执行函数时所需要的各项参数 68 | task = self.pool.submit(self.func, *args) 69 | # 绑定任务完成后的回调 70 | task.add_done_callback(self.get_result) 71 | else: 72 | # 若结果的数量与任务的数量相等,则说明所有的任务已经完成 73 | if self.task_num == len(self.results): 74 | print('\n', '任务完成') 75 | # 获取锁 76 | self.cond.acquire() 77 | # 通知 78 | self.cond.notify() 79 | # 释放锁 80 | self.cond.release() 81 | return 82 | 83 | def _start_tasks(self): 84 | # 向线程池中添加到最大数量的线程 85 | for i in range(self.max_thread_num): 86 | # 作出所有任务是否已经完成的判断,原因如下: 87 | # 如果直接向线程池提交巨大数量的任务,线程池会创建任务队列,占用大量内存 88 | # 为减少创建任务队列的巨大开销,本类中所有子线程在完成后的回调中,会向线程池中提交新的任务 89 | # 循环往复,直到所有任务全部完成,而任务队列几乎不存在 90 | # 1:当提交的任务数量小于线程池容纳的最大线程数,在本循环中,必会出现所有任务已经提交的情况 91 | # 2:当函数执行速度非常快的时候,也会出现所有任务已经提交的情况 92 | 93 | # 如果参数数组中还有元素,则说明没有到达线程池的上限 94 | if len(self.args_list): 95 | # 取出一组参数,同时删除该任务 96 | args = self.args_list.pop() 97 | # 向线程池中提交新的任务 98 | task = self.pool.submit(self.func, *args) 99 | # 绑定任务完成后的回调 100 | task.add_done_callback(self.get_result) 101 | # 所有任务已经全部提交,跳出循环 102 | else: 103 | break 104 | 105 | # 获取最终所有线程完成后的处理结果 106 | def final_results(self): 107 | # 开始执行所有任务 108 | self._start_tasks() 109 | # 获取结果时,会有两种情况 110 | # 所有的任务都已经完成了,直接返回结果就行 111 | if self.task_num == len(self.results): 112 | return self.results 113 | # 线程池中还有未完成的线程,只有当线程池中的任务全部结束才能够获取到最终的结果 114 | # 这种情况会在线程池容量过大或者线程极度耗时时才会出现 115 | else: 116 | # 获取锁 117 | self.cond.acquire() 118 | # 阻塞当前线程,等待通知 119 | self.cond.wait() 120 | # 已经获取到通知,释放锁 121 | self.cond.release() 122 | # 返回结果集 123 | return self.results 124 | # 参数times用来模拟网络请求的时间 125 | 126 | def download(save_dir,jmId): 127 | headers = { 128 | "X-Algolia-Api-Key": "a6099f9d3771d6ceb142321ac5273d16", 129 | "X-Algolia-Application-Id": "XSWHBQ6C6E", 130 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" 131 | } 132 | response = requests.post('https://www.jukinmedia.com/api/public/video/downloadVideo/'+jmId,headers=headers) 133 | video_response = requests.get(js.loads(response.content)['url']) 134 | f = open(save_dir/('v_'+str(jmId)+'.mp4'),'wb') 135 | f.write(video_response.content) 136 | f.close() 137 | print('{} succeed!'.format(jmId)) 138 | 139 | def check_already(save_dir, args_list): 140 | already_file = list(map(lambda x:x.name.split('.')[0].split('_')[1],Path(save_dir).rglob('*.mp4'))) 141 | result = [] 142 | for _, arg in args_list: 143 | if arg not in already_file: 144 | result.append((_,arg)) 145 | print('already {}, left {}'.format(len(already_file), len(result))) 146 | return result 147 | 148 | 149 | def main(args): 150 | input_file_path = Path(args.input_file) 151 | all_data = js.load(open(input_file_path,'r')) 152 | 153 | Path(args.save_dir).mkdir(exist_ok=True, parents=True) 154 | tp = ThreadPool(args.num_process) 155 | args_list = [] 156 | for cat in all_data: 157 | args_list+=all_data[cat] 158 | args_list = [(args.save_dir,data['jmId']) for data in args_list] 159 | args_list = check_already(args.save_dir,args_list) 160 | tp.set_tasks(download, args_list) 161 | # 获取每个线程执行的结果 162 | res = tp.final_results() 163 | 164 | if __name__ == "__main__": 165 | parser = ArgumentParser(description="Script to parallel downloads videos") 166 | parser.add_argument("--save_dir", required=True,) 167 | parser.add_argument("--input_file", default='./jukinmedia/jukin-100k.json',) 168 | parser.add_argument("--num_process", type = int, default=5) 169 | args = parser.parse_args() 170 | main(args) 171 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ⛰️Valley: Video Assistant with Large Language model Enhanced abilitY 2 | Understanding Complex Videos Relying on Large Language and Vision Models 3 | [[Project Page](https://valley-vl.github.io/)] [[Paper](https://arxiv.org/pdf/2306.07207.pdf)] 4 | The online demo is no longer available, because we released the code for offline demo deployment 5 | 6 | 7 | 8 | **Video Assistant with Large Language model Enhanced abilitY**
9 | [Ruipu Luo*](https://github.com/RupertLuo), [Ziwang Zhao*](), [Min Yang*](https://github.com/feymanpriv) (*Equal Contribution) 10 | 11 |

12 |
13 | Generated by stablecog via "A cute llama with valley" 14 |

15 | 16 | [![Code License](https://img.shields.io/badge/Code%20License-Apache_2.0-green.svg)](https://github.com/tatsu-lab/stanford_alpaca/blob/main/LICENSE) 17 | [![Data License](https://img.shields.io/badge/Data%20License-CC%20By%20NC%204.0-red.svg)](https://github.com/tatsu-lab/stanford_alpaca/blob/main/DATA_LICENSE) 18 | **Usage and License Notices**: The data, code and checkpoint is intended and licensed for research use only. They are also restricted to uses that follow the license agreement of LLaMA, Vicuna and GPT-4. The dataset is CC BY NC 4.0 (allowing only non-commercial use) and models trained using the dataset should not be used outside of research purposes. 19 | 20 | ## Release 21 | - [24/12/24] 🔥🔥🔥🔥 [**Valley 2.0**](https://github.com/bytedance/Valley) is now available which is based on Qwen2.5!!! We strongly recommend using Valley 2.0, which has better code usability and better performance in image understanding and video understanding. **DOWNLOAD and TRY!** \[[code](https://github.com/bytedance/Valley)\]| \[[model](https://huggingface.co/bytedance-research/Valley-Eagle-7B)\] 22 | - [24/02/21] 🔥 **Important!!** Due to my previous preprocessing error, the actual valley_instruct data set did not have 65K, and the actual number was 65k. This data error has now been fixed. The link is at [Valley-Instruct-65k](https://huggingface.co/datasets/luoruipu1/Valley-Instruct-65k). And we provide a script ```get_jukinmedia_videourl.py``` to get the url of jukinmedia video. Please see huggingface’s repo for details. 23 | - [8/14] 🫧 We released the Chinese version of Valley! Now its 7B and 13b weights are available at [Chinese-Valley7B-V1](https://huggingface.co/Zhaoziwang/chinese_valley7b_v1) and [Chinese-Valley13B-V1](https://huggingface.co/Zhaoziwang/chinese_valley13b_v1). 24 | - [8/10] 🫧 Realeased pretrain stage weight of 13b and 7b ,[Valley2-7b-pretrain](https://huggingface.co/luoruipu1/Valley2-7b-pretrain/), [valley-13b-pretrain](https://huggingface.co/luoruipu1/valley-13b-pretrain). 25 | - [8/8] 🫧 We released the self-collected and expanded instruction fine-tuning dataset ([Valley-Instruct-65K](https://huggingface.co/datasets/luoruipu1/Valley-Instruct-65K)). 26 | - [8/7] 🫧 We released [Valley2-7b](https://huggingface.co/luoruipu1/Valley2-7b), It replaces Vicuna with Llama 2. 27 | - [7/23] 🫧 We modified the our training code to make it easier to train valley and also support the training of lora. 28 | - [7/5] 🫧 Release training code for valley, and upload our pretraining data. 29 | - [6/21] 🫧 upload offline demo code. 30 | - [6/14] 🫧 build a share link ~[[demo]()]~. 31 | - [6/13] 🫧 We uploaded model weight of [Valley-13b-v1-delta](https://huggingface.co/luoruipu1/valley-13b-v1-delta). 32 | - [6/12] 🫧 We released Valley: Video Assistant with Large Language model Enhanced abilitY. Checkout the [paper](https://arxiv.org/pdf/2306.07207.pdf). 33 | 34 | ## Install 35 | 1. Clone this repository and navigate to Valley folder 36 | ``` 37 | git clone https://github.com/RupertLuo/Valley.git 38 | cd Valley 39 | ``` 40 | 2. Install Package 41 | ``` 42 | conda create -n valley python=3.10 -y 43 | conda activate valley 44 | pip install --upgrade pip 45 | pip install -e . 46 | ``` 47 | ## Data 48 | In the pretrain stage, we use the data from [LLaVA-CC3M-Pretrain-595K](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain) and the [Valley-webvid2M-Pretrain-703K](https://huggingface.co/datasets/luoruipu1/Valley-webvid2M-Pretrain-703K) collected and filtered by ourselves. The acquisition of picture and video data can refer to [LLAVA]( https://llava-vl.github.io/) and [Webvid](https://github.com/m-bain/webvid) 49 | 50 | In the finetune stage, we use the data from [LLaVA-instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K), [VideoChat-instruct-11K](https://github.com/OpenGVLab/InternVideo/tree/main/Data/instruction_data) and our self-collected [Valley-Instruct-65K](https://huggingface.co/datasets/luoruipu1/Valley-Instruct-65K). For the images and videos of the first two parts, please refer to their official website. Here we describe how we obtain the data we collect ourselves ([Valley-Instruct-65K](https://huggingface.co/datasets/luoruipu1/Valley-Instruct-65K)). 51 | 52 | 1. Part of Valley-Instruct-65K is collected from the open source dataset [VATEX](https://eric-xw.github.io/vatex-website/explore.html), which contains about 20k downloadable videos. You can download the original annotation file ("ava_vatex_training_v1.0.json") from its official website. Its video comes from YouTube, and now there are many open source tools that can download YouTube videos by video id. We provide a tool to download its videos, the tool is located in the [Crawler](./Crawler/) folder, please read the tool's [Readme.md](./Crawler/README.md) to use it. 53 | 2. Another part of Valley-Instruct-65K is collected from a video site, named [JukinMedia](https://www.jukinmedia.com/). It contains a wide variety of videos. We also provide a tool to download jukinmedia videos and its high quality descriptions, the tool is located in the [Crawler](./Crawler/) folder, please read the tool's [Readme.md](./Crawler/README.md) to use it. 54 | 55 | 56 | ## ValleyWeight 57 | ### Valley 13b v1 58 | We release [Valley-13b-v1](https://huggingface.co/luoruipu1/valley-13b-v1-delta) delta weights weights to comply with the LLaMA model license. You can apply this delta weights to original LLaMA model weight through the instructions blew: 59 | 60 | 1. Get the original LLaMA weights in the huggingface format by following the instructions structions [here](https://huggingface.co/docs/transformers/main/model_doc/llama). 61 | 2. Use the following scripts to get Valley weights by applying our delta ([13b-v1](https://huggingface.co/luoruipu1/valley-13b-v1-delta)). 62 | ```bash 63 | python3 valley/model/apply_delta.py \ 64 | --base /path/to/llama-13b \ 65 | --target /output/path/to/Valley-13B-v1 \ 66 | --delta /path/to/valley-13b-v1-delta 67 | ``` 68 | ### Valley2 7b 69 | For the Valley2-7b model, we provide direct weights, the address is [here](https://huggingface.co/luoruipu1/Valley2-7b) 70 | 71 | ### Chinese Valley 13b 72 | We now support **Chinese valley**. We use "BelleGroup/BELLE-LLaMA-EXT-13B" as LLM backbone, and "OFA-Sys/chinese-clip-vit-large-patch14" for visual backbone, the address is [here](https://huggingface.co/Zhaoziwang/chinese_valley_v1). 73 | 74 | ### Pretrain Weight 75 | We provide [13b](https://huggingface.co/luoruipu1/valley-13b-pretrain) and [7b](https://huggingface.co/luoruipu1/Valley2-7b-pretrain/) pre-trained weights so that people can fine-tune directly on our pre-trained weights with their own fine-tuning data. 76 | 77 | ## Web UI 78 |

79 |
80 |

81 | 82 | The framework of this webUI comes from [LLaVA](https://github.com/haotian-liu/LLaVA) and [FastChat](https://github.com/lm-sys/FastChat), we modified a part of the code to make this demo support the input of video and images. 83 | #### launch a controller 84 | ```bsah 85 | python valley/serve/controller.py 86 | ``` 87 | #### launch a model worker 88 | ```bsah 89 | python valley/serve/model_worker.py --model-path /path/to/valley-13b-v1 90 | ``` 91 | Ps: At present, only single card mode is supported to load the model, and at least 30G of video memory is required, so the graphics card needs at least one Tesla V100. 92 | #### launch a gradio demo 93 | ```bash 94 | python valley/serve/gradio_web_server_video.py --share 95 | ``` 96 | 97 | 98 | ## Inference Valley in Command Line 99 | We now update inference code which is more convient, and supports input in the form of openai api. 100 | 101 | Inference CLI 102 | ``` 103 | python3 inference/run_valley.py --model-name [PATH TO VALLEY WEIGHT] --video_file [PATH TO VIDEO] --quary [YOUR QUERY ON THE VIDEO] 104 | ``` 105 | 106 | Inference Chinese Valley 107 | ``` 108 | python3 inference/run_valley.py --model-name [PATH TO CHINESE VALLEY WEIGHT] --video_file [PATH TO VIDEO] --query [YOUR QUERY ON THE VIDEO] --system-prompt "你是大型语言视觉助手 Chinese-Valley。你能够理解用户提供的视觉内容或视频,并使用自然语言协助用户完成各种任务。请仔细按照人类的指令进行回答,并详细解释你的答案。" 109 | ``` 110 | 111 | Inference in code 112 | 113 | - You can utilize the code located at [valley/inference/run_valley_llamma_v2.py](valley/inference/run_valley_llamma_v2.py) to run inference on a video. All that's required is a video path 114 | 115 | ```bash 116 | python valley/inference/run_valley_llamma_v2.py --video_file 117 | ``` 118 | 119 | - luoruipu1/Valley2-7b is used in the provided code. 120 | 121 | ## Train Valley Step By Step 122 | 123 | Inspired by LLAVA, we adopt a two-stage training method. The pre-training stage uses the [Valley-webvid2M-Pretrain-703K](https://huggingface.co/datasets/luoruipu1/Valley-webvid2M-Pretrain-703K) and [LLaVA-CC3M-Pretrain-595K](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain). And fine-tune stage uses [LLaVA-instruct-150K](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K) , [VideoChat-instruct-11K](https://github.com/OpenGVLab/InternVideo/tree/main/Data/instruction_data) and [Valley-Instruct-65K](https://huggingface.co/datasets/luoruipu1/Valley-Instruct-65K) 124 | 125 | We modified our code for training valley and managed the model hyperparameters with yaml files. Run the following two scripts to perform valley training. 126 | 127 | ### Pretrain 128 | The llm backbone that currently supports pre-training is Llama(7b,13b), vicuna(7b,13b), stable-vicuna(13b), Llama2(chat-7b, chat-13b). You need to download these open source language model weights yourself and convert them to the huggingface format. 129 | ```shell 130 | bash valley/train/train.sh valley/configs/experiment/valley_stage1.yaml 131 | ``` 132 | 133 | #### Finetune 134 | 135 | ```shell 136 | bash valley/train/train.sh valley/configs/experiment/valley_stage2.yaml 137 | ``` 138 | 139 | 140 | 141 | ## Acknowledgement 142 | 143 | - [LLaVA](https://github.com/haotian-liu/LLaVA) & [MOSS](https://github.com/OpenLMLab/MOSS): Thanks to these two repositories for providing high-quality code, our code is based on them. 144 | ## Citation 145 | If the project is helpful to your research, please consider citing our paper as follows 146 | 147 | ```bibtex 148 | @misc{luo2023valley, 149 | title={Valley: Video Assistant with Large Language model Enhanced abilitY}, 150 | author={Ruipu Luo and Ziwang Zhao and Min Yang and Junwei Dong and Minghui Qiu and Pengcheng Lu and Tao Wang and Zhongyu Wei}, 151 | year={2023}, 152 | eprint={2306.07207}, 153 | archivePrefix={arXiv}, 154 | primaryClass={cs.CV} 155 | } 156 | ``` 157 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools>=61.0"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | name = "valley" 7 | version = "0.1.0" 8 | description = "A video assistant towards instruction tuning" 9 | readme = "README.md" 10 | requires-python = ">=3.8" 11 | classifiers = [ 12 | "Programming Language :: Python :: 3", 13 | "License :: OSI Approved :: Apache Software License", 14 | ] 15 | dependencies = [ 16 | "accelerate", "einops", "fastapi", "gradio==3.23", "markdown2[all]", "numpy", 17 | "requests", "sentencepiece", "tokenizers==0.12.1", 18 | "torch", "torchvision", "uvicorn", 19 | "transformers@git+https://github.com/huggingface/transformers.git@cae78c46", 20 | "ninja", "decord","einops","scikit-image","opencv-python-headless","peft", 21 | "deepspeed", "prettytable","wandb" 22 | ] 23 | 24 | [project.urls] 25 | "Bug Tracker" = "https://github.com/RupertLuo/Valley/issues" 26 | 27 | [tool.setuptools.packages.find] 28 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] 29 | 30 | [tool.wheel] 31 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"] -------------------------------------------------------------------------------- /valley/configs/deepspeed/config_zero2.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | 14 | "zero_optimization": { 15 | "stage": 2, 16 | "allgather_partitions": true, 17 | "allgather_bucket_size": 5e8, 18 | "overlap_comm": true, 19 | "reduce_scatter": true, 20 | "reduce_bucket_size": 5e8, 21 | "contiguous_gradients": true 22 | }, 23 | 24 | "gradient_accumulation_steps": "auto", 25 | "gradient_clipping": "auto", 26 | "steps_per_print": 2000, 27 | "train_batch_size": "auto", 28 | "train_micro_batch_size_per_gpu": "auto", 29 | "wall_clock_breakdown": false 30 | } -------------------------------------------------------------------------------- /valley/configs/deepspeed/config_zero3.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "train_micro_batch_size_per_gpu": "auto", 14 | "train_batch_size": "auto", 15 | "gradient_accumulation_steps": "auto", 16 | "zero_optimization": { 17 | "stage": 3, 18 | "overlap_comm": true, 19 | "contiguous_gradients": true, 20 | "sub_group_size": 1e9, 21 | "reduce_bucket_size": "auto", 22 | "stage3_prefetch_bucket_size": "auto", 23 | "stage3_param_persistence_threshold": "auto", 24 | "stage3_max_live_parameters": 1e9, 25 | "stage3_max_reuse_distance": 1e9, 26 | "stage3_gather_16bit_weights_on_model_save": true 27 | } 28 | } -------------------------------------------------------------------------------- /valley/configs/deepspeed/config_zero3_offload.json: -------------------------------------------------------------------------------- 1 | { 2 | "fp16": { 3 | "enabled": "auto", 4 | "loss_scale": 0, 5 | "loss_scale_window": 1000, 6 | "initial_scale_power": 16, 7 | "hysteresis": 2, 8 | "min_loss_scale": 1 9 | }, 10 | "bf16": { 11 | "enabled": "auto" 12 | }, 13 | "optimizer": { 14 | "type": "AdamW", 15 | "params": { 16 | "lr": "auto", 17 | "betas": "auto", 18 | "eps": "auto", 19 | "weight_decay": "auto" 20 | } 21 | }, 22 | "scheduler": { 23 | "type": "WarmupLR", 24 | "params": { 25 | "warmup_min_lr": "auto", 26 | "warmup_max_lr": "auto", 27 | "warmup_num_steps": "auto" 28 | } 29 | }, 30 | "zero_optimization": { 31 | "stage": 3, 32 | "offload_optimizer": { 33 | "device": "cpu", 34 | "pin_memory": true 35 | }, 36 | "offload_param": { 37 | "device": "cpu", 38 | "pin_memory": true 39 | }, 40 | "overlap_comm": true, 41 | "contiguous_gradients": true, 42 | "sub_group_size": 1e9, 43 | "reduce_bucket_size": "auto", 44 | "stage3_prefetch_bucket_size": "auto", 45 | "stage3_param_persistence_threshold": "auto", 46 | "stage3_max_live_parameters": 1e9, 47 | "stage3_max_reuse_distance": 1e9, 48 | "gather_16bit_weights_on_model_save": true 49 | }, 50 | "gradient_accumulation_steps": "auto", 51 | "gradient_clipping": "auto", 52 | "train_batch_size": "auto", 53 | "train_micro_batch_size_per_gpu": "auto", 54 | "steps_per_print": 1e5, 55 | "wall_clock_breakdown": false 56 | } -------------------------------------------------------------------------------- /valley/configs/experiment/valley_stage1.yaml: -------------------------------------------------------------------------------- 1 | model_name_or_path: Path/to/opensource/LLM 2 | data_path: Path/to/LLaVA-CC3M-Pretrain-595K/chat.json 3 | image_folder: Path/to/LLaVA-CC3M-Pretrain-595K/image_new 4 | video_data_path: Path/to/webvid_703K/chat.json 5 | video_folder: Path/to/webvid_703K/videos 6 | output_dir: Path/to/model/out/dir 7 | # experiment name 8 | project_name: valley 9 | run_name: valley_stage1 10 | 11 | # Whether to make the system prompt a mask in the label, and others do not mask 12 | only_mask_system: False 13 | # system prompt style 14 | conv_mode: v1 15 | # wether freeze backbone 16 | freeze_backbone: True 17 | # wether tune multimodal projection layer 18 | tune_mm_mlp_adapter: True 19 | # wether lora 20 | lora: False 21 | # wether multimodal 22 | is_multimodal: True 23 | 24 | num_train_epochs: 1 25 | per_device_train_batch_size: 16 26 | save_strategy: steps 27 | save_steps: 2400 28 | learning_rate: 2e-3 29 | gradient_checkpointing: True 30 | 31 | # wether do fast epoch 32 | fast_epoch: False 33 | 34 | vision_tower: openai/clip-vit-large-patch14 35 | mm_vision_select_layer: -2 36 | mm_use_im_start_end: True 37 | lazy_preprocess: True 38 | bf16: False 39 | fp16: True 40 | tf32: False 41 | per_device_eval_batch_size: 1 42 | gradient_accumulation_steps: 1 43 | evaluation_strategy: "no" 44 | save_total_limit: 1 45 | weight_decay: 0. 46 | warmup_ratio: 0.03 47 | lr_scheduler_type: cosine 48 | logging_steps: 1 49 | model_max_length: 2048 50 | adam_beta1: 0.9 51 | adam_beta2: 0.95 52 | deepspeed: valley/configs/deepspeed/config_zero2.json 53 | report_to: wandb -------------------------------------------------------------------------------- /valley/configs/experiment/valley_stage2.yaml: -------------------------------------------------------------------------------- 1 | model_name_or_path: Path/ to/ pretrain/ valley/ from/ stage1 2 | data_path: Path/ to/ LLaVA-Instruct-150K/ llava_instruct_150k.json 3 | image_folder: Path/ to/ COCO/ train2014 4 | video_data_path: /Path/ to/ Valley-Instruct/ valley_instruct_73k.json 5 | video_folder: Path/ to/ Valley-Instruct/ videos 6 | output_dir: Model/ Output/ path 7 | prediction_file_name: Model/ Output/ path/ eval_result.jsonl # evaluation file output path 8 | # experiment name 9 | project_name: valley 10 | run_name: valley_stage2 11 | # Whether to make the system prompt a mask in the label, and others do not mask 12 | only_mask_system: False 13 | # system prompt style 14 | conv_mode: v1 15 | # wether freeze backbone 16 | freeze_backbone: False 17 | # wether tune multimodal projection layer 18 | tune_mm_mlp_adapter: True 19 | # wether lora 20 | lora: False 21 | # wether multimodal 22 | is_multimodal: True 23 | 24 | num_train_epochs: 3 25 | per_device_train_batch_size: 1 26 | per_device_eval_batch_size: 1 # must 1 27 | save_strategy: steps 28 | save_steps: 3000 29 | evaluation_strategy: 'no' 30 | eval_steps: 3000 31 | eval_num: 600 32 | use_legacy_prediction_loop: True 33 | predict_with_generate: True 34 | prediction_loss_only: False 35 | generation_max_length: 1536 36 | learning_rate: 2e-5 37 | gradient_checkpointing: True 38 | 39 | # wether do fast epoch 40 | fast_epoch: False 41 | 42 | vision_tower: openai/clip-vit-large-patch14 43 | mm_vision_select_layer: -2 44 | mm_use_im_start_end: True 45 | lazy_preprocess: True 46 | bf16: True 47 | fp16: False 48 | tf32: False 49 | gradient_accumulation_steps: 1 50 | weight_decay: 0. 51 | warmup_ratio: 0.03 52 | lr_scheduler_type: cosine 53 | logging_steps: 1 54 | model_max_length: 2048 55 | deepspeed: valley/configs/deepspeed/config_zero2.json 56 | report_to: wandb -------------------------------------------------------------------------------- /valley/configs/experiment/valley_stage2_lora.yaml: -------------------------------------------------------------------------------- 1 | model_name_or_path: Path/ to/ pretrain/ valley/ from/ stage1 2 | data_path: Path/ to/ LLaVA-Instruct-150K/ llava_instruct_150k.json 3 | image_folder: Path/ to/ COCO/ train2014 4 | video_data_path: /Path/ to/ Valley-Instruct/ valley_instruct_73k.json 5 | video_folder: Path/ to/ Valley-Instruct/ videos 6 | output_dir: Model/ Output/ path 7 | prediction_file_name: Model/ Output/ path/ eval_result.jsonl # evaluation file output path 8 | # experiment name 9 | project_name: valley 10 | run_name: valley_stage2_lora 11 | # Whether to make the system prompt a mask in the label, and others do not mask 12 | only_mask_system: False 13 | # system prompt style 14 | conv_mode: v1 15 | # wether freeze backbone 16 | freeze_backbone: False 17 | # wether tune multimodal projection layer 18 | tune_mm_mlp_adapter: True 19 | # wether lora 20 | lora: True 21 | # wether multimodal 22 | is_multimodal: True 23 | 24 | num_train_epochs: 3 25 | per_device_train_batch_size: 4 26 | save_strategy: 'no' 27 | lora_save_strategy: steps # if do lora training, turn on this button, to only save lora weight. support ['steps','epochs','no'] 28 | save_steps: 5000 29 | learning_rate: 5e-4 30 | gradient_checkpointing: True 31 | 32 | # wether do fast epoch 33 | fast_epoch: False 34 | 35 | vision_tower: openai/clip-vit-large-patch14 36 | mm_vision_select_layer: -2 37 | mm_use_im_start_end: True 38 | lazy_preprocess: True 39 | bf16: False 40 | fp16: True 41 | tf32: False 42 | per_device_eval_batch_size: 1 43 | gradient_accumulation_steps: 1 44 | evaluation_strategy: "no" 45 | save_total_limit: 3 46 | weight_decay: 0. 47 | warmup_ratio: 0.03 48 | lr_scheduler_type: cosine 49 | logging_steps: 1 50 | model_max_length: 2048 51 | adam_beta1: 0.9 52 | adam_beta2: 0.95 53 | deepspeed: valley/configs/deepspeed/config_zero2.json 54 | report_to: wandb -------------------------------------------------------------------------------- /valley/configs/experiment/valley_stage2_zero3.yaml: -------------------------------------------------------------------------------- 1 | model_name_or_path: Path/ to/ pretrain/ valley/ from/ stage1 2 | data_path: Path/ to/ LLaVA-Instruct-150K/ llava_instruct_150k.json 3 | image_folder: Path/ to/ COCO/ train2014 4 | video_data_path: /Path/ to/ Valley-Instruct/ valley_instruct_73k.json 5 | video_folder: Path/ to/ Valley-Instruct/ videos 6 | output_dir: Model/ Output/ path 7 | prediction_file_name: Model/ Output/ path/ eval_result.jsonl # evaluation file output path 8 | # experiment name 9 | project_name: valley2 10 | run_name: valley_stage2_zero3 11 | # Whether to make the system prompt a mask in the label, and others do not mask 12 | only_mask_system: False 13 | # system prompt style 14 | conv_mode: v1 15 | # wether freeze backbone 16 | freeze_backbone: False 17 | # wether tune multimodal projection layer 18 | tune_mm_mlp_adapter: True 19 | # wether freeze multimodal projection layer 20 | freeze_mm_mlp_adapter: False 21 | # wether lora 22 | lora: False 23 | # wether multimodal 24 | is_multimodal: True 25 | 26 | num_train_epochs: 3 27 | per_device_train_batch_size: 1 # zero3 must 1 28 | per_device_eval_batch_size: 1 # must 1 29 | save_strategy: steps 30 | save_steps: 3000 31 | evaluation_strategy: "no" 32 | eval_steps: 3000 33 | eval_num: 600 34 | use_legacy_prediction_loop: True 35 | predict_with_generate: True 36 | prediction_loss_only: False 37 | generation_max_length: 1536 38 | learning_rate: 2e-5 39 | gradient_checkpointing: True 40 | 41 | # wether do fast epoch 42 | fast_epoch: False 43 | 44 | vision_tower: openai/clip-vit-large-patch14 45 | mm_vision_select_layer: -2 46 | mm_use_im_start_end: True 47 | lazy_preprocess: True 48 | bf16: False 49 | fp16: True 50 | tf32: False 51 | gradient_accumulation_steps: 1 52 | weight_decay: 0. 53 | warmup_ratio: 0.03 54 | lr_scheduler_type: cosine 55 | logging_steps: 1 56 | model_max_length: 2048 57 | deepspeed: valley/configs/deepspeed/config_zero3.json 58 | report_to: wandb -------------------------------------------------------------------------------- /valley/constants.py: -------------------------------------------------------------------------------- 1 | CONTROLLER_HEART_BEAT_EXPIRATION = 30 2 | WORKER_HEART_BEAT_INTERVAL = 15 3 | 4 | LOGDIR = "." 5 | -------------------------------------------------------------------------------- /valley/conversation.py: -------------------------------------------------------------------------------- 1 | import dataclasses 2 | from enum import auto, Enum 3 | from typing import List, Tuple 4 | import base64 5 | 6 | class SeparatorStyle(Enum): 7 | """Different separator style.""" 8 | SINGLE = auto() 9 | TWO = auto() 10 | 11 | 12 | @dataclasses.dataclass 13 | class Conversation: 14 | """A class that keeps all conversation history.""" 15 | system: str 16 | roles: List[str] 17 | messages: List[List[str]] 18 | offset: int 19 | sep_style: SeparatorStyle = SeparatorStyle.SINGLE 20 | sep: str = "###" 21 | sep2: str = None 22 | mode: str = None 23 | skip_next: bool = False 24 | 25 | def get_prompt(self): 26 | if self.sep_style == SeparatorStyle.SINGLE: 27 | ret = self.system + self.sep 28 | for role, message in self.messages: 29 | if message: 30 | if type(message) is tuple: 31 | message, _, _ = message 32 | ret += role + ": " + message + self.sep 33 | else: 34 | ret += role + ":" 35 | return ret 36 | elif self.sep_style == SeparatorStyle.TWO: 37 | seps = [self.sep, self.sep2] 38 | ret = self.system + seps[0] 39 | for i, (role, message) in enumerate(self.messages): 40 | if message: 41 | if type(message) is tuple: 42 | message, _, _ = message 43 | ret += role + ": " + message + seps[i % 2] 44 | else: 45 | ret += role + ":" 46 | return ret 47 | else: 48 | raise ValueError(f"Invalid style: {self.sep_style}") 49 | 50 | def append_message(self, role, message): 51 | self.messages.append([role, message]) 52 | 53 | 54 | def get_video(self,): 55 | videos = [] 56 | path_list = [] 57 | for i, (role, msg) in enumerate(self.messages[self.offset:]): 58 | if i % 2 == 0: 59 | if type(msg) is tuple: 60 | msg, video_path, image_process_mode = msg 61 | path_list.append(video_path) 62 | with open(video_path, "rb") as videoFile: 63 | video_b64_str = base64.b64encode(videoFile.read()) 64 | videos.append(video_b64_str) 65 | return videos, path_list 66 | def get_images(self, return_pil=False): 67 | images = [] 68 | for i, (role, msg) in enumerate(self.messages[self.offset:]): 69 | if i % 2 == 0: 70 | if type(msg) is tuple: 71 | import base64 72 | from io import BytesIO 73 | from PIL import Image 74 | msg, image_list, image_process_mode = msg 75 | if type(image_list) is not list: 76 | image_list = [image_list] 77 | for image in image_list: 78 | if image_process_mode == "Pad": 79 | def expand2square(pil_img, background_color=(122, 116, 104)): 80 | width, height = pil_img.size 81 | if width == height: 82 | return pil_img 83 | elif width > height: 84 | result = Image.new(pil_img.mode, (width, width), background_color) 85 | result.paste(pil_img, (0, (width - height) // 2)) 86 | return result 87 | else: 88 | result = Image.new(pil_img.mode, (height, height), background_color) 89 | result.paste(pil_img, ((height - width) // 2, 0)) 90 | return result 91 | image = expand2square(image) 92 | elif image_process_mode == "Crop": 93 | pass 94 | elif image_process_mode == "Resize": 95 | image = image.resize((224, 224)) 96 | else: 97 | raise ValueError(f"Invalid image_process_mode: {image_process_mode}") 98 | max_hw, min_hw = max(image.size), min(image.size) 99 | aspect_ratio = max_hw / min_hw 100 | max_len, min_len = 800, 400 101 | shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw)) 102 | longest_edge = int(shortest_edge * aspect_ratio) 103 | W, H = image.size 104 | if H > W: 105 | H, W = longest_edge, shortest_edge 106 | else: 107 | H, W = shortest_edge, longest_edge 108 | image = image.resize((W, H)) 109 | if return_pil: 110 | images.append(image) 111 | else: 112 | buffered = BytesIO() 113 | image.save(buffered, format="JPEG") 114 | img_b64_str = base64.b64encode(buffered.getvalue()).decode() 115 | images.append(img_b64_str) 116 | return images 117 | 118 | def to_gradio_chatbot(self): 119 | ret = [] 120 | for i, (role, msg) in enumerate(self.messages[self.offset:]): 121 | if i % 2 == 0: 122 | if type(msg) is tuple: 123 | import base64 124 | from io import BytesIO 125 | msg, image, image_process_mode = msg 126 | img_str = '' 127 | max_hw, min_hw = max(image.size), min(image.size) 128 | aspect_ratio = max_hw / min_hw 129 | max_len, min_len = 800, 400 130 | shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw)) 131 | longest_edge = int(shortest_edge * aspect_ratio) 132 | W, H = image.size 133 | if H > W: 134 | H, W = longest_edge, shortest_edge 135 | else: 136 | H, W = shortest_edge, longest_edge 137 | image = image.resize((W, H)) 138 | # image = image.resize((224, 224)) 139 | buffered = BytesIO() 140 | image.save(buffered, format="JPEG") 141 | img_b64_str = base64.b64encode(buffered.getvalue()).decode() 142 | img_str = img_str+f'user upload image' 143 | msg = msg.replace('', '')+img_str 144 | ret.append([msg, None]) 145 | else: 146 | ret.append([msg, None]) 147 | else: 148 | ret[-1][-1] = msg 149 | return ret 150 | 151 | def video_to_gradio_chatbot(self): 152 | ret = [] 153 | for i, (role, msg) in enumerate(self.messages[self.offset:]): 154 | if i % 2 == 0: 155 | if type(msg) is tuple: 156 | 157 | msg, video, image_process_mode = msg 158 | with open(video, "rb") as videoFile: 159 | video_b64_str = base64.b64encode(videoFile.read()).decode("utf-8") 160 | img_str = '' 161 | img_str = img_str+f'''''' 164 | msg = msg.replace('