├── INSTALLATION.md
├── README.md
├── align.sh
├── assest
    ├── imgs
    ├── main_features_embodiedgpt.png
    └── overall_frame_embodiedgpt.png
├── datasets
    └── datasets_share.zip
├── demo
    ├── .DS_Store
    ├── inference.py
    ├── script.py
    └── test.py
├── pyproject.toml
├── requirements.txt
├── robohusky
    ├── .DS_Store
    ├── base_dataset.py
    ├── base_dataset_uni.py
    ├── compression.py
    ├── configuration_husky.py
    ├── constants.py
    ├── conversation.py
    ├── convert_fp16.py
    ├── convert_husky_fp16.py
    ├── convert_reward_fp16.py
    ├── dist_utils.py
    ├── llama2_flash_attn_monkey_patch.py
    ├── model
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-38.pyc
    │   │   ├── configuration_husky.cpython-38.pyc
    │   │   └── modeling_husky_embody2.cpython-38.pyc
    │   ├── compression.py
    │   ├── configuration_husky.py
    │   ├── configuration_husky_ori.py
    │   ├── modeling_husky.py
    │   ├── modeling_husky_embody2.py
    │   ├── modeling_husky_embody2_ori.py
    │   └── processing_husky.py
    ├── train
    │   ├── .DS_Store
    │   ├── llama_flash_attn_monkey_patch.py
    │   ├── llama_rmsnorm_monkey_patch.py
    │   ├── train.py
    │   └── train_uni.py
    ├── utils.py
    └── video_transformers.py
├── train_files
    └── example_train_file.json
├── zero_stage0_config.json
├── zero_stage1_config.json
├── zero_stage2_config.json
└── zero_stage3_config.json


/INSTALLATION.md:
--------------------------------------------------------------------------------
 1 | ## 🛠️ Installation
 2 | 
 3 | - Clone this repository:
 4 | 
 5 |   ```bash
 6 |   git clone https://github.com/EmbodiedGPT/EmbodiedGPT_Pytorch
 7 |   ```
 8 | 
 9 | - Create a conda virtual environment and activate it:
10 | 
11 |   ```bash
12 |   conda create -n robohusky python=3.9.16 -y
13 |   conda activate robohusky
14 |   ```
15 | 
16 | - Install `PyTorch>=2.0` and `torchvision>=0.15.2` with `CUDA>=11.7`:
17 | 
18 |   For examples, to install `torch==2.0.1` with `CUDA==11.8`:
19 | 
20 |   ```bash
21 |   conda install pytorch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 pytorch-cuda=11.8 -c pytorch -c nvidia
22 |   # or
23 |   pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2 --index-url https://download.pytorch.org/whl/cu118
24 |   ```
25 | 
26 | - Install `flash-attn`:
27 | 
28 |   ```bash
29 |   git clone https://github.com/Dao-AILab/flash-attention.git
30 |   cd flash-attention
31 |   pip install flash-attn --no-build-isolation
32 |   ```
33 | 
34 | - Install `transformers==4.34.1`:
35 | 
36 |   ```bash
37 |   pip install transformers==4.34.1
38 |   ```
39 | 
40 | - Install `apex` (optional):
41 | 
42 |   ```bash
43 |   git clone https://github.com/NVIDIA/apex.git
44 |   git checkout 2386a912164b0c5cfcd8be7a2b890fbac5607c82  # https://github.com/NVIDIA/apex/issues/1735
45 |   pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" ./
46 |   ```
47 | 
48 | - Install other requirements:
49 | 
50 |   ```bash
51 |   cd ..
52 |   pip install -e EmbodiedGPT_Pytorch
53 |   ```


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Embodied Family Code Base
  2 | 
  3 | We will update the instructions for this codebase as soon as possible.
  4 | 
  5 | ## Installation
  6 | 
  7 | See [INSTALLATION.md](https://github.com/EmbodiedGPT/EmbodiedGPT_Pytorch/blob/main/INSTALLATION.md)
  8 | 
  9 | ## Data Preparation
 10 | 
 11 | 1. Download the [EgoCOT dataset](https://github.com/EmbodiedGPT/EgoCOT_Dataset).
 12 | 2. Download the [COCO-2017 dataset](https://www.kaggle.com/datasets/awsaf49/coco-2017-dataset).
 13 | 
 14 | ## Download the Pretrained Model
 15 | 
 16 | Download the testing
 17 | model [Embodied_family_7btiny](https://huggingface.co/Liang-ZX/Embodied_family_7b/).
 18 | 
 19 | ## Prepare the Text Data Paired with Video and Image
 20 | 
 21 | - Unzip `datasets_share.zip`, which contains the text part of the multi-modal dataset, to the `./datasets/` directory.
 22 | 
 23 | ## 🏠 Overview
 24 | 
 25 | <img width="800" alt="image" src="https://github.com/EmbodiedGPT/EmbodiedGPT_Pytorch/blob/main/assest/overall_frame_embodiedgpt.png">
 26 | 
 27 | ## 🎁 Major Features
 28 | 
 29 | <img width="800" alt="image" src="https://github.com/EmbodiedGPT/EmbodiedGPT_Pytorch/blob/main/assest/main_features_embodiedgpt.png">
 30 | 
 31 | ## Usage
 32 | 
 33 | This repo can be used in conjunction with PyTorch's `Dataset` and `DataLoader` for training models on heterogeneous
 34 | data. Here's a brief overview of the classes and their functionalities:
 35 | 
 36 | ### BaseDataset
 37 | 
 38 | The `BaseDataset` class extends PyTorch's `Dataset` and is designed to handle different media types (images, videos, and
 39 | text). It includes a transformation process to standardize the input data and a processor to handle the data specific to
 40 | the task.
 41 | 
 42 | #### Example
 43 | 
 44 | ```python
 45 | from robohusky.base_dataset_uni import BaseDataset
 46 | 
 47 | # Initialize the dataset with the required parameters
 48 | dataset = BaseDataset(
 49 |     dataset,  # Your dataset here
 50 |     processor,  # Your processor here
 51 |     image_path="path/to/images",
 52 |     input_size=224,
 53 |     num_segments=8,
 54 |     norm_type="openai",
 55 |     media_type="image"
 56 | )
 57 | 
 58 | # Use the dataset with a PyTorch DataLoader
 59 | from torch.utils.data import DataLoader
 60 | 
 61 | data_loader = DataLoader(dataset, batch_size=32, shuffle=True)
 62 | ```
 63 | 
 64 | ### WeightedConcatDataset
 65 | 
 66 | The `WeightedConcatDataset` class extends PyTorch's `ConcatDataset` and allows for the creation of a unified dataset by
 67 | concatenating multiple datasets with specified weights.
 68 | 
 69 | #### Example
 70 | 
 71 | ```python
 72 | from robohusky.base_dataset_uni import WeightedConcatDataset
 73 | 
 74 | # Assume we have multiple datasets for different tasks
 75 | dataset1 = BaseDataset(...)
 76 | dataset2 = BaseDataset(...)
 77 | dataset3 = BaseDataset(...)
 78 | 
 79 | # Define the weights for each dataset
 80 | weights = [0.5, 0.3, 0.2]
 81 | 
 82 | # Create a weighted concatenated dataset
 83 | weighted_dataset = WeightedConcatDataset([dataset1, dataset2, dataset3], weights=weights)
 84 | 
 85 | # Use the weighted dataset with a PyTorch DataLoader
 86 | data_loader = DataLoader(weighted_dataset, batch_size=32, shuffle=True)
 87 | ```
 88 | 
 89 | ## Customization
 90 | 
 91 | The package is designed to be flexible and customizable. You can implement your own transformation and processing logic
 92 | by subclassing `BaseDataset` and overriding the necessary methods.
 93 | 
 94 | ## 🎫 License
 95 | 
 96 | This project is released under the [Apache 2.0 license](LICENSE).
 97 | 
 98 | ## 🖊️ Citation
 99 | 
100 | If you find this project useful in your research, please consider cite:
101 | ```bibtex
102 | @article{mu2024embodiedgpt,
103 |   title={Embodiedgpt: Vision-language pre-training via embodied chain of thought},
104 |   author={Mu, Yao and Zhang, Qinglong and Hu, Mengkang and Wang, Wenhai and Ding, Mingyu and Jin, Jun and Wang, Bin and Dai, Jifeng and Qiao, Yu and Luo, Ping},
105 |   journal={Advances in Neural Information Processing Systems},
106 |   volume={36},
107 |   year={2024}
108 | }
109 | ```
110 | 


--------------------------------------------------------------------------------
/align.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | set -x
 4 | 
 5 | PARTITION= "your partition"
 6 | 
 7 | GPUS=${GPUS:-your number}
 8 | GPUS_PER_NODE=${GPUS_PER_NODE:-your number}
 9 | QUOTA_TYPE="reserved"
10 | 
11 | CPUS_PER_TASK=${CPUS_PER_TASK:-10}
12 | SRUN_ARGS=${SRUN_ARGS:-""}
13 | 
14 | srun -p ${PARTITION} \
15 |   --job-name='embodied_family' \
16 |   --gres=gpu:${GPUS_PER_NODE} \
17 |   --nodes= your number \
18 |   --ntasks=${GPUS} \
19 |   --ntasks-per-node=${GPUS_PER_NODE} \
20 |   --cpus-per-task=${CPUS_PER_TASK} \
21 |   --kill-on-bad-exit=1 \
22 |   --quotatype=${QUOTA_TYPE} \
23 |   ${SRUN_ARGS} \
24 |   python -u ./embodied_family/robohusky/train/train.py\
25 |   --model_name_or_path "your path" \
26 |   --cache_dir "/your path to cache"\
27 |   --conv_style "husky" \
28 |   --train_file "your path to train file" \
29 |   --output_dir "your output dir" \
30 |   --overwrite_output_dir True \
31 |   --run_name "embodied_family" \
32 |   --freeze_vision_model False \
33 |   --freeze_vision_adapter False \
34 |   --freeze_qformer False \
35 |   --freeze_text_model False \
36 |   --preprocessing_num_workers 1 \
37 |   --pad_to_max_length True \
38 |   --fp16 True \
39 |   --num_train_epochs 3 \
40 |   --per_device_train_batch_size 1 \
41 |   --per_device_eval_batch_size 1 \
42 |   --gradient_accumulation_steps 4 \
43 |   --evaluation_strategy "no" \
44 |   --save_strategy "steps" \
45 |   --save_steps 1000 \
46 |   --save_total_limit 1 \
47 |   --learning_rate 2e-6 \
48 |   --weight_decay 0. \
49 |   --warmup_ratio 0.05 \
50 |   --lr_scheduler_type "cosine" \
51 |   --logging_steps 1 \
52 |   --max_seq_length 2048 \
53 |   --do_train True \
54 |   --deepspeed "zero_stage2_config.json"
55 | 


--------------------------------------------------------------------------------
/assest/imgs:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/assest/main_features_embodiedgpt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmbodiedGPT/EmbodiedGPT_Pytorch/cda80524bf6b7d276ba3b532887bacd4b133f234/assest/main_features_embodiedgpt.png


--------------------------------------------------------------------------------
/assest/overall_frame_embodiedgpt.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmbodiedGPT/EmbodiedGPT_Pytorch/cda80524bf6b7d276ba3b532887bacd4b133f234/assest/overall_frame_embodiedgpt.png


--------------------------------------------------------------------------------
/datasets/datasets_share.zip:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmbodiedGPT/EmbodiedGPT_Pytorch/cda80524bf6b7d276ba3b532887bacd4b133f234/datasets/datasets_share.zip


--------------------------------------------------------------------------------
/demo/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmbodiedGPT/EmbodiedGPT_Pytorch/cda80524bf6b7d276ba3b532887bacd4b133f234/demo/.DS_Store


--------------------------------------------------------------------------------
/demo/inference.py:
--------------------------------------------------------------------------------
  1 | """
  2 | srun -p INTERN2 --job-name='husky_multi_test' --gres=gpu:1 --cpus-per-task=8 --quotatype="auto" python -u demo/inference_new.py
  3 | """
  4 | 
  5 | import abc
  6 | from typing import Optional
  7 | 
  8 | import os
  9 | import requests
 10 | from PIL import Image
 11 | from io import BytesIO
 12 | 
 13 | import torch
 14 | import torchvision.transforms as T
 15 | from peft import PeftModel
 16 | from torchvision.transforms.functional import InterpolationMode
 17 | 
 18 | from transformers import (
 19 |     LlamaTokenizer,
 20 |     GenerationConfig,
 21 |     StoppingCriteria,
 22 |     StoppingCriteriaList,
 23 | )
 24 | 
 25 | from robohusky.model.modeling_husky_embody2 import HuskyForConditionalGeneration
 26 | 
 27 | from robohusky.conversation import (
 28 |     conv_templates,
 29 |     get_conv_template,
 30 | )
 31 | 
 32 | from robohusky.video_transformers import (
 33 |     GroupNormalize,
 34 |     GroupScale,
 35 |     GroupCenterCrop,
 36 |     Stack,
 37 |     ToTorchFormatTensor,
 38 |     get_index,
 39 | )
 40 | 
 41 | from robohusky.compression import compress_module
 42 | from decord import VideoReader, cpu
 43 | 
 44 | # import deepspeed
 45 | 
 46 | IGNORE_INDEX = -100
 47 | DEFAULT_UNK_TOKEN = "<unk>"
 48 | DEFAULT_IMG_START_TOKEN = "<img>"
 49 | DEFAULT_IMG_END_TOKEN = "</img>"
 50 | 
 51 | DEFAULT_VIDEO_START_TOKEN = "<vid>"
 52 | DEFAULT_VIDEO_END_TOKEN = "</vid>"
 53 | 
 54 | def get_gpu_memory(max_gpus=None):
 55 |     gpu_memory = []
 56 |     num_gpus = (
 57 |         torch.cuda.device_count()
 58 |         if max_gpus is None
 59 |         else min(max_gpus, torch.cuda.device_count())
 60 |     )
 61 | 
 62 |     for gpu_id in range(num_gpus):
 63 |         with torch.cuda.device(gpu_id):
 64 |             device = torch.cuda.current_device()
 65 |             gpu_properties = torch.cuda.get_device_properties(device)
 66 |             total_memory = gpu_properties.total_memory / (1024 ** 3)
 67 |             allocated_memory = torch.cuda.memory_allocated() / (1024 ** 3)
 68 |             available_memory = total_memory - allocated_memory
 69 |             gpu_memory.append(available_memory)
 70 |     return gpu_memory
 71 | 
 72 | def load_model(
 73 |         model_path, device, num_gpus, max_gpu_memory=None, load_8bit=False, lora_weights=None
 74 | ):
 75 |     if device == "cpu":
 76 |         kwargs = {}
 77 |     elif device == "cuda":
 78 |         kwargs = {"torch_dtype": torch.float16}
 79 |         if num_gpus == "auto":
 80 |             kwargs["device_map"] = "auto"
 81 |         else:
 82 |             num_gpus = int(num_gpus)
 83 |             if num_gpus != 1:
 84 |                 kwargs["device_map"] = "auto"
 85 |                 if max_gpu_memory is None:
 86 |                     kwargs[
 87 |                         "device_map"
 88 |                     ] = "sequential"  # This is important for not the same VRAM sizes
 89 |                     available_gpu_memory = get_gpu_memory(num_gpus)
 90 |                     kwargs["max_memory"] = {
 91 |                         i: str(int(available_gpu_memory[i] * 0.85)) + "GiB"
 92 |                         for i in range(num_gpus)
 93 |                     }
 94 |                 else:
 95 |                     kwargs["max_memory"] = {i: max_gpu_memory for i in range(num_gpus)}
 96 |     else:
 97 |         raise ValueError(f"Invalid device: {device}")
 98 | 
 99 |     tokenizer = LlamaTokenizer.from_pretrained(
100 |         model_path, use_fast=False)
101 | 
102 |     if lora_weights is None:
103 |         model = HuskyForConditionalGeneration.from_pretrained(
104 |             model_path, low_cpu_mem_usage=True, **kwargs
105 |         )
106 |     else:
107 |         kwargs["device_map"] = "auto"
108 |         model = HuskyForConditionalGeneration.from_pretrained(
109 |             model_path, low_cpu_mem_usage=True, **kwargs
110 |         )
111 |         model.language_model = PeftModel.from_pretrained(
112 |             model.language_model,
113 |             lora_weights,
114 |             **kwargs
115 |         )
116 | 
117 |     if load_8bit:
118 |         compress_module(model, device)
119 | 
120 |     if (device == "cuda" and num_gpus == 1) or device == "mps":
121 |         model.to(device)
122 | 
123 |     model = model.eval()
124 |     return model, tokenizer
125 | 
126 | def load_image(image_file, input_size=224):
127 |     if image_file.startswith('http') or image_file.startswith('https'):
128 |         response = requests.get(image_file)
129 |         image = Image.open(BytesIO(response.content)).convert('RGB')
130 |     else:
131 |         image = Image.open(image_file).convert('RGB')
132 | 
133 |     crop_pct = 224 / 256
134 |     size = int(input_size / crop_pct)
135 |     transform = T.Compose([
136 |         T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
137 |         T.Resize(size, interpolation=InterpolationMode.BICUBIC),
138 |         T.CenterCrop(input_size),
139 |         T.ToTensor(),
140 |         T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
141 |     ])
142 |     image = transform(image)
143 |     return image
144 | 
145 | def load_video(video_path, num_segments=8):
146 |     vr = VideoReader(video_path, ctx=cpu(0))
147 |     num_frames = len(vr)
148 |     frame_indices = get_index(num_frames, num_segments)
149 | 
150 |     # transform
151 |     crop_size = 224
152 |     scale_size = 224
153 |     input_mean = [0.48145466, 0.4578275, 0.40821073]
154 |     input_std = [0.26862954, 0.26130258, 0.27577711]
155 | 
156 |     transform = T.Compose([
157 |         GroupScale(int(scale_size), interpolation=InterpolationMode.BICUBIC),
158 |         GroupCenterCrop(crop_size),
159 |         Stack(),
160 |         ToTorchFormatTensor(),
161 |         GroupNormalize(input_mean, input_std)
162 |     ])
163 | 
164 |     images_group = list()
165 |     for frame_index in frame_indices:
166 |         img = Image.fromarray(vr[frame_index].asnumpy())
167 |         images_group.append(img)
168 |     video = transform(images_group)
169 |     return video
170 | 
171 | class StoppingCriteriaSub(StoppingCriteria):
172 | 
173 |     def __init__(self, stops, encounters=1):
174 |         super().__init__()
175 |         self.stops = stops
176 | 
177 |     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
178 |         for stop in self.stops:
179 |             if torch.all((stop == input_ids[0][-len(stop):])).item():
180 |                 return True
181 | 
182 |         return False
183 | 
184 | @torch.inference_mode()
185 | def generate_stream(
186 |         model, tokenizer, image_processor, params, device
187 | ):
188 |     prompt = params["prompt"]
189 |     images = params.get("images", None)
190 |     videos = params.get("videos", None)
191 |     temperature = float(params.get("temperature", 0.7))
192 |     max_new_tokens = int(params.get("max_new_tokens", 1024))
193 | 
194 |     num_queries = model.config.num_query_tokens
195 | 
196 |     stop_words = ["Human: ", "Assistant: ", "###", "\n\n"]
197 |     stop_words_ids = [tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in stop_words]
198 |     stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
199 | 
200 |     generation_config = GenerationConfig(
201 |         bos_token_id=1,
202 |         do_sample=True,
203 |         temperature=temperature,
204 |         max_new_tokens=max_new_tokens,
205 |         stopping_criteria=stopping_criteria
206 |     )
207 | 
208 |     pixel_values = None
209 |     if images is not None:
210 |         pixel_values = load_image(images).to(device)  # only support one image
211 |         image_query = DEFAULT_IMG_START_TOKEN + DEFAULT_IMG_END_TOKEN
212 |         prompt = prompt.replace("<image>", image_query)
213 | 
214 |     elif videos is not None:
215 |         pixel_values = load_video(videos).to(device)
216 |         video_query = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_END_TOKEN
217 |         prompt = prompt.replace("<video>", video_query)
218 | 
219 |     model_inputs = tokenizer([prompt], return_tensors="pt")
220 |     model_inputs.pop("token_type_ids", None)
221 | 
222 |     if pixel_values is not None:
223 |         model_inputs["pixel_values"] = pixel_values
224 | 
225 |         generation_output = model.generate(
226 |             **model_inputs,
227 |             generation_config=generation_config,
228 |             return_dict_in_generate=True,
229 |             output_scores=True
230 |         )
231 |     else:
232 |         generation_output = model.language_model.generate(
233 |             **model_inputs,
234 |             generation_config=generation_config,
235 |             return_dict_in_generate=True,
236 |             output_scores=True
237 |         )
238 | 
239 |     preds = generation_output.sequences
240 |     outputs = tokenizer.batch_decode(preds, skip_special_tokens=True)
241 |     return outputs
242 | 
243 | class Chat:
244 |     def __init__(
245 |             self,
246 |             model_path,
247 |             device,
248 |             num_gpus=1,
249 |             load_8bit=False,
250 |             temperature=0.7,
251 |             max_new_tokens=512,
252 |             lora_path=None,
253 |     ):
254 |         model, tokenizer = load_model(
255 |             model_path, device, num_gpus, load_8bit=load_8bit, lora_weights=lora_path
256 |         )
257 | 
258 |         self.model = model
259 |         # self.model.language_model = deepspeed.init_inference(
260 |         #     self.model.language_model, mp_size=1, dtype=torch.float16, checkpoint=None, replace_with_kernel_inject=True)
261 |         self.tokenizer = tokenizer
262 |         num_queries = model.config.num_query_tokens
263 | 
264 |         self.device = device
265 |         self.dtype = model.dtype
266 | 
267 |         stop_words = ["Human: ", "Assistant: ", "###", "\n\n"]
268 |         stop_words_ids = [tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in stop_words]
269 |         stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
270 | 
271 |         self.conv = get_conv_template("husky")
272 | 
273 |         self.image_query = DEFAULT_IMG_START_TOKEN + DEFAULT_IMG_END_TOKEN
274 |         self.video_query = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_END_TOKEN
275 | 
276 |         self.generation_config = GenerationConfig(
277 |             bos_token_id=1,
278 |             do_sample=True,
279 |             top_k=20,
280 |             top_p=0.9,
281 |             temperature=temperature,
282 |             max_new_tokens=max_new_tokens,
283 |             stopping_criteria=stopping_criteria
284 |         )
285 | 
286 |     def ask(self, text, conv, modal_type="image"):
287 |         assert modal_type in ["text", "image", "video"]
288 |         conversations = []
289 | 
290 |         if len(conv.messages) > 0 or modal_type == "text":
291 |             conv.append_message(conv.roles[0], text)
292 |         elif modal_type == "image":
293 |             conv.append_message(conv.roles[0], self.image_query + "\n" + text)
294 |         else:
295 |             conv.append_message(conv.roles[0], self.video_query + "\n" + text)
296 | 
297 |         conv.append_message(conv.roles[1], None)
298 |         conversations.append(conv.get_prompt())
299 |         return conversations
300 | 
301 |     @torch.no_grad()
302 |     def get_image_embedding(self, image_file):
303 |         pixel_values = load_image(image_file)
304 |         pixel_values = pixel_values.unsqueeze(0).to(self.device, dtype=self.dtype)
305 |         language_model_inputs = self.model.extract_feature(pixel_values)
306 |         return language_model_inputs
307 | 
308 |     @torch.no_grad()
309 |     def get_video_embedding(self, video_file):
310 |         pixel_values = load_video(video_file)
311 |         TC, H, W = pixel_values.shape
312 |         pixel_values = pixel_values.reshape(TC // 3, 3, H, W).transpose(0, 1)  # [C, T, H, W]
313 |         pixel_values = pixel_values.unsqueeze(0).to(self.device, dtype=self.dtype)
314 |         assert len(pixel_values.shape) == 5
315 |         language_model_inputs = self.model.extract_feature(pixel_values)
316 |         return language_model_inputs
317 | 
318 |     @torch.no_grad()
319 |     def answer(self, conversations, language_model_inputs, modal_type="image"):
320 |         model_inputs = self.tokenizer(
321 |             conversations,
322 |             return_tensors="pt",
323 |         )
324 |         model_inputs.pop("token_type_ids", None)
325 | 
326 |         input_ids = model_inputs["input_ids"].to(self.device)
327 |         attention_mask = model_inputs["attention_mask"].to(self.device)
328 | 
329 |         if modal_type == "text":
330 |             generation_output = self.model.language_model.generate(
331 |                 input_ids=input_ids,
332 |                 attention_mask=attention_mask,
333 |                 generation_config=self.generation_config,
334 |                 return_dict_in_generate=True,
335 |                 output_scores=True
336 |             )
337 |         else:
338 |             pixel_values = model_inputs.pop("pixel_values", None)
339 |             if pixel_values is not None:
340 |                 pixel_values = pixel_values.to(self.device)
341 | 
342 |             generation_output = self.model.generate(
343 |                 pixel_values=pixel_values,
344 |                 input_ids=input_ids,
345 |                 attention_mask=attention_mask,
346 |                 language_model_inputs=language_model_inputs,
347 |                 generation_config=self.generation_config,
348 |                 return_dict_in_generate=True,
349 |                 output_scores=True
350 |             )
351 | 
352 |         preds = generation_output.sequences
353 |         outputs = self.tokenizer.batch_decode(preds, skip_special_tokens=True)[0]
354 | 
355 |         if modal_type == "text":
356 |             skip_echo_len = len(conversations[0]) - conversations[0].count("</s>") * 3
357 |             outputs = outputs[skip_echo_len:].strip()
358 | 
359 |         return outputs 
360 | 
361 | if __name__ == '__main__':
362 |     # model_path = "/mnt/petrelfs/zhangqinglong/Documents/Husky/work_dirs/husky_v3/EmbodiedGPT/pretrain_0727"
363 |     model_path = "/mnt/petrelfs/share_data/gvembodied/workdirs/align_new_myyf"
364 |     device = "cuda" if torch.cuda.is_available() else "cpu"
365 |     chat = Chat(model_path, device=device, num_gpus=1, max_new_tokens=1024, load_8bit=False)
366 | 
367 |     vision_feature = None
368 |     image_state = False
369 |     video_state = False
370 | 
371 |     while True:
372 |         query = input("\n")
373 |         if query.lower().endswith(('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff')):
374 |             if os.path.exists(query):
375 |                 print("received.")
376 |                 vision_feature = chat.get_image_embedding(query)
377 |                 chat.conv = get_conv_template("husky").copy()
378 |                 image_state = True
379 |                 continue
380 |         if query.lower().endswith(('.mp4', '.mkv', '.avi', '.wmv', '.iso', ".webm")):
381 |             if os.path.exists(query):
382 |                 print("received.")
383 |                 vision_feature = chat.get_video_embedding(query)
384 |                 chat.conv = get_conv_template("husky").copy()
385 |                 video_state = True
386 |                 continue
387 | 
388 |         if query == "stop":
389 |             break
390 |         if query == "clear" or query == "" or query == "\n":
391 |             chat.conv = get_conv_template("husky").copy()
392 |             image_state = False
393 |             video_state = False
394 |             os.system("clear")
395 |             print("欢迎使用 husky-13b-zh 模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序")
396 |             continue
397 | 
398 |         if image_state:
399 |             modal_type = "image"
400 |         elif video_state:
401 |             modal_type = "video"
402 |         else:
403 |             modal_type = "text"
404 | 
405 |         # image_test = "assets/husky.jpg"
406 |         # image_test = "assets/yoga.mp4"
407 |         # video_test = "assets/pretty_girl.mp4"
408 |         # video_test = "assets/stock-footage-billiards-concentrated-young-woman-playing-in-club.webm"
409 |         # video_test = "assets/stock-footage-kherson-ukraine-may-open-free-rock-music-festival-crowd-partying-at-a-rock-concert.webm"
410 |         conversations = chat.ask(text=query, conv=chat.conv, modal_type=modal_type)
411 |         outputs = chat.answer(conversations, vision_feature, modal_type=modal_type)
412 |         # NOTE: strip is important to align with the training data.
413 |         chat.conv.messages[-1][1] = outputs.strip()
414 | 
415 |         print(f"Husky: \n{outputs}")
416 | 


--------------------------------------------------------------------------------
/demo/test.py:
--------------------------------------------------------------------------------
  1 | """
  2 | srun -p your partition --job-name='test' --gres=gpu:1 --cpus-per-task=8  python -u demo/test.py
  3 | """
  4 | 
  5 | import abc
  6 | from typing import Optional
  7 | 
  8 | import os
  9 | import requests
 10 | from PIL import Image
 11 | from io import BytesIO
 12 | 
 13 | import torch
 14 | import torchvision.transforms as T
 15 | from peft import PeftModel
 16 | from torchvision.transforms.functional import InterpolationMode
 17 | 
 18 | from transformers import (
 19 |     LlamaTokenizer,
 20 |     GenerationConfig,
 21 |     StoppingCriteria,
 22 |     StoppingCriteriaList,
 23 | )
 24 | 
 25 | from robohusky.model.modeling_husky_embody2 import HuskyForConditionalGeneration
 26 | 
 27 | from robohusky.conversation import (
 28 |     conv_templates,
 29 |     get_conv_template,
 30 | )
 31 | 
 32 | from robohusky.video_transformers import (
 33 |     GroupNormalize,
 34 |     GroupScale,
 35 |     GroupCenterCrop,
 36 |     Stack,
 37 |     ToTorchFormatTensor,
 38 |     get_index,
 39 | )
 40 | 
 41 | from robohusky.compression import compress_module
 42 | from decord import VideoReader, cpu
 43 | 
 44 | # import deepspeed
 45 | 
 46 | IGNORE_INDEX = -100
 47 | DEFAULT_UNK_TOKEN = "<unk>"
 48 | DEFAULT_IMG_START_TOKEN = "<img>"
 49 | DEFAULT_IMG_END_TOKEN = "</img>"
 50 | 
 51 | DEFAULT_VIDEO_START_TOKEN = "<vid>"
 52 | DEFAULT_VIDEO_END_TOKEN = "</vid>"
 53 | 
 54 | def get_gpu_memory(max_gpus=None):
 55 |     gpu_memory = []
 56 |     num_gpus = (
 57 |         torch.cuda.device_count()
 58 |         if max_gpus is None
 59 |         else min(max_gpus, torch.cuda.device_count())
 60 |     )
 61 | 
 62 |     for gpu_id in range(num_gpus):
 63 |         with torch.cuda.device(gpu_id):
 64 |             device = torch.cuda.current_device()
 65 |             gpu_properties = torch.cuda.get_device_properties(device)
 66 |             total_memory = gpu_properties.total_memory / (1024 ** 3)
 67 |             allocated_memory = torch.cuda.memory_allocated() / (1024 ** 3)
 68 |             available_memory = total_memory - allocated_memory
 69 |             gpu_memory.append(available_memory)
 70 |     return gpu_memory
 71 | 
 72 | def load_model(
 73 |         model_path, device, num_gpus, max_gpu_memory=None, load_8bit=False, lora_weights=None
 74 | ):
 75 |     if device == "cpu":
 76 |         kwargs = {}
 77 |     elif device == "cuda":
 78 |         kwargs = {"torch_dtype": torch.float16}
 79 |         if num_gpus == "auto":
 80 |             kwargs["device_map"] = "auto"
 81 |         else:
 82 |             num_gpus = int(num_gpus)
 83 |             if num_gpus != 1:
 84 |                 kwargs["device_map"] = "auto"
 85 |                 if max_gpu_memory is None:
 86 |                     kwargs[
 87 |                         "device_map"
 88 |                     ] = "sequential"  # This is important for not the same VRAM sizes
 89 |                     available_gpu_memory = get_gpu_memory(num_gpus)
 90 |                     kwargs["max_memory"] = {
 91 |                         i: str(int(available_gpu_memory[i] * 0.85)) + "GiB"
 92 |                         for i in range(num_gpus)
 93 |                     }
 94 |                 else:
 95 |                     kwargs["max_memory"] = {i: max_gpu_memory for i in range(num_gpus)}
 96 |     else:
 97 |         raise ValueError(f"Invalid device: {device}")
 98 | 
 99 |     tokenizer = LlamaTokenizer.from_pretrained(
100 |         model_path, use_fast=False)
101 | 
102 |     if lora_weights is None:
103 |         model = HuskyForConditionalGeneration.from_pretrained(
104 |             model_path, low_cpu_mem_usage=True, **kwargs
105 |         )
106 |     else:
107 |         kwargs["device_map"] = "auto"
108 |         model = HuskyForConditionalGeneration.from_pretrained(
109 |             model_path, low_cpu_mem_usage=True, **kwargs
110 |         )
111 |         model.language_model = PeftModel.from_pretrained(
112 |             model.language_model,
113 |             lora_weights,
114 |             **kwargs
115 |         )
116 | 
117 |     if load_8bit:
118 |         compress_module(model, device)
119 | 
120 |     if (device == "cuda" and num_gpus == 1) or device == "mps":
121 |         model.to(device)
122 | 
123 |     model = model.eval()
124 |     return model, tokenizer
125 | 
126 | def load_image(image_file, input_size=224):
127 |     if image_file.startswith('http') or image_file.startswith('https'):
128 |         response = requests.get(image_file)
129 |         image = Image.open(BytesIO(response.content)).convert('RGB')
130 |     else:
131 |         image = Image.open(image_file).convert('RGB')
132 | 
133 |     crop_pct = 224 / 256
134 |     size = int(input_size / crop_pct)
135 |     transform = T.Compose([
136 |         T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
137 |         T.Resize(size, interpolation=InterpolationMode.BICUBIC),
138 |         T.CenterCrop(input_size),
139 |         T.ToTensor(),
140 |         T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
141 |     ])
142 |     image = transform(image)
143 |     return image
144 | 
145 | def load_video(video_path, num_segments=8):
146 |     vr = VideoReader(video_path, ctx=cpu(0))
147 |     num_frames = len(vr)
148 |     frame_indices = get_index(num_frames, num_segments)
149 | 
150 |     # transform
151 |     crop_size = 224
152 |     scale_size = 224
153 |     input_mean = [0.48145466, 0.4578275, 0.40821073]
154 |     input_std = [0.26862954, 0.26130258, 0.27577711]
155 | 
156 |     transform = T.Compose([
157 |         GroupScale(int(scale_size), interpolation=InterpolationMode.BICUBIC),
158 |         GroupCenterCrop(crop_size),
159 |         Stack(),
160 |         ToTorchFormatTensor(),
161 |         GroupNormalize(input_mean, input_std)
162 |     ])
163 | 
164 |     images_group = list()
165 |     for frame_index in frame_indices:
166 |         img = Image.fromarray(vr[frame_index].asnumpy())
167 |         images_group.append(img)
168 |     video = transform(images_group)
169 |     return video
170 | 
171 | class StoppingCriteriaSub(StoppingCriteria):
172 | 
173 |     def __init__(self, stops, encounters=1):
174 |         super().__init__()
175 |         self.stops = stops
176 | 
177 |     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
178 |         for stop in self.stops:
179 |             if torch.all((stop == input_ids[0][-len(stop):])).item():
180 |                 return True
181 | 
182 |         return False
183 | 
184 | @torch.inference_mode()
185 | def generate_stream(
186 |         model, tokenizer, image_processor, params, device
187 | ):
188 |     prompt = params["prompt"]
189 |     images = params.get("images", None)
190 |     videos = params.get("videos", None)
191 |     temperature = float(params.get("temperature", 0.7))
192 |     max_new_tokens = int(params.get("max_new_tokens", 1024))
193 | 
194 |     num_queries = model.config.num_query_tokens
195 | 
196 |     stop_words = ["Human: ", "Assistant: ", "###", "\n\n"]
197 |     stop_words_ids = [tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in stop_words]
198 |     stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
199 | 
200 |     generation_config = GenerationConfig(
201 |         bos_token_id=1,
202 |         do_sample=True,
203 |         temperature=temperature,
204 |         max_new_tokens=max_new_tokens,
205 |         stopping_criteria=stopping_criteria
206 |     )
207 | 
208 |     pixel_values = None
209 |     if images is not None:
210 |         pixel_values = load_image(images).to(device)  # only support one image
211 |         image_query = DEFAULT_IMG_START_TOKEN + DEFAULT_IMG_END_TOKEN
212 |         prompt = prompt.replace("<image>", image_query)
213 | 
214 |     elif videos is not None:
215 |         pixel_values = load_video(videos).to(device)
216 |         video_query = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_END_TOKEN
217 |         prompt = prompt.replace("<video>", video_query)
218 | 
219 |     model_inputs = tokenizer([prompt], return_tensors="pt")
220 |     model_inputs.pop("token_type_ids", None)
221 | 
222 |     if pixel_values is not None:
223 |         model_inputs["pixel_values"] = pixel_values
224 | 
225 |         generation_output = model.generate(
226 |             **model_inputs,
227 |             generation_config=generation_config,
228 |             return_dict_in_generate=True,
229 |             output_scores=True
230 |         )
231 |     else:
232 |         generation_output = model.language_model.generate(
233 |             **model_inputs,
234 |             generation_config=generation_config,
235 |             return_dict_in_generate=True,
236 |             output_scores=True
237 |         )
238 | 
239 |     preds = generation_output.sequences
240 |     outputs = tokenizer.batch_decode(preds, skip_special_tokens=True)
241 |     return outputs
242 | 
243 | class Chat:
244 |     def __init__(
245 |             self,
246 |             model_path,
247 |             device,
248 |             num_gpus=1,
249 |             load_8bit=False,
250 |             temperature=0.7,
251 |             max_new_tokens=512,
252 |             lora_path=None,
253 |     ):
254 |         model, tokenizer = load_model(
255 |             model_path, device, num_gpus, load_8bit=load_8bit, lora_weights=lora_path
256 |         )
257 | 
258 |         self.model = model
259 |         # self.model.language_model = deepspeed.init_inference(
260 |         #     self.model.language_model, mp_size=1, dtype=torch.float16, checkpoint=None, replace_with_kernel_inject=True)
261 |         self.tokenizer = tokenizer
262 |         num_queries = model.config.num_query_tokens
263 | 
264 |         self.device = device
265 |         self.dtype = model.dtype
266 | 
267 |         stop_words = ["Human: ", "Assistant: ", "###", "\n\n"]
268 |         stop_words_ids = [tokenizer(stop_word, return_tensors='pt')['input_ids'].squeeze() for stop_word in stop_words]
269 |         stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
270 | 
271 |         self.conv = get_conv_template("husky")
272 | 
273 |         self.image_query = DEFAULT_IMG_START_TOKEN + DEFAULT_IMG_END_TOKEN
274 |         self.video_query = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_END_TOKEN
275 | 
276 |         self.generation_config = GenerationConfig(
277 |             bos_token_id=1,
278 |             pad_token_id=0,
279 |             do_sample=True,
280 |             top_k=20,
281 |             top_p=0.9,
282 |             temperature=temperature,
283 |             max_new_tokens=max_new_tokens,
284 |             stopping_criteria=stopping_criteria
285 |         )
286 | 
287 |     def ask(self, text, conv, modal_type="image"):
288 |         assert modal_type in ["text", "image", "video"]
289 |         conversations = []
290 | 
291 |         if len(conv.messages) > 0 or modal_type == "text":
292 |             conv.append_message(conv.roles[0], text)
293 |         elif modal_type == "image":
294 |             conv.append_message(conv.roles[0], self.image_query + "\n" + text)
295 |         else:
296 |             conv.append_message(conv.roles[0], self.video_query + "\n" + text)
297 | 
298 |         conv.append_message(conv.roles[1], None)
299 |         conversations.append(conv.get_prompt())
300 |         return conversations
301 | 
302 |     @torch.no_grad()
303 |     def get_image_embedding(self, image_file):
304 |         pixel_values = load_image(image_file)
305 |         pixel_values = pixel_values.unsqueeze(0).to(self.device, dtype=self.dtype)
306 |         language_model_inputs = self.model.extract_feature(pixel_values)
307 |         return language_model_inputs
308 | 
309 |     @torch.no_grad()
310 |     def get_video_embedding(self, video_file):
311 |         pixel_values = load_video(video_file)
312 |         TC, H, W = pixel_values.shape
313 |         pixel_values = pixel_values.reshape(TC // 3, 3, H, W).transpose(0, 1)  # [C, T, H, W]
314 |         pixel_values = pixel_values.unsqueeze(0).to(self.device, dtype=self.dtype)
315 |         assert len(pixel_values.shape) == 5
316 |         language_model_inputs = self.model.extract_feature(pixel_values)
317 |         return language_model_inputs
318 | 
319 |     @torch.no_grad()
320 |     def answer(self, conversations, language_model_inputs, modal_type="image"):
321 |         model_inputs = self.tokenizer(
322 |             conversations,
323 |             return_tensors="pt",
324 |         )
325 |         model_inputs.pop("token_type_ids", None)
326 | 
327 |         input_ids = model_inputs["input_ids"].to(self.device)
328 |         attention_mask = model_inputs["attention_mask"].to(self.device)
329 | 
330 |         if modal_type == "text":
331 |             generation_output = self.model.language_model.generate(
332 |                 input_ids=input_ids,
333 |                 attention_mask=attention_mask,
334 |                 generation_config=self.generation_config,
335 |                 return_dict_in_generate=True,
336 |                 output_scores=True
337 |             )
338 |         else:
339 |             pixel_values = model_inputs.pop("pixel_values", None)
340 |             if pixel_values is not None:
341 |                 pixel_values = pixel_values.to(self.device)
342 | 
343 |             generation_output = self.model.generate(
344 |                 pixel_values=pixel_values,
345 |                 input_ids=input_ids,
346 |                 attention_mask=attention_mask,
347 |                 language_model_inputs=language_model_inputs,
348 |                 generation_config=self.generation_config,
349 |                 return_dict_in_generate=True,
350 |                 output_scores=True
351 |             )
352 | 
353 |         preds = generation_output.sequences
354 |         outputs = self.tokenizer.batch_decode(preds, skip_special_tokens=True)[0]
355 | 
356 |         if modal_type == "text":
357 |             skip_echo_len = len(conversations[0]) - conversations[0].count("</s>") * 3
358 |             outputs = outputs[skip_echo_len:].strip()
359 | 
360 |         return outputs
361 | 
362 | if __name__ == '__main__':
363 |     model_path = “your path”
364 |     device = "cuda" if torch.cuda.is_available() else "cpu"
365 |     chat = Chat(model_path, device=device, num_gpus=1, max_new_tokens=2048, load_8bit=False)
366 | 
367 |     vision_feature = None
368 |     image_state = False
369 |     video_state = False
370 | 
371 |     while True:
372 |         query = input("\n")
373 |         if query.lower().endswith(('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff')):
374 |             if os.path.exists(query):
375 |                 print("received.")
376 |                 vision_feature = chat.get_image_embedding(query)
377 |                 chat.conv = get_conv_template("husky").copy()
378 |                 image_state = True
379 |                 continue
380 |         if query.lower().endswith(('.mp4', '.mkv', '.avi', '.wmv', '.iso', ".webm")):
381 |             if os.path.exists(query):
382 |                 print("received.")
383 |                 vision_feature = chat.get_video_embedding(query)
384 |                 chat.conv = get_conv_template("husky").copy()
385 |                 video_state = True
386 |                 continue
387 | 
388 |         if query == "stop":
389 |             break
390 |         if query == "clear" or query == "" or query == "\n":
391 |             chat.conv = get_conv_template("husky").copy()
392 |             image_state = False
393 |             video_state = False
394 |             os.system("clear")
395 |             print("欢迎使用 Embodied Family 系列模型，输入内容即可进行对话，clear 清空对话历史，stop 终止程序")
396 |             continue
397 | 
398 |         if image_state:
399 |             modal_type = "image"
400 |         elif video_state:
401 |             modal_type = "video"
402 |         else:
403 |             modal_type = "text"
404 | 
405 |         conversations = chat.ask(text=query, conv=chat.conv, modal_type=modal_type)
406 |         outputs = chat.answer(conversations, vision_feature, modal_type=modal_type)
407 |         # NOTE: strip is important to align with the training data.
408 |         chat.conv.messages[-1][1] = outputs.strip()
409 | 
410 |         print(f"Agent: \n{outputs}")
411 | 


--------------------------------------------------------------------------------
/pyproject.toml:
--------------------------------------------------------------------------------
 1 | [build-system]
 2 | requires = ["setuptools>=61.0"]
 3 | build-backend = "setuptools.build_meta"
 4 | 
 5 | [project]
 6 | name = "robohusky"
 7 | version = "0.1.0"
 8 | description = "An open platform for training, serving, and evaluating large language model based chatbots."
 9 | readme = "README.md"
10 | requires-python = ">=3.8"
11 | classifiers = [
12 |     "Programming Language :: Python :: 3",
13 |     "License :: OSI Approved :: Apache Software License",
14 | ]
15 | dependencies = [
16 |     "accelerate", "fastapi", "gradio>=3.23", "markdown2[all]", "numpy",
17 |     "prompt_toolkit>=3.0.0", "requests", "rich>=10.0.0", "sentencepiece",
18 |     "torchvision>=0.15.2", "shortuuid", "transformers>=4.34.1", "tokenizers>=0.14.1",
19 |     "torch>=2.0", "uvicorn", "wandb", "httpx", "shortuuid", "pydantic", "nh3",
20 | ]
21 | 
22 | [project.optional-dependencies]
23 | dev = ["black>=23.3.0", "pylint>=2.8.2"]
24 | 
25 | [project.urls]
26 | "Homepage" = "https://github.com/EmbodiedGPT/EmbodiedGPT_Pytorch"
27 | "Bug Tracker" = "https://github.com/EmbodiedGPT/EmbodiedGPT_Pytorch/issues"
28 | 
29 | [tool.setuptools.packages.find]
30 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
31 | 
32 | [tool.wheel]
33 | exclude = ["assets*", "benchmark*", "docs", "dist*", "playground*", "scripts*", "tests*"]
34 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets
2 | torch
3 | torchvision 
4 | decord 
5 | peft 
6 | deepspeed
7 | transformers
8 | 


--------------------------------------------------------------------------------
/robohusky/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmbodiedGPT/EmbodiedGPT_Pytorch/cda80524bf6b7d276ba3b532887bacd4b133f234/robohusky/.DS_Store


--------------------------------------------------------------------------------
/robohusky/base_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | 
  4 | from typing import Dict, Optional, Sequence
  5 | from PIL import PngImagePlugin, Image, ImageFile
  6 | 
  7 | import torch
  8 | from torch.utils.data import Dataset
  9 | import torchvision.transforms as T
 10 | from torchvision.transforms.functional import InterpolationMode
 11 | 
 12 | from robohusky.train.tcsloader import TCSLoader
 13 | from robohusky.conversation import get_conv_template
 14 | 
 15 | IGNORE_INDEX = -100
 16 | 
 17 | Image.MAX_IMAGE_PIXELS = None
 18 | ImageFile.LOAD_TRUNCATED_IMAGES = True
 19 | MaximumDecompressedSize = 1024
 20 | MegaByte = 2 ** 20
 21 | PngImagePlugin.MAX_TEXT_CHUNK = MaximumDecompressedSize * MegaByte
 22 | 
 23 | DEFAULT_IMG_START_TOKEN = "<img>"
 24 | DEFAULT_IMG_END_TOKEN = "</img>"
 25 | 
 26 | DEFAULT_VIDEO_START_TOKEN = "<vid>"
 27 | DEFAULT_VIDEO_END_TOKEN = "</vid>"
 28 | 
 29 | def is_image(image_file):
 30 |     if image_file.lower().endswith(('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff')):
 31 |         return True
 32 |     else:
 33 |         return False
 34 | 
 35 | def is_video(image_file):
 36 |     if image_file.lower().endswith(('.mp4', '.mkv', '.avi', '.wmv', '.iso', ".webm")):
 37 |         return True
 38 |     else:
 39 |         return False
 40 | 
 41 | def build_transform(input_size):
 42 |     transform = T.Compose([
 43 |         T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
 44 |         T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
 45 |         T.ToTensor(),
 46 |         T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225))
 47 |     ])
 48 |     return transform
 49 | 
 50 | def format_inputs(sources):
 51 |     # Apply prompt templates
 52 |     conv = get_conv_template("husky").copy()
 53 |     roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
 54 |     conversations = []
 55 | 
 56 |     for i, source in enumerate(sources):
 57 |         if roles[source[0]["from"]] != conv.roles[0]:
 58 |             # Skip the first one if it is not from human
 59 |             source = source[1:]
 60 | 
 61 |         conv.messages = []
 62 |         for j, sentence in enumerate(source):
 63 |             role = roles[sentence["from"]]
 64 |             assert role == conv.roles[j % 2], f"{i}"
 65 |             # vision is only supported for the human input
 66 |             if role == conv.roles[0]:
 67 |                 value = sentence["value"]
 68 |                 if "<image>" in value:
 69 |                     if value.endswith("\n<image>"):
 70 |                         value = "<image>\n" + value.replace("\n<image>", "")
 71 |                     image_query = DEFAULT_IMG_START_TOKEN + DEFAULT_IMG_END_TOKEN
 72 |                     sentence["value"] = value.replace("<image>", image_query)
 73 | 
 74 |                 elif "<video>" in value:
 75 |                     if value.endswith("\n<video>"):
 76 |                         value = "<video>\n" + value.replace("\n<video>", "")
 77 |                     video_query = DEFAULT_VIDEO_START_TOKEN + DEFAULT_VIDEO_END_TOKEN
 78 |                     sentence["value"] = value.replace("<video>", video_query)
 79 | 
 80 |             conv.append_message(role, sentence["value"])
 81 |         conversations.append(conv.get_prompt())
 82 | 
 83 |     return conversations, conv
 84 | 
 85 | def process_func(examples, tokenizer, max_seq_length):
 86 |     conversations, conv = format_inputs(examples['conversations'])
 87 |     model_inputs = tokenizer(
 88 |         conversations,
 89 |         max_length=max_seq_length,
 90 |         padding="max_length",
 91 |         truncation=True,
 92 |         return_tensors="pt",
 93 |     )
 94 | 
 95 |     model_inputs.pop("token_type_ids", None)
 96 |     # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
 97 |     # padding in the loss.
 98 |     targets = model_inputs["input_ids"].clone()
 99 | 
100 |     # Mask targets
101 |     sep = conv.sep + conv.roles[1] + ": "
102 |     for conversation, target in zip(conversations, targets):
103 |         total_len = int(target.ne(tokenizer.pad_token_id).sum())
104 | 
105 |         turns = conversation.split(conv.sep2)
106 |         cur_len = 1
107 |         target[:cur_len] = IGNORE_INDEX
108 |         for i, turn in enumerate(turns):
109 |             if turn == "":
110 |                 break
111 |             turn_len = len(tokenizer(turn).input_ids)
112 | 
113 |             parts = turn.split(sep)
114 |             if len(parts) != 2:
115 |                 break
116 |             parts[0] += sep
117 | 
118 |             # "-2" is hardcoded for the Llama tokenizer to make the offset correct.
119 |             instruction_len = len(tokenizer(parts[0]).input_ids) - 2
120 | 
121 |             if i != 0 and not tokenizer.legacy:
122 |                 # The legacy and non-legacy modes handle special tokens differently
123 |                 instruction_len -= 1
124 | 
125 |             # Ignore the user instructions
126 |             target[cur_len: cur_len + instruction_len] = IGNORE_INDEX
127 |             cur_len += turn_len
128 | 
129 |             if i != 0 and not tokenizer.legacy:
130 |                 # The legacy and non-legacy modes handle special tokens differently
131 |                 cur_len -= 1
132 | 
133 |         target[cur_len:] = IGNORE_INDEX
134 | 
135 |         if cur_len < tokenizer.model_max_length:
136 |             if cur_len != total_len:
137 |                 target[:] = IGNORE_INDEX
138 | 
139 |     model_inputs["labels"] = targets
140 |     return model_inputs
141 | 
142 | class BaseDataset(Dataset):
143 |     def __init__(self, dataset, processor, image_path="", input_size=224):
144 |         super(BaseDataset, self).__init__()
145 |         self.dataset = dataset
146 |         self.image_path = image_path
147 | 
148 |         self.transform = build_transform(input_size)
149 |         self.husky_processor = processor
150 | 
151 |         self.cached_data_dict = {}
152 | 
153 |     def __len__(self):
154 |         return len(self.dataset)
155 | 
156 |     def __getitem__(self, i) -> Dict[str, torch.Tensor]:
157 |         if i in self.cached_data_dict:
158 |             return self.cached_data_dict[i]
159 | 
160 |         data = self.dataset[i]
161 |         image_file = data.pop("image", None)
162 | 
163 |         if self.image_path != "":
164 |             image_file = os.path.join(self.image_path, image_file)
165 |             if not os.path.exists(image_file):
166 |                 return self.__getitem__((i + 1) % len(self.dataset))
167 |             image = Image.open(image_file)
168 |         else:
169 |             image = Image.open(image_file)
170 | 
171 |         for k, v in data.items():
172 |             data[k] = [v]
173 |         ret = self.husky_processor(data)
174 |         for k, v in ret.items():
175 |             ret[k] = v[0]
176 | 
177 |         pixel_values = self.transform(image)
178 |         ret["pixel_values"] = pixel_values
179 | 
180 |         self.cached_data_dict[i] = ret
181 |         return ret
182 | 
183 | class CephDataset(Dataset):
184 |     def __init__(self, dataset, processor, input_size=224):
185 |         super(CephDataset, self).__init__()
186 |         self.dataset = dataset
187 | 
188 |         self.transform = build_transform(input_size)
189 |         self.husky_processor = processor
190 | 
191 |         conf_path = "./petrelf.conf"
192 |         self.conf_path = os.path.abspath(conf_path)
193 | 
194 |         self.initialized = False
195 |         self._init_memcached()
196 | 
197 |     def _init_memcached(self):
198 |         if not self.initialized:
199 |             assert self.conf_path is not None
200 |             self.mt_loader = TCSLoader(self.conf_path)
201 |             self.initialized = True
202 | 
203 |     def __len__(self):
204 |         return len(self.dataset)
205 | 
206 |     def __getitem__(self, i) -> Dict[str, torch.Tensor]:
207 |         data = self.dataset[i]
208 |         image_file = data.pop("image", None)
209 | 
210 |         try:
211 |             image = self.mt_loader(image_file).convert('RGB')
212 |         except (AttributeError, OSError):
213 |             with open("error.txt", 'a') as f:
214 |                 f.write(image_file + '\n')
215 |             i = random.randint(0, len(self.dataset))
216 |             return self.__getitem__(i % len(self.dataset))
217 | 
218 |         for k, v in data.items():
219 |             data[k] = [v]
220 | 
221 |         ret = self.husky_processor(data)
222 |         for k, v in ret.items():
223 |             ret[k] = v[0]
224 |         pixel_values = self.transform(image)
225 |         ret["pixel_values"] = pixel_values
226 |         return ret
227 | 


--------------------------------------------------------------------------------
/robohusky/base_dataset_uni.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import random
  3 | 
  4 | from typing import Dict, Optional, Sequence, Iterator, List, Iterable, Union
  5 | from PIL import PngImagePlugin, Image, ImageFile, ImageOps
  6 | 
  7 | import numpy as np
  8 | 
  9 | import torch
 10 | from torch.utils.data import (
 11 |     Dataset,
 12 |     ConcatDataset,
 13 |     Sampler,
 14 |     WeightedRandomSampler
 15 | )
 16 | import torchvision.transforms as T
 17 | from torchvision.transforms.functional import InterpolationMode
 18 | 
 19 | from robohusky.train.tcsloader import TCSLoader
 20 | 
 21 | from decord import VideoReader, cpu
 22 | from robohusky.video_transformers import (
 23 |     GroupNormalize,
 24 |     GroupScale,
 25 |     GroupCenterCrop,
 26 |     Stack,
 27 |     ToTorchFormatTensor,
 28 |     get_index,
 29 | )
 30 | 
 31 | from robohusky.conversation import get_conv_template
 32 | 
 33 | IMAGENET_DEFAULT_MEAN = [0.485, 0.456, 0.406]
 34 | IMAGENET_DEFAULT_STD = [0.229, 0.224, 0.225]
 35 | IMAGENET_STANDARD_MEAN = [0.5, 0.5, 0.5]
 36 | IMAGENET_STANDARD_STD = [0.5, 0.5, 0.5]
 37 | OPENAI_CLIP_MEAN = [0.48145466, 0.4578275, 0.40821073]
 38 | OPENAI_CLIP_STD = [0.26862954, 0.26130258, 0.27577711]
 39 | 
 40 | IGNORE_INDEX = -100
 41 | 
 42 | Image.MAX_IMAGE_PIXELS = None
 43 | ImageFile.LOAD_TRUNCATED_IMAGES = True
 44 | MaximumDecompressedSize = 1024
 45 | MegaByte = 2 ** 20
 46 | PngImagePlugin.MAX_TEXT_CHUNK = MaximumDecompressedSize * MegaByte
 47 | 
 48 | DEFAULT_IMG_START_TOKEN = "<img>"
 49 | DEFAULT_IMG_END_TOKEN = "</img>"
 50 | 
 51 | DEFAULT_VIDEO_START_TOKEN = "<vid>"
 52 | DEFAULT_VIDEO_END_TOKEN = "</vid>"
 53 | 
 54 | DEFAULT_EMBED_TOKEN = "<quad>"
 55 | 
 56 | conf_path = "/your path to/petrelf.conf"
 57 | 
 58 | def is_image(image_file):
 59 |     if image_file.lower().endswith(('.bmp', '.dib', '.png', '.jpg', '.jpeg', '.pbm', '.pgm', '.ppm', '.tif', '.tiff')):
 60 |         return True
 61 |     else:
 62 |         return False
 63 | 
 64 | def is_video(image_file):
 65 |     if image_file.lower().endswith(('.mp4', '.mkv', '.avi', '.wmv', '.iso', ".webm")):
 66 |         return True
 67 |     else:
 68 |         return False
 69 | 
 70 | def is_numpy(image_file):
 71 |     if image_file.endswith(".npy"):
 72 |         return True
 73 |     else:
 74 |         return False
 75 | 
 76 | def get_media_type(image_file):
 77 |     if is_image(image_file):
 78 |         return "image"
 79 |     elif is_video(image_file):
 80 |         return "video"
 81 |     elif is_numpy(image_file):
 82 |         return "numpy"
 83 |     else:
 84 |         return "text"
 85 | 
 86 | def build_transform(input_size, norm_type="openai", media_type="image"):
 87 |     if norm_type == "openai":
 88 |         mean = OPENAI_CLIP_MEAN
 89 |         std = OPENAI_CLIP_STD
 90 |     elif norm_type == "imagenet":
 91 |         mean = IMAGENET_DEFAULT_MEAN
 92 |         std = IMAGENET_DEFAULT_STD
 93 |     else:
 94 |         mean = IMAGENET_DEFAULT_MEAN
 95 |         std = IMAGENET_DEFAULT_STD
 96 | 
 97 |     if media_type == "image":
 98 |         transform = T.Compose([
 99 |             T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
100 |             T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
101 |             T.ToTensor(),
102 |             T.Normalize(mean=mean, std=std)
103 |         ])
104 |     elif media_type == "video":
105 |         transform = T.Compose([
106 |             GroupScale(int(input_size), interpolation=InterpolationMode.BICUBIC),
107 |             GroupCenterCrop(input_size),
108 |             Stack(),
109 |             ToTorchFormatTensor(),
110 |             GroupNormalize(mean=mean, std=std)
111 |         ])
112 |     else:
113 |         transform = None
114 |     return transform
115 | 
116 | def check_format(data):
117 |     if not ('id' in data and 'image' in data and 'conversations' in data and len(data['conversations']) % 2 == 0):
118 |         print(f"Lake field: {data}")
119 |         return False
120 |     for i, message in enumerate(data['conversations']):
121 |         if i == 0:
122 |             if not (message['value'].startswith("<image>\n") or message['value'].endswith("\n<image>")):
123 |                 print(f"No <image>: {data}")
124 |                 return False
125 |         if i % 2 == 0:
126 |             if not (message['from'] == 'human'):
127 |                 print(f"Not from human: {data}")
128 |                 return False
129 |         else:
130 |             if not (message['from'] == 'gpt'):
131 |                 print(f"Not from gpt: {data}")
132 |                 return False
133 |         if message['value'] is None or (len(message['value']) == 0):
134 |             print(f"No Message: {data}")
135 |             return False
136 |     return True
137 | 
138 | def format_inputs(sources, conv_tempt="husky", num_query_tokens=256):
139 |     # Apply prompt templates
140 |     conv = get_conv_template(conv_tempt).copy()
141 |     roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
142 |     conversations = []
143 | 
144 |     for i, source in enumerate(sources):
145 |         if roles[source[0]["from"]] != conv.roles[0]:
146 |             # Skip the first one if it is not from human
147 |             source = source[1:]
148 | 
149 |         conv.messages = []
150 |         for j, sentence in enumerate(source):
151 |             role = roles[sentence["from"]]
152 |             assert role == conv.roles[j % 2], f"{i}"
153 |             # vision is only supported for the human input
154 |             if role == conv.roles[0]:
155 |                 value = sentence["value"]
156 |                 if "<image>" in value:
157 |                     if value.endswith("\n<image>"):
158 |                         value = "<image>\n" + value.replace("\n<image>", "")
159 | 
160 |                     image_query = DEFAULT_IMG_START_TOKEN + num_query_tokens * DEFAULT_EMBED_TOKEN + DEFAULT_IMG_END_TOKEN
161 |                     sentence["value"] = value.replace("<image>", image_query)
162 | 
163 |                 elif "<video>" in value:
164 |                     if value.endswith("\n<video>"):
165 |                         value = "<video>\n" + value.replace("\n<video>", "")
166 | 
167 |                     video_query = DEFAULT_VIDEO_START_TOKEN + num_query_tokens * DEFAULT_EMBED_TOKEN + DEFAULT_VIDEO_END_TOKEN
168 |                     sentence["value"] = value.replace("<video>", video_query)
169 | 
170 |             conv.append_message(role, sentence["value"])
171 |         conversations.append(conv.get_prompt())
172 | 
173 |     return conversations, conv
174 | 
175 | def process_func(examples, tokenizer, max_seq_length=-1, conv_tempt="husky", num_query_tokens=256):
176 |     conversations, conv = format_inputs(examples['conversations'], conv_tempt, num_query_tokens)
177 |     if max_seq_length < 0:
178 |         model_inputs = tokenizer(
179 |             conversations,
180 |             return_tensors="pt",
181 |             max_length=tokenizer.model_max_length,
182 |             truncation=True,
183 |         )
184 |     else:
185 |         model_inputs = tokenizer(
186 |             conversations,
187 |             max_length=max_seq_length,
188 |             padding="max_length",
189 |             truncation=True,
190 |             return_tensors="pt",
191 |         )
192 | 
193 |     model_inputs.pop("token_type_ids", None)
194 |     # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
195 |     # padding in the loss.
196 |     targets = model_inputs["input_ids"].clone()
197 | 
198 |     # Mask targets
199 |     sep = conv.sep + conv.roles[1] + ": "
200 |     for conversation, target in zip(conversations, targets):
201 |         total_len = int(target.ne(tokenizer.pad_token_id).sum())
202 | 
203 |         turns = conversation.split(conv.sep2)
204 |         cur_len = 1
205 |         target[:cur_len] = IGNORE_INDEX
206 |         for i, turn in enumerate(turns):
207 |             if turn == "":
208 |                 break
209 |             turn_len = len(tokenizer(turn).input_ids)
210 | 
211 |             parts = turn.split(sep)
212 |             if len(parts) != 2:
213 |                 break
214 |             parts[0] += sep
215 | 
216 |             # "-2" is hardcoded for the Llama tokenizer to make the offset correct.
217 |             instruction_len = len(tokenizer(parts[0]).input_ids) - 2
218 | 
219 |             if i != 0 and not tokenizer.legacy:
220 |                 # The legacy and non-legacy modes handle special tokens differently
221 |                 instruction_len -= 1
222 | 
223 |             # Ignore the user instructions
224 |             target[cur_len: cur_len + instruction_len] = IGNORE_INDEX
225 |             cur_len += turn_len
226 | 
227 |             if i != 0 and not tokenizer.legacy:
228 |                 # The legacy and non-legacy modes handle special tokens differently
229 |                 cur_len -= 1
230 | 
231 |         target[cur_len:] = IGNORE_INDEX
232 | 
233 |         if cur_len < tokenizer.model_max_length:
234 |             if cur_len != total_len:
235 |                 target[:] = IGNORE_INDEX
236 | 
237 |     model_inputs["labels"] = targets
238 |     return model_inputs
239 | 
240 | class BaseDataset(Dataset):
241 |     def __init__(
242 |             self,
243 |             dataset,
244 |             processor,
245 |             image_path="",
246 |             input_size=224,
247 |             num_segments=8,
248 |             norm_type="openai",
249 |             media_type="image"
250 |     ):
251 |         super(BaseDataset, self).__init__()
252 |         self.dataset = dataset
253 |         self.image_path = image_path
254 |         self.input_size = input_size
255 |         self.num_segments = num_segments
256 | 
257 |         self.media_type = media_type
258 |         self.transform = build_transform(input_size, norm_type, media_type)
259 |         self.husky_processor = processor
260 |         self.tcs_loader = TCSLoader(os.path.abspath(conf_path), media_type=media_type)
261 | 
262 |         self.cached_data_dict = {}
263 | 
264 |     def __len__(self):
265 |         return len(self.dataset)
266 | 
267 |     def __getitem__(self, i) -> Dict[str, torch.Tensor]:
268 |         if i in self.cached_data_dict:
269 |             return self.cached_data_dict[i]
270 | 
271 |         data = self.dataset[i]
272 |         image_file = data["image"] if "image" in data else data["video"]
273 | 
274 |         if self.media_type == "llm" or image_file == "":
275 |             # Pseudo pixel_values
276 |             # pixel_values = torch.zeros(size=(3, self.input_size, self.input_size))
277 |             pixel_values = None
278 |         else:
279 |             if self.image_path != "":
280 |                 image_file = os.path.join(self.image_path, image_file)
281 |             if "s3://" not in image_file and not os.path.exists(image_file):
282 |                 i = random.randint(0, len(self.dataset))
283 |                 return self.__getitem__(i % len(self.dataset))
284 | 
285 |             try:
286 |                 if self.media_type == "image":
287 |                     # load from ceph
288 |                     if "s3://" in image_file:
289 |                         image = self.tcs_loader(image_file)
290 |                     else:
291 |                         image = Image.open(image_file).convert('RGB')
292 | 
293 |                     # process image with extreme aspect ratios
294 |                     height, width = image.size
295 |                     if height / width >= 1.8:
296 |                         delta = height - width
297 |                         padding = (0, delta // 2, 0, delta - delta // 2)
298 |                         image = ImageOps.expand(image, padding)
299 |                     elif height / width <= 0.56:
300 |                         delta = width - height
301 |                         padding = (delta // 2, 0, delta - delta // 2, 0)
302 |                         image = ImageOps.expand(image, padding)
303 |                     pixel_values = self.transform(image)
304 |                 elif self.media_type == "video":
305 |                     if "s3://" in image_file:
306 |                         vr = self.tcs_loader(image_file)
307 |                     else:
308 |                         vr = VideoReader(image_file, ctx=cpu(0))
309 | 
310 |                     num_frames = len(vr)
311 |                     frame_indices = get_index(num_frames, self.num_segments)
312 |                     images_group = list()
313 |                     for frame_index in frame_indices:
314 |                         img = Image.fromarray(vr[frame_index].asnumpy())
315 |                         images_group.append(img)
316 |                     pixel_values = self.transform(images_group)
317 |                     TC, H, W = pixel_values.shape
318 |                     pixel_values = pixel_values.reshape(TC // 3, 3, H, W).transpose(0, 1)  # [C, T, H, W]
319 |                 else:
320 |                     # load numpy
321 |                     if "s3://" in image_file:
322 |                         pixel_values = self.tcs_loader(image_file)
323 |                     else:
324 |                         pixel_values = np.load(image_file)
325 |                     pixel_values = torch.tensor(pixel_values).transpose(0, 1)
326 |             except (AttributeError, OSError):
327 |                 with open("error.txt", 'a') as f:
328 |                     f.write(image_file + '\n')
329 |                 i = random.randint(0, len(self.dataset))
330 |                 return self.__getitem__(i % len(self.dataset))
331 | 
332 |         for k, v in data.items():
333 |             data[k] = [v]
334 |         ret = self.husky_processor(data)
335 |         for k, v in ret.items():
336 |             ret[k] = v[0]
337 | 
338 |         if pixel_values is not None:
339 |             ret["pixel_values"] = pixel_values
340 | 
341 |         self.cached_data_dict[i] = ret
342 |         return ret
343 | 
344 | class WeightedConcatDataset(ConcatDataset):
345 |     def __init__(
346 |             self,
347 |             datasets: List[Dataset],
348 |             weights: Sequence[float] = None,
349 |             replacement: bool = True,
350 |             batch_size: int = -1,
351 |             generator=None
352 |     ) -> None:
353 |         super().__init__(datasets)
354 |         if weights is None:
355 |             weights = [1.0] * len(self.datasets)
356 |         weights_tensor = torch.as_tensor(weights, dtype=torch.double)
357 |         if len(weights_tensor.shape) != 1:
358 |             raise ValueError("weights should be a 1d sequence but given "
359 |                              "weights have shape {}".format(tuple(weights_tensor.shape)))
360 |         self.weights = weights_tensor
361 |         self.batch_size = batch_size
362 | 
363 |         self.replacement = replacement
364 |         self.generator = generator
365 | 
366 |         if self.batch_size <= 0:
367 |             self.num_samples = sum([len(d) for d in datasets])
368 |             self.sampler = WeightedRandomSampler(
369 |                 weights=self.weights,
370 |                 num_samples=self.num_samples,
371 |                 replacement=self.replacement
372 |             )
373 |         else:
374 |             self.task_batches = [len(d) // batch_size for d in datasets]
375 |             self.num_samples = sum(self.task_batches) * batch_size
376 |             self.sampler = WeightedBatchSampler(
377 |                 weights=self.weights,
378 |                 num_samples=self.num_samples,
379 |                 batch_size=self.batch_size,
380 |                 replacement=self.replacement
381 |             )
382 | 
383 |     def __iter__(self) -> Iterator[int]:
384 |         return iter(self.sampler)
385 | 
386 |     def __len__(self) -> int:
387 |         return self.num_samples
388 | 
389 | class WeightedBatchSampler(Sampler[int]):
390 |     weights: torch.Tensor
391 |     num_samples: int
392 |     batch_size: int
393 |     replacement: bool
394 | 
395 |     def __init__(
396 |             self,
397 |             weights: Sequence[float],
398 |             num_samples: int,
399 |             batch_size: int,
400 |             replacement: bool = True,
401 |             generator=None
402 |     ) -> None:
403 |         if not isinstance(batch_size, int) or isinstance(batch_size, bool) or \
404 |                 batch_size <= 0:
405 |             raise ValueError("batch_size should be a positive integer value, "
406 |                              "but got batch_size={}".format(batch_size))
407 |         if not isinstance(num_samples, int) or isinstance(num_samples, bool) or \
408 |                 num_samples <= 0:
409 |             raise ValueError("num_samples should be a positive integer "
410 |                              "value, but got num_samples={}".format(num_samples))
411 |         if not isinstance(replacement, bool):
412 |             raise ValueError("replacement should be a boolean value, but got "
413 |                              "replacement={}".format(replacement))
414 | 
415 |         weights_tensor = torch.as_tensor(weights, dtype=torch.double)
416 |         if len(weights_tensor.shape) != 1:
417 |             raise ValueError("weights should be a 1d sequence but given "
418 |                              "weights have shape {}".format(tuple(weights_tensor.shape)))
419 | 
420 |         self.weights = weights_tensor
421 |         self.num_samples = num_samples
422 |         self.batch_size = batch_size
423 |         self.num_batches = num_samples // batch_size
424 |         self.replacement = replacement
425 |         self.generator = generator
426 | 
427 |     def __iter__(self) -> Iterator[int]:
428 |         rand_tensor = torch.multinomial(self.weights, self.num_batches, self.replacement, generator=self.generator)
429 |         rand_tensor = rand_tensor.repeat_interleave(self.batch_size)
430 | 
431 |         yield from iter(rand_tensor.tolist())
432 | 
433 |     def __len__(self) -> int:
434 |         return self.num_samples
435 | 


--------------------------------------------------------------------------------
/robohusky/compression.py:
--------------------------------------------------------------------------------
  1 | import dataclasses
  2 | import gc
  3 | import glob
  4 | import os
  5 | 
  6 | from accelerate import init_empty_weights
  7 | from accelerate.utils import set_module_tensor_to_device
  8 | import torch
  9 | from torch import Tensor
 10 | from torch.nn import functional as F
 11 | import torch.nn as nn
 12 | from tqdm import tqdm
 13 | from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
 14 | 
 15 | 
 16 | @dataclasses.dataclass
 17 | class CompressionConfig:
 18 |     """Group-wise quantization."""
 19 | 
 20 |     num_bits: int
 21 |     group_size: int
 22 |     group_dim: int
 23 |     symmetric: bool
 24 |     enabled: bool = True
 25 | 
 26 | 
 27 | default_compression_config = CompressionConfig(
 28 |     num_bits=8, group_size=256, group_dim=1, symmetric=True, enabled=True
 29 | )
 30 | 
 31 | 
 32 | class CLinear(nn.Module):
 33 |     """Compressed Linear Layer."""
 34 | 
 35 |     def __init__(self, weight=None, bias=None, device=None):
 36 |         super().__init__()
 37 |         if weight is None:
 38 |             self.weight = None
 39 |         elif isinstance(weight, Tensor):
 40 |             self.weight = compress(weight.data.to(device), default_compression_config)
 41 |         else:
 42 |             self.weight = weight
 43 |         self.bias = bias
 44 | 
 45 |     def forward(self, input: Tensor) -> Tensor:
 46 |         weight = decompress(self.weight, default_compression_config)
 47 |         if self.bias is None:
 48 |             return F.linear(input.to(weight.dtype), weight)
 49 |         return F.linear(input.to(weight.dtype), weight, self.bias.to(weight.dtype))
 50 | 
 51 | 
 52 | def compress_module(module, target_device):
 53 |     for attr_str in dir(module):
 54 |         target_attr = getattr(module, attr_str)
 55 |         if type(target_attr) == torch.nn.Linear:
 56 |             setattr(
 57 |                 module,
 58 |                 attr_str,
 59 |                 CLinear(target_attr.weight, target_attr.bias, target_device),
 60 |             )
 61 |     for name, child in module.named_children():
 62 |         compress_module(child, target_device)
 63 | 
 64 | 
 65 | def get_compressed_list(module, prefix=""):
 66 |     compressed_list = []
 67 |     for attr_str in dir(module):
 68 |         target_attr = getattr(module, attr_str)
 69 |         if type(target_attr) == torch.nn.Linear:
 70 |             full_name = (
 71 |                 f"{prefix}.{attr_str}.weight" if prefix else f"{attr_str}.weight"
 72 |             )
 73 |             compressed_list.append(full_name)
 74 |     for name, child in module.named_children():
 75 |         child_prefix = f"{prefix}.{name}" if prefix else name
 76 |         for each in get_compressed_list(child, child_prefix):
 77 |             compressed_list.append(each)
 78 |     return compressed_list
 79 | 
 80 | 
 81 | def apply_compressed_weight(module, compressed_state_dict, target_device, prefix=""):
 82 |     for attr_str in dir(module):
 83 |         target_attr = getattr(module, attr_str)
 84 |         if type(target_attr) == torch.nn.Linear:
 85 |             full_name = (
 86 |                 f"{prefix}.{attr_str}.weight" if prefix else f"{attr_str}.weight"
 87 |             )
 88 |             setattr(
 89 |                 module,
 90 |                 attr_str,
 91 |                 CLinear(
 92 |                     compressed_state_dict[full_name], target_attr.bias, target_device
 93 |                 ),
 94 |             )
 95 |     for name, child in module.named_children():
 96 |         child_prefix = f"{prefix}.{name}" if prefix else name
 97 |         apply_compressed_weight(
 98 |             child, compressed_state_dict, target_device, child_prefix
 99 |         )
100 | 
101 | 
102 | def load_compress_model(model_path, device, torch_dtype, use_fast=False):
103 |     # partially load model
104 |     tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=use_fast)
105 |     base_pattern = os.path.join(model_path, "pytorch_model*.bin")
106 |     files = glob.glob(base_pattern)
107 | 
108 |     with init_empty_weights():
109 |         config = AutoConfig.from_pretrained(
110 |             model_path, low_cpu_mem_usage=True, torch_dtype=torch_dtype
111 |         )
112 |         model = AutoModelForCausalLM.from_config(config)
113 |         linear_weights = get_compressed_list(model)
114 | 
115 |     compressed_state_dict = {}
116 | 
117 |     for filename in tqdm(files):
118 |         tmp_state_dict = torch.load(filename)
119 |         for name in tmp_state_dict:
120 |             if name in linear_weights:
121 |                 tensor = tmp_state_dict[name].to(device).data.to(torch_dtype)
122 |                 compressed_state_dict[name] = compress(
123 |                     tensor, default_compression_config
124 |                 )
125 |             else:
126 |                 compressed_state_dict[name] = tmp_state_dict[name].to(device)
127 |             tmp_state_dict[name] = None
128 |             tensor = None
129 |             gc.collect()
130 |             torch.cuda.empty_cache()
131 | 
132 |     for name in model.state_dict():
133 |         if name not in linear_weights:
134 |             set_module_tensor_to_device(
135 |                 model, name, device, value=compressed_state_dict[name]
136 |             )
137 |     apply_compressed_weight(model, compressed_state_dict, device)
138 | 
139 |     model.to(device)
140 | 
141 |     return model, tokenizer
142 | 
143 | 
144 | def compress(tensor, config):
145 |     """Simulate group-wise quantization."""
146 |     if not config.enabled:
147 |         return tensor
148 | 
149 |     group_size, num_bits, group_dim, symmetric = (
150 |         config.group_size,
151 |         config.num_bits,
152 |         config.group_dim,
153 |         config.symmetric,
154 |     )
155 |     assert num_bits <= 8
156 | 
157 |     original_shape = tensor.shape
158 |     num_groups = (original_shape[group_dim] + group_size - 1) // group_size
159 |     new_shape = (
160 |         original_shape[:group_dim]
161 |         + (num_groups, group_size)
162 |         + original_shape[group_dim + 1 :]
163 |     )
164 | 
165 |     # Pad
166 |     pad_len = (group_size - original_shape[group_dim] % group_size) % group_size
167 |     if pad_len != 0:
168 |         pad_shape = (
169 |             original_shape[:group_dim] + (pad_len,) + original_shape[group_dim + 1 :]
170 |         )
171 |         tensor = torch.cat(
172 |             [tensor, torch.zeros(pad_shape, dtype=tensor.dtype, device=tensor.device)],
173 |             dim=group_dim,
174 |         )
175 |     data = tensor.view(new_shape)
176 | 
177 |     # Quantize
178 |     if symmetric:
179 |         B = 2 ** (num_bits - 1) - 1
180 |         scale = B / torch.max(data.abs(), dim=group_dim + 1, keepdim=True)[0]
181 |         data = data * scale
182 |         data = data.clamp_(-B, B).round_().to(torch.int8)
183 |         return data, scale, original_shape
184 |     else:
185 |         B = 2**num_bits - 1
186 |         mn = torch.min(data, dim=group_dim + 1, keepdim=True)[0]
187 |         mx = torch.max(data, dim=group_dim + 1, keepdim=True)[0]
188 | 
189 |         scale = B / (mx - mn)
190 |         data = data - mn
191 |         data.mul_(scale)
192 | 
193 |         data = data.clamp_(0, B).round_().to(torch.uint8)
194 |         return data, mn, scale, original_shape
195 | 
196 | 
197 | def decompress(packed_data, config):
198 |     """Simulate group-wise dequantization."""
199 |     if not config.enabled:
200 |         return packed_data
201 | 
202 |     group_size, num_bits, group_dim, symmetric = (
203 |         config.group_size,
204 |         config.num_bits,
205 |         config.group_dim,
206 |         config.symmetric,
207 |     )
208 | 
209 |     # Dequantize
210 |     if symmetric:
211 |         data, scale, original_shape = packed_data
212 |         data = data / scale
213 |     else:
214 |         data, mn, scale, original_shape = packed_data
215 |         data = data / scale
216 |         data.add_(mn)
217 | 
218 |     # Unpad
219 |     pad_len = (group_size - original_shape[group_dim] % group_size) % group_size
220 |     if pad_len:
221 |         padded_original_shape = (
222 |             original_shape[:group_dim]
223 |             + (original_shape[group_dim] + pad_len,)
224 |             + original_shape[group_dim + 1 :]
225 |         )
226 |         data = data.reshape(padded_original_shape)
227 |         indices = [slice(0, x) for x in original_shape]
228 |         return data[indices].contiguous()
229 |     else:
230 |         return data.view(original_shape)


--------------------------------------------------------------------------------
/robohusky/configuration_husky.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ Husky model configuration"""
 16 | 
 17 | import copy
 18 | import os
 19 | from typing import Union
 20 | 
 21 | from transformers.configuration_utils import PretrainedConfig
 22 | from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 23 | from transformers.utils import logging
 24 | from transformers.models.auto import CONFIG_MAPPING
 25 | 
 26 | logger = logging.get_logger(__name__)
 27 | 
 28 | HUSKY_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 29 |     "wofmanaf/husky-7b": "https://huggingface.co/wofmanaf/husky-7b/resolve/main/config.json",
 30 | }
 31 | 
 32 | class HuskyVisionConfig(PretrainedConfig):
 33 |     r"""
 34 |     This is the configuration class to store the configuration of a [`HuskyVisionModel`]. It is used to
 35 |     instantiate a Husky vision encoder according to the specified arguments, defining the model architecture.
 36 |     Instantiating a configuration defaults will yield a similar configuration to that of the Husky architecture.
 37 | 
 38 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
 39 |     documentation from [`PretrainedConfig`] for more information.
 40 | 
 41 |     Args:
 42 |         hidden_size (`int`, *optional*, defaults to 1408):
 43 |             Dimensionality of the encoder layers and the pooler layer.
 44 |         intermediate_size (`int`, *optional*, defaults to 6144):
 45 |             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
 46 |         num_hidden_layers (`int`, *optional*, defaults to 39):
 47 |             Number of hidden layers in the Transformer encoder.
 48 |         num_attention_heads (`int`, *optional*, defaults to 16):
 49 |             Number of attention heads for each attention layer in the Transformer encoder.
 50 |         image_size (`int`, *optional*, defaults to 224):
 51 |             The size (resolution) of each image.
 52 |         patch_size (`int`, *optional*, defaults to 14):
 53 |             The size (resolution) of each patch.
 54 |         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
 55 |             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
 56 |             `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
 57 |             to 1e-5): The epsilon used by the layer normalization layers.
 58 |         dropout (`float`, *optional*, defaults to 0.0):
 59 |             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
 60 |         attention_dropout (`float`, *optional*, defaults to 0.0):
 61 |             The dropout ratio for the attention probabilities.
 62 |         initializer_range (`float`, *optional*, defaults to 0.02):
 63 |             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 64 |         initializer_factor (`float``, *optional*, defaults to 1):
 65 |             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
 66 |             testing).
 67 |         qkv_bias (`bool`, *optional*, defaults to `True`):
 68 |             Whether to add a bias to the queries and values in the self-attention layers.
 69 |     """
 70 | 
 71 |     model_type = "husky_vision_model"
 72 | 
 73 |     def __init__(
 74 |             self,
 75 |             hidden_size=1408,
 76 |             intermediate_size=6144,
 77 |             projection_dim=512,
 78 |             num_hidden_layers=39,
 79 |             num_attention_heads=16,
 80 |             num_channels=3,
 81 |             image_size=224,
 82 |             patch_size=14,
 83 |             hidden_act="gelu",
 84 |             layer_norm_eps=0.00001,
 85 |             dropout=0.0,
 86 |             attention_dropout=0.0,
 87 |             initializer_range=1e-10,
 88 |             initializer_factor=1.0,
 89 |             qkv_bias=True,
 90 |             **kwargs,
 91 |     ):
 92 |         super().__init__(**kwargs)
 93 | 
 94 |         self.hidden_size = hidden_size
 95 |         self.intermediate_size = intermediate_size
 96 |         self.projection_dim = projection_dim
 97 |         self.dropout = dropout
 98 |         self.num_hidden_layers = num_hidden_layers
 99 |         self.num_attention_heads = num_attention_heads
100 |         self.num_channels = num_channels
101 |         self.patch_size = patch_size
102 |         self.image_size = image_size
103 |         self.initializer_range = initializer_range
104 |         self.initializer_factor = initializer_factor
105 |         self.attention_dropout = attention_dropout
106 |         self.layer_norm_eps = layer_norm_eps
107 |         self.hidden_act = hidden_act
108 |         self.qkv_bias = qkv_bias
109 | 
110 |     @classmethod
111 |     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
112 |         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
113 | 
114 |         # get the vision config dict if we are loading from HuskyConfig
115 |         if config_dict.get("model_type") == "husky":
116 |             config_dict = config_dict["vision_config"]
117 | 
118 |         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
119 |             logger.warning(
120 |                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
121 |                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
122 |             )
123 | 
124 |         return cls.from_dict(config_dict, **kwargs)
125 | 
126 | class HuskyQFormerConfig(PretrainedConfig):
127 |     r"""
128 |     This is the configuration class to store the configuration of a [`HuskyQFormerModel`]. It is used to
129 |     instantiate a Husky Querying Transformer (Q-Former) model according to the specified arguments, defining the
130 |     model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
131 |     the Husky [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5)
132 |     architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
133 |     Read the documentation from [`PretrainedConfig`] for more information.
134 | 
135 |     Note that [`HuskyQFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
136 | 
137 |     Args:
138 |         vocab_size (`int`, *optional*, defaults to 30522):
139 |             Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
140 |             the `inputs_ids` passed when calling the model.
141 |         hidden_size (`int`, *optional*, defaults to 768):
142 |             Dimensionality of the encoder layers and the pooler layer.
143 |         num_hidden_layers (`int`, *optional*, defaults to 12):
144 |             Number of hidden layers in the Transformer encoder.
145 |         num_attention_heads (`int`, *optional*, defaults to 12):
146 |             Number of attention heads for each attention layer in the Transformer encoder.
147 |         intermediate_size (`int`, *optional*, defaults to 3072):
148 |             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
149 |         hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
150 |             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
151 |             `"relu"`, `"silu"` and `"gelu_new"` are supported.
152 |         hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
153 |             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
154 |         attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
155 |             The dropout ratio for the attention probabilities.
156 |         max_position_embeddings (`int`, *optional*, defaults to 512):
157 |             The maximum sequence length that this model might ever be used with. Typically set this to something large
158 |             just in case (e.g., 512 or 1024 or 2048).
159 |         initializer_range (`float`, *optional*, defaults to 0.02):
160 |             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
161 |         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
162 |             The epsilon used by the layer normalization layers.
163 |         position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
164 |             Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
165 |             positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
166 |             [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
167 |             For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
168 |             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
169 |         classifier_dropout (`float`, *optional*):
170 |             The dropout ratio for the classification head.
171 |         cross_attention_frequency (`int`, *optional*, defaults to 2):
172 |             The frequency of adding cross-attention to the Transformer layers.
173 |         encoder_hidden_size (`int`, *optional*, defaults to 1408):
174 |             The hidden size of the hidden states for cross-attention.
175 |     """
176 |     model_type = "husky_qformer"
177 | 
178 |     def __init__(
179 |             self,
180 |             vocab_size=30522,
181 |             hidden_size=768,
182 |             num_hidden_layers=12,
183 |             num_attention_heads=12,
184 |             intermediate_size=3072,
185 |             hidden_act="gelu",
186 |             hidden_dropout_prob=0.1,
187 |             attention_probs_dropout_prob=0.1,
188 |             max_position_embeddings=512,
189 |             initializer_range=0.02,
190 |             layer_norm_eps=1e-12,
191 |             pad_token_id=0,
192 |             position_embedding_type="absolute",
193 |             classifier_dropout=None,
194 |             cross_attention_frequency=2,
195 |             encoder_hidden_size=1408,
196 |             **kwargs,
197 |     ):
198 |         super().__init__(pad_token_id=pad_token_id, **kwargs)
199 | 
200 |         self.vocab_size = vocab_size
201 |         self.hidden_size = hidden_size
202 |         self.num_hidden_layers = num_hidden_layers
203 |         self.num_attention_heads = num_attention_heads
204 |         self.hidden_act = hidden_act
205 |         self.intermediate_size = intermediate_size
206 |         self.hidden_dropout_prob = hidden_dropout_prob
207 |         self.attention_probs_dropout_prob = attention_probs_dropout_prob
208 |         self.max_position_embeddings = max_position_embeddings
209 |         self.initializer_range = initializer_range
210 |         self.layer_norm_eps = layer_norm_eps
211 |         self.position_embedding_type = position_embedding_type
212 |         self.classifier_dropout = classifier_dropout
213 |         self.cross_attention_frequency = cross_attention_frequency
214 |         self.encoder_hidden_size = encoder_hidden_size
215 | 
216 |     @classmethod
217 |     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
218 |         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
219 |         # get the qformer config dict if we are loading from HuskyConfig
220 |         if config_dict.get("model_type") == "husky":
221 |             config_dict = config_dict["qformer_config"]
222 | 
223 |         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
224 |             logger.warning(
225 |                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
226 |                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
227 |             )
228 | 
229 |         return cls.from_dict(config_dict, **kwargs)
230 | 
231 | class HuskyConfig(PretrainedConfig):
232 |     r"""
233 |     [`HuskyConfig`] is the configuration class to store the configuration of a
234 |     [`HuskyForConditionalGeneration`]. It is used to instantiate a Husky model according to the specified
235 |     arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
236 |     the defaults will yield a similar configuration to that of the Husky
237 |     [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.
238 | 
239 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
240 |     documentation from [`PretrainedConfig`] for more information.
241 | 
242 |     Args:
243 |         vision_config (`dict`, *optional*):
244 |             Dictionary of configuration options used to initialize [`HuskyVisionConfig`].
245 |         qformer_config (`dict`, *optional*):
246 |             Dictionary of configuration options used to initialize [`HuskyQFormerConfig`].
247 |         text_config (`dict`, *optional*):
248 |             Dictionary of configuration options used to initialize any [`PretrainedConfig`].
249 |         num_query_tokens (`int`, *optional*, defaults to 32):
250 |             The number of query tokens passed through the Transformer.
251 | 
252 |         kwargs (*optional*):
253 |             Dictionary of keyword arguments.
254 |     """
255 | 
256 |     model_type = "husky"
257 |     is_composition = True
258 | 
259 |     def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
260 |         super().__init__(**kwargs)
261 | 
262 |         if vision_config is None:
263 |             vision_config = {}
264 |             logger.info("vision_config is None. initializing the HuskyVisionConfig with default values.")
265 | 
266 |         if qformer_config is None:
267 |             qformer_config = {}
268 |             logger.info("qformer_config is None. Initializing the HuskyQFormerConfig with default values.")
269 | 
270 |         if text_config is None:
271 |             text_config = {}
272 |             logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
273 | 
274 |         self.vision_config = HuskyVisionConfig(**vision_config)
275 |         self.qformer_config = HuskyQFormerConfig(**qformer_config)
276 |         text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
277 |         self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
278 | 
279 |         self.tie_word_embeddings = self.text_config.tie_word_embeddings
280 |         self.is_encoder_decoder = self.text_config.is_encoder_decoder
281 | 
282 |         self.num_query_tokens = num_query_tokens
283 |         self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
284 |         self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
285 |         self.initializer_factor = 1.0
286 |         self.initializer_range = 0.02
287 | 
288 |     @classmethod
289 |     def from_vision_qformer_text_configs(
290 |             cls,
291 |             vision_config: HuskyVisionConfig,
292 |             qformer_config: HuskyQFormerConfig,
293 |             text_config: PretrainedConfig,
294 |             **kwargs,
295 |     ):
296 |         r"""
297 |         Instantiate a [`HuskyConfig`] (or a derived class) from a Husky vision model, Q-Former and
298 |         language model configurations.
299 | 
300 |         Returns:
301 |             [`HuskyConfig`]: An instance of a configuration object
302 |         """
303 | 
304 |         return cls(
305 |             vision_config=vision_config.to_dict(),
306 |             qformer_config=qformer_config.to_dict(),
307 |             text_config=text_config.to_dict(),
308 |             **kwargs,
309 |         )
310 | 
311 |     def to_dict(self):
312 |         """
313 |         Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
314 | 
315 |         Returns:
316 |             `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
317 |         """
318 |         output = copy.deepcopy(self.__dict__)
319 |         output["vision_config"] = self.vision_config.to_dict()
320 |         output["qformer_config"] = self.qformer_config.to_dict()
321 |         output["text_config"] = self.text_config.to_dict()
322 |         output["model_type"] = self.__class__.model_type
323 |         return output
324 | 
325 | if __name__ == '__main__':
326 |     config = HuskyConfig.from_pretrain
327 | 


--------------------------------------------------------------------------------
/robohusky/constants.py:
--------------------------------------------------------------------------------
 1 | from enum import IntEnum
 2 | import os
 3 | 
 4 | # For the gradio web server
 5 | SERVER_ERROR_MSG = (
 6 |     "**NETWORK ERROR DUE TO HIGH TRAFFIC. PLEASE REGENERATE OR REFRESH THIS PAGE.**"
 7 | )
 8 | MODERATION_MSG = "YOUR INPUT VIOLATES OUR CONTENT MODERATION GUIDELINES. PLEASE FIX YOUR INPUT AND TRY AGAIN."
 9 | CONVERSATION_LIMIT_MSG = "YOU HAVE REACHED THE CONVERSATION LENGTH LIMIT. PLEASE CLEAR HISTORY AND START A NEW CONVERSATION."
10 | INPUT_CHAR_LEN_LIMIT = 2560
11 | CONVERSATION_LEN_LIMIT = 50
12 | LOGDIR = "."
13 | 
14 | # For the controller and workers(could be overwritten through ENV variables.)
15 | CONTROLLER_HEART_BEAT_EXPIRATION = int(
16 |     os.getenv("FASTCHAT_CONTROLLER_HEART_BEAT_EXPIRATION", 90)
17 | )
18 | WORKER_HEART_BEAT_INTERVAL = int(os.getenv("FASTCHAT_WORKER_HEART_BEAT_INTERVAL", 30))
19 | WORKER_API_TIMEOUT = int(os.getenv("FASTCHAT_WORKER_API_TIMEOUT", 100))
20 | WORKER_API_EMBEDDING_BATCH_SIZE = int(os.getenv("WORKER_API_EMBEDDING_BATCH_SIZE", 4))
21 | 
22 | 
23 | class ErrorCode(IntEnum):
24 |     """
25 |     https://platform.openai.com/docs/guides/error-codes/api-errors
26 |     """
27 | 
28 |     VALIDATION_TYPE_ERROR = 40001
29 | 
30 |     INVALID_AUTH_KEY = 40101
31 |     INCORRECT_AUTH_KEY = 40102
32 |     NO_PERMISSION = 40103
33 | 
34 |     INVALID_MODEL = 40301
35 |     PARAM_OUT_OF_RANGE = 40302
36 |     CONTEXT_OVERFLOW = 40303
37 | 
38 |     RATE_LIMIT = 42901
39 |     QUOTA_EXCEEDED = 42902
40 |     ENGINE_OVERLOADED = 42903
41 | 
42 |     INTERNAL_ERROR = 50001
43 |     CUDA_OUT_OF_MEMORY = 50002
44 |     GRADIO_REQUEST_ERROR = 50003
45 |     GRADIO_STREAM_UNKNOWN_ERROR = 50004
46 |     CONTROLLER_NO_WORKER = 50005
47 |     CONTROLLER_WORKER_TIMEOUT = 50006
48 | 


--------------------------------------------------------------------------------
/robohusky/conversation.py:
--------------------------------------------------------------------------------
  1 | """
  2 | Conversation prompt templates.
  3 | """
  4 | 
  5 | import dataclasses
  6 | from enum import auto, Enum
  7 | from typing import List, Any, Dict
  8 | 
  9 | 
 10 | class SeparatorStyle(Enum):
 11 |     """Separator styles."""
 12 | 
 13 |     ADD_COLON_SINGLE = auto()
 14 |     ADD_COLON_TWO = auto()
 15 |     ADD_COLON_SPACE_SINGLE = auto()
 16 |     NO_COLON_SINGLE = auto()
 17 |     ADD_NEW_LINE_SINGLE = auto()
 18 |     DOLLY = auto()
 19 |     RWKV = auto()
 20 |     PHOENIX = auto()
 21 | 
 22 | 
 23 | @dataclasses.dataclass
 24 | class Conversation:
 25 |     """A class that keeps all conversation history."""
 26 | 
 27 |     # The name of this template
 28 |     name: str
 29 |     # The system prompt
 30 |     system: str
 31 |     # Two roles
 32 |     roles: List[str]
 33 |     # All messages. Each item is (role, message).
 34 |     messages: List[List[str]]
 35 |     # The number of few shot examples
 36 |     offset: int
 37 |     # Separators
 38 |     sep_style: SeparatorStyle
 39 |     sep: str
 40 |     sep2: str = None
 41 |     # Stop criteria (the default one is EOS token)
 42 |     stop_str: str = None
 43 |     # Stops generation if meeting any token in this list
 44 |     stop_token_ids: List[int] = None
 45 | 
 46 |     def get_prompt(self) -> str:
 47 |         """Get the prompt for generation."""
 48 |         if self.sep_style == SeparatorStyle.ADD_COLON_SINGLE:
 49 |             ret = self.system + self.sep
 50 |             for role, message in self.messages:
 51 |                 if message:
 52 |                     ret += role + ": " + message + self.sep
 53 |                 else:
 54 |                     ret += role + ":"
 55 |             return ret
 56 |         elif self.sep_style == SeparatorStyle.ADD_COLON_TWO:
 57 |             seps = [self.sep, self.sep2]
 58 |             ret = self.system + seps[0]
 59 |             for i, (role, message) in enumerate(self.messages):
 60 |                 if message:
 61 |                     ret += role + ": " + message + seps[i % 2]
 62 |                 else:
 63 |                     ret += role + ":"
 64 |             return ret
 65 |         elif self.sep_style == SeparatorStyle.ADD_COLON_SPACE_SINGLE:
 66 |             ret = self.system + self.sep
 67 |             for role, message in self.messages:
 68 |                 if message:
 69 |                     ret += role + ": " + message + self.sep
 70 |                 else:
 71 |                     ret += role + ": "  # must be end with a space
 72 |             return ret
 73 |         elif self.sep_style == SeparatorStyle.NO_COLON_SINGLE:
 74 |             ret = self.system
 75 |             for role, message in self.messages:
 76 |                 if message:
 77 |                     ret += role + message + self.sep
 78 |                 else:
 79 |                     ret += role
 80 |             return ret
 81 |         elif self.sep_style == SeparatorStyle.ADD_NEW_LINE_SINGLE:
 82 |             ret = self.system + self.sep
 83 |             for role, message in self.messages:
 84 |                 if message:
 85 |                     ret += role + "\n" + message + self.sep
 86 |                 else:
 87 |                     ret += role + "\n"
 88 |             return ret
 89 |         elif self.sep_style == SeparatorStyle.DOLLY:
 90 |             seps = [self.sep, self.sep2]
 91 |             ret = self.system
 92 |             for i, (role, message) in enumerate(self.messages):
 93 |                 if message:
 94 |                     ret += role + ":\n" + message + seps[i % 2]
 95 |                     if i % 2 == 1:
 96 |                         ret += "\n\n"
 97 |                 else:
 98 |                     ret += role + ":\n"
 99 |             return ret
100 |         elif self.sep_style == SeparatorStyle.RWKV:
101 |             ret = self.system
102 |             for i, (role, message) in enumerate(self.messages):
103 |                 if message:
104 |                     ret += (
105 |                             role
106 |                             + ": "
107 |                             + message.replace("\r\n", "\n").replace("\n\n", "\n")
108 |                     )
109 |                     ret += "\n\n"
110 |                 else:
111 |                     ret += role + ":"
112 |             return ret
113 |         elif self.sep_style == SeparatorStyle.PHOENIX:
114 |             ret = self.system
115 |             for role, message in self.messages:
116 |                 if message:
117 |                     ret += role + ": " + "<s>" + message + "</s>"
118 |                 else:
119 |                     ret += role + ": " + "<s>"
120 |             return ret
121 |         else:
122 |             raise ValueError(f"Invalid style: {self.sep_style}")
123 | 
124 |     def append_message(self, role: str, message: str):
125 |         """Append a new message."""
126 |         self.messages.append([role, message])
127 | 
128 |     def update_last_message(self, message: str):
129 |         """Update the last output.
130 | 
131 |         The last message is typically set to be None when constructing the prompt,
132 |         so we need to update it in-place after getting the response from a model.
133 |         """
134 |         self.messages[-1][1] = message
135 | 
136 |     def to_gradio_chatbot(self):
137 |         """Convert the conversation to gradio chatbot format"""
138 |         ret = []
139 |         for i, (role, msg) in enumerate(self.messages[self.offset:]):
140 |             if i % 2 == 0:
141 |                 ret.append([msg, None])
142 |             else:
143 |                 ret[-1][-1] = msg
144 |         return ret
145 | 
146 |     def to_openai_api_messages(self):
147 |         """Convert the conversation to OpenAI chat completion format."""
148 |         ret = [{"role": "system", "content": self.system}]
149 | 
150 |         for i, (_, msg) in enumerate(self.messages[self.offset:]):
151 |             if i % 2 == 0:
152 |                 ret.append({"role": "user", "content": msg})
153 |             else:
154 |                 if msg is not None:
155 |                     ret.append({"role": "assistant", "content": msg})
156 |         return ret
157 | 
158 |     def copy(self):
159 |         return Conversation(
160 |             name=self.name,
161 |             system=self.system,
162 |             roles=self.roles,
163 |             messages=[[x, y] for x, y in self.messages],
164 |             offset=self.offset,
165 |             sep_style=self.sep_style,
166 |             sep=self.sep,
167 |             sep2=self.sep2,
168 |             stop_str=self.stop_str,
169 |             stop_token_ids=self.stop_token_ids,
170 |         )
171 | 
172 |     def dict(self):
173 |         return {
174 |             "name": self.name,
175 |             "system": self.system,
176 |             "roles": self.roles,
177 |             "messages": self.messages,
178 |             "offset": self.offset,
179 |         }
180 | 
181 | 
182 | # A global registry for all conversation templates
183 | conv_templates: Dict[str, Conversation] = {}
184 | 
185 | 
186 | def register_conv_template(template: Conversation, override: bool = False):
187 |     """Register a new conversation template."""
188 |     if not override:
189 |         assert template.name not in conv_templates, f"{template.name} has been registered."
190 |     conv_templates[template.name] = template
191 | 
192 | 
193 | def get_conv_template(name: str) -> Conversation:
194 |     """Get a conversation template."""
195 |     return conv_templates[name].copy()
196 | 
197 | 
198 | # A template with one conversation example
199 | register_conv_template(
200 |     Conversation(
201 |         name="one_shot",
202 |         system="A chat between a curious human and an artificial intelligence assistant. "
203 |                "The assistant gives helpful, detailed, and polite answers to the human's questions.",
204 |         roles=("Human", "Assistant"),
205 |         messages=(
206 |             (
207 |                 "Human",
208 |                 "Got any creative ideas for a 10 year old’s birthday?",
209 |             ),
210 |             (
211 |                 "Assistant",
212 |                 """Of course! Here are some creative ideas for a 10-year-old's birthday party:
213 | 1. Treasure Hunt: Organize a treasure hunt in your backyard or nearby park. Create clues and riddles for the kids to solve, leading them to hidden treasures and surprises.
214 | 2. Science Party: Plan a science-themed party where kids can engage in fun and interactive experiments. You can set up different stations with activities like making slime, erupting volcanoes, or creating simple chemical reactions.
215 | 3. Outdoor Movie Night: Set up a backyard movie night with a projector and a large screen or white sheet. Create a cozy seating area with blankets and pillows, and serve popcorn and snacks while the kids enjoy a favorite movie under the stars.
216 | 4. DIY Crafts Party: Arrange a craft party where kids can unleash their creativity. Provide a variety of craft supplies like beads, paints, and fabrics, and let them create their own unique masterpieces to take home as party favors.
217 | 5. Sports Olympics: Host a mini Olympics event with various sports and games. Set up different stations for activities like sack races, relay races, basketball shooting, and obstacle courses. Give out medals or certificates to the participants.
218 | 6. Cooking Party: Have a cooking-themed party where the kids can prepare their own mini pizzas, cupcakes, or cookies. Provide toppings, frosting, and decorating supplies, and let them get hands-on in the kitchen.
219 | 7. Superhero Training Camp: Create a superhero-themed party where the kids can engage in fun training activities. Set up an obstacle course, have them design their own superhero capes or masks, and organize superhero-themed games and challenges.
220 | 8. Outdoor Adventure: Plan an outdoor adventure party at a local park or nature reserve. Arrange activities like hiking, nature scavenger hunts, or a picnic with games. Encourage exploration and appreciation for the outdoors.
221 | Remember to tailor the activities to the birthday child's interests and preferences. Have a great celebration!""",
222 |             ),
223 |         ),
224 |         offset=2,
225 |         sep_style=SeparatorStyle.ADD_COLON_SINGLE,
226 |         sep="\n### ",
227 |         stop_str="###",
228 |     )
229 | )
230 | 
231 | # Vicuna v1.1 template
232 | register_conv_template(
233 |     Conversation(
234 |         name="vicuna_v1.1",
235 |         system="A chat between a curious user and an artificial intelligence assistant. "
236 |                "The assistant gives helpful, detailed, and polite answers to the user's questions.",
237 |         roles=("USER", "ASSISTANT"),
238 |         messages=(),
239 |         offset=0,
240 |         sep_style=SeparatorStyle.ADD_COLON_TWO,
241 |         sep=" ",
242 |         sep2="</s>",
243 |     )
244 | )
245 | 
246 | # Husky template
247 | register_conv_template(
248 |     Conversation(
249 |         name="husky",
250 |         system="",
251 |         roles=("Human", "Assistant"),
252 |         messages=(),
253 |         offset=0,
254 |         sep_style=SeparatorStyle.ADD_COLON_TWO,
255 |         sep=" ",
256 |         sep2="</s>",
257 |     )
258 | )
259 | 
260 | # Koala default template
261 | register_conv_template(
262 |     Conversation(
263 |         name="koala_v1",
264 |         system="BEGINNING OF CONVERSATION:",
265 |         roles=("USER", "GPT"),
266 |         messages=(),
267 |         offset=0,
268 |         sep_style=SeparatorStyle.ADD_COLON_TWO,
269 |         sep=" ",
270 |         sep2="</s>",
271 |     )
272 | )
273 | 
274 | # Alpaca default template
275 | register_conv_template(
276 |     Conversation(
277 |         name="alpaca",
278 |         system="Below is an instruction that describes a task. Write a response that appropriately completes the request.",
279 |         roles=("### Instruction:", "### Response:"),
280 |         messages=(),
281 |         offset=0,
282 |         sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
283 |         sep="\n\n",
284 |     )
285 | )
286 | 
287 | # Dolly V2 default template
288 | register_conv_template(
289 |     Conversation(
290 |         name="dolly_v2",
291 |         system="Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n",
292 |         roles=("### Instruction", "### Response"),
293 |         messages=(),
294 |         offset=0,
295 |         sep_style=SeparatorStyle.DOLLY,
296 |         sep="\n\n",
297 |         sep2="### End",
298 |     )
299 | )
300 | 
301 | # OpenAssistant Pythia default template
302 | register_conv_template(
303 |     Conversation(
304 |         name="oasst_pythia",
305 |         system="",
306 |         roles=("<|prompter|>", "<|assistant|>"),
307 |         messages=(),
308 |         offset=0,
309 |         sep_style=SeparatorStyle.NO_COLON_SINGLE,
310 |         sep="<|endoftext|>",
311 |     )
312 | )
313 | 
314 | # StableLM Alpha default template
315 | register_conv_template(
316 |     Conversation(
317 |         name="stablelm",
318 |         system="""<|SYSTEM|># StableLM Tuned (Alpha version)
319 | - StableLM is a helpful and harmless open-source AI language model developed by StabilityAI.
320 | - StableLM is excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
321 | - StableLM is more than just an information source, StableLM is also able to write poetry, short stories, and make jokes.
322 | - StableLM will refuse to participate in anything that could harm a human.
323 | """,
324 |         roles=("<|USER|>", "<|ASSISTANT|>"),
325 |         messages=(),
326 |         offset=0,
327 |         sep_style=SeparatorStyle.NO_COLON_SINGLE,
328 |         sep="",
329 |         stop_token_ids=[50278, 50279, 50277, 1, 0],
330 |     )
331 | )
332 | 
333 | # Baize default template
334 | register_conv_template(
335 |     Conversation(
336 |         name="baize",
337 |         system="The following is a conversation between a human and an AI assistant named Baize (named after a mythical creature in Chinese folklore). Baize is an open-source AI assistant developed by UCSD and Sun Yat-Sen University. The human and the AI assistant take turns chatting. Human statements start with [|Human|] and AI assistant statements start with [|AI|]. The AI assistant always provides responses in as much detail as possible, and in Markdown format. The AI assistant always declines to engage with topics, questions and instructions related to unethical, controversial, or sensitive issues. Complete the transcript in exactly that format.\n",
338 |         roles=("[|Human|]", "[|AI|]"),
339 |         messages=(
340 |             ("[|Human|]", "Hello!"),
341 |             ("[|AI|]", "Hi!"),
342 |         ),
343 |         offset=2,
344 |         sep_style=SeparatorStyle.NO_COLON_SINGLE,
345 |         sep="\n",
346 |         stop_str="[|Human|]",
347 |     )
348 | )
349 | 
350 | # RWKV-4-Raven default template
351 | register_conv_template(
352 |     Conversation(
353 |         name="rwkv",
354 |         system="",
355 |         roles=("Bob", "Alice"),
356 |         messages=(
357 |             ("Bob", "hi"),
358 |             (
359 |                 "Alice",
360 |                 "Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it.",
361 |             ),
362 |         ),
363 |         offset=2,
364 |         sep_style=SeparatorStyle.RWKV,
365 |         sep="",
366 |         stop_str="\n\n",
367 |     )
368 | )
369 | 
370 | # Buddy default template
371 | register_conv_template(
372 |     Conversation(
373 |         name="openbuddy",
374 |         system="""Consider a conversation between User (a human) and Assistant (named Buddy).
375 | Buddy is an INTP-T, a friendly, intelligent and multilingual AI assistant, by OpenBuddy team. GitHub: https://github.com/OpenBuddy/OpenBuddy
376 | Buddy cannot access the Internet.
377 | Buddy can fluently speak the user's language (e.g. English, Chinese).
378 | Buddy can generate poems, stories, code, essays, songs, parodies, and more.
379 | Buddy possesses vast knowledge about the world, history, and culture.
380 | Buddy's responses are always safe, creative, high-quality, human-like, and interesting.
381 | Buddy strictly refuses to discuss political, NSFW, or other unsafe topics.
382 | 
383 | User: Hi.
384 | Assistant: Hi, I'm Buddy, your AI assistant. How can I help you today?""",
385 |         roles=("User", "Assistant"),
386 |         messages=(),
387 |         offset=0,
388 |         sep_style=SeparatorStyle.ADD_COLON_SINGLE,
389 |         sep="\n",
390 |     )
391 | )
392 | 
393 | # Phoenix default template
394 | register_conv_template(
395 |     Conversation(
396 |         name="phoenix",
397 |         system="A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n",
398 |         roles=("Human", "Assistant"),
399 |         messages=(),
400 |         offset=0,
401 |         sep_style=SeparatorStyle.PHOENIX,
402 |         sep="</s>",
403 |     )
404 | )
405 | 
406 | # ChatGPT default template
407 | register_conv_template(
408 |     Conversation(
409 |         name="chatgpt",
410 |         system="You are a helpful assistant.",
411 |         roles=("user", "assistant"),
412 |         messages=(),
413 |         offset=0,
414 |         sep_style=None,
415 |         sep=None,
416 |     )
417 | )
418 | 
419 | # Claude default template
420 | register_conv_template(
421 |     Conversation(
422 |         name="claude",
423 |         system="",
424 |         roles=("Human", "Assistant"),
425 |         messages=(),
426 |         offset=0,
427 |         sep_style=SeparatorStyle.ADD_COLON_SINGLE,
428 |         sep="\n\n",
429 |     )
430 | )
431 | 
432 | # MPT default template
433 | register_conv_template(
434 |     Conversation(
435 |         name="mpt",
436 |         system="""<|im_start|>system
437 | - You are a helpful assistant chatbot trained by MosaicML.
438 | - You answer questions.
439 | - You are excited to be able to help the user, but will refuse to do anything that could be considered harmful to the user.
440 | - You are more than just an information source, you are also able to write poetry, short stories, and make jokes.
441 | """,
442 |         roles=("<|im_start|>user", "<|im_start|>assistant"),
443 |         messages=(),
444 |         offset=0,
445 |         sep_style=SeparatorStyle.ADD_NEW_LINE_SINGLE,
446 |         sep="<|im_end|>",
447 |         stop_token_ids=[50278, 0],
448 |     )
449 | )
450 | 
451 | # Bard default template
452 | register_conv_template(
453 |     Conversation(
454 |         name="bard",
455 |         system="",
456 |         roles=("0", "1"),
457 |         messages=(),
458 |         offset=0,
459 |         sep_style=None,
460 |         sep=None,
461 |     )
462 | )
463 | 
464 | # BiLLa default template
465 | register_conv_template(
466 |     Conversation(
467 |         name="billa",
468 |         system="",
469 |         roles=("Human", "Assistant"),
470 |         messages=(),
471 |         offset=0,
472 |         sep_style=SeparatorStyle.ADD_COLON_SPACE_SINGLE,
473 |         sep="\n",
474 |         stop_str="Human:",
475 |     )
476 | )
477 | 
478 | # RedPajama INCITE default template
479 | register_conv_template(
480 |     Conversation(
481 |         name="redpajama-incite",
482 |         system="",
483 |         roles=("<human>", "<bot>"),
484 |         messages=(),
485 |         offset=0,
486 |         sep_style=SeparatorStyle.ADD_COLON_SINGLE,
487 |         sep="\n",
488 |         stop_str="<human>",
489 |     )
490 | )
491 | 
492 | # h2oGPT default template
493 | register_conv_template(
494 |     Conversation(
495 |         name="h2ogpt",
496 |         system="",
497 |         roles=("<|prompt|>", "<|answer|>"),
498 |         messages=(),
499 |         offset=0,
500 |         sep_style=SeparatorStyle.NO_COLON_SINGLE,
501 |         sep="</s>",
502 |     )
503 | )
504 | 
505 | if __name__ == "__main__":
506 |     conv = get_conv_template("husky")
507 |     conv.append_message(conv.roles[0], "Hello!")
508 |     conv.append_message(conv.roles[1], "Hi!")
509 |     conv.append_message(conv.roles[0], "How are you?")
510 |     conv.append_message(conv.roles[1], None)
511 |     print(conv.get_prompt())
512 | 


--------------------------------------------------------------------------------
/robohusky/convert_fp16.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | srun -p INTERN2 --job-name='convert_2_fp16' --gres=gpu:0 --cpus-per-task=8 --quotatype="auto" python -u husky/convert_fp16.py --in-checkpoint work_dirs/llm/husky-13b/zh_bell/checkpoint-9500 --out-checkpoint work_dirs/llm/husky-13b/zh_bell/
 4 | """
 5 | import argparse
 6 | import os.path
 7 | 
 8 | from transformers import AutoTokenizer, AutoModelForCausalLM
 9 | import torch
10 | 
11 | def convert_fp16(in_checkpoint, out_checkpoint):
12 |     tokenizer = AutoTokenizer.from_pretrained(in_checkpoint, use_fast=False)
13 |     model = AutoModelForCausalLM.from_pretrained(
14 |         in_checkpoint, torch_dtype=torch.float16, low_cpu_mem_usage=False
15 |     )
16 |     if not os.path.exists(out_checkpoint):
17 |         os.mkdir(out_checkpoint)
18 |     model.save_pretrained(out_checkpoint)
19 |     tokenizer.save_pretrained(out_checkpoint)
20 | 
21 | if __name__ == "__main__":
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument("--in-checkpoint", type=str, help="Path to the model")
24 |     parser.add_argument("--out-checkpoint", type=str, help="Path to the output model")
25 |     args = parser.parse_args()
26 | 
27 |     convert_fp16(args.in_checkpoint, args.out_checkpoint)
28 | 


--------------------------------------------------------------------------------
/robohusky/convert_husky_fp16.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | srun -p INTERN2 --job-name='convert_2_fp16' --gres=gpu:0 --cpus-per-task=8 --quotatype="auto" python -u husky/convert_husky_fp16.py --in-checkpoint work_dirs/husky_v3/multi_align/checkpoint-48000 --out-checkpoint work_dirs/husky_v3/multi_align_fp16
 4 | """
 5 | import argparse
 6 | import os.path
 7 | 
 8 | from transformers import AutoTokenizer
 9 | from husky.model.modeling_husky_multi import HuskyForConditionalGeneration
10 | import torch
11 | 
12 | def convert_fp16(in_checkpoint, out_checkpoint):
13 |     tokenizer = AutoTokenizer.from_pretrained(in_checkpoint, use_fast=False)
14 |     model = HuskyForConditionalGeneration.from_pretrained(
15 |         in_checkpoint, torch_dtype=torch.float16, low_cpu_mem_usage=False
16 |     )
17 |     if not os.path.exists(out_checkpoint):
18 |         os.mkdir(out_checkpoint)
19 |     model.save_pretrained(out_checkpoint)
20 |     tokenizer.save_pretrained(out_checkpoint)
21 | 
22 | if __name__ == "__main__":
23 |     parser = argparse.ArgumentParser()
24 |     parser.add_argument("--in-checkpoint", type=str, help="Path to the model")
25 |     parser.add_argument("--out-checkpoint", type=str, help="Path to the output model")
26 |     args = parser.parse_args()
27 | 
28 |     convert_fp16(args.in_checkpoint, args.out_checkpoint)
29 | 


--------------------------------------------------------------------------------
/robohusky/convert_reward_fp16.py:
--------------------------------------------------------------------------------
 1 | """
 2 | Usage:
 3 | srun -p INTERN2 --job-name='convert_2_fp16' --gres=gpu:0 --cpus-per-task=8 --quotatype="auto" python -u husky/convert_reward_fp16.py --in-checkpoint work_dirs/llm/Ziya-LLaMA-7B-Reward --out-checkpoint work_dirs/llm/reward_model
 4 | """
 5 | import argparse
 6 | import os.path
 7 | 
 8 | from transformers import LlamaTokenizer, AutoModelForSequenceClassification
 9 | import torch
10 | 
11 | def convert_fp16(in_checkpoint, out_checkpoint):
12 |     tokenizer = LlamaTokenizer.from_pretrained(in_checkpoint, use_fast=False)
13 |     model = AutoModelForSequenceClassification.from_pretrained(
14 |         in_checkpoint, torch_dtype=torch.float16, low_cpu_mem_usage=False
15 |     )
16 |     if not os.path.exists(out_checkpoint):
17 |         os.mkdir(out_checkpoint)
18 |     model.save_pretrained(out_checkpoint)
19 |     tokenizer.save_pretrained(out_checkpoint)
20 | 
21 | if __name__ == "__main__":
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument("--in-checkpoint", type=str, help="Path to the model")
24 |     parser.add_argument("--out-checkpoint", type=str, help="Path to the output model")
25 |     args = parser.parse_args()
26 | 
27 |     convert_fp16(args.in_checkpoint, args.out_checkpoint)
28 | 


--------------------------------------------------------------------------------
/robohusky/dist_utils.py:
--------------------------------------------------------------------------------
  1 | import functools
  2 | import os
  3 | import socket
  4 | import subprocess
  5 | from collections import OrderedDict
  6 | 
  7 | import torch
  8 | import torch.multiprocessing as mp
  9 | from torch import distributed as dist
 10 | 
 11 | 
 12 | def _find_free_port():
 13 |     # Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501
 14 |     sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
 15 |     # Binding to port 0 will cause the OS to find an available port for us
 16 |     sock.bind(('', 0))
 17 |     port = sock.getsockname()[1]
 18 |     sock.close()
 19 |     # NOTE: there is still a chance the port could be taken by other processes.
 20 |     return port
 21 | 
 22 | 
 23 | def _is_free_port(port):
 24 |     ips = socket.gethostbyname_ex(socket.gethostname())[-1]
 25 |     ips.append('localhost')
 26 |     with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
 27 |         return all(s.connect_ex((ip, port)) != 0 for ip in ips)
 28 | 
 29 | 
 30 | def init_dist(launcher, backend='nccl', **kwargs):
 31 |     if mp.get_start_method(allow_none=True) is None:
 32 |         mp.set_start_method('spawn')
 33 |     if launcher == 'pytorch':
 34 |         _init_dist_pytorch(backend, **kwargs)
 35 |     elif launcher == 'mpi':
 36 |         _init_dist_mpi(backend, **kwargs)
 37 |     elif launcher == 'slurm':
 38 |         _init_dist_slurm(backend, **kwargs)
 39 |     else:
 40 |         raise ValueError(f'Invalid launcher type: {launcher}')
 41 | 
 42 | 
 43 | def _init_dist_pytorch(backend, **kwargs):
 44 |     # TODO: use local_rank instead of rank % num_gpus
 45 |     rank = int(os.environ['RANK'])
 46 |     num_gpus = torch.cuda.device_count()
 47 |     torch.cuda.set_device(rank % num_gpus)
 48 |     dist.init_process_group(backend=backend, **kwargs)
 49 | 
 50 | 
 51 | def _init_dist_mpi(backend, **kwargs):
 52 |     local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
 53 |     torch.cuda.set_device(local_rank)
 54 |     if 'MASTER_PORT' not in os.environ:
 55 |         # 29500 is torch.distributed default port
 56 |         os.environ['MASTER_PORT'] = '29500'
 57 |     if 'MASTER_ADDR' not in os.environ:
 58 |         raise KeyError('The environment variable MASTER_ADDR is not set')
 59 |     os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE']
 60 |     os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK']
 61 |     dist.init_process_group(backend=backend, **kwargs)
 62 | 
 63 | 
 64 | def _init_dist_slurm(backend, port=None):
 65 |     """Initialize slurm distributed training environment.
 66 | 
 67 |     If argument ``port`` is not specified, then the master port will be system
 68 |     environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
 69 |     environment variable, then a default port ``29500`` will be used.
 70 | 
 71 |     Args:
 72 |         backend (str): Backend of torch.distributed.
 73 |         port (int, optional): Master port. Defaults to None.
 74 |     """
 75 |     proc_id = int(os.environ['SLURM_PROCID'])
 76 |     ntasks = int(os.environ['SLURM_NTASKS'])
 77 |     node_list = os.environ['SLURM_NODELIST']
 78 |     num_gpus = torch.cuda.device_count()
 79 |     torch.cuda.set_device(proc_id % num_gpus)
 80 |     addr = subprocess.getoutput(
 81 |         f'scontrol show hostname {node_list} | head -n1')
 82 |     # specify master port
 83 |     if port is not None:
 84 |         os.environ['MASTER_PORT'] = str(port)
 85 |     elif 'MASTER_PORT' in os.environ:
 86 |         pass  # use MASTER_PORT in the environment variable
 87 |     else:
 88 |         # if torch.distributed default port(29500) is available
 89 |         # then use it, else find a free port
 90 |         if _is_free_port(29500):
 91 |             os.environ['MASTER_PORT'] = '29500'
 92 |         else:
 93 |             os.environ['MASTER_PORT'] = str(_find_free_port())
 94 |     # use MASTER_ADDR in the environment variable if it already exists
 95 |     if 'MASTER_ADDR' not in os.environ:
 96 |         os.environ['MASTER_ADDR'] = addr
 97 |     os.environ['WORLD_SIZE'] = str(ntasks)
 98 |     os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
 99 |     os.environ['RANK'] = str(proc_id)
100 |     dist.init_process_group(backend=backend)
101 | 


--------------------------------------------------------------------------------
/robohusky/llama2_flash_attn_monkey_patch.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from typing import Optional, Tuple
  3 | 
  4 | import torch
  5 | from flash_attn import __version__ as flash_attn_version
  6 | from flash_attn.bert_padding import pad_input, unpad_input
  7 | from flash_attn.flash_attn_interface import (
  8 |     flash_attn_func,
  9 |     flash_attn_varlen_kvpacked_func,
 10 | )
 11 | from transformers.models.llama.modeling_llama import (
 12 |     LlamaAttention,
 13 |     LlamaModel,
 14 |     rotate_half,
 15 | )
 16 | 
 17 | def apply_rotary_pos_emb(q, k, cos_sin, position_ids):
 18 |     gather_indices = position_ids[:, :, None, None]  # [bsz, seq_len, 1, 1]
 19 |     gather_indices = gather_indices.repeat(
 20 |         1, 1, cos_sin[0].shape[1], cos_sin[0].shape[3]
 21 |     )
 22 |     bsz = gather_indices.shape[0]
 23 |     cos, sin = (
 24 |         torch.gather(x.transpose(1, 2).repeat(bsz, 1, 1, 1), 1, gather_indices)
 25 |         for x in cos_sin
 26 |     )
 27 |     q, k = ((x * cos) + (rotate_half(x) * sin) for x in (q, k))
 28 |     return q, k
 29 | 
 30 | def forward(
 31 |         self,
 32 |         hidden_states: torch.Tensor,
 33 |         attention_mask: Optional[torch.Tensor] = None,
 34 |         position_ids: Optional[torch.Tensor] = None,
 35 |         past_key_value: Optional[Tuple[torch.Tensor]] = None,
 36 |         output_attentions: bool = False,
 37 |         use_cache: bool = False,
 38 |         padding_mask: Optional[torch.Tensor] = None,
 39 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
 40 |     if output_attentions:
 41 |         warnings.warn(
 42 |             "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
 43 |         )
 44 | 
 45 |     bsz, q_len, _ = hidden_states.size()
 46 |     kv_heads = getattr(self, "num_key_value_heads", self.num_heads)
 47 | 
 48 |     q, k, v = (
 49 |         op(hidden_states).view(bsz, q_len, nh, self.head_dim)
 50 |         for op, nh in (
 51 |         (self.q_proj, self.num_heads),
 52 |         (self.k_proj, kv_heads),
 53 |         (self.v_proj, kv_heads),
 54 |     )
 55 |     )
 56 |     # shape: (b, s, num_heads, head_dim)
 57 | 
 58 |     kv_seq_len = k.shape[1]
 59 |     past_kv_len = 0
 60 |     if past_key_value is not None:
 61 |         past_kv_len = past_key_value[0].shape[2]
 62 |         kv_seq_len += past_kv_len
 63 | 
 64 |     cos_sin = self.rotary_emb(v, seq_len=kv_seq_len)
 65 |     q, k = apply_rotary_pos_emb(q, k, cos_sin, position_ids)
 66 | 
 67 |     if past_key_value is not None:
 68 |         assert (
 69 |                 flash_attn_version >= "2.1.0"
 70 |         ), "past_key_value support requires flash-attn >= 2.1.0"
 71 |         # reuse k, v
 72 |         k = torch.cat([past_key_value[0].transpose(1, 2), k], dim=1)
 73 |         v = torch.cat([past_key_value[1].transpose(1, 2), v], dim=1)
 74 | 
 75 |     past_key_value = (k.transpose(1, 2), v.transpose(1, 2)) if use_cache else None
 76 | 
 77 |     if attention_mask is None:
 78 |         output = flash_attn_func(q, k, v, 0.0, softmax_scale=None, causal=True).view(
 79 |             bsz, q_len, -1
 80 |         )
 81 |     else:
 82 |         q, indices, cu_q_lens, max_s = unpad_input(q, attention_mask[:, -q_len:])
 83 |         # We can skip concat and call unpad twice but seems better to call unpad only once.
 84 |         kv, _, cu_k_lens, max_k = unpad_input(
 85 |             torch.stack((k, v), dim=2), attention_mask
 86 |         )
 87 |         output_unpad = flash_attn_varlen_kvpacked_func(
 88 |             q,
 89 |             kv,
 90 |             cu_q_lens,
 91 |             cu_k_lens,
 92 |             max_s,
 93 |             max_k,
 94 |             0.0,
 95 |             softmax_scale=None,
 96 |             causal=True,
 97 |         )
 98 |         output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
 99 |         output = pad_input(output_unpad, indices, bsz, q_len)
100 | 
101 |     return self.o_proj(output), None, past_key_value
102 | 
103 | # Disable the transformation of the attention mask in LlamaModel as flash attention
104 | # takes a boolean key_padding_mask. Fills in the past kv length for use in forward.
105 | def _prepare_decoder_attention_mask(
106 |         self, attention_mask, input_shape, inputs_embeds, past_key_values_length
107 | ):
108 |     # [bsz, seq_len]
109 |     if past_key_values_length > 0 and attention_mask is not None:
110 |         attention_mask = torch.cat(
111 |             (
112 |                 torch.full(
113 |                     (input_shape[0], past_key_values_length),
114 |                     True,
115 |                     dtype=attention_mask.dtype,
116 |                     device=attention_mask.device,
117 |                 ),
118 |                 attention_mask,
119 |             ),
120 |             dim=-1,
121 |         )
122 | 
123 |     if attention_mask is not None and torch.all(attention_mask):
124 |         return None  # This uses the faster call when training with full samples
125 | 
126 |     return attention_mask
127 | 
128 | def replace_llama_attn_with_flash_attn():
129 |     cuda_major, cuda_minor = torch.cuda.get_device_capability()
130 |     if cuda_major < 8:
131 |         warnings.warn(
132 |             "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
133 |             "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
134 |         )
135 | 
136 |     LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
137 |     LlamaAttention.forward = forward
138 | 
139 | def test():
140 |     from robohusky.train.llama_flash_attn_monkey_patch import forward as fastchat_forward
141 |     from transformers.models.llama.configuration_llama import LlamaConfig
142 | 
143 |     config = LlamaConfig(
144 |         hidden_size=1024,
145 |         intermediate_size=128,
146 |         num_hidden_layers=1,
147 |         num_attention_heads=8,
148 |         max_position_embeddings=16,
149 |     )
150 |     device = torch.device("cuda")
151 |     model = LlamaModel(config)
152 |     attn = LlamaAttention(config).to(device).half()
153 |     bsz, hs, seqlen = 2, config.hidden_size, config.max_position_embeddings
154 |     position_ids = torch.arange(seqlen, dtype=torch.long, device=device).view(
155 |         -1, seqlen
156 |     )
157 | 
158 |     mask = torch.full((bsz, seqlen), True, dtype=torch.bool, device=device)
159 |     for i in range(4):
160 |         hidden = torch.rand((bsz, seqlen, hs), dtype=torch.float16, device=device)
161 |         if i:
162 |             mask[0, -i:] = False
163 |             mask[1, :i] = False
164 | 
165 |         lmask = model._prepare_decoder_attention_mask(mask, hidden.shape[:2], hidden, 0)
166 |         ref, _, _ = attn.forward(
167 |             hidden, attention_mask=lmask, position_ids=position_ids
168 |         )
169 | 
170 |         fast, _, _ = fastchat_forward(
171 |             attn, hidden, attention_mask=mask, position_ids=position_ids
172 |         )
173 | 
174 |         lmask = _prepare_decoder_attention_mask(
175 |             model, mask, hidden.shape[:2], hidden, 0
176 |         )
177 |         test, _, _ = forward(
178 |             attn, hidden, attention_mask=lmask, position_ids=position_ids
179 |         )
180 | 
181 |         print(f"Mean(abs(ref)) = {torch.mean(torch.abs(ref))}")
182 |         print(f"Mean(abs(ref - fast)) = {torch.mean(torch.abs(ref - fast))}")
183 |         print(f"Mean(abs(ref - test)) = {torch.mean(torch.abs(ref - test))}")
184 |         print(f"Mean(abs(fast - test)) = {torch.mean(torch.abs(fast - test))}")
185 |         print(f"allclose(fast, test) = {torch.allclose(fast, test)}")
186 | 
187 |     with torch.no_grad():
188 |         # Also check that past_kv is handled properly
189 |         hidden = torch.rand((bsz, seqlen, hs), dtype=torch.float16, device=device)
190 |         part_len = seqlen // 4
191 |         assert part_len * 4 == seqlen
192 |         mask = torch.full((bsz, seqlen), True, dtype=torch.bool, device=device)
193 |         mask[0, -2:] = False
194 |         lmask = _prepare_decoder_attention_mask(
195 |             model, mask, hidden.shape[:2], hidden, 0
196 |         )
197 |         oneshot, _, _ = forward(
198 |             attn, hidden, attention_mask=lmask, position_ids=position_ids
199 |         )
200 |         parts = []
201 |         past_kv, past_kv_len = None, 0
202 |         for i in range(4):
203 |             start = part_len * i
204 |             end = start + part_len
205 |             hidden_part = hidden[:, start:end, ...]
206 |             lmask = _prepare_decoder_attention_mask(
207 |                 model,
208 |                 mask[:, start:end],
209 |                 hidden_part.shape[:2],
210 |                 hidden_part,
211 |                 past_kv_len,
212 |             )
213 |             part, _, past_kv = forward(
214 |                 attn,
215 |                 hidden_part.clone(),
216 |                 attention_mask=lmask,
217 |                 position_ids=position_ids[:, start:end],
218 |                 past_key_value=past_kv,
219 |                 use_cache=True,
220 |             )
221 |             parts.append(part)
222 |             past_kv_len = past_kv[0].shape[2]
223 | 
224 |         print(
225 |             f"allclose(oneshot[:, 0], parts[0]) = {torch.allclose(oneshot[:, :part_len], parts[0])}"
226 |         )
227 |         print(
228 |             f"allclose(oneshot, parts) = {torch.allclose(oneshot, torch.cat(parts, dim=1))}"
229 |         )
230 | 
231 | if __name__ == "__main__":
232 |     test()
233 | 


--------------------------------------------------------------------------------
/robohusky/model/__init__.py:
--------------------------------------------------------------------------------
 1 | # Copyright 2023 The HuggingFace Team. All rights reserved.
 2 | #
 3 | # Licensed under the Apache License, Version 2.0 (the "License");
 4 | # you may not use this file except in compliance with the License.
 5 | # You may obtain a copy of the License at
 6 | #
 7 | #     http://www.apache.org/licenses/LICENSE-2.0
 8 | #
 9 | # Unless required by applicable law or agreed to in writing, software
10 | # distributed under the License is distributed on an "AS IS" BASIS,
11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 | # See the License for the specific language governing permissions and
13 | # limitations under the License.
14 | from typing import TYPE_CHECKING
15 | 
16 | from transformers.utils import OptionalDependencyNotAvailable, _LazyModule, is_torch_available
17 | 
18 | _import_structure = {
19 |     "configuration_husky": [
20 |         "HUSKY_PRETRAINED_CONFIG_ARCHIVE_MAP",
21 |         "HuskyConfig",
22 |         "HuskyQFormerConfig",
23 |         "HuskyVisionConfig",
24 |     ],
25 |     "processing_husky": ["HuskyProcessor"],
26 | }
27 | 
28 | try:
29 |     if not is_torch_available():
30 |         raise OptionalDependencyNotAvailable()
31 | except OptionalDependencyNotAvailable:
32 |     pass
33 | else:
34 |     _import_structure["modeling_husky"] = [
35 |         "HUSKY_PRETRAINED_MODEL_ARCHIVE_LIST",
36 |         "HuskyModel",
37 |         "HuskyQFormerModel",
38 |         "HuskyPreTrainedModel",
39 |         "HuskyForConditionalGeneration",
40 |         "HuskyVisionModel",
41 |     ]
42 | 
43 | if TYPE_CHECKING:
44 |     from .configuration_husky import (
45 |         HUSKY_PRETRAINED_CONFIG_ARCHIVE_MAP,
46 |         HuskyConfig,
47 |         HuskyVisionConfig,
48 |         HuskyQFormerConfig
49 |     )
50 |     from .processing_husky import HuskyProcessor
51 | 
52 |     try:
53 |         if not is_torch_available():
54 |             raise OptionalDependencyNotAvailable()
55 |     except OptionalDependencyNotAvailable:
56 |         pass
57 |     else:
58 |         from .modeling_husky import (
59 |             HUSKY_PRETRAINED_MODEL_ARCHIVE_LIST,
60 |             HuskyForConditionalGeneration,
61 |             HuskyModel,
62 |             HuskyPreTrainedModel,
63 |             HuskyQFormerModel,
64 |             HuskyVisionModel,
65 |         )
66 | 
67 | else:
68 |     import sys
69 | 
70 |     sys.modules[__name__] = _LazyModule(__name__, globals()["__file__"], _import_structure, module_spec=__spec__)
71 | 


--------------------------------------------------------------------------------
/robohusky/model/__pycache__/__init__.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmbodiedGPT/EmbodiedGPT_Pytorch/cda80524bf6b7d276ba3b532887bacd4b133f234/robohusky/model/__pycache__/__init__.cpython-38.pyc


--------------------------------------------------------------------------------
/robohusky/model/__pycache__/configuration_husky.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmbodiedGPT/EmbodiedGPT_Pytorch/cda80524bf6b7d276ba3b532887bacd4b133f234/robohusky/model/__pycache__/configuration_husky.cpython-38.pyc


--------------------------------------------------------------------------------
/robohusky/model/__pycache__/modeling_husky_embody2.cpython-38.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmbodiedGPT/EmbodiedGPT_Pytorch/cda80524bf6b7d276ba3b532887bacd4b133f234/robohusky/model/__pycache__/modeling_husky_embody2.cpython-38.pyc


--------------------------------------------------------------------------------
/robohusky/model/compression.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmbodiedGPT/EmbodiedGPT_Pytorch/cda80524bf6b7d276ba3b532887bacd4b133f234/robohusky/model/compression.py


--------------------------------------------------------------------------------
/robohusky/model/configuration_husky.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ Husky model configuration"""
 16 | 
 17 | import copy
 18 | import os
 19 | from typing import Union
 20 | 
 21 | from transformers.configuration_utils import PretrainedConfig
 22 | from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 23 | from transformers.utils import logging
 24 | 
 25 | from transformers.models.auto import CONFIG_MAPPING
 26 | 
 27 | logger = logging.get_logger(__name__)
 28 | 
 29 | HUSKY_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 30 |     "wofmanaf/husky-7b": "https://huggingface.co/wofmanaf/husky-7b/resolve/main/config.json",
 31 | }
 32 | 
 33 | class HuskyVisionConfig(PretrainedConfig):
 34 |     r"""
 35 |     This is the configuration class to store the configuration of a [`HuskyVisionModel`]. It is used to
 36 |     instantiate a Husky vision encoder according to the specified arguments, defining the model architecture.
 37 |     Instantiating a configuration defaults will yield a similar configuration to that of the Husky architecture.
 38 | 
 39 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
 40 |     documentation from [`PretrainedConfig`] for more information.
 41 | 
 42 |     Args:
 43 |         hidden_size (`int`, *optional*, defaults to 1408):
 44 |             Dimensionality of the encoder layers and the pooler layer.
 45 |         intermediate_size (`int`, *optional*, defaults to 6144):
 46 |             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
 47 |         num_hidden_layers (`int`, *optional*, defaults to 39):
 48 |             Number of hidden layers in the Transformer encoder.
 49 |         num_attention_heads (`int`, *optional*, defaults to 16):
 50 |             Number of attention heads for each attention layer in the Transformer encoder.
 51 |         image_size (`int`, *optional*, defaults to 224):
 52 |             The size (resolution) of each image.
 53 |         patch_size (`int`, *optional*, defaults to 14):
 54 |             The size (resolution) of each patch.
 55 |         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
 56 |             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
 57 |             `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
 58 |             to 1e-5): The epsilon used by the layer normalization layers.
 59 |         dropout (`float`, *optional*, defaults to 0.0):
 60 |             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
 61 |         attention_dropout (`float`, *optional*, defaults to 0.0):
 62 |             The dropout ratio for the attention probabilities.
 63 |         initializer_range (`float`, *optional*, defaults to 0.02):
 64 |             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 65 |         initializer_factor (`float``, *optional*, defaults to 1):
 66 |             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
 67 |             testing).
 68 |         qkv_bias (`bool`, *optional*, defaults to `True`):
 69 |             Whether to add a bias to the queries and values in the self-attention layers.
 70 |     """
 71 | 
 72 |     model_type = "husky_vision_model"
 73 | 
 74 |     def __init__(
 75 |             self,
 76 |             hidden_size=1408,
 77 |             intermediate_size=6144,
 78 |             projection_dim=512,
 79 |             num_hidden_layers=39,
 80 |             num_attention_heads=16,
 81 |             num_channels=3,
 82 |             image_size=224,
 83 |             patch_size=14,
 84 |             hidden_act="gelu",
 85 |             layer_norm_eps=0.00001,
 86 |             dropout=0.0,
 87 |             attention_dropout=0.0,
 88 |             initializer_range=1e-10,
 89 |             initializer_factor=1.0,
 90 |             qkv_bias=True,
 91 |             _flash_attn_2_enabled=True,
 92 |             **kwargs,
 93 |     ):
 94 |         super().__init__(**kwargs)
 95 | 
 96 |         self.hidden_size = hidden_size
 97 |         self.intermediate_size = intermediate_size
 98 |         self.projection_dim = projection_dim
 99 |         self.dropout = dropout
100 |         self.num_hidden_layers = num_hidden_layers
101 |         self.num_attention_heads = num_attention_heads
102 |         self.num_channels = num_channels
103 |         self.patch_size = patch_size
104 |         self.image_size = image_size
105 |         self.initializer_range = initializer_range
106 |         self.initializer_factor = initializer_factor
107 |         self.attention_dropout = attention_dropout
108 |         self.layer_norm_eps = layer_norm_eps
109 |         self.hidden_act = hidden_act
110 |         self.qkv_bias = qkv_bias
111 |         self._flash_attn_2_enabled = _flash_attn_2_enabled
112 | 
113 |     @classmethod
114 |     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
115 |         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
116 | 
117 |         # get the vision config dict if we are loading from HuskyConfig
118 |         if config_dict.get("model_type") == "husky":
119 |             config_dict = config_dict["vision_config"]
120 | 
121 |         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
122 |             logger.warning(
123 |                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
124 |                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
125 |             )
126 | 
127 |         return cls.from_dict(config_dict, **kwargs)
128 | 
129 | class HuskyQFormerConfig(PretrainedConfig):
130 |     r"""
131 |     This is the configuration class to store the configuration of a [`HuskyQFormerModel`]. It is used to
132 |     instantiate a Husky Querying Transformer (Q-Former) model according to the specified arguments, defining the
133 |     model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
134 |     the Husky [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5)
135 |     architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
136 |     Read the documentation from [`PretrainedConfig`] for more information.
137 | 
138 |     Note that [`HuskyQFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
139 | 
140 |     Args:
141 |         vocab_size (`int`, *optional*, defaults to 30522):
142 |             Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
143 |             the `inputs_ids` passed when calling the model.
144 |         hidden_size (`int`, *optional*, defaults to 768):
145 |             Dimensionality of the encoder layers and the pooler layer.
146 |         num_hidden_layers (`int`, *optional*, defaults to 12):
147 |             Number of hidden layers in the Transformer encoder.
148 |         num_attention_heads (`int`, *optional*, defaults to 12):
149 |             Number of attention heads for each attention layer in the Transformer encoder.
150 |         intermediate_size (`int`, *optional*, defaults to 3072):
151 |             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
152 |         hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
153 |             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
154 |             `"relu"`, `"silu"` and `"gelu_new"` are supported.
155 |         hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
156 |             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
157 |         attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
158 |             The dropout ratio for the attention probabilities.
159 |         max_position_embeddings (`int`, *optional*, defaults to 512):
160 |             The maximum sequence length that this model might ever be used with. Typically set this to something large
161 |             just in case (e.g., 512 or 1024 or 2048).
162 |         initializer_range (`float`, *optional*, defaults to 0.02):
163 |             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
164 |         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
165 |             The epsilon used by the layer normalization layers.
166 |         position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
167 |             Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
168 |             positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
169 |             [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
170 |             For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
171 |             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
172 |         classifier_dropout (`float`, *optional*):
173 |             The dropout ratio for the classification head.
174 |         cross_attention_frequency (`int`, *optional*, defaults to 2):
175 |             The frequency of adding cross-attention to the Transformer layers.
176 |         encoder_hidden_size (`int`, *optional*, defaults to 1408):
177 |             The hidden size of the hidden states for cross-attention.
178 |     """
179 |     model_type = "husky_qformer"
180 | 
181 |     def __init__(
182 |             self,
183 |             vocab_size=30522,
184 |             hidden_size=768,
185 |             num_hidden_layers=12,
186 |             num_attention_heads=12,
187 |             intermediate_size=3072,
188 |             hidden_act="gelu",
189 |             hidden_dropout_prob=0.1,
190 |             attention_probs_dropout_prob=0.1,
191 |             max_position_embeddings=512,
192 |             initializer_range=0.02,
193 |             layer_norm_eps=1e-12,
194 |             pad_token_id=0,
195 |             position_embedding_type="absolute",
196 |             classifier_dropout=None,
197 |             cross_attention_frequency=2,
198 |             encoder_hidden_size=1408,
199 |             _flash_attn_2_enabled=True,
200 |             **kwargs,
201 |     ):
202 |         super().__init__(pad_token_id=pad_token_id, **kwargs)
203 | 
204 |         self.vocab_size = vocab_size
205 |         self.hidden_size = hidden_size
206 |         self.num_hidden_layers = num_hidden_layers
207 |         self.num_attention_heads = num_attention_heads
208 |         self.hidden_act = hidden_act
209 |         self.intermediate_size = intermediate_size
210 |         self.hidden_dropout_prob = hidden_dropout_prob
211 |         self.attention_probs_dropout_prob = attention_probs_dropout_prob
212 |         self.max_position_embeddings = max_position_embeddings
213 |         self.initializer_range = initializer_range
214 |         self.layer_norm_eps = layer_norm_eps
215 |         self.position_embedding_type = position_embedding_type
216 |         self.classifier_dropout = classifier_dropout
217 |         self.cross_attention_frequency = cross_attention_frequency
218 |         self.encoder_hidden_size = encoder_hidden_size
219 |         self._flash_attn_2_enabled = _flash_attn_2_enabled
220 | 
221 |     @classmethod
222 |     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
223 |         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
224 |         # get the qformer config dict if we are loading from HuskyConfig
225 |         if config_dict.get("model_type") == "husky":
226 |             config_dict = config_dict["qformer_config"]
227 | 
228 |         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
229 |             logger.warning(
230 |                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
231 |                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
232 |             )
233 | 
234 |         return cls.from_dict(config_dict, **kwargs)
235 | 
236 | class HuskyConfig(PretrainedConfig):
237 |     r"""
238 |     [`HuskyConfig`] is the configuration class to store the configuration of a
239 |     [`HuskyForConditionalGeneration`]. It is used to instantiate a Husky model according to the specified
240 |     arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
241 |     the defaults will yield a similar configuration to that of the Husky
242 |     [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.
243 | 
244 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
245 |     documentation from [`PretrainedConfig`] for more information.
246 | 
247 |     Args:
248 |         vision_config (`dict`, *optional*):
249 |             Dictionary of configuration options used to initialize [`HuskyVisionConfig`].
250 |         qformer_config (`dict`, *optional*):
251 |             Dictionary of configuration options used to initialize [`HuskyQFormerConfig`].
252 |         text_config (`dict`, *optional*):
253 |             Dictionary of configuration options used to initialize any [`PretrainedConfig`].
254 |         num_query_tokens (`int`, *optional*, defaults to 32):
255 |             The number of query tokens passed through the Transformer.
256 | 
257 |         kwargs (*optional*):
258 |             Dictionary of keyword arguments.
259 |     """
260 | 
261 |     model_type = "husky"
262 |     is_composition = True
263 | 
264 |     def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
265 |         super().__init__(**kwargs)
266 | 
267 |         if vision_config is None:
268 |             vision_config = {}
269 |             logger.info("vision_config is None. initializing the HuskyVisionConfig with default values.")
270 | 
271 |         if qformer_config is None:
272 |             qformer_config = {}
273 |             logger.info("qformer_config is None. Initializing the HuskyQFormerConfig with default values.")
274 | 
275 |         if text_config is None:
276 |             text_config = {}
277 |             logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
278 | 
279 |         self.vision_config = HuskyVisionConfig(**vision_config)
280 |         self.qformer_config = HuskyQFormerConfig(**qformer_config)
281 |         text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
282 |         self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
283 | 
284 |         self.tie_word_embeddings = self.text_config.tie_word_embeddings
285 |         self.is_encoder_decoder = self.text_config.is_encoder_decoder
286 | 
287 |         self.num_query_tokens = num_query_tokens
288 |         self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
289 |         self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
290 |         self.initializer_factor = 1.0
291 |         self.initializer_range = 0.02
292 | 
293 |     @classmethod
294 |     def from_vision_qformer_text_configs(
295 |             cls,
296 |             vision_config: HuskyVisionConfig,
297 |             qformer_config: HuskyQFormerConfig,
298 |             text_config: PretrainedConfig,
299 |             **kwargs,
300 |     ):
301 |         r"""
302 |         Instantiate a [`HuskyConfig`] (or a derived class) from a Husky vision model, Q-Former and
303 |         language model configurations.
304 | 
305 |         Returns:
306 |             [`HuskyConfig`]: An instance of a configuration object
307 |         """
308 | 
309 |         return cls(
310 |             vision_config=vision_config.to_dict(),
311 |             qformer_config=qformer_config.to_dict(),
312 |             text_config=text_config.to_dict(),
313 |             **kwargs,
314 |         )
315 | 
316 |     def to_dict(self):
317 |         """
318 |         Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
319 | 
320 |         Returns:
321 |             `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
322 |         """
323 |         output = copy.deepcopy(self.__dict__)
324 |         output["vision_config"] = self.vision_config.to_dict()
325 |         output["qformer_config"] = self.qformer_config.to_dict()
326 |         output["text_config"] = self.text_config.to_dict()
327 |         output["model_type"] = self.__class__.model_type
328 |         return output
329 | 
330 | if __name__ == '__main__':
331 |     config = HuskyConfig.from_pretrain
332 | 


--------------------------------------------------------------------------------
/robohusky/model/configuration_husky_ori.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2023 The HuggingFace Inc. team. All rights reserved.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """ Husky model configuration"""
 16 | 
 17 | import copy
 18 | import os
 19 | from typing import Union
 20 | 
 21 | from transformers.configuration_utils import PretrainedConfig
 22 | from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
 23 | from transformers.utils import logging
 24 | 
 25 | from transformers.models.auto import CONFIG_MAPPING
 26 | 
 27 | logger = logging.get_logger(__name__)
 28 | 
 29 | HUSKY_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 30 |     "wofmanaf/husky-7b": "https://huggingface.co/wofmanaf/husky-7b/resolve/main/config.json",
 31 | }
 32 | 
 33 | class HuskyVisionConfig(PretrainedConfig):
 34 |     r"""
 35 |     This is the configuration class to store the configuration of a [`HuskyVisionModel`]. It is used to
 36 |     instantiate a Husky vision encoder according to the specified arguments, defining the model architecture.
 37 |     Instantiating a configuration defaults will yield a similar configuration to that of the Husky architecture.
 38 | 
 39 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
 40 |     documentation from [`PretrainedConfig`] for more information.
 41 | 
 42 |     Args:
 43 |         hidden_size (`int`, *optional*, defaults to 1408):
 44 |             Dimensionality of the encoder layers and the pooler layer.
 45 |         intermediate_size (`int`, *optional*, defaults to 6144):
 46 |             Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
 47 |         num_hidden_layers (`int`, *optional*, defaults to 39):
 48 |             Number of hidden layers in the Transformer encoder.
 49 |         num_attention_heads (`int`, *optional*, defaults to 16):
 50 |             Number of attention heads for each attention layer in the Transformer encoder.
 51 |         image_size (`int`, *optional*, defaults to 224):
 52 |             The size (resolution) of each image.
 53 |         patch_size (`int`, *optional*, defaults to 14):
 54 |             The size (resolution) of each patch.
 55 |         hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
 56 |             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
 57 |             `"relu"`, `"selu"` and `"gelu_new"` ``"gelu"` are supported. layer_norm_eps (`float`, *optional*, defaults
 58 |             to 1e-5): The epsilon used by the layer normalization layers.
 59 |         dropout (`float`, *optional*, defaults to 0.0):
 60 |             The dropout probabilitiy for all fully connected layers in the embeddings, encoder, and pooler.
 61 |         attention_dropout (`float`, *optional*, defaults to 0.0):
 62 |             The dropout ratio for the attention probabilities.
 63 |         initializer_range (`float`, *optional*, defaults to 0.02):
 64 |             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
 65 |         initializer_factor (`float``, *optional*, defaults to 1):
 66 |             A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
 67 |             testing).
 68 |         qkv_bias (`bool`, *optional*, defaults to `True`):
 69 |             Whether to add a bias to the queries and values in the self-attention layers.
 70 |     """
 71 | 
 72 |     model_type = "husky_vision_model"
 73 | 
 74 |     def __init__(
 75 |             self,
 76 |             hidden_size=1408,
 77 |             intermediate_size=6144,
 78 |             projection_dim=512,
 79 |             num_hidden_layers=39,
 80 |             num_attention_heads=16,
 81 |             num_channels=3,
 82 |             image_size=224,
 83 |             patch_size=14,
 84 |             hidden_act="gelu",
 85 |             layer_norm_eps=0.00001,
 86 |             dropout=0.0,
 87 |             attention_dropout=0.0,
 88 |             initializer_range=1e-10,
 89 |             initializer_factor=1.0,
 90 |             qkv_bias=True,
 91 |             **kwargs,
 92 |     ):
 93 |         super().__init__(**kwargs)
 94 | 
 95 |         self.hidden_size = hidden_size
 96 |         self.intermediate_size = intermediate_size
 97 |         self.projection_dim = projection_dim
 98 |         self.dropout = dropout
 99 |         self.num_hidden_layers = num_hidden_layers
100 |         self.num_attention_heads = num_attention_heads
101 |         self.num_channels = num_channels
102 |         self.patch_size = patch_size
103 |         self.image_size = image_size
104 |         self.initializer_range = initializer_range
105 |         self.initializer_factor = initializer_factor
106 |         self.attention_dropout = attention_dropout
107 |         self.layer_norm_eps = layer_norm_eps
108 |         self.hidden_act = hidden_act
109 |         self.qkv_bias = qkv_bias
110 | 
111 |     @classmethod
112 |     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
113 |         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
114 | 
115 |         # get the vision config dict if we are loading from HuskyConfig
116 |         if config_dict.get("model_type") == "husky":
117 |             config_dict = config_dict["vision_config"]
118 | 
119 |         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
120 |             logger.warning(
121 |                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
122 |                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
123 |             )
124 | 
125 |         return cls.from_dict(config_dict, **kwargs)
126 | 
127 | class HuskyQFormerConfig(PretrainedConfig):
128 |     r"""
129 |     This is the configuration class to store the configuration of a [`HuskyQFormerModel`]. It is used to
130 |     instantiate a Husky Querying Transformer (Q-Former) model according to the specified arguments, defining the
131 |     model architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
132 |     the Husky [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5)
133 |     architecture. Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs.
134 |     Read the documentation from [`PretrainedConfig`] for more information.
135 | 
136 |     Note that [`HuskyQFormerModel`] is very similar to [`BertLMHeadModel`] with interleaved cross-attention.
137 | 
138 |     Args:
139 |         vocab_size (`int`, *optional*, defaults to 30522):
140 |             Vocabulary size of the Q-Former model. Defines the number of different tokens that can be represented by
141 |             the `inputs_ids` passed when calling the model.
142 |         hidden_size (`int`, *optional*, defaults to 768):
143 |             Dimensionality of the encoder layers and the pooler layer.
144 |         num_hidden_layers (`int`, *optional*, defaults to 12):
145 |             Number of hidden layers in the Transformer encoder.
146 |         num_attention_heads (`int`, *optional*, defaults to 12):
147 |             Number of attention heads for each attention layer in the Transformer encoder.
148 |         intermediate_size (`int`, *optional*, defaults to 3072):
149 |             Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
150 |         hidden_act (`str` or `Callable`, *optional*, defaults to `"gelu"`):
151 |             The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
152 |             `"relu"`, `"silu"` and `"gelu_new"` are supported.
153 |         hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
154 |             The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
155 |         attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
156 |             The dropout ratio for the attention probabilities.
157 |         max_position_embeddings (`int`, *optional*, defaults to 512):
158 |             The maximum sequence length that this model might ever be used with. Typically set this to something large
159 |             just in case (e.g., 512 or 1024 or 2048).
160 |         initializer_range (`float`, *optional*, defaults to 0.02):
161 |             The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
162 |         layer_norm_eps (`float`, *optional*, defaults to 1e-12):
163 |             The epsilon used by the layer normalization layers.
164 |         position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
165 |             Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
166 |             positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
167 |             [Self-Attention with Relative Position Representations (Shaw et al.)](https://arxiv.org/abs/1803.02155).
168 |             For more information on `"relative_key_query"`, please refer to *Method 4* in [Improve Transformer Models
169 |             with Better Relative Position Embeddings (Huang et al.)](https://arxiv.org/abs/2009.13658).
170 |         classifier_dropout (`float`, *optional*):
171 |             The dropout ratio for the classification head.
172 |         cross_attention_frequency (`int`, *optional*, defaults to 2):
173 |             The frequency of adding cross-attention to the Transformer layers.
174 |         encoder_hidden_size (`int`, *optional*, defaults to 1408):
175 |             The hidden size of the hidden states for cross-attention.
176 |     """
177 |     model_type = "husky_qformer"
178 | 
179 |     def __init__(
180 |             self,
181 |             vocab_size=30522,
182 |             hidden_size=768,
183 |             num_hidden_layers=12,
184 |             num_attention_heads=12,
185 |             intermediate_size=3072,
186 |             hidden_act="gelu",
187 |             hidden_dropout_prob=0.1,
188 |             attention_probs_dropout_prob=0.1,
189 |             max_position_embeddings=512,
190 |             initializer_range=0.02,
191 |             layer_norm_eps=1e-12,
192 |             pad_token_id=0,
193 |             position_embedding_type="absolute",
194 |             classifier_dropout=None,
195 |             cross_attention_frequency=2,
196 |             encoder_hidden_size=1408,
197 |             **kwargs,
198 |     ):
199 |         super().__init__(pad_token_id=pad_token_id, **kwargs)
200 | 
201 |         self.vocab_size = vocab_size
202 |         self.hidden_size = hidden_size
203 |         self.num_hidden_layers = num_hidden_layers
204 |         self.num_attention_heads = num_attention_heads
205 |         self.hidden_act = hidden_act
206 |         self.intermediate_size = intermediate_size
207 |         self.hidden_dropout_prob = hidden_dropout_prob
208 |         self.attention_probs_dropout_prob = attention_probs_dropout_prob
209 |         self.max_position_embeddings = max_position_embeddings
210 |         self.initializer_range = initializer_range
211 |         self.layer_norm_eps = layer_norm_eps
212 |         self.position_embedding_type = position_embedding_type
213 |         self.classifier_dropout = classifier_dropout
214 |         self.cross_attention_frequency = cross_attention_frequency
215 |         self.encoder_hidden_size = encoder_hidden_size
216 | 
217 |     @classmethod
218 |     def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
219 |         config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
220 |         # get the qformer config dict if we are loading from HuskyConfig
221 |         if config_dict.get("model_type") == "husky":
222 |             config_dict = config_dict["qformer_config"]
223 | 
224 |         if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
225 |             logger.warning(
226 |                 f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
227 |                 f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
228 |             )
229 | 
230 |         return cls.from_dict(config_dict, **kwargs)
231 | 
232 | class HuskyConfig(PretrainedConfig):
233 |     r"""
234 |     [`HuskyConfig`] is the configuration class to store the configuration of a
235 |     [`HuskyForConditionalGeneration`]. It is used to instantiate a Husky model according to the specified
236 |     arguments, defining the vision model, Q-Former model and language model configs. Instantiating a configuration with
237 |     the defaults will yield a similar configuration to that of the Husky
238 |     [Salesforce/instruct-blip-flan-t5](https://huggingface.co/Salesforce/instruct-blip-flan-t5) architecture.
239 | 
240 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
241 |     documentation from [`PretrainedConfig`] for more information.
242 | 
243 |     Args:
244 |         vision_config (`dict`, *optional*):
245 |             Dictionary of configuration options used to initialize [`HuskyVisionConfig`].
246 |         qformer_config (`dict`, *optional*):
247 |             Dictionary of configuration options used to initialize [`HuskyQFormerConfig`].
248 |         text_config (`dict`, *optional*):
249 |             Dictionary of configuration options used to initialize any [`PretrainedConfig`].
250 |         num_query_tokens (`int`, *optional*, defaults to 32):
251 |             The number of query tokens passed through the Transformer.
252 | 
253 |         kwargs (*optional*):
254 |             Dictionary of keyword arguments.
255 |     """
256 | 
257 |     model_type = "husky"
258 |     is_composition = True
259 | 
260 |     def __init__(self, vision_config=None, qformer_config=None, text_config=None, num_query_tokens=32, **kwargs):
261 |         super().__init__(**kwargs)
262 | 
263 |         if vision_config is None:
264 |             vision_config = {}
265 |             logger.info("vision_config is None. initializing the HuskyVisionConfig with default values.")
266 | 
267 |         if qformer_config is None:
268 |             qformer_config = {}
269 |             logger.info("qformer_config is None. Initializing the HuskyQFormerConfig with default values.")
270 | 
271 |         if text_config is None:
272 |             text_config = {}
273 |             logger.info("text_config is None. Initializing the text config with default values (`OPTConfig`).")
274 | 
275 |         self.vision_config = HuskyVisionConfig(**vision_config)
276 |         self.qformer_config = HuskyQFormerConfig(**qformer_config)
277 |         text_model_type = text_config["model_type"] if "model_type" in text_config else "opt"
278 |         self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
279 | 
280 |         self.tie_word_embeddings = self.text_config.tie_word_embeddings
281 |         self.is_encoder_decoder = self.text_config.is_encoder_decoder
282 | 
283 |         self.num_query_tokens = num_query_tokens
284 |         self.qformer_config.encoder_hidden_size = self.vision_config.hidden_size
285 |         self.use_decoder_only_language_model = self.text_config.model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES
286 |         self.initializer_factor = 1.0
287 |         self.initializer_range = 0.02
288 | 
289 |     @classmethod
290 |     def from_vision_qformer_text_configs(
291 |             cls,
292 |             vision_config: HuskyVisionConfig,
293 |             qformer_config: HuskyQFormerConfig,
294 |             text_config: PretrainedConfig,
295 |             **kwargs,
296 |     ):
297 |         r"""
298 |         Instantiate a [`HuskyConfig`] (or a derived class) from a Husky vision model, Q-Former and
299 |         language model configurations.
300 | 
301 |         Returns:
302 |             [`HuskyConfig`]: An instance of a configuration object
303 |         """
304 | 
305 |         return cls(
306 |             vision_config=vision_config.to_dict(),
307 |             qformer_config=qformer_config.to_dict(),
308 |             text_config=text_config.to_dict(),
309 |             **kwargs,
310 |         )
311 | 
312 |     def to_dict(self):
313 |         """
314 |         Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
315 | 
316 |         Returns:
317 |             `Dict[str, any]`: Dictionary of all the attributes that make up this configuration instance,
318 |         """
319 |         output = copy.deepcopy(self.__dict__)
320 |         output["vision_config"] = self.vision_config.to_dict()
321 |         output["qformer_config"] = self.qformer_config.to_dict()
322 |         output["text_config"] = self.text_config.to_dict()
323 |         output["model_type"] = self.__class__.model_type
324 |         return output
325 | 
326 | if __name__ == '__main__':
327 |     config = HuskyConfig.from_pretrain
328 | 


--------------------------------------------------------------------------------
/robohusky/model/processing_husky.py:
--------------------------------------------------------------------------------
  1 | # coding=utf-8
  2 | # Copyright 2023 The HuggingFace Inc. team.
  3 | #
  4 | # Licensed under the Apache License, Version 2.0 (the "License");
  5 | # you may not use this file except in compliance with the License.
  6 | # You may obtain a copy of the License at
  7 | #
  8 | #     http://www.apache.org/licenses/LICENSE-2.0
  9 | #
 10 | # Unless required by applicable law or agreed to in writing, software
 11 | # distributed under the License is distributed on an "AS IS" BASIS,
 12 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 13 | # See the License for the specific language governing permissions and
 14 | # limitations under the License.
 15 | """
 16 | Processor class for Husky. Largely copy of Blip2Processor with addition of a tokenizer for the Q-Former.
 17 | """
 18 | 
 19 | from typing import List, Optional, Union
 20 | 
 21 | from transformers.processing_utils import ProcessorMixin
 22 | from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, \
 23 |     TruncationStrategy
 24 | from transformers.utils import TensorType
 25 | from transformers.models.auto import AutoTokenizer
 26 | 
 27 | 
 28 | class HuskyProcessor(ProcessorMixin):
 29 |     r"""
 30 |     Constructs an Husky processor which wraps a BLIP image processor and a LLaMa/T5 tokenizer into a single
 31 |     processor.
 32 | 
 33 |     [`HuskyProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`AutoTokenizer`]. See the
 34 |     docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
 35 | 
 36 |     Args:
 37 |         image_processor (`BlipImageProcessor`):
 38 |             An instance of [`BlipImageProcessor`]. The image processor is a required input.
 39 |         tokenizer (`AutoTokenizer`):
 40 |             An instance of ['PreTrainedTokenizer`]. The tokenizer is a required input.
 41 |     """
 42 |     attributes = ["image_processor", "tokenizer"]
 43 |     image_processor_class = "BlipImageProcessor"
 44 |     tokenizer_class = "AutoTokenizer"
 45 | 
 46 |     def __init__(self, image_processor, tokenizer):
 47 |         super().__init__(image_processor, tokenizer)
 48 |         self.current_processor = self.image_processor
 49 | 
 50 |         # add QFormer tokenizer
 51 |         self.qformer_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased", truncation_side="left")
 52 |         self.qformer_tokenizer.add_special_tokens({"bos_token": "[DEC]"})
 53 | 
 54 |     def __call__(
 55 |             self,
 56 |             images=None,
 57 |             text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
 58 |             add_special_tokens: bool = True,
 59 |             padding: Union[bool, str, PaddingStrategy] = False,
 60 |             truncation: Union[bool, str, TruncationStrategy] = None,
 61 |             max_length: Optional[int] = None,
 62 |             stride: int = 0,
 63 |             pad_to_multiple_of: Optional[int] = None,
 64 |             return_attention_mask: Optional[bool] = None,
 65 |             return_overflowing_tokens: bool = False,
 66 |             return_special_tokens_mask: bool = False,
 67 |             return_offsets_mapping: bool = False,
 68 |             return_token_type_ids: bool = False,
 69 |             return_length: bool = False,
 70 |             verbose: bool = True,
 71 |             return_tensors: Optional[Union[str, TensorType]] = None,
 72 |             **kwargs,
 73 |     ) -> BatchEncoding:
 74 |         """
 75 |         This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
 76 |         [`BertTokenizerFast.__call__`] to prepare text for the model.
 77 | 
 78 |         Please refer to the docstring of the above two methods for more information.
 79 |         """
 80 |         if images is None and text is None:
 81 |             raise ValueError("You have to specify either images or text.")
 82 | 
 83 |         # Get only text
 84 |         if images is None:
 85 |             self.current_processor = self.tokenizer
 86 |             text_encoding = self.tokenizer(
 87 |                 text=text,
 88 |                 add_special_tokens=add_special_tokens,
 89 |                 padding=padding,
 90 |                 truncation=truncation,
 91 |                 max_length=max_length,
 92 |                 stride=stride,
 93 |                 pad_to_multiple_of=pad_to_multiple_of,
 94 |                 return_attention_mask=return_attention_mask,
 95 |                 return_overflowing_tokens=return_overflowing_tokens,
 96 |                 return_special_tokens_mask=return_special_tokens_mask,
 97 |                 return_offsets_mapping=return_offsets_mapping,
 98 |                 return_token_type_ids=return_token_type_ids,
 99 |                 return_length=return_length,
100 |                 verbose=verbose,
101 |                 return_tensors=return_tensors,
102 |                 **kwargs,
103 |             )
104 |             return text_encoding
105 | 
106 |         # add pixel_values
107 |         encoding_image_processor = self.image_processor(images, return_tensors=return_tensors)
108 | 
109 |         if text is not None:
110 |             text_encoding = self.tokenizer(
111 |                 text=text,
112 |                 add_special_tokens=add_special_tokens,
113 |                 padding=padding,
114 |                 truncation=truncation,
115 |                 max_length=max_length,
116 |                 stride=stride,
117 |                 pad_to_multiple_of=pad_to_multiple_of,
118 |                 return_attention_mask=return_attention_mask,
119 |                 return_overflowing_tokens=return_overflowing_tokens,
120 |                 return_special_tokens_mask=return_special_tokens_mask,
121 |                 return_offsets_mapping=return_offsets_mapping,
122 |                 return_token_type_ids=return_token_type_ids,
123 |                 return_length=return_length,
124 |                 verbose=verbose,
125 |                 return_tensors=return_tensors,
126 |                 **kwargs,
127 |             )
128 |             qformer_text_encoding = self.qformer_tokenizer(
129 |                 text=text,
130 |                 add_special_tokens=add_special_tokens,
131 |                 padding=padding,
132 |                 truncation=truncation,
133 |                 max_length=max_length,
134 |                 stride=stride,
135 |                 pad_to_multiple_of=pad_to_multiple_of,
136 |                 return_attention_mask=return_attention_mask,
137 |                 return_overflowing_tokens=return_overflowing_tokens,
138 |                 return_special_tokens_mask=return_special_tokens_mask,
139 |                 return_offsets_mapping=return_offsets_mapping,
140 |                 return_token_type_ids=return_token_type_ids,
141 |                 return_length=return_length,
142 |                 verbose=verbose,
143 |                 return_tensors=return_tensors,
144 |                 **kwargs,
145 |             )
146 |             qformer_text_encoding["qformer_input_ids"] = qformer_text_encoding.pop("input_ids")
147 |             qformer_text_encoding["qformer_attention_mask"] = qformer_text_encoding.pop("attention_mask")
148 |             text_encoding.update(qformer_text_encoding)
149 |         else:
150 |             text_encoding = None
151 | 
152 |         if text_encoding is not None:
153 |             encoding_image_processor.update(text_encoding)
154 | 
155 |         return encoding_image_processor
156 | 
157 |     # Copied from transformers.models.blip.processing_blip.BlipProcessor.batch_decode with BertTokenizerFast->PreTrainedTokenizer
158 |     def batch_decode(self, *args, **kwargs):
159 |         """
160 |         This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
161 |         refer to the docstring of this method for more information.
162 |         """
163 |         return self.tokenizer.batch_decode(*args, **kwargs)
164 | 
165 |     # Copied from transformers.models.blip.processing_blip.BlipProcessor.decode with BertTokenizerFast->PreTrainedTokenizer
166 |     def decode(self, *args, **kwargs):
167 |         """
168 |         This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer
169 |         to the docstring of this method for more information.
170 |         """
171 |         return self.tokenizer.decode(*args, **kwargs)
172 | 
173 |     @property
174 |     # Copied from transformers.models.blip.processing_blip.BlipProcessor.model_input_names
175 |     def model_input_names(self):
176 |         tokenizer_input_names = self.tokenizer.model_input_names
177 |         image_processor_input_names = self.image_processor.model_input_names
178 |         return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
179 | 


--------------------------------------------------------------------------------
/robohusky/train/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/EmbodiedGPT/EmbodiedGPT_Pytorch/cda80524bf6b7d276ba3b532887bacd4b133f234/robohusky/train/.DS_Store


--------------------------------------------------------------------------------
/robohusky/train/llama_flash_attn_monkey_patch.py:
--------------------------------------------------------------------------------
  1 | import warnings
  2 | from typing import Optional, Tuple
  3 | 
  4 | import torch
  5 | from flash_attn import __version__ as flash_attn_version
  6 | from flash_attn.bert_padding import pad_input, unpad_input
  7 | from flash_attn.flash_attn_interface import (
  8 |     flash_attn_func,
  9 |     flash_attn_varlen_kvpacked_func,
 10 | )
 11 | from transformers.models.llama.modeling_llama import (
 12 |     LlamaAttention,
 13 |     LlamaModel,
 14 |     rotate_half,
 15 | )
 16 | 
 17 | def apply_rotary_pos_emb(q, k, cos_sin, position_ids):
 18 |     gather_indices = position_ids[:, :, None, None]  # [bsz, seq_len, 1, 1]
 19 |     gather_indices = gather_indices.repeat(
 20 |         1, 1, cos_sin[0].shape[1], cos_sin[0].shape[3]
 21 |     )
 22 |     bsz = gather_indices.shape[0]
 23 |     cos, sin = (
 24 |         torch.gather(x.transpose(1, 2).repeat(bsz, 1, 1, 1), 1, gather_indices)
 25 |         for x in cos_sin
 26 |     )
 27 |     q, k = ((x * cos) + (rotate_half(x) * sin) for x in (q, k))
 28 |     return q, k
 29 | 
 30 | def forward(
 31 |         self,
 32 |         hidden_states: torch.Tensor,
 33 |         attention_mask: Optional[torch.Tensor] = None,
 34 |         position_ids: Optional[torch.Tensor] = None,
 35 |         past_key_value: Optional[Tuple[torch.Tensor]] = None,
 36 |         output_attentions: bool = False,
 37 |         use_cache: bool = False,
 38 |         padding_mask: Optional[torch.Tensor] = None,
 39 | ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
 40 |     if output_attentions:
 41 |         warnings.warn(
 42 |             "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead."
 43 |         )
 44 | 
 45 |     bsz, q_len, _ = hidden_states.size()
 46 |     kv_heads = getattr(self, "num_key_value_heads", self.num_heads)
 47 | 
 48 |     q, k, v = (
 49 |         op(hidden_states).view(bsz, q_len, nh, self.head_dim)
 50 |         for op, nh in (
 51 |         (self.q_proj, self.num_heads),
 52 |         (self.k_proj, kv_heads),
 53 |         (self.v_proj, kv_heads),
 54 |     )
 55 |     )
 56 |     # shape: (b, s, num_heads, head_dim)
 57 | 
 58 |     kv_seq_len = k.shape[1]
 59 |     past_kv_len = 0
 60 |     if past_key_value is not None:
 61 |         past_kv_len = past_key_value[0].shape[2]
 62 |         kv_seq_len += past_kv_len
 63 | 
 64 |     cos_sin = self.rotary_emb(v, seq_len=kv_seq_len)
 65 |     q, k = apply_rotary_pos_emb(q, k, cos_sin, position_ids)
 66 | 
 67 |     if past_key_value is not None:
 68 |         assert (
 69 |                 flash_attn_version >= "2.1.0"
 70 |         ), "past_key_value support requires flash-attn >= 2.1.0"
 71 |         # reuse k, v
 72 |         k = torch.cat([past_key_value[0].transpose(1, 2), k], dim=1)
 73 |         v = torch.cat([past_key_value[1].transpose(1, 2), v], dim=1)
 74 | 
 75 |     past_key_value = (k.transpose(1, 2), v.transpose(1, 2)) if use_cache else None
 76 | 
 77 |     if attention_mask is None:
 78 |         output = flash_attn_func(q, k, v, 0.0, softmax_scale=None, causal=True).view(
 79 |             bsz, q_len, -1
 80 |         )
 81 |     else:
 82 |         q, indices, cu_q_lens, max_s = unpad_input(q, attention_mask[:, -q_len:])
 83 |         # We can skip concat and call unpad twice but seems better to call unpad only once.
 84 |         kv, _, cu_k_lens, max_k = unpad_input(
 85 |             torch.stack((k, v), dim=2), attention_mask
 86 |         )
 87 |         output_unpad = flash_attn_varlen_kvpacked_func(
 88 |             q,
 89 |             kv,
 90 |             cu_q_lens,
 91 |             cu_k_lens,
 92 |             max_s,
 93 |             max_k,
 94 |             0.0,
 95 |             softmax_scale=None,
 96 |             causal=True,
 97 |         )
 98 |         output_unpad = output_unpad.reshape(-1, self.num_heads * self.head_dim)
 99 |         output = pad_input(output_unpad, indices, bsz, q_len)
100 | 
101 |     return self.o_proj(output), None, past_key_value
102 | 
103 | # Disable the transformation of the attention mask in LlamaModel as flash attention
104 | # takes a boolean key_padding_mask. Fills in the past kv length for use in forward.
105 | def _prepare_decoder_attention_mask(
106 |         self, attention_mask, input_shape, inputs_embeds, past_key_values_length
107 | ):
108 |     # [bsz, seq_len]
109 |     if past_key_values_length > 0 and attention_mask is not None:
110 |         attention_mask = torch.cat(
111 |             (
112 |                 torch.full(
113 |                     (input_shape[0], past_key_values_length),
114 |                     True,
115 |                     dtype=attention_mask.dtype,
116 |                     device=attention_mask.device,
117 |                 ),
118 |                 attention_mask,
119 |             ),
120 |             dim=-1,
121 |         )
122 | 
123 |     if attention_mask is not None and torch.all(attention_mask):
124 |         return None  # This uses the faster call when training with full samples
125 | 
126 |     return attention_mask
127 | 
128 | def replace_llama_attn_with_flash_attn():
129 |     cuda_major, cuda_minor = torch.cuda.get_device_capability()
130 |     if cuda_major < 8:
131 |         warnings.warn(
132 |             "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
133 |             "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
134 |         )
135 | 
136 |     LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
137 |     LlamaAttention.forward = forward
138 | 
139 | def test():
140 |     from robohusky.train.llama_flash_attn_monkey_patch import forward as fastchat_forward
141 |     from transformers.models.llama.configuration_llama import LlamaConfig
142 | 
143 |     config = LlamaConfig(
144 |         hidden_size=1024,
145 |         intermediate_size=128,
146 |         num_hidden_layers=1,
147 |         num_attention_heads=8,
148 |         max_position_embeddings=16,
149 |     )
150 |     device = torch.device("cuda")
151 |     model = LlamaModel(config)
152 |     attn = LlamaAttention(config).to(device).half()
153 |     bsz, hs, seqlen = 2, config.hidden_size, config.max_position_embeddings
154 |     position_ids = torch.arange(seqlen, dtype=torch.long, device=device).view(
155 |         -1, seqlen
156 |     )
157 | 
158 |     mask = torch.full((bsz, seqlen), True, dtype=torch.bool, device=device)
159 |     for i in range(4):
160 |         hidden = torch.rand((bsz, seqlen, hs), dtype=torch.float16, device=device)
161 |         if i:
162 |             mask[0, -i:] = False
163 |             mask[1, :i] = False
164 | 
165 |         lmask = model._prepare_decoder_attention_mask(mask, hidden.shape[:2], hidden, 0)
166 |         ref, _, _ = attn.forward(
167 |             hidden, attention_mask=lmask, position_ids=position_ids
168 |         )
169 | 
170 |         fast, _, _ = fastchat_forward(
171 |             attn, hidden, attention_mask=mask, position_ids=position_ids
172 |         )
173 | 
174 |         lmask = _prepare_decoder_attention_mask(
175 |             model, mask, hidden.shape[:2], hidden, 0
176 |         )
177 |         test, _, _ = forward(
178 |             attn, hidden, attention_mask=lmask, position_ids=position_ids
179 |         )
180 | 
181 |         print(f"Mean(abs(ref)) = {torch.mean(torch.abs(ref))}")
182 |         print(f"Mean(abs(ref - fast)) = {torch.mean(torch.abs(ref - fast))}")
183 |         print(f"Mean(abs(ref - test)) = {torch.mean(torch.abs(ref - test))}")
184 |         print(f"Mean(abs(fast - test)) = {torch.mean(torch.abs(fast - test))}")
185 |         print(f"allclose(fast, test) = {torch.allclose(fast, test)}")
186 | 
187 |     with torch.no_grad():
188 |         # Also check that past_kv is handled properly
189 |         hidden = torch.rand((bsz, seqlen, hs), dtype=torch.float16, device=device)
190 |         part_len = seqlen // 4
191 |         assert part_len * 4 == seqlen
192 |         mask = torch.full((bsz, seqlen), True, dtype=torch.bool, device=device)
193 |         mask[0, -2:] = False
194 |         lmask = _prepare_decoder_attention_mask(
195 |             model, mask, hidden.shape[:2], hidden, 0
196 |         )
197 |         oneshot, _, _ = forward(
198 |             attn, hidden, attention_mask=lmask, position_ids=position_ids
199 |         )
200 |         parts = []
201 |         past_kv, past_kv_len = None, 0
202 |         for i in range(4):
203 |             start = part_len * i
204 |             end = start + part_len
205 |             hidden_part = hidden[:, start:end, ...]
206 |             lmask = _prepare_decoder_attention_mask(
207 |                 model,
208 |                 mask[:, start:end],
209 |                 hidden_part.shape[:2],
210 |                 hidden_part,
211 |                 past_kv_len,
212 |             )
213 |             part, _, past_kv = forward(
214 |                 attn,
215 |                 hidden_part.clone(),
216 |                 attention_mask=lmask,
217 |                 position_ids=position_ids[:, start:end],
218 |                 past_key_value=past_kv,
219 |                 use_cache=True,
220 |             )
221 |             parts.append(part)
222 |             past_kv_len = past_kv[0].shape[2]
223 | 
224 |         print(
225 |             f"allclose(oneshot[:, 0], parts[0]) = {torch.allclose(oneshot[:, :part_len], parts[0])}"
226 |         )
227 |         print(
228 |             f"allclose(oneshot, parts) = {torch.allclose(oneshot, torch.cat(parts, dim=1))}"
229 |         )
230 | 
231 | if __name__ == "__main__":
232 |     test()
233 | 


--------------------------------------------------------------------------------
/robohusky/train/llama_rmsnorm_monkey_patch.py:
--------------------------------------------------------------------------------
 1 | import transformers
 2 | 
 3 | def replace_llama_rmsnorm_with_fused_rmsnorm():
 4 |     try:
 5 |         from apex.normalization import FusedRMSNorm
 6 |         from functools import partial
 7 |         LlamaRMSNorm = partial(FusedRMSNorm, eps=1e-6)   # noqa
 8 |         transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
 9 |         print("Discovered apex.normalization.FusedRMSNorm - will use it instead of LlamaRMSNorm")
10 |     except ImportError:
11 |         # using the normal LlamaRMSNorm
12 |         pass
13 |     except Exception:
14 |         print("discovered apex but it failed to load, falling back to LlamaRMSNorm")
15 |         pass
16 | 


--------------------------------------------------------------------------------
/robohusky/utils.py:
--------------------------------------------------------------------------------
  1 | from asyncio import AbstractEventLoop
  2 | import json
  3 | import logging
  4 | import logging.handlers
  5 | import os
  6 | import platform
  7 | import sys
  8 | from typing import AsyncGenerator, Generator
  9 | import warnings
 10 | 
 11 | import requests
 12 | import torch
 13 | 
 14 | from husky.constants import LOGDIR
 15 | 
 16 | handler = None
 17 | 
 18 | 
 19 | def build_logger(logger_name, logger_filename):
 20 |     global handler
 21 | 
 22 |     formatter = logging.Formatter(
 23 |         fmt="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
 24 |         datefmt="%Y-%m-%d %H:%M:%S",
 25 |     )
 26 | 
 27 |     # Set the format of root handlers
 28 |     if not logging.getLogger().handlers:
 29 |         if sys.version_info[1] >= 9:
 30 |             # This is for windows
 31 |             logging.basicConfig(level=logging.INFO, encoding="utf-8")
 32 |         else:
 33 |             if platform.system() == "Windows":
 34 |                 warnings.warn(
 35 |                     "If you are running on Windows, "
 36 |                     "we recommend you use Python >= 3.9 for UTF-8 encoding."
 37 |                 )
 38 |             logging.basicConfig(level=logging.INFO)
 39 |     logging.getLogger().handlers[0].setFormatter(formatter)
 40 | 
 41 |     # Redirect stdout and stderr to loggers
 42 |     stdout_logger = logging.getLogger("stdout")
 43 |     stdout_logger.setLevel(logging.INFO)
 44 |     sl = StreamToLogger(stdout_logger, logging.INFO)
 45 |     sys.stdout = sl
 46 | 
 47 |     stderr_logger = logging.getLogger("stderr")
 48 |     stderr_logger.setLevel(logging.ERROR)
 49 |     sl = StreamToLogger(stderr_logger, logging.ERROR)
 50 |     sys.stderr = sl
 51 | 
 52 |     # Get logger
 53 |     logger = logging.getLogger(logger_name)
 54 |     logger.setLevel(logging.INFO)
 55 | 
 56 |     # Add a file handler for all loggers
 57 |     if handler is None:
 58 |         os.makedirs(LOGDIR, exist_ok=True)
 59 |         filename = os.path.join(LOGDIR, logger_filename)
 60 |         handler = logging.handlers.TimedRotatingFileHandler(
 61 |             filename, when="D", utc=True, encoding="utf-8"
 62 |         )
 63 |         handler.setFormatter(formatter)
 64 | 
 65 |         for name, item in logging.root.manager.loggerDict.items():
 66 |             if isinstance(item, logging.Logger):
 67 |                 item.addHandler(handler)
 68 | 
 69 |     return logger
 70 | 
 71 | 
 72 | class StreamToLogger(object):
 73 |     """
 74 |     Fake file-like stream object that redirects writes to a logger instance.
 75 |     """
 76 | 
 77 |     def __init__(self, logger, log_level=logging.INFO):
 78 |         self.terminal = sys.stdout
 79 |         self.logger = logger
 80 |         self.log_level = log_level
 81 |         self.linebuf = ""
 82 | 
 83 |     def __getattr__(self, attr):
 84 |         return getattr(self.terminal, attr)
 85 | 
 86 |     def write(self, buf):
 87 |         temp_linebuf = self.linebuf + buf
 88 |         self.linebuf = ""
 89 |         for line in temp_linebuf.splitlines(True):
 90 |             # From the io.TextIOWrapper docs:
 91 |             #   On output, if newline is None, any '\n' characters written
 92 |             #   are translated to the system default line separator.
 93 |             # By default sys.stdout.write() expects '\n' newlines and then
 94 |             # translates them so this is still cross platform.
 95 |             if line[-1] == "\n":
 96 |                 encoded_message = line.encode("utf-8", "ignore").decode("utf-8")
 97 |                 self.logger.log(self.log_level, encoded_message.rstrip())
 98 |             else:
 99 |                 self.linebuf += line
100 | 
101 |     def flush(self):
102 |         if self.linebuf != "":
103 |             encoded_message = self.linebuf.encode("utf-8", "ignore").decode("utf-8")
104 |             self.logger.log(self.log_level, encoded_message.rstrip())
105 |         self.linebuf = ""
106 | 
107 | 
108 | def disable_torch_init():
109 |     """
110 |     Disable the redundant torch default initialization to accelerate model creation.
111 |     """
112 |     import torch
113 | 
114 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
115 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
116 | 
117 | 
118 | def get_gpu_memory(max_gpus=None):
119 |     """Get available memory for each GPU."""
120 |     gpu_memory = []
121 |     num_gpus = (
122 |         torch.cuda.device_count()
123 |         if max_gpus is None
124 |         else min(max_gpus, torch.cuda.device_count())
125 |     )
126 | 
127 |     for gpu_id in range(num_gpus):
128 |         with torch.cuda.device(gpu_id):
129 |             device = torch.cuda.current_device()
130 |             gpu_properties = torch.cuda.get_device_properties(device)
131 |             total_memory = gpu_properties.total_memory / (1024 ** 3)
132 |             allocated_memory = torch.cuda.memory_allocated() / (1024 ** 3)
133 |             available_memory = total_memory - allocated_memory
134 |             gpu_memory.append(available_memory)
135 |     return gpu_memory
136 | 
137 | 
138 | def violates_moderation(text):
139 |     """
140 |     Check whether the text violates OpenAI moderation API.
141 |     """
142 |     url = "https://api.openai.com/v1/moderations"
143 |     headers = {
144 |         "Content-Type": "application/json",
145 |         "Authorization": "Bearer " + os.environ["OPENAI_API_KEY"],
146 |     }
147 |     text = text.replace("\n", "")
148 |     data = "{" + '"input": ' + f'"{text}"' + "}"
149 |     data = data.encode("utf-8")
150 |     try:
151 |         ret = requests.post(url, headers=headers, data=data, timeout=5)
152 |         flagged = ret.json()["results"][0]["flagged"]
153 |     except requests.exceptions.RequestException as e:
154 |         flagged = False
155 |     except KeyError as e:
156 |         flagged = False
157 | 
158 |     return flagged
159 | 
160 | 
161 | # Flan-t5 trained with HF+FSDP saves corrupted  weights for shared embeddings,
162 | # Use this function to make sure it can be correctly loaded.
163 | def clean_flant5_ckpt(ckpt_path):
164 |     index_file = os.path.join(ckpt_path, "pytorch_model.bin.index.json")
165 |     index_json = json.load(open(index_file, "r"))
166 | 
167 |     weightmap = index_json["weight_map"]
168 | 
169 |     share_weight_file = weightmap["shared.weight"]
170 |     share_weight = torch.load(os.path.join(ckpt_path, share_weight_file))[
171 |         "shared.weight"
172 |     ]
173 | 
174 |     for weight_name in ["decoder.embed_tokens.weight", "encoder.embed_tokens.weight"]:
175 |         weight_file = weightmap[weight_name]
176 |         weight = torch.load(os.path.join(ckpt_path, weight_file))
177 |         weight[weight_name] = share_weight
178 |         torch.save(weight, os.path.join(ckpt_path, weight_file))
179 | 
180 | 
181 | def pretty_print_semaphore(semaphore):
182 |     """Print a semaphore in better format."""
183 |     if semaphore is None:
184 |         return "None"
185 |     return f"Semaphore(value={semaphore._value}, locked={semaphore.locked()})"
186 | 
187 | 
188 | """A javascript function to get url parameters for the gradio web server."""
189 | get_window_url_params_js = """
190 | function() {
191 |     const params = new URLSearchParams(window.location.search);
192 |     url_params = Object.fromEntries(params);
193 |     console.log("url_params", url_params);
194 |     return url_params;
195 |     }
196 | """
197 | 
198 | 
199 | def iter_over_async(
200 |         async_gen: AsyncGenerator, event_loop: AbstractEventLoop
201 | ) -> Generator:
202 |     """
203 |     Convert async generator to sync generator
204 | 
205 |     :param async_gen: the AsyncGenerator to convert
206 |     :param event_loop: the event loop to run on
207 |     :returns: Sync generator
208 |     """
209 |     ait = async_gen.__aiter__()
210 | 
211 |     async def get_next():
212 |         try:
213 |             obj = await ait.__anext__()
214 |             return False, obj
215 |         except StopAsyncIteration:
216 |             return True, None
217 | 
218 |     while True:
219 |         done, obj = event_loop.run_until_complete(get_next())
220 |         if done:
221 |             break
222 |         yield obj
223 | 
224 | 
225 | def detect_language(text: str) -> str:
226 |     """Detect the langauge of a string."""
227 |     import polyglot  # pip3 install polyglot pyicu pycld2
228 |     from polyglot.detect import Detector
229 |     from polyglot.detect.base import logger as polyglot_logger
230 |     import pycld2
231 | 
232 |     polyglot_logger.setLevel("ERROR")
233 | 
234 |     try:
235 |         lang_code = Detector(text).language.name
236 |     except (pycld2.error, polyglot.detect.base.UnknownLanguage):
237 |         lang_code = "unknown"
238 |     return lang_code
239 | 


--------------------------------------------------------------------------------
/robohusky/video_transformers.py:
--------------------------------------------------------------------------------
  1 | import torchvision
  2 | import random
  3 | from PIL import Image, ImageOps
  4 | import numpy as np
  5 | import numbers
  6 | import math
  7 | import torch
  8 | 
  9 | class GroupRandomCrop(object):
 10 |     def __init__(self, size):
 11 |         if isinstance(size, numbers.Number):
 12 |             self.size = (int(size), int(size))
 13 |         else:
 14 |             self.size = size
 15 | 
 16 |     def __call__(self, img_group):
 17 | 
 18 |         w, h = img_group[0].size
 19 |         th, tw = self.size
 20 | 
 21 |         out_images = list()
 22 | 
 23 |         x1 = random.randint(0, w - tw)
 24 |         y1 = random.randint(0, h - th)
 25 | 
 26 |         for img in img_group:
 27 |             assert (img.size[0] == w and img.size[1] == h)
 28 |             if w == tw and h == th:
 29 |                 out_images.append(img)
 30 |             else:
 31 |                 out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
 32 | 
 33 |         return out_images
 34 | 
 35 | class MultiGroupRandomCrop(object):
 36 |     def __init__(self, size, groups=1):
 37 |         if isinstance(size, numbers.Number):
 38 |             self.size = (int(size), int(size))
 39 |         else:
 40 |             self.size = size
 41 |         self.groups = groups
 42 | 
 43 |     def __call__(self, img_group):
 44 | 
 45 |         w, h = img_group[0].size
 46 |         th, tw = self.size
 47 | 
 48 |         out_images = list()
 49 | 
 50 |         for i in range(self.groups):
 51 |             x1 = random.randint(0, w - tw)
 52 |             y1 = random.randint(0, h - th)
 53 | 
 54 |             for img in img_group:
 55 |                 assert (img.size[0] == w and img.size[1] == h)
 56 |                 if w == tw and h == th:
 57 |                     out_images.append(img)
 58 |                 else:
 59 |                     out_images.append(img.crop((x1, y1, x1 + tw, y1 + th)))
 60 | 
 61 |         return out_images
 62 | 
 63 | class GroupCenterCrop(object):
 64 |     def __init__(self, size):
 65 |         self.worker = torchvision.transforms.CenterCrop(size)
 66 | 
 67 |     def __call__(self, img_group):
 68 |         return [self.worker(img) for img in img_group]
 69 | 
 70 | class GroupRandomHorizontalFlip(object):
 71 |     """Randomly horizontally flips the given PIL.Image with a probability of 0.5
 72 |     """
 73 | 
 74 |     def __init__(self, is_flow=False):
 75 |         self.is_flow = is_flow
 76 | 
 77 |     def __call__(self, img_group, is_flow=False):
 78 |         v = random.random()
 79 |         if v < 0.5:
 80 |             ret = [img.transpose(Image.FLIP_LEFT_RIGHT) for img in img_group]
 81 |             if self.is_flow:
 82 |                 for i in range(0, len(ret), 2):
 83 |                     # invert flow pixel values when flipping
 84 |                     ret[i] = ImageOps.invert(ret[i])
 85 |             return ret
 86 |         else:
 87 |             return img_group
 88 | 
 89 | class GroupNormalize(object):
 90 |     def __init__(self, mean, std):
 91 |         self.mean = mean
 92 |         self.std = std
 93 | 
 94 |     def __call__(self, tensor):
 95 |         rep_mean = self.mean * (tensor.size()[0] // len(self.mean))
 96 |         rep_std = self.std * (tensor.size()[0] // len(self.std))
 97 | 
 98 |         # TODO: make efficient
 99 |         for t, m, s in zip(tensor, rep_mean, rep_std):
100 |             t.sub_(m).div_(s)
101 | 
102 |         return tensor
103 | 
104 | class GroupScale(object):
105 |     """ Rescales the input PIL.Image to the given 'size'.
106 |     'size' will be the size of the smaller edge.
107 |     For example, if height > width, then image will be
108 |     rescaled to (size * height / width, size)
109 |     size: size of the smaller edge
110 |     interpolation: Default: PIL.Image.BILINEAR
111 |     """
112 | 
113 |     def __init__(self, size, interpolation=Image.BILINEAR):
114 |         self.worker = torchvision.transforms.Resize(size, interpolation)
115 | 
116 |     def __call__(self, img_group):
117 |         return [self.worker(img) for img in img_group]
118 | 
119 | class GroupOverSample(object):
120 |     def __init__(self, crop_size, scale_size=None, flip=True):
121 |         self.crop_size = crop_size if not isinstance(
122 |             crop_size, int) else (crop_size, crop_size)
123 | 
124 |         if scale_size is not None:
125 |             self.scale_worker = GroupScale(scale_size)
126 |         else:
127 |             self.scale_worker = None
128 |         self.flip = flip
129 | 
130 |     def __call__(self, img_group):
131 | 
132 |         if self.scale_worker is not None:
133 |             img_group = self.scale_worker(img_group)
134 | 
135 |         image_w, image_h = img_group[0].size
136 |         crop_w, crop_h = self.crop_size
137 | 
138 |         offsets = GroupMultiScaleCrop.fill_fix_offset(
139 |             False, image_w, image_h, crop_w, crop_h)
140 |         oversample_group = list()
141 |         for o_w, o_h in offsets:
142 |             normal_group = list()
143 |             flip_group = list()
144 |             for i, img in enumerate(img_group):
145 |                 crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
146 |                 normal_group.append(crop)
147 |                 flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
148 | 
149 |                 if img.mode == 'L' and i % 2 == 0:
150 |                     flip_group.append(ImageOps.invert(flip_crop))
151 |                 else:
152 |                     flip_group.append(flip_crop)
153 | 
154 |             oversample_group.extend(normal_group)
155 |             if self.flip:
156 |                 oversample_group.extend(flip_group)
157 |         return oversample_group
158 | 
159 | class GroupFullResSample(object):
160 |     def __init__(self, crop_size, scale_size=None, flip=True):
161 |         self.crop_size = crop_size if not isinstance(
162 |             crop_size, int) else (crop_size, crop_size)
163 | 
164 |         if scale_size is not None:
165 |             self.scale_worker = GroupScale(scale_size)
166 |         else:
167 |             self.scale_worker = None
168 |         self.flip = flip
169 | 
170 |     def __call__(self, img_group):
171 | 
172 |         if self.scale_worker is not None:
173 |             img_group = self.scale_worker(img_group)
174 | 
175 |         image_w, image_h = img_group[0].size
176 |         crop_w, crop_h = self.crop_size
177 | 
178 |         w_step = (image_w - crop_w) // 4
179 |         h_step = (image_h - crop_h) // 4
180 | 
181 |         offsets = list()
182 |         offsets.append((0 * w_step, 2 * h_step))  # left
183 |         offsets.append((4 * w_step, 2 * h_step))  # right
184 |         offsets.append((2 * w_step, 2 * h_step))  # center
185 | 
186 |         oversample_group = list()
187 |         for o_w, o_h in offsets:
188 |             normal_group = list()
189 |             flip_group = list()
190 |             for i, img in enumerate(img_group):
191 |                 crop = img.crop((o_w, o_h, o_w + crop_w, o_h + crop_h))
192 |                 normal_group.append(crop)
193 |                 if self.flip:
194 |                     flip_crop = crop.copy().transpose(Image.FLIP_LEFT_RIGHT)
195 | 
196 |                     if img.mode == 'L' and i % 2 == 0:
197 |                         flip_group.append(ImageOps.invert(flip_crop))
198 |                     else:
199 |                         flip_group.append(flip_crop)
200 | 
201 |             oversample_group.extend(normal_group)
202 |             oversample_group.extend(flip_group)
203 |         return oversample_group
204 | 
205 | class GroupMultiScaleCrop(object):
206 | 
207 |     def __init__(self, input_size, scales=None, max_distort=1,
208 |                  fix_crop=True, more_fix_crop=True):
209 |         self.scales = scales if scales is not None else [1, .875, .75, .66]
210 |         self.max_distort = max_distort
211 |         self.fix_crop = fix_crop
212 |         self.more_fix_crop = more_fix_crop
213 |         self.input_size = input_size if not isinstance(input_size, int) else [
214 |             input_size, input_size]
215 |         self.interpolation = Image.BILINEAR
216 | 
217 |     def __call__(self, img_group):
218 | 
219 |         im_size = img_group[0].size
220 | 
221 |         crop_w, crop_h, offset_w, offset_h = self._sample_crop_size(im_size)
222 |         crop_img_group = [
223 |             img.crop(
224 |                 (offset_w,
225 |                  offset_h,
226 |                  offset_w +
227 |                  crop_w,
228 |                  offset_h +
229 |                  crop_h)) for img in img_group]
230 |         ret_img_group = [img.resize((self.input_size[0], self.input_size[1]), self.interpolation)
231 |                          for img in crop_img_group]
232 |         return ret_img_group
233 | 
234 |     def _sample_crop_size(self, im_size):
235 |         image_w, image_h = im_size[0], im_size[1]
236 | 
237 |         # find a crop size
238 |         base_size = min(image_w, image_h)
239 |         crop_sizes = [int(base_size * x) for x in self.scales]
240 |         crop_h = [
241 |             self.input_size[1] if abs(
242 |                 x - self.input_size[1]) < 3 else x for x in crop_sizes]
243 |         crop_w = [
244 |             self.input_size[0] if abs(
245 |                 x - self.input_size[0]) < 3 else x for x in crop_sizes]
246 | 
247 |         pairs = []
248 |         for i, h in enumerate(crop_h):
249 |             for j, w in enumerate(crop_w):
250 |                 if abs(i - j) <= self.max_distort:
251 |                     pairs.append((w, h))
252 | 
253 |         crop_pair = random.choice(pairs)
254 |         if not self.fix_crop:
255 |             w_offset = random.randint(0, image_w - crop_pair[0])
256 |             h_offset = random.randint(0, image_h - crop_pair[1])
257 |         else:
258 |             w_offset, h_offset = self._sample_fix_offset(
259 |                 image_w, image_h, crop_pair[0], crop_pair[1])
260 | 
261 |         return crop_pair[0], crop_pair[1], w_offset, h_offset
262 | 
263 |     def _sample_fix_offset(self, image_w, image_h, crop_w, crop_h):
264 |         offsets = self.fill_fix_offset(
265 |             self.more_fix_crop, image_w, image_h, crop_w, crop_h)
266 |         return random.choice(offsets)
267 | 
268 |     @staticmethod
269 |     def fill_fix_offset(more_fix_crop, image_w, image_h, crop_w, crop_h):
270 |         w_step = (image_w - crop_w) // 4
271 |         h_step = (image_h - crop_h) // 4
272 | 
273 |         ret = list()
274 |         ret.append((0, 0))  # upper left
275 |         ret.append((4 * w_step, 0))  # upper right
276 |         ret.append((0, 4 * h_step))  # lower left
277 |         ret.append((4 * w_step, 4 * h_step))  # lower right
278 |         ret.append((2 * w_step, 2 * h_step))  # center
279 | 
280 |         if more_fix_crop:
281 |             ret.append((0, 2 * h_step))  # center left
282 |             ret.append((4 * w_step, 2 * h_step))  # center right
283 |             ret.append((2 * w_step, 4 * h_step))  # lower center
284 |             ret.append((2 * w_step, 0 * h_step))  # upper center
285 | 
286 |             ret.append((1 * w_step, 1 * h_step))  # upper left quarter
287 |             ret.append((3 * w_step, 1 * h_step))  # upper right quarter
288 |             ret.append((1 * w_step, 3 * h_step))  # lower left quarter
289 |             ret.append((3 * w_step, 3 * h_step))  # lower righ quarter
290 | 
291 |         return ret
292 | 
293 | class GroupRandomSizedCrop(object):
294 |     """Random crop the given PIL.Image to a random size of (0.08 to 1.0) of the original size
295 |     and and a random aspect ratio of 3/4 to 4/3 of the original aspect ratio
296 |     This is popularly used to train the Inception networks
297 |     size: size of the smaller edge
298 |     interpolation: Default: PIL.Image.BILINEAR
299 |     """
300 | 
301 |     def __init__(self, size, interpolation=Image.BILINEAR):
302 |         self.size = size
303 |         self.interpolation = interpolation
304 | 
305 |     def __call__(self, img_group):
306 |         for attempt in range(10):
307 |             area = img_group[0].size[0] * img_group[0].size[1]
308 |             target_area = random.uniform(0.08, 1.0) * area
309 |             aspect_ratio = random.uniform(3. / 4, 4. / 3)
310 | 
311 |             w = int(round(math.sqrt(target_area * aspect_ratio)))
312 |             h = int(round(math.sqrt(target_area / aspect_ratio)))
313 | 
314 |             if random.random() < 0.5:
315 |                 w, h = h, w
316 | 
317 |             if w <= img_group[0].size[0] and h <= img_group[0].size[1]:
318 |                 x1 = random.randint(0, img_group[0].size[0] - w)
319 |                 y1 = random.randint(0, img_group[0].size[1] - h)
320 |                 found = True
321 |                 break
322 |         else:
323 |             found = False
324 |             x1 = 0
325 |             y1 = 0
326 | 
327 |         if found:
328 |             out_group = list()
329 |             for img in img_group:
330 |                 img = img.crop((x1, y1, x1 + w, y1 + h))
331 |                 assert (img.size == (w, h))
332 |                 out_group.append(
333 |                     img.resize(
334 |                         (self.size, self.size), self.interpolation))
335 |             return out_group
336 |         else:
337 |             # Fallback
338 |             scale = GroupScale(self.size, interpolation=self.interpolation)
339 |             crop = GroupRandomCrop(self.size)
340 |             return crop(scale(img_group))
341 | 
342 | class ConvertDataFormat(object):
343 |     def __init__(self, model_type):
344 |         self.model_type = model_type
345 | 
346 |     def __call__(self, images):
347 |         if self.model_type == '2D':
348 |             return images
349 |         tc, h, w = images.size()
350 |         t = tc // 3
351 |         images = images.view(t, 3, h, w)
352 |         images = images.permute(1, 0, 2, 3)
353 |         return images
354 | 
355 | class Stack(object):
356 | 
357 |     def __init__(self, roll=False):
358 |         self.roll = roll
359 | 
360 |     def __call__(self, img_group):
361 |         if img_group[0].mode == 'L':
362 |             return np.concatenate([np.expand_dims(x, 2)
363 |                                    for x in img_group], axis=2)
364 |         elif img_group[0].mode == 'RGB':
365 |             if self.roll:
366 |                 return np.concatenate([np.array(x)[:, :, ::-1]
367 |                                        for x in img_group], axis=2)
368 |             else:
369 |                 # print(np.concatenate(img_group, axis=2).shape)
370 |                 # print(img_group[0].shape)
371 |                 return np.concatenate(img_group, axis=2)
372 | 
373 | class ToTorchFormatTensor(object):
374 |     """ Converts a PIL.Image (RGB) or numpy.ndarray (H x W x C) in the range [0, 255]
375 |     to a torch.FloatTensor of shape (C x H x W) in the range [0.0, 1.0] """
376 | 
377 |     def __init__(self, div=True):
378 |         self.div = div
379 | 
380 |     def __call__(self, pic):
381 |         if isinstance(pic, np.ndarray):
382 |             # handle numpy array
383 |             img = torch.from_numpy(pic).permute(2, 0, 1).contiguous()
384 |         else:
385 |             # handle PIL Image
386 |             img = torch.ByteTensor(
387 |                 torch.ByteStorage.from_buffer(
388 |                     pic.tobytes()))
389 |             img = img.view(pic.size[1], pic.size[0], len(pic.mode))
390 |             # put it from HWC to CHW format
391 |             # yikes, this transpose takes 80% of the loading time/CPU
392 |             img = img.transpose(0, 1).transpose(0, 2).contiguous()
393 |         return img.float().div(255) if self.div else img.float()
394 | 
395 | class IdentityTransform(object):
396 | 
397 |     def __call__(self, data):
398 |         return data
399 | 
400 | def get_index(num_frames, num_segments):
401 |     seg_size = float(num_frames - 1) / num_segments
402 |     start = int(seg_size / 2)
403 |     offsets = np.array([
404 |         start + int(np.round(seg_size * idx)) for idx in range(num_segments)
405 |     ])
406 |     return offsets
407 | 


--------------------------------------------------------------------------------
/train_files/example_train_file.json:
--------------------------------------------------------------------------------
 1 | [
 2 |   {
 3 |     "data_name": "general_video_703k",
 4 |     "text_file": "./datasets/mm/general_video_703K/chat_general_video_703k.jsonl",
 5 |     "image_path": "your personal path",
 6 |     "data_type": "video",
 7 |     "size": 702971
 8 |   },
 9 |   {
10 |     "data_name": "ego_4d",
11 |     "text_file": "./datasets/mm/egocot/chat_ego4d_cot.jsonl",
12 |     "image_path": "your personal path",
13 |     "data_type": "numpy",
14 |     "size": 317528
15 |   }
16 | ]


--------------------------------------------------------------------------------
/zero_stage0_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 1,
 4 |     "allgather_partitions": true,
 5 |     "allgather_bucket_size": 2e8,
 6 |     "overlap_comm": true,
 7 |     "reduce_scatter": true,
 8 |     "reduce_bucket_size": "auto",
 9 |     "contiguous_gradients": true
10 |   },
11 |   "fp16": {
12 |     "enabled": "auto",
13 |     "auto_cast": true,
14 |     "loss_scale": 0,
15 |     "initial_scale_power": 32,
16 |     "loss_scale_window": 1000,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 0.01
19 |   },
20 |   "bf16": {
21 |     "enabled": "auto"
22 |   },
23 |   "scheduler": {
24 |     "type": "WarmupDecayLR",
25 |     "params": {
26 |       "warmup_min_lr": "auto",
27 |       "warmup_max_lr": "auto",
28 |       "warmup_num_steps": "auto",
29 |       "total_num_steps": "auto"
30 |     }
31 |   },
32 |   "optimizer": {
33 |     "type": "AdamW",
34 |     "params": {
35 |       "lr": "auto",
36 |       "betas": [
37 |         0.9,
38 |         0.999
39 |       ],
40 |       "eps": 1e-8,
41 |       "weight_decay": "auto"
42 |     }
43 |   },
44 |   "gradient_accumulation_steps": "auto",
45 |   "gradient_clipping": "auto",
46 |   "steps_per_print": 2000,
47 |   "train_batch_size": "auto",
48 |   "train_micro_batch_size_per_gpu": "auto",
49 |   "wall_clock_breakdown": false
50 | }
51 | 


--------------------------------------------------------------------------------
/zero_stage1_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 1,
 4 |     "allgather_partitions": true,
 5 |     "allgather_bucket_size": 2e8,
 6 |     "overlap_comm": true,
 7 |     "reduce_scatter": true,
 8 |     "reduce_bucket_size": "auto",
 9 |     "contiguous_gradients": true
10 |   },
11 |   "fp16": {
12 |     "enabled": "auto",
13 |     "auto_cast": true,
14 |     "loss_scale": 0,
15 |     "initial_scale_power": 32,
16 |     "loss_scale_window": 1000,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1
19 |   },
20 |   "bf16": {
21 |     "enabled": "auto"
22 |   },
23 |   "scheduler": {
24 |     "type": "WarmupDecayLR",
25 |     "params": {
26 |       "warmup_min_lr": "auto",
27 |       "warmup_max_lr": "auto",
28 |       "warmup_num_steps": "auto",
29 |       "total_num_steps": "auto"
30 |     }
31 |   },
32 |   "optimizer": {
33 |     "type": "AdamW",
34 |     "params": {
35 |       "lr": "auto",
36 |       "betas": [
37 |         0.9,
38 |         0.999
39 |       ],
40 |       "eps": 1e-8,
41 |       "weight_decay": "auto"
42 |     }
43 |   },
44 |   "gradient_accumulation_steps": "auto",
45 |   "gradient_clipping": "auto",
46 |   "steps_per_print": 2000,
47 |   "train_batch_size": "auto",
48 |   "train_micro_batch_size_per_gpu": "auto",
49 |   "wall_clock_breakdown": false
50 | }


--------------------------------------------------------------------------------
/zero_stage2_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "zero_optimization": {
 3 |     "stage": 2,
 4 |     "allgather_partitions": true,
 5 |     "allgather_bucket_size": 2e8,
 6 |     "overlap_comm": true,
 7 |     "reduce_scatter": true,
 8 |     "reduce_bucket_size": "auto",
 9 |     "contiguous_gradients": true
10 |   },
11 |   "fp16": {
12 |     "enabled": "auto",
13 |     "auto_cast": true,
14 |     "loss_scale": 0,
15 |     "initial_scale_power": 32,
16 |     "loss_scale_window": 1000,
17 |     "hysteresis": 2,
18 |     "min_loss_scale": 1
19 |   },
20 |   "bf16": {
21 |     "enabled": "auto"
22 |   },
23 |   "scheduler": {
24 |     "type": "WarmupDecayLR",
25 |     "params": {
26 |       "warmup_min_lr": "auto",
27 |       "warmup_max_lr": "auto",
28 |       "warmup_num_steps": "auto",
29 |       "total_num_steps": "auto"
30 |     }
31 |   },
32 |   "optimizer": {
33 |     "type": "AdamW",
34 |     "params": {
35 |       "lr": "auto",
36 |       "betas": [
37 |         0.9,
38 |         0.999
39 |       ],
40 |       "eps": 1e-8,
41 |       "weight_decay": "auto"
42 |     }
43 |   },
44 |   "gradient_accumulation_steps": "auto",
45 |   "gradient_clipping": "auto",
46 |   "steps_per_print": 2000,
47 |   "train_batch_size": "auto",
48 |   "train_micro_batch_size_per_gpu": "auto",
49 |   "wall_clock_breakdown": false
50 | }


--------------------------------------------------------------------------------
/zero_stage3_config.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "scheduler": {
23 |     "type": "WarmupLR",
24 |     "params": {
25 |       "warmup_min_lr": "auto",
26 |       "warmup_max_lr": "auto",
27 |       "warmup_num_steps": "auto"
28 |     }
29 |   },
30 |   "zero_optimization": {
31 |     "stage": 3,
32 |     "overlap_comm": true,
33 |     "contiguous_gradients": true,
34 |     "sub_group_size": 1e9,
35 |     "reduce_bucket_size": "auto",
36 |     "stage3_prefetch_bucket_size": "auto",
37 |     "stage3_param_persistence_threshold": "auto",
38 |     "stage3_max_live_parameters": 1e9,
39 |     "stage3_max_reuse_distance": 1e9,
40 |     "stage3_gather_16bit_weights_on_model_save": true
41 |   },
42 |   "gradient_accumulation_steps": "auto",
43 |   "gradient_clipping": "auto",
44 |   "steps_per_print": 2000,
45 |   "train_batch_size": "auto",
46 |   "train_micro_batch_size_per_gpu": "auto",
47 |   "wall_clock_breakdown": false
48 | }


--------------------------------------------------------------------------------