├── models
    ├── __init__.py
    ├── tarsier
    │   ├── utils.py
    │   ├── processor.py
    │   └── modeling_tarsier.py
    ├── modeling_captioners.py
    ├── modeling_encoders.py
    └── modeling_basemodels.py
├── utils
    ├── __init__.py
    ├── model.py
    ├── video.py
    ├── gpt_api.py
    └── dream_gpt.py
├── dataset
    ├── __init__.py
    ├── utils.py
    └── dataset.py
├── assets
    ├── demo.mp4
    ├── logo.png
    ├── carebench.png
    ├── care_model.png
    ├── comparison.png
    └── performance.png
├── .gitignore
├── requirements.txt
├── scripts
    ├── retrieval.sh
    ├── captioning.sh
    └── train.sh
├── data.config
├── ds.config
├── README.md
└── tasks
    ├── retrieval.py
    ├── captioning.py
    └── finetuning.py


/models/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/utils/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/dataset/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/assets/demo.mp4:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/CaReBench/HEAD/assets/demo.mp4


--------------------------------------------------------------------------------
/assets/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/CaReBench/HEAD/assets/logo.png


--------------------------------------------------------------------------------
/assets/carebench.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/CaReBench/HEAD/assets/carebench.png


--------------------------------------------------------------------------------
/assets/care_model.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/CaReBench/HEAD/assets/care_model.png


--------------------------------------------------------------------------------
/assets/comparison.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/CaReBench/HEAD/assets/comparison.png


--------------------------------------------------------------------------------
/assets/performance.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/MCG-NJU/CaReBench/HEAD/assets/performance.png


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | checkpoints*/
 2 | data/
 3 | wandb/
 4 | thirdparty/
 5 | experiments/
 6 | notebooks/
 7 | __pycache__/
 8 | 
 9 | checkpoints
10 | *.pth


--------------------------------------------------------------------------------
/dataset/utils.py:
--------------------------------------------------------------------------------
1 | import json
2 | def load_dataset_config(config_path, dataset_name):
3 |     with open(config_path) as f:
4 |         data_config = json.load(f)[dataset_name]
5 |     return data_config


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | transformers==4.45.0
 2 | accelerate==0.34.2
 3 | datasets==3.2.0
 4 | decord==0.6.0
 5 | deepspeed==0.15.2
 6 | Pillow==10.4.0
 7 | fire==0.6.0
 8 | wandb==0.17.6
 9 | easydict==1.13
10 | pathos==0.3.4
11 | func_timeout==4.3.5
12 | openai==1.96.1
13 | 


--------------------------------------------------------------------------------
/scripts/retrieval.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_PATH="checkpoints-release/InternVL2-8B-RA"
 4 | DATA=didemo
 5 | 
 6 | accelerate launch \
 7 |     --num_machines=1 \
 8 |     --num_processes 8 \
 9 |     --machine_rank 0 \
10 |     tasks/retrieval.py \
11 |     --model_path $MODEL_PATH \
12 |     --num_frames 32 \
13 |     --data $DATA


--------------------------------------------------------------------------------
/data.config:
--------------------------------------------------------------------------------
 1 | {
 2 |     "msrvtt": {
 3 |         "anno_path": "/path/to/anno/json",
 4 |         "data_root": "/path/to/data/root"
 5 |     },
 6 |     "msvd": {
 7 |         "anno_path": "/path/to/anno/json",
 8 |         "data_root": "/path/to/data/root"
 9 |     },
10 |     "didemo": {
11 |         "anno_path": "/path/to/anno/json",
12 |         "data_root": "/path/to/data/root",
13 |         "apply_paragraph_retrieval": true,
14 |         "trim30": true
15 |     },
16 |     "carebench": {
17 |         "anno_path": "/path/to/anno/json",
18 |         "data_root": "/path/to/data/root"
19 |     },
20 | }


--------------------------------------------------------------------------------
/scripts/captioning.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_PATH="path/to/model"
 4 | SAVE_DIR="path/to/save/eval/results"
 5 | DATA=carebench
 6 | 
 7 | accelerate launch \
 8 |     --num_machines=1 \
 9 |     --num_processes 8 \
10 |     --machine_rank 0 \
11 |     tasks/captioning.py \
12 |     --config_path data.config \
13 |     --dataset_name $DATA \
14 |     --model_path $MODEL_PATH \
15 |     --save_dir $SAVE_DIR \
16 |     --num_frames 32 \
17 |     --api_endpoint "https://api.deepseek.com/v1" \
18 |     --api_key "your-api-key" \
19 |     --api_model "deepseek-chat" \
20 |     --api_num_worker 64 \
21 |     --evaluate
22 | 


--------------------------------------------------------------------------------
/ds.config:
--------------------------------------------------------------------------------
 1 | {
 2 |   "train_batch_size": "auto",
 3 |   "train_micro_batch_size_per_gpu": "auto",
 4 |   "gradient_accumulation_steps": "auto",
 5 |   "gradient_clipping": "auto",
 6 |   "zero_allow_untested_optimizer": true,
 7 |   "fp16": {
 8 |     "enabled": "auto",
 9 |     "loss_scale": 0,
10 |     "initial_scale_power": 16,
11 |     "loss_scale_window": 1000,
12 |     "hysteresis": 2,
13 |     "min_loss_scale": 1
14 |   },
15 |     "zero_optimization": {
16 |         "stage": 2,
17 |         "allgather_partitions": true,
18 |         "allgather_bucket_size": 2e8,
19 |         "overlap_comm": true,
20 |         "reduce_scatter": true,
21 |         "reduce_bucket_size": 2e8,
22 |         "contiguous_gradients": true
23 |     }
24 | }
25 | 


--------------------------------------------------------------------------------
/scripts/train.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | OUTPUT_DIR="checkpoints/e5v-qwen2vl-7b-mix-recap-2ksteps-nli-lr-2e-5-mbs-32-bs-768-llm"
 4 | RUN_NAME=`basename $OUTPUT_DIR`
 5 | 
 6 | args=()
 7 | 
 8 | BASE_MODEL="checkpoints/Qwen2-VL-7B-Mix-Recap-2ksteps"
 9 | BATCH_SIZE=768
10 | MICRO_BATCH_SIZE=32
11 | EPOCH=2
12 | LR=2e-5
13 | WARMUP_RATIO=0.1
14 | CUTOFF_LEN=32
15 | GPUS=8
16 | NUM_NODES=1
17 | 
18 | echo $BASE_MODEL
19 | echo $MICRO_BATCH_SIZE $BATCH_SIZE
20 | wandb online
21 | 
22 | deepspeed --num_gpus=$GPUS --num_nodes=$NUM_NODES tasks/finetuning.py \
23 |         --model_name_or_path $BASE_MODEL \
24 |         --data_path 'data/nli_for_simcse.csv' \
25 |         --batch_size $BATCH_SIZE \
26 |         --micro_batch_size $MICRO_BATCH_SIZE  \
27 |         --num_epochs $EPOCH \
28 |         --warmup_ratio $WARMUP_RATIO \
29 |         --learning_rate $LR \
30 |         --cutoff_len $CUTOFF_LEN \
31 |         --output_dir $OUTPUT_DIR  \
32 |         --run_name $RUN_NAME \
33 |         --use_neg_sentence --save_steps 1000 \
34 |         --deepspeed ds.config \
35 |         --bf16 \
36 |         --logging_steps 1 --grad_checkpoint


--------------------------------------------------------------------------------
/utils/model.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import json
 3 | import torch
 4 | import torchvision.transforms as T
 5 | 
 6 | from typing import Dict, List
 7 | import os
 8 | 
 9 | 
10 | def load_architectures_from_config(config_path: str) -> List[str]:
11 |     if not os.path.exists(config_path):
12 |         raise ValueError(f"{config_path} doesn't exist.")
13 |     # load architectures from config.json
14 |     with open(config_path, 'r') as f:
15 |         config = json.load(f)
16 |         architectures = config.get('architectures', None)
17 |         if architectures is None:
18 |             raise ValueError(f"Architectures not found in {config_path}.")
19 |         if len(architectures) != 1:
20 |             raise ValueError(f"Architectures should have only one element, got {len(architectures)}.")
21 |         model_arch = architectures[0]
22 |     return model_arch
23 | 
24 | def transform_pixel_values(pixel_values: torch.Tensor | List[torch.Tensor]) -> torch.Tensor:
25 |     # NOTE: this function doesn't accept unbatched inputs
26 |     # pixel_values should be uint8 of (B, T, C, H, W)
27 |     if isinstance(pixel_values, list):
28 |         pixel_values = torch.stack(pixel_values)
29 | 
30 |     if pixel_values.ndim == 4:
31 |         # pixel_values is (B, C, H, W)
32 |         # (B, C, H, W) -> (B, 1, C, H, W)
33 |         pixel_values = pixel_values.unsqueeze(1)
34 |     elif pixel_values.ndim == 5:
35 |         # pixel_values is (B, T, C, H, W)
36 |         pass
37 |     else:
38 |         raise ValueError(f"pixel_values should be 4D or 5D, got {pixel_values.ndim}D")
39 |     return pixel_values
40 | 
41 | EOL_PROMPTS = {
42 |     'text': '<sent>\nSummary above sentence in one word:',
43 |     'image': '<image>\nSummary above image in one word:',
44 |     'video': '<video>\nSummary above video in one word:',
45 | }


--------------------------------------------------------------------------------
/dataset/dataset.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import json
 3 | from PIL import Image
 4 | from utils.video import read_frames_decord
 5 | from torchvision.transforms import PILToTensor
 6 | import os
 7 | 
 8 | def custom_collate_fn(batch):
 9 |     collated_batch = {}
10 |     for key in batch[0].keys():
11 |         collated_batch[key] = [b[key] for b in batch]
12 |     return collated_batch
13 | 
14 | class VideoTextDataset(torch.utils.data.Dataset):
15 |     def __init__(
16 |             self, 
17 |             anno_path: str, 
18 |             data_root=None, 
19 |             decode=True,
20 |             apply_paragraph_retrieval=False,
21 |             trim30=False,
22 |             num_frames=32,
23 |             **kwargs,
24 |         ):
25 |         with open(anno_path) as f:
26 |             self.data = json.load(f)
27 |         self.data_root = data_root
28 |         self.apply_paragraph_retrieval = apply_paragraph_retrieval
29 |         self.trim30 = trim30
30 |         self.num_frames = num_frames
31 |         self.decode = decode
32 |         self.texts = []
33 |         self.texts_vision_index = []
34 |         self.vision_texts_index = []
35 | 
36 |         self._proprocess()
37 |         self.return_text = False
38 | 
39 |     def _proprocess(self):
40 |         if self.apply_paragraph_retrieval:
41 |             self._cast('caption', lambda x: ' '.join(x))
42 |         self._cast('video', lambda x: os.path.join(self.data_root, x))
43 |         self._indexing()
44 |         
45 |     def _indexing(self):
46 |         # add idx to each data
47 |         for idx in range(len(self.data)):
48 |             self.data[idx]['idx'] = idx
49 |         # generate texts and vision_texts_index
50 |         for idx, ann in enumerate(self.data):
51 |             self.vision_texts_index.append([])
52 |             if isinstance(ann["caption"], list):
53 |                 _captions = ann["caption"]
54 |             else:
55 |                 _captions = [ann["caption"]]
56 |             for i, caption in enumerate(_captions):
57 |                 self.texts.append(caption)
58 |                 self.texts_vision_index.append(idx)
59 |                 self.vision_texts_index[idx].append(len(self.texts_vision_index) - 1)
60 |     
61 |     def _cast(self, key, func: callable):
62 |         for idx in range(len(self.data)):
63 |             self.data[idx][key] = func(self.data[idx][key])
64 | 
65 |     def __len__(self):
66 |         if self.return_text:
67 |             return len(self.texts)
68 |         return len(self.data)
69 | 
70 |     def __getitem__(self, idx):
71 |         if self.return_text:
72 |             return {'caption': self.texts[idx]}
73 |         d = self.data[idx]
74 |         if self.decode:
75 |             d['video'] = read_frames_decord(d['video'], num_frames=self.num_frames, trimmed30=self.trim30)
76 |         return {'idx': d['idx'], 'video': d['video'], 'caption': d['caption']}
77 | 


--------------------------------------------------------------------------------
/utils/video.py:
--------------------------------------------------------------------------------
 1 | from decord import VideoReader
 2 | import decord
 3 | import random
 4 | import numpy as np
 5 | 
 6 | decord.bridge.set_bridge("torch")
 7 | 
 8 | 
 9 | def read_frames_decord(
10 |         video_path, num_frames, sample='middle', fix_start=None, 
11 |         max_num_frames=-1, trimmed30=False
12 |     ):
13 |     num_threads = 1 if video_path.endswith('.webm') else 0 # make ssv2 happy
14 |     video_reader = VideoReader(video_path, num_threads=num_threads)
15 |     vlen = len(video_reader)
16 |  
17 |     fps = video_reader.get_avg_fps()
18 |     duration = vlen / float(fps)
19 | 
20 |     # only use top 30 seconds
21 |     if trimmed30 and duration > 30:
22 |         duration = 30
23 |         vlen = int(30 * float(fps))
24 | 
25 |     frame_indices = get_frame_indices(
26 |         num_frames, vlen, sample=sample, fix_start=fix_start,
27 |         input_fps=fps, max_num_frames=max_num_frames
28 |     )
29 | 
30 |     frames = video_reader.get_batch(frame_indices)  # (T, H, W, C), torch.uint8
31 |     frames = frames.permute(0, 3, 1, 2)  # (T, C, H, W), torch.uint8
32 |     return frames
33 | 
34 | def get_frame_indices(num_frames, vlen, sample='rand', fix_start=None, input_fps=1, max_num_frames=-1):
35 |     if sample in ["rand", "middle"]: # uniform sampling
36 |         acc_samples = min(num_frames, vlen)
37 |         # split the video into `acc_samples` intervals, and sample from each interval.
38 |         intervals = np.linspace(start=0, stop=vlen, num=acc_samples + 1).astype(int)
39 |         ranges = []
40 |         for idx, interv in enumerate(intervals[:-1]):
41 |             ranges.append((interv, intervals[idx + 1] - 1))
42 |         if sample == 'rand':
43 |             try:
44 |                 frame_indices = [random.choice(range(x[0], x[1])) for x in ranges]
45 |             except:
46 |                 frame_indices = np.random.permutation(vlen)[:acc_samples]
47 |                 frame_indices.sort()
48 |                 frame_indices = list(frame_indices)
49 |         elif fix_start is not None:
50 |             frame_indices = [x[0] + fix_start for x in ranges]
51 |         elif sample == 'middle':
52 |             frame_indices = [(x[0] + x[1]) // 2 for x in ranges]
53 |         else:
54 |             raise NotImplementedError
55 | 
56 |         if len(frame_indices) < num_frames:  # padded with last frame
57 |             padded_frame_indices = [frame_indices[-1]] * num_frames
58 |             padded_frame_indices[:len(frame_indices)] = frame_indices
59 |             frame_indices = padded_frame_indices
60 |     elif "fps" in sample:  # fps0.5, sequentially sample frames at 0.5 fps
61 |         output_fps = float(sample[3:])
62 |         duration = float(vlen) / input_fps
63 |         delta = 1 / output_fps  # gap between frames, this is also the clip length each frame represents
64 |         frame_seconds = np.arange(0 + delta / 2, duration + delta / 2, delta)
65 |         frame_indices = np.around(frame_seconds * input_fps).astype(int)
66 |         frame_indices = [e for e in frame_indices if e < vlen]
67 |         if max_num_frames > 0 and len(frame_indices) > max_num_frames:
68 |             frame_indices = frame_indices[:max_num_frames]
69 |             # frame_indices = np.linspace(0 + delta / 2, duration + delta / 2, endpoint=False, num=max_num_frames)
70 |     else:
71 |         raise ValueError
72 |     return frame_indices


--------------------------------------------------------------------------------
/models/tarsier/utils.py:
--------------------------------------------------------------------------------
  1 | # Copyright (2024) Bytedance Ltd. and/or its affiliates
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | from typing import List
 15 | import os
 16 | from PIL import Image, ImageSequence
 17 | import decord
 18 | 
 19 | VALID_DATA_FORMAT_STRING = "Input data must be {'.jpg', '.jpeg', '.png', '.tif'} for image; or {'.mp4', '.avi', '.webm', '.mov', '.mkv', '.wmv', '.gif'}  for videos!"
 20 | 
 21 | # 均匀抽帧，必采样首尾帧。
 22 | def sample_frame_indices(start_frame, total_frames: int, n_frames: int):
 23 |     if n_frames == 1:
 24 |         return [0]  # sample first frame in default
 25 |     sample_ids = [round(i * (total_frames - 1) / (n_frames - 1)) for i in range(n_frames)]
 26 |     sample_ids = [i + start_frame for i in sample_ids]
 27 |     return sample_ids
 28 | 
 29 | def sample_video(
 30 |     video_path: str, 
 31 |     n_frames: int = None,
 32 |     start_time: int = 0,
 33 |     end_time: int = -1
 34 |     ) -> List[Image.Image]:
 35 | 
 36 |     assert os.path.exists(video_path), f"File not found: {video_path}"
 37 |     vr = decord.VideoReader(video_path, num_threads=1, ctx=decord.cpu(0))
 38 |     vr.seek(0)
 39 |     total_frames = len(vr)
 40 |     fps = vr.get_avg_fps()
 41 | 
 42 |     start_frame = 0
 43 |     end_frame = total_frames - 1
 44 |     if start_time > 0:
 45 |         start_frame = min((total_frames-1), int(fps*start_time))
 46 |     if end_time > 0:
 47 |         end_frame = max(start_frame, int(fps*end_time))
 48 |         end_frame = min(end_frame, (total_frames-1))
 49 |     frame_indices = sample_frame_indices(
 50 |         start_frame=start_frame,
 51 |         total_frames=end_frame - start_frame + 1,
 52 |         n_frames=n_frames,
 53 |     )
 54 | 
 55 |     frames = vr.get_batch(frame_indices).asnumpy()
 56 |     frames = [Image.fromarray(f).convert('RGB') for f in frames]
 57 |     return frames
 58 | 
 59 | def sample_gif(
 60 |         gif_path: str,
 61 |         n_frames:int = None,
 62 |         start_time: int = 0,
 63 |         end_time: int = -1
 64 |     ) -> List[Image.Image]:
 65 | 
 66 |     assert os.path.exists(gif_path), f"File not found: {gif_path}"
 67 |     
 68 |     gif_frames = Image.open(gif_path)
 69 | 
 70 |     start_frame = 0
 71 |     end_frame = gif_frames.n_frames - 1
 72 |     frame_indices = sample_frame_indices(
 73 |         start_frame=start_frame,
 74 |         total_frames=end_frame - start_frame + 1,
 75 |         n_frames=n_frames,
 76 |     )
 77 |         
 78 |     frames = []
 79 |     i = 0
 80 |     for frame in ImageSequence.Iterator(gif_frames):
 81 |         if i in frame_indices:
 82 |             frames.append(frame.convert('RGB'))
 83 |         i += 1
 84 |     return frames
 85 | 
 86 | def sample_image(
 87 |     image_path: str, 
 88 |     n_frames: int = None,
 89 |     start_time: int = 0,
 90 |     end_time: int = -1
 91 |     ):
 92 |     assert os.path.exists(image_path), f"File not found: {image_path}"
 93 |     image = Image.open(image_path).convert('RGB')
 94 |     return [image]
 95 | 
 96 | def get_visual_type(input_file):
 97 |     ext = os.path.splitext(input_file)[-1]
 98 |     if ext in {'.gif'}:
 99 |         return 'gif'
100 |     elif ext in {'.mp4', '.avi', '.webm', '.mov', '.mkv', '.wmv'}:
101 |         return 'video'
102 |     elif ext in {'.jpg', '.jpeg', '.png', '.tif'}:
103 |         return 'image'
104 |     else:
105 |         print(f"{VALID_DATA_FORMAT_STRING} But found {ext}!")
106 |         return 'unk'
107 | 
108 | def get_benchmarks(benchmarks):
109 |     final_benchmarks = []
110 |     type2bm = {
111 |         'dream': ['dream'],
112 |         'caption': ['msvd-caption', 'msr-vtt-caption', 'vatex-caption'],
113 |         'mc_qa': ['next-qa', 'egoschema', 'mvbench', 'video-mme'],
114 |         'oe_qa': ['msvd-qa', 'msr-vtt-qa', 'tgif-qa', 'anet-qa'],
115 |     }
116 |     for bm in benchmarks:
117 |         bm = bm.lower()
118 |         if bm in final_benchmarks:
119 |             continue
120 |         if bm == 'all':
121 |             for v in type2bm.values():
122 |                 final_benchmarks.extend(v)
123 |             return final_benchmarks
124 |         if bm in type2bm:
125 |             final_benchmarks.extend(type2bm[bm])
126 |         else:
127 |             final_benchmarks.append(bm)
128 |     return final_benchmarks
129 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | <div align="center">
  2 |   <h2>
  3 |     <img src="assets/logo.png"" style="vertical-align: middle" alt="Logo" height="25px" width="25px">
  4 |     <a href="https://arxiv.org/pdf/2501.00513" style="vertical-align: middle;"><span style="font-variant: small-caps;">CaReBench:</span> A Fine-grained Benchmark for Video Captioning and Retrieval</a>
  5 |   </h2>
  6 | 
  7 |   Yifan Xu, [Xinhao Li](https://scholar.google.com/citations?user=evR3uR0AAAAJ), Yichun Yang, Desen Meng, Rui Huang, [Limin Wang](https://scholar.google.com/citations?user=HEuN8PcAAAAJ)
  8 | 
  9 |   <p align="center">
 10 |     🤗 <a href="https://huggingface.co/MCG-NJU/CaRe-7B">Model</a> &nbsp&nbsp | &nbsp&nbsp 🤗 <a href="https://huggingface.co/datasets/MCG-NJU/CaReBench">Data</a> &nbsp&nbsp｜ &nbsp&nbsp 📑 <a href="https://arxiv.org/pdf/2501.00513">Paper</a> &nbsp&nbsp 
 11 |     <br>
 12 |   </p>
 13 | </div>
 14 | 
 15 | ![](assets/comparison.png)
 16 | 
 17 | ## :memo: Introduction
 18 | 
 19 | **🌟 CaReBench** is a fine-grained benchmark comprising **1,000 high-quality videos** with detailed human-annotated captions, including **manually separated spatial and temporal descriptions** for independent spatiotemporal bias evaluation.
 20 | ![CaReBench](assets/carebench.png)
 21 | 
 22 | **📊 ReBias and CapST Metrics** are designed specifically for retrieval and captioning tasks, providing a comprehensive evaluation framework for spatiotemporal understanding in video-language models.
 23 | 
 24 | 
 25 | **⚡ CaRe: A Unified Baseline** for fine-grained video retrieval and captioning, achieving competitive performance through **two-stage Supervised Fine-Tuning (SFT)**. CaRe excels in both generating detailed video descriptions and extracting robust video features.
 26 | ![CaRe Training Recipe](assets/care_model.png)
 27 | 
 28 | **🚀 State-of-the-art performance** on both detailed video captioning and fine-grained video retrieval. CaRe outperforms CLIP-based retrieval models and popular MLLMs in captioning tasks.
 29 | ![alt text](assets/performance.png)
 30 | 
 31 | 
 32 | ## :partying_face: Get Started
 33 | 
 34 | Our code is quite simple and easy. Just follow the instructions below and the code will work like magic.
 35 | 
 36 | ### Prepare
 37 | 
 38 | Install the requirements.
 39 | 
 40 | ```
 41 | pip install -r requirements.txt
 42 | ```
 43 | 
 44 | ### Inference
 45 | 
 46 | **Our framework supports auto-loadable inference of all the MLLMs metioned in our paper**, including CaRe, LLaVA NeXT Video, MiniCPM-V 2.6, InternVL2, Qwen2-VL and Tarsier. You only need to change the checkpoint path and our model loader will load them automatically.
 47 | 
 48 | **For Video Captioning Task**
 49 | 
 50 | ```python
 51 | from utils.video import read_frames_decord
 52 | from models.modeling_captioners import AutoCaptioner
 53 | 
 54 | captioner = AutoCaptioner.from_pretrained('path/to/checkpoints/CaRe-7B')
 55 | frames = read_frames_decord(video_path='assets/demo.mp4', num_frames=32)
 56 | description = captioner.describe(frames.unsqueeze(0))
 57 | print(description[0])
 58 | ```
 59 | 
 60 | **For Video Retrieval Task**
 61 | 
 62 | ```python
 63 | from utils.video import read_frames_decord
 64 | from models.modeling_encoders import AutoEncoder
 65 | from torch.nn.functional import cosine_similarity
 66 | 
 67 | encoder = AutoEncoder.from_pretrained('path/to/checkpoints/CaRe-7B')
 68 | frames = read_frames_decord(video_path='assets/demo.mp4', num_frames=32)
 69 | text = "This video features a man slicing tomatoes in the kitchen."
 70 | vision_emb = encoder.encode_vision(frames.unsqueeze(0))
 71 | text_emb = encoder.encode_text(text)
 72 | print(f'Vision embedding shape: {vision_emb.shape}')
 73 | print(f'Text embedding shape: {text_emb.shape}')
 74 | print(f'Cosine similarity: {cosine_similarity(vision_emb, text_emb)}')
 75 | 
 76 | ```
 77 | 
 78 | ### Benchmark
 79 | 
 80 | 1. Download data from [our huggingface repository](https://huggingface.co/datasets/MCG-NJU/CaReBench).
 81 | 
 82 | 2. Add our benchmark to `data.config`.
 83 | 
 84 | 3. Check the arguments in `scripts/captioning.sh` or `scripts/retrieval.sh` and run it. 
 85 | 
 86 | 
 87 | 
 88 | ### Training
 89 | 
 90 | **Stage-I**
 91 | 
 92 | We are preparing for the release of Stage-I training code.
 93 | 
 94 | **Stage-II**
 95 | 
 96 | 1. Download data
 97 | 
 98 | ```
 99 | mkdir data && wget https://huggingface.co/datasets/princeton-nlp/datasets-for-simcse/resolve/main/nli_for_simcse.csv -O data/nli_for_simcse.csv
100 | ```
101 | 
102 | 2. Check the arguments in `scripts/train.sh` we prepare for you and run it. 
103 | 
104 | 
105 | ## Customize Your Own Model
106 | 
107 | Our framework is designed for our paper, but it is also scalable since we have added many code specification. If you wish to have your retrieval model or caption model evaluated within our framework, please refer to the following guidelines.
108 | 
109 | ### Step 1: Add Base Model
110 | 
111 | 1. Inherit your model from the `BaseModel` in `models/modeling_basemodels.py`, and implement the `__init__` function. Your model will automatically gain the from_pretrained method.
112 | 2. (Optional) To support all the auto methods, set `ARCHITECTURE` in your class property. Make sure there is `config.json` in your model path with the structure below (something like transformers models). `ARCHITECTURE` should be the same as `architectures[0]`. Then, all the auto methods will load your model according to this architecture.
113 | ```json
114 | {
115 |   "architectures": [
116 |     "CLIPModel"
117 |   ],
118 |   ...
119 | }
120 | ```
121 | ### Step 2: Add Retrieval Model
122 | 
123 | 1. Inherit your retrieval model from your custom base model and `EncodeMixin` in `models/modeling_encoders.py
124 | 2. Implement `encode_vision` and `encode_text` method.
125 | 
126 | ### Step 3: Add Caption Model
127 | 
128 | 1. Inherit your caption model from your custom base model and `CaptionMixin` in `models/modeling_captioners.py
129 | 2. Implement `describe` method.
130 | 


--------------------------------------------------------------------------------
/utils/gpt_api.py:
--------------------------------------------------------------------------------
  1 | # Copyright (2024) Bytedance Ltd. and/or its affiliates
  2 | 
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | 
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | 
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import base64
 15 | import openai
 16 | import json
 17 | import time
 18 | from typing import List
 19 | import os
 20 | import logging
 21 | logger = logging.getLogger(__name__)
 22 | 
 23 | azure_endpoint = os.getenv("AZURE_ENDPOINT", "")
 24 | azure_api_key = os.getenv("OPENAI_API_KEY", "")
 25 | 
 26 | azure_gpt4v_client = openai.AzureOpenAI(
 27 |     azure_endpoint=azure_endpoint,
 28 |     api_version="2023-07-01-preview",
 29 |     api_key=azure_api_key,
 30 |     timeout = 120, 
 31 | )
 32 | 
 33 | azure_gpt4_client = openai.OpenAI(api_key=azure_api_key, base_url=azure_endpoint)
 34 | 
 35 | client = openai.AzureOpenAI(
 36 |     azure_endpoint=azure_endpoint,
 37 |     api_version="2023-07-01-preview",
 38 |     api_key=azure_api_key,
 39 |     timeout = 120,
 40 | )
 41 | 
 42 | # Function to encode the image
 43 | def encode_image(image_path):
 44 |   with open(image_path, "rb") as image_file:
 45 |     return base64.b64encode(image_file.read()).decode('utf-8')
 46 | 
 47 | def call_gemini_api(prompt: str, image_paths: List[str]=None, images_bs64: List[str]=None):
 48 |     assert image_paths is not None or images_bs64 is not None, "image_paths and images_bs64 cannot be both None."
 49 |     if images_bs64 is None:
 50 |         encoded_images = [encode_image(p) for p in image_paths]
 51 |     else:
 52 |         encoded_images = images_bs64
 53 |     completion = client.chat.completions.create(
 54 |         model="gemini-1.5-pro-preview", 
 55 |         messages=[
 56 |             {
 57 |                 "role": "user",
 58 |                 "content": [
 59 |                     {
 60 |                         "type": "text",
 61 |                         "text": prompt,
 62 |                     },
 63 |                     *[{
 64 |                         "type": "image_url",
 65 |                         "image_url": {
 66 |                             "url": image
 67 |                         }
 68 |                     } for image in encoded_images],
 69 |                 ]
 70 |             }
 71 |         ],
 72 |     )   
 73 |     return json.loads(completion.model_dump_json())['choices'][0]['message']['content']
 74 | 
 75 | def call_azure_gpt4v_api(prompt: str, image_paths: List[str]=None, images_bs64: List[str]=None):
 76 |     assert image_paths is not None or images_bs64 is not None, "image_paths and images_bs64 cannot be both None."
 77 |     if images_bs64 is None:
 78 |         encoded_images = [encode_image(p) for p in image_paths]
 79 |     else:
 80 |         encoded_images = images_bs64
 81 |     completion = azure_gpt4v_client.chat.completions.create(
 82 |         model="gptv", # or gpt-4o-2024-05-13
 83 |         messages=[
 84 |             {
 85 |                 "role": "user",
 86 |                 "content": [
 87 |                     {
 88 |                         "type": "text",
 89 |                         "text": prompt
 90 |                     },
 91 |                     *[{
 92 |                         "type": "image_url",
 93 |                         "image_url": {
 94 |                             "url": f'data:image/png;base64,{image}',
 95 |                             "detail": "high"
 96 |                         }
 97 |                     } for image in encoded_images],
 98 |                 ]
 99 |             }
100 |         ],
101 |         temperature = 0.7,
102 |         top_p = 0.95,
103 |         # max_tokens = 4096,
104 |     )
105 |     return json.loads(completion.model_dump_json())['choices'][0]['message']['content']
106 |         
107 | def call_azure_gpt_api(prompt: str, model = 'gpt-35-turbo'):
108 |     
109 |     completion = azure_gpt4_client.chat.completions.create(
110 |         model=model,
111 |         messages=[
112 |             {
113 |                 "role": "user",
114 |                 "content": prompt
115 |             }
116 |         ]
117 |     )
118 |     return json.loads(completion.model_dump_json())['choices'][0]['message']['content']
119 | 
120 | retry_exceptions = [
121 |     "qpm limit, you can apply for expansion on the platform",
122 |     "reach token limit, you can apply for expansion on the platform",
123 |     "Request timed out",
124 |     "The service is temporarily unable to process your request.",
125 |     "upstream failed to respond",
126 |     "502 Bad Gateway",
127 | ]
128 | 
129 | def try_call_api(model, prompt: str, image_paths: List[str]=None, images_bs64: List[str]=None):
130 |     q_success = False
131 |     while q_success != True:
132 |         try:
133 |             if model == 'gptv':
134 |                 assert image_paths is not None or images_bs64 is not None, "image_paths and images_bs64 cannot be both None."
135 |                 gpt_q = call_azure_gpt4v_api(prompt, image_paths, images_bs64)
136 |             elif model in ['gpt-4-1106-preview', 'gpt-35-turbo']:
137 |                 gpt_q = call_azure_gpt_api(prompt, model)
138 |             else:
139 |                 raise ValueError(f'{model} is invalid.')
140 |             q_success = True
141 |             return gpt_q, 0
142 |         except Exception as e:
143 |             e = f'ERROR from try_call_api: {e}'
144 |             logger.error(e)
145 |             hit = False
146 |             for x in retry_exceptions:
147 |                 if x in e:
148 |                     hit = True
149 |                     time.sleep(10)
150 |             if not hit:
151 |                 return e, 1
152 |  
153 | if __name__ == '__main__':
154 |     resp = call_azure_gpt4v_api(prompt='Describe the image in detail.', image_paths=[os.path.dirname(os.path.abspath(__file__)) + '/../assets/figures/tarsier_logo.jpg'])


--------------------------------------------------------------------------------
/models/tarsier/processor.py:
--------------------------------------------------------------------------------
  1 | # Copyright (2024) Bytedance Ltd. and/or its affiliates
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | from PIL import Image
 15 | from typing import List
 16 | import torch
 17 | from transformers import DataCollatorForSeq2Seq
 18 | from transformers.models.llava import LlavaProcessor
 19 | import re
 20 | 
 21 | from .utils import sample_image, sample_video, sample_gif, get_visual_type
 22 | 
 23 | ext2sampler = {
 24 |     'image': sample_image,
 25 |     'gif': sample_gif,
 26 |     'video': sample_video
 27 | }
 28 | 
 29 | class CustomImageProcessor:
 30 |     def __init__(self, processor) -> None:
 31 |         self.processor = processor
 32 | 
 33 |     def __call__(self, images: List[Image.Image], do_padding=False) -> torch.Tensor:
 34 |         if do_padding:
 35 |             images = [self.expand2square(
 36 |                 img,
 37 |                 tuple(int(x * 255) for x in self.processor.image_processor.image_mean)
 38 |             ) for img in images]
 39 |         else:
 40 |             images = [self.resize2square(img) for img in images]
 41 |         images_pixel = self.processor(text="", images=images, return_tensors="pt")['pixel_values']
 42 |         return images_pixel  # [num_images, 3, 336, 336]
 43 | 
 44 |     def expand2square(self, pil_img, background_color):
 45 |         width, height = pil_img.size
 46 |         if width == height:
 47 |             return pil_img
 48 |         elif width > height:
 49 |             result = Image.new(pil_img.mode, (width, width), background_color)
 50 |             result.paste(pil_img, (0, (width - height) // 2))
 51 |             return result
 52 |         else:
 53 |             result = Image.new(pil_img.mode, (height, height), background_color)
 54 |             result.paste(pil_img, ((height - width) // 2, 0))
 55 |             return result
 56 | 
 57 |     def resize2square(self, pil_img: Image.Image):
 58 |         width, height = pil_img.size
 59 |         pil_img = pil_img.resize((max(width, height), max(width, height)))
 60 |         return pil_img
 61 | 
 62 | class Processor(object):
 63 |     def __init__(
 64 |             self,
 65 |             model_name_or_path,
 66 |             max_n_frames=8,
 67 |             max_seq_len=None,
 68 |             add_sep=False,
 69 |             do_image_padding=False,
 70 |         ):
 71 |         self.max_n_frames = max_n_frames
 72 |         self.max_seq_len = max_seq_len,
 73 |         self.add_sep = add_sep
 74 |         self.do_image_padding = do_image_padding
 75 |         if not self.do_image_padding:
 76 |             print(f"### do_image_padding is set as False, images will be resized directly!")
 77 | 
 78 |         self.setup(model_name_or_path)
 79 |         
 80 |     
 81 |     def setup(self, model_name_or_path):
 82 |         sub_processor = LlavaProcessor.from_pretrained(
 83 |             model_name_or_path,
 84 |             padding_side='left',
 85 |             trust_remote_code=True,
 86 |         )
 87 |         self.processor = CustomImageProcessor(sub_processor)
 88 |         self.tokenizer = sub_processor.tokenizer
 89 |         # self.pad_collator = DataCollatorForSeq2Seq(self.tokenizer, padding='longest')
 90 |         self.sep_id = self.tokenizer.sep_token_id
 91 |         self.pad_id = self.tokenizer.pad_token_id
 92 |         self.eos_id = self.tokenizer.eos_token_id
 93 | 
 94 |         if self.sep_id is None:
 95 |             self.add_sep = False
 96 |         if not self.max_seq_len:
 97 |             self.max_seq_len = self.tokenizer.model_max_length
 98 | 
 99 |     def process_prompt(self, prompt, images: List[Image.Image]=None):
100 |         if not images:
101 |             prompt = prompt.replace("<image>", "").replace("<video>", "")
102 |         elif images is not None:
103 |             prompt = prompt.replace("<video>", "<image>"*len(images))
104 |             image_token_num = len(re.findall('<image>', prompt, re.S))
105 |             if image_token_num == 0:
106 |                 prompt_parts = re.findall(r'USER:(.*)ASSISTANT:(.*)', prompt, re.S)
107 |                 if prompt_parts and len(prompt_parts) == 2:
108 |                     p1, p2 = prompt_parts
109 |                 else:
110 |                     p1 = prompt
111 |                     p2 = ''
112 |                 prompt = f"USER: {'<image>'*len(images) + ' ' + p1.strip()} ASSISTANT: {p2.strip()}"
113 |             assert image_token_num == len(images)
114 |         
115 |         if not re.findall(r'USER:(.*)ASSISTANT:(.*)', prompt, re.S):
116 |             prompt = f'USER: {prompt} ASSISTANT: '
117 |         return prompt
118 | 
119 |     def select_frames_sampler(self, visual_data_path):
120 |         visual_type = get_visual_type(visual_data_path)
121 |         if visual_type in ext2sampler:
122 |             return ext2sampler[visual_type]
123 |         else:
124 |             raise ValueError(f"Unsupported data format: {visual_data_path}")
125 |         
126 |     def load_images(self, visual_data_path, n_frames=None, start_time=0, end_time=-1):
127 |         sampler = self.select_frames_sampler(visual_data_path)
128 |         return sampler(visual_data_path, n_frames=min(n_frames, self.max_n_frames) if n_frames else self.max_n_frames, start_time=start_time, end_time=end_time)
129 | 
130 |     def get_pixel_values(self, images):
131 |         if images is not None and len(images) > 0:
132 |             pixel_values = self.processor(images=images, do_padding=self.do_image_padding)
133 |         else:
134 |             pixel_values = None
135 |         return pixel_values
136 | 
137 |     def get_text_inputs(self, text):
138 |         prompt_ids = self.tokenizer.encode(text, add_special_tokens=True)  # will add <s>
139 |         if self.add_sep:
140 |             prompt_ids = prompt_ids + [self.sep_id]
141 |         prompt_ids = torch.tensor(prompt_ids, dtype=torch.long).unsqueeze(dim=0)
142 |         return prompt_ids
143 | 
144 |     def get_inputs(self, prompt, visual_data_file=None, images=None, n_frames=None, edit_prompt=False, return_prompt=False):
145 |         if images is None:
146 |             images = self.load_images(visual_data_file, n_frames) if visual_data_file else None
147 |         if edit_prompt:
148 |             prompt = self.process_prompt(prompt, images)
149 |         text_inputs = self.get_text_inputs(prompt)
150 |         pixel_values = self.get_pixel_values(images)
151 |         inputs = {
152 |             "input_ids": text_inputs,
153 |             "pixel_values": pixel_values
154 |         }
155 |         if return_prompt:
156 |             inputs['prompt'] = prompt
157 |         return inputs
158 | 
159 |     def __call__(self, prompt, visual_data_file=None, images=None, n_frames=None, edit_prompt=False, return_prompt=False):
160 |         return self.get_inputs(prompt, visual_data_file, images, n_frames, edit_prompt, return_prompt)
161 | 


--------------------------------------------------------------------------------
/tasks/retrieval.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import os
  3 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
  4 | import json
  5 | import torch
  6 | from tqdm import tqdm
  7 | from accelerate import Accelerator
  8 | from PIL import Image
  9 | import torch.nn.functional as F
 10 | from models.modeling_encoders import AutoEncoder
 11 | from utils.video import read_frames_decord
 12 | import decord
 13 | from torchvision.transforms.v2 import PILToTensor
 14 | from dataset.dataset import VideoTextDataset
 15 | 
 16 | 
 17 | decord.bridge.set_bridge("torch")
 18 | accelerator = Accelerator(device_placement=False)
 19 | 
 20 | def recall_at_k(scores, positive_pairs, k):
 21 |     """
 22 |     Compute the recall at k for each sample
 23 |     :param scores: compability score between  text and image embeddings (nb texts, nb images)
 24 |     :param k: number of images to consider per text, for retrieval
 25 |     :param positive_pairs: boolean matrix of positive pairs (nb texts, nb images)
 26 |     :return: recall at k averaged over all texts
 27 |     """
 28 |     nb_texts, nb_images = scores.shape
 29 |     # for each text, sort according to image scores in decreasing order
 30 |     topk_indices = torch.topk(scores, k, dim=1)[1]
 31 |     # compute number of positives for each text
 32 |     nb_positive = positive_pairs.sum(dim=1)
 33 |     # nb_texts, k, nb_images
 34 |     topk_indices_onehot = torch.nn.functional.one_hot(topk_indices, num_classes=nb_images)
 35 |     # compute number of true positives
 36 |     positive_pairs_reshaped = positive_pairs.view(nb_texts, 1, nb_images)
 37 |     # a true positive means a positive among the topk
 38 |     nb_true_positive = (topk_indices_onehot * positive_pairs_reshaped).sum(dim=(1,2))
 39 |     # compute recall at k
 40 |     recall_at_k = (nb_true_positive / nb_positive)
 41 |     return recall_at_k
 42 | 
 43 | def batchify(func, X, Y, batch_size, device, *args, **kwargs):
 44 |     results = []
 45 |     for start in range(0, len(X), batch_size):
 46 |         end = start + batch_size
 47 |         x = X[start:end].to(device)
 48 |         y = Y[start:end].to(device)
 49 |         result = func(x, y, *args, **kwargs).cpu()
 50 |         results.append(result)
 51 |     return torch.cat(results)
 52 | 
 53 | def emb_data(encoder, dataset, device,
 54 |              emb_type='text'):
 55 |     # convert batch from to a dictionary
 56 |     def custom_collate_fn(batch):
 57 |         collated_batch = {}
 58 |         for key in batch[0].keys():
 59 |             collated_batch[key] = [b[key] for b in batch]
 60 |         return collated_batch
 61 |     if emb_type == 'text':
 62 |         dataset.return_text = True
 63 |     else:
 64 |         dataset.return_text = False
 65 | 
 66 |     dataloader = torch.utils.data.DataLoader(
 67 |         dataset, batch_size=1,
 68 |         shuffle=False, num_workers=1,
 69 |         collate_fn=custom_collate_fn
 70 |     )
 71 |     dataloader = accelerator.prepare(dataloader)
 72 |     embs = []
 73 |     bar = tqdm(total=len(dataloader))
 74 |     for batch in dataloader:
 75 |         emb = []
 76 |         if emb_type == 'text':
 77 |             emb = encoder.encode_text(batch['caption'])
 78 |         elif emb_type == 'video':
 79 |             emb = encoder.encode_vision(batch['video'])
 80 |         elif emb_type == 'image':
 81 |             emb = encoder.encode_vision(batch['image'])
 82 | 
 83 |         emb = F.normalize(emb, dim=-1)
 84 |         emb = accelerator.gather_for_metrics(emb)
 85 |         embs.append(emb.cpu().float())
 86 |         bar.update(1)
 87 | 
 88 |     embs = torch.cat(embs)
 89 |     bar.close()
 90 |     return embs
 91 | 
 92 | def ir(encoder, data_config, device, num_frames=32):
 93 | 
 94 |     dataset = VideoTextDataset(**data_config, num_frames=num_frames)
 95 |     text_embs = emb_data(encoder, dataset, device, emb_type='text')
 96 |     texts_image_index = dataset.texts_vision_index
 97 |     vision_texts_index = dataset.vision_texts_index
 98 |     assert len(texts_image_index) == len(text_embs), f'length of text embs({len(text_embs)}) and texts_image_index({len(texts_image_index)}) should be the same'
 99 |     video_embs = emb_data(encoder, dataset, device, emb_type='video')
100 | 
101 |     assert text_embs.isnan().sum().item() == 0, 'nan in retrieve emb'
102 |     assert video_embs.isnan().sum().item() == 0, 'nan in images emb'
103 | 
104 |     # get the score for each text and image pair
105 |     scores  = text_embs @ video_embs.t()
106 | 
107 |     positive_pairs = torch.zeros_like(scores, dtype=bool)
108 |     positive_pairs[torch.arange(len(scores)), texts_image_index] = True
109 |     metrics = {}
110 |     recall_k_list = [1, 5, 10]
111 |     batch_size = 64
112 |     for recall_k in recall_k_list:
113 |         # Note that recall_at_k computes **actual** recall i.e. nb_true_positive/nb_positives, where the number
114 |         # of true positives, e.g. for text retrieval, is, for each image,  the number of retrieved texts matching that image among the top-k.
115 |         # Also, the number of positives are the total number of texts matching the image in the dataset, as we have a set of captions
116 |         # for each image, that number will be greater than 1 for text retrieval.
117 |         # However, image/text retrieval recall@k, the way it is done in CLIP-like papers, is a bit different.
118 |         # recall@k, in CLIP-like papers, is, for each image, either 1 or 0. It is 1 if atleast one text matches the image among the top-k.
119 |         # so we can easily compute that using the actual recall, by checking whether there is at least one true positive,
120 |         # which would be the case if the recall is greater than 0. One we compute the recal for each image (or text), we average
121 |         # it over the dataset.
122 |         metrics[f"image_retrieval_recall@{recall_k}"] = (batchify(recall_at_k, scores, positive_pairs, batch_size, device, k=recall_k)>0).float().mean().item() * 100
123 |         metrics[f"text_retrieval_recall@{recall_k}"] = (batchify(recall_at_k, scores.T, positive_pairs.T, batch_size, device, k=recall_k)>0).float().mean().item() * 100
124 |     
125 |     return metrics
126 | 
127 | def main(
128 |         # model: str = None,
129 |         model_path: str = None,
130 |         data: str = "msrvtt",
131 |         num_frames: int = 32,
132 | ):
133 | 
134 |     device=accelerator.device
135 | 
136 |     # encoder = init_encoder(model, model_path, tokenizer_path, bf16)
137 |     encoder = AutoEncoder.from_pretrained(model_path)
138 | 
139 |     from datasets import disable_caching
140 |     disable_caching()
141 | 
142 |     assert os.path.exists("data.config"), "data.config not found"
143 |     with open("data.config") as f:
144 |         data_configs = json.load(f)
145 | 
146 |     metrics = ir(
147 |         encoder=encoder, 
148 |         device=device, 
149 |         data_config=data_configs[data],
150 |         num_frames=num_frames,
151 |     )
152 | 
153 |     if accelerator.is_main_process:
154 |         print("\nMetrics:")
155 |         for k, v in metrics.items():
156 |             print(f"{k}: {v:.2f}")
157 | 
158 | 
159 | if __name__ == '__main__':
160 |     from fire import Fire
161 |     import warnings
162 |     warnings.simplefilter(action='ignore', category=FutureWarning)
163 |     warnings.simplefilter(action='ignore', category=UserWarning)
164 |     os.environ["TOKENIZERS_PARALLELISM"] = "false"
165 |     Fire(main)
166 | 


--------------------------------------------------------------------------------
/tasks/captioning.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
  3 | import logging
  4 | import functools
  5 | from tqdm import tqdm
  6 | from models.modeling_captioners import AutoCaptioner
  7 | from typing import List
  8 | import json
  9 | import os
 10 | from torch.utils.data import DataLoader
 11 | from accelerate import Accelerator
 12 | from dataset.dataset import VideoTextDataset
 13 | 
 14 | 
 15 | accelerator = Accelerator(device_placement=False)
 16 | 
 17 | def wrap_main_process(function):
 18 |     @functools.wraps(function)
 19 |     def run(*args, **kwargs):
 20 |         if accelerator.is_main_process:
 21 |             return function(*args, **kwargs)
 22 |         return lambda *args, **kwargs: None
 23 |     return run
 24 | 
 25 | def wraped_getLogger(name: str | None = None) -> logging.Logger:
 26 |     logger = logging._getLogger(name)
 27 |     logger.log       = wrap_main_process(logger.log)
 28 |     logger.info      = wrap_main_process(logger.info)
 29 |     logger.error     = wrap_main_process(logger.error)
 30 |     logger.warning   = wrap_main_process(logger.warning)
 31 |     logger.debug     = wrap_main_process(logger.debug)
 32 |     return logger
 33 | 
 34 | 
 35 | logging._getLogger = logging.getLogger
 36 | logging.getLogger = wraped_getLogger
 37 | 
 38 | logger = logging.getLogger(__name__)
 39 | 
 40 | 
 41 | def get_dataloader(config_path: str, dataset_name: str, num_frames: int) -> DataLoader:
 42 |     # load data.config
 43 |     with open(config_path, 'r') as f:
 44 |         config = json.load(f)
 45 |     anno_path = config[dataset_name]['anno_path']
 46 |     data_root = config[dataset_name]['data_root']
 47 |     media_type = config[dataset_name]['media_type']
 48 |     assert media_type == 'video', 'media_type must be video'
 49 |     dataset = VideoTextDataset(
 50 |         anno_path=anno_path,
 51 |         data_root=data_root,
 52 |         num_frames=num_frames
 53 |     )
 54 |     dataloader = DataLoader(dataset, batch_size=1, num_workers=4)
 55 |     return dataloader
 56 | 
 57 | def convert_list_to_dict(data: List[dict], index_key: str='video') -> dict:
 58 |     """
 59 |     Converts a list of dictionaries into a dictionary of dictionaries, using a specified key as the index.
 60 | 
 61 |     Args:
 62 |         data (List[dict]): A list of dictionaries to be converted.
 63 |         index_key (str): The key to be used as the index in the resulting dictionary. Defaults to 'video'.
 64 | 
 65 |     Returns:
 66 |         dict: A dictionary where each key is the value of the specified index_key from the input dictionaries,
 67 |               and each value is a dictionary containing the remaining key-value pairs from the input dictionaries.
 68 | 
 69 |     Raises:
 70 |         ValueError: If the specified index_key is not present in the keys of the input dictionaries.
 71 |     """
 72 |     keys = data[0].keys()
 73 |     if index_key not in keys:
 74 |         raise ValueError(f'Index key `{index_key}` not in keys')
 75 |     return {d[index_key]: {k: d[k] for k in keys if k != index_key} for d in data}
 76 | 
 77 | def gen_description(
 78 |     config_path: str,
 79 |     dataset_name: str,
 80 |     model_path: str,
 81 |     save_path: str = None,
 82 |     num_frames: int = 64,
 83 | ) -> str:
 84 |     
 85 |     if os.path.exists(save_path):
 86 |         logger.info(f'{save_path} already exists. Skipping...')
 87 |         with open(save_path, 'r') as f:
 88 |             data = json.load(f)
 89 |         return data
 90 | 
 91 |     if model_path is None:
 92 |         raise ValueError('model_path must be provided if description.json does not exist')
 93 |     
 94 |     logger.info('Generating descriptions...')
 95 |     captioner = AutoCaptioner.from_pretrained(model_path, is_llm=False)
 96 |     dataloader = get_dataloader(config_path, dataset_name, num_frames)
 97 |     dataloader = accelerator.prepare(dataloader)
 98 |     data = []
 99 |     for batch in tqdm(dataloader):
100 |         d = []
101 |         preds = captioner.describe(batch['video'])
102 |         for idx, gt, pred in zip(batch['idx'], batch['caption'], preds):
103 |             d.append({'idx': idx.item(), 'pred': pred, 'gt': gt})
104 |         d = accelerator.gather_for_metrics(d)
105 |         data += d
106 | 
107 |     # NOTE: `data` only contains 'idx', 'pred' and 'gt'
108 |     # since __getitem__ in VideoTextDataset doesn't return events
109 |     #
110 |     # We need to get the original data from VideoTextDataset
111 |     # and try to merge events to `data`
112 |     #
113 |     # We can't get events from __getitem__ directly since events may be None
114 |     # and can't be collected in batch.
115 |     
116 |     data = convert_list_to_dict(data, index_key='idx')
117 |     raw_data = convert_list_to_dict(dataloader.dataset.data, index_key='idx')
118 | 
119 |     events_none = 0
120 |     objects_none = 0
121 |     for k in data.keys():
122 |         data[k]['events'] = raw_data[k].get('events', None)
123 |         data[k]['objects'] = raw_data[k].get('objects', None)
124 |         if data[k]['events'] is None:
125 |             events_none += 1
126 |         if data[k]['objects'] is None:
127 |             objects_none += 1
128 |     
129 |     if events_none > 0:
130 |         logger.info(f'No events found for {events_none} entries. Events will be extracted while evaluating.')
131 |     else:
132 |         logger.info('Events found for all entries. Events will not be extracted again.')
133 |     
134 |     if objects_none > 0:
135 |         logger.info(f'No objects found for {objects_none} entries. Objects will be extracted while evaluating.')
136 |     else:
137 |         logger.info('Objects found for all entries. Objects will not be extracted again.')
138 | 
139 |     if save_path is not None:
140 |         logger.info(f'Saving results to {save_path}')
141 |         with open(save_path, 'w') as f:
142 |             json.dump(data, f)
143 |     return data
144 |     
145 | def evaluate_gpt(data, result_dir, api_endpoint, api_key, api_model, api_num_worker):
146 |     logger.info('Evaluating GPT...')
147 | 
148 |     os.environ['AZURE_ENDPOINT'] = api_endpoint
149 |     os.environ['OPENAI_API_KEY'] = api_key
150 | 
151 |     from utils.dream_gpt import DREAMGPTMetric
152 | 
153 |     metric = DREAMGPTMetric("TEST")
154 |     metric.num_worker = api_num_worker
155 |     metric.model = api_model
156 | 
157 |     dataset = []
158 |     events_none = 0
159 |     for idx, anno in data.items():
160 |         data = {}
161 |         data['idx'] = idx
162 |         data['dataset'] = "overall"
163 |         data['response'] = anno['gt']
164 |         data['prediction'] = anno['pred']
165 |         data['events'] = anno['events']
166 |         data['objects'] = anno['objects']
167 |         
168 |         dataset.append(data)
169 |     
170 |     if events_none > 0:
171 |         logger.warning(f'No events found for {events_none} entries. Events will be extracted while evaluating.')
172 |         
173 |     metric.process(dataset[:])
174 |     metric._summarize_metric_by_subtask()
175 | 
176 |     os.makedirs(result_dir, exist_ok=True)
177 |     metric.save_results(result_dir)
178 |     metric.save_eval_infos(result_dir)
179 | 
180 | def set_logger(log_path: str):
181 |     logging.basicConfig(
182 |         level=logging.INFO,
183 |         format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
184 |         datefmt='%Y-%m-%d %H:%M:%S'
185 |     )
186 |     formatter = logging.Formatter(
187 |         '%(asctime)s | %(levelname)s | %(name)s | %(message)s',
188 |         datefmt='%Y-%m-%d %H:%M:%S'
189 |     )
190 |     file_handler = logging.FileHandler(log_path)
191 |     file_handler.setFormatter(formatter)
192 |     # add file handler to root logger
193 |     logging.getLogger().addHandler(file_handler)
194 | 
195 | 
196 | def main(
197 |     config_path: str,
198 |     dataset_name: str,
199 |     model_path: str,
200 |     save_dir: str,
201 |     num_frames: int,
202 |     evaluate: bool = True, # if not evaluate, just generate descriptions
203 |     api_endpoint: str = None,
204 |     api_key: str = None,
205 |     api_model: str = None,
206 |     api_num_worker: int = 10,
207 | ):
208 |     
209 |     os.makedirs(save_dir, exist_ok=True)
210 |     DESCRIPTION_JSON_PATH = os.path.join(save_dir, 'description.json')
211 |     LOGGING_PATH = os.path.join(save_dir, 'run.log')
212 |     set_logger(LOGGING_PATH)
213 | 
214 |     logger.info('********** Start Video Captioning Task **********')
215 |     logger.info(f'config_path: {config_path}')
216 |     logger.info(f'dataset_name: {dataset_name}')
217 |     logger.info(f'model_path: {model_path}')
218 |     logger.info(f'save_dir: {save_dir}')
219 |     logger.info(f'num_frames: {num_frames}')
220 |     logger.info(f'api_model: {api_model}')
221 |     logger.info(f'api_num_worker: {api_num_worker}')
222 |     logger.info(f'api_endpoint: {api_endpoint}')
223 |     logger.info(f'api_key: {api_key[:7] + "*" * (len(api_key) - 8) + api_key[-4:]}')
224 |     
225 |     if evaluate and (api_endpoint is None or api_key is None):
226 |         logger.error('api_endpoint and api_key must be provided')
227 |         return
228 |     
229 |     data = gen_description(config_path, dataset_name, model_path, DESCRIPTION_JSON_PATH, num_frames)
230 |     if evaluate and accelerator.is_main_process:
231 |         evaluate_gpt(data, save_dir, api_endpoint, api_key, api_model, api_num_worker)
232 | 
233 | if __name__ == '__main__':
234 |     from fire import Fire
235 |     Fire(main)
236 | 
237 | 


--------------------------------------------------------------------------------
/models/modeling_captioners.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import json
  3 | from os import PathLike
  4 | import einops
  5 | import torch
  6 | from PIL import Image
  7 | import torchvision.transforms as T
  8 | from torchvision.transforms.functional import InterpolationMode
  9 | from transformers import (
 10 |     AutoModel, 
 11 |     AutoProcessor, 
 12 |     AutoTokenizer, 
 13 |     AutoModelForCausalLM, 
 14 |     Qwen2ForCausalLM,
 15 |     Qwen2VLForConditionalGeneration,
 16 |     Qwen2VLModel,
 17 | )
 18 | from transformers import LlavaNextVideoForConditionalGeneration
 19 | from transformers import CLIPConfig, CLIPTokenizer, CLIPModel
 20 | from torchvision.transforms.v2 import (
 21 |     Compose, 
 22 |     Resize, 
 23 |     CenterCrop, 
 24 |     Lambda, 
 25 |     ToTensor, 
 26 |     Normalize, 
 27 |     ToPILImage,
 28 |     functional,
 29 | )
 30 | from typing import Dict, List, Optional, Union
 31 | import os
 32 | import math
 33 | 
 34 | from abc import ABCMeta, abstractmethod
 35 | 
 36 | from models.modeling_basemodels import (
 37 |     BaseModelForMiniCPMV,
 38 |     BaseModelForInternVL2,
 39 |     BaseModelForLlavaNextVideo,
 40 |     BaseModelForTarsier,
 41 |     BaseModelForQwen2VL,
 42 |     BaseModelForCaRe,
 43 | )
 44 | from utils.model import load_architectures_from_config
 45 | import qwen_vl_utils.vision_process as qwen_vl_vision_process
 46 | 
 47 | captioner_registry = {}
 48 | 
 49 | class CaptionMixin(metaclass=ABCMeta):
 50 |     def __init_subclass__(cls, **kwargs):
 51 |         super().__init_subclass__(**kwargs)
 52 |         # register model architecture
 53 |         if hasattr(cls, 'ARCHITECTURE'):
 54 |             captioner_registry[cls.ARCHITECTURE] = cls
 55 | 
 56 |     def transform_pixel_values(self, pixel_values: torch.Tensor | List[torch.Tensor]) -> torch.Tensor:
 57 |         # NOTE: this function doesn't accept unbatched inputs
 58 |         # pixel_values should be uint8 of (B, T, C, H, W)
 59 |         if isinstance(pixel_values, list):
 60 |             pixel_values = torch.stack(pixel_values)
 61 | 
 62 |         if pixel_values.ndim == 4:
 63 |             # pixel_values is (B, C, H, W)
 64 |             # (B, C, H, W) -> (B, 1, C, H, W)
 65 |             pixel_values = pixel_values.unsqueeze(1)
 66 |         elif pixel_values.ndim == 5:
 67 |             # pixel_values is (B, T, C, H, W)
 68 |             pass
 69 |         else:
 70 |             raise ValueError(f"pixel_values should be 4D or 5D, got {pixel_values.ndim}D")
 71 |         return pixel_values
 72 | 
 73 |     @abstractmethod
 74 |     def describe(self):
 75 |         raise NotImplementedError
 76 | 
 77 | class AutoCaptioner:
 78 |     @staticmethod
 79 |     def from_pretrained(
 80 |         model_name_or_path: str,
 81 |         device_map: Optional[Union[str, Dict[str, int]]] = None,
 82 |         architecture: Optional[str] = None,
 83 |         **kwargs):
 84 | 
 85 |         config_path = os.path.join(model_name_or_path, 'config.json')
 86 |         if architecture is not None:
 87 |             model_arch = architecture
 88 |             print(f"Argument `architecture` of AutoEncoder is not None. Overriding model architecture to {model_arch}.")
 89 |         else:
 90 |             model_arch = load_architectures_from_config(config_path)
 91 |         if model_arch not in captioner_registry:
 92 |             raise ValueError(
 93 |                 f"Model architecture {model_arch} is not registered. "
 94 |                 "You can register it by subclassing EncoderBase and setting ARCHITECTURE attribute."
 95 |             )
 96 |         if device_map is None:
 97 |             if torch.cuda.is_available():
 98 |                 device_map = 'cuda'
 99 |                 print(f"Argument `device_map` is None. CUDA is detected. Setting device_map={device_map}.")
100 |             else:
101 |                 device_map = 'cpu'
102 |                 print(f"Argument `device_map` is None. CUDA is not detected. Setting device_map={device_map}.")
103 |         
104 |         MODEL_CLASS = captioner_registry[model_arch]
105 | 
106 |         return MODEL_CLASS.from_pretrained(model_name_or_path, load_llm=False, device_map=device_map, **kwargs)
107 |     
108 | class CaptionerForMiniCPMV(BaseModelForMiniCPMV, CaptionMixin):
109 | 
110 |     def describe(self, pixel_values: torch.Tensor | List[torch.Tensor]) -> List[str]:
111 |         if self.is_llm:
112 |             raise NotImplementedError("describe method is not implemented for LLM models.")
113 |         
114 |         pixel_values = self.transform_pixel_values(pixel_values)
115 | 
116 |         to_image = ToPILImage()
117 |         batched_frames = []
118 |         for batch in pixel_values:
119 |             frames = [to_image(v) for v in batch]
120 |             batched_frames.append(frames)
121 |         descriptions = []
122 |         for frames in batched_frames:
123 |             msgs = [
124 |                 {'role': 'user', 'content': frames + [self.describe_prompt]}
125 |             ]
126 |             params = {}
127 |             params["use_image_id"] = False
128 |             params["max_slice_nums"] = 1
129 |             answer = self.model.chat(
130 |                 image=None,
131 |                 msgs=msgs,
132 |                 tokenizer=self.tokenizer,
133 |                 processor=self.processor,
134 |                 **params
135 |             )
136 |             descriptions.append(answer)
137 |         return descriptions
138 | 
139 | class CaptionerForInternVL2(BaseModelForInternVL2, CaptionMixin):
140 | 
141 |     def describe(self, pixel_values: torch.Tensor | List[torch.Tensor]) -> List[str]:
142 |         
143 |         pixel_values = self.transform_pixel_values(pixel_values)
144 | 
145 |         dynamic_preprocess_max_num = 1
146 |         prompt = f"<|im_start|>user\n<video>\n{self.describe_prompt}<|im_end|><|im_start|>assistant\n"
147 |         
148 |         self.model.eval()
149 |         IMG_START_TOKEN='<img>'
150 |         IMG_END_TOKEN='</img>'
151 |         IMG_CONTEXT_TOKEN='<IMG_CONTEXT>'
152 |         img_context_token_id = self.tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
153 |         self.model.img_context_token_id = img_context_token_id
154 |         eos_token_id = self.tokenizer.convert_tokens_to_ids("<|im_end|>")
155 |         transform = self.build_transform(input_size=448)
156 |         descriptions = []
157 |         
158 |         for batch in pixel_values:
159 |             # batch: (T, C, H, W)
160 |             pixel_values_list, num_patches_list = [], []
161 |             T = batch.shape[0]
162 |             for frame in batch:
163 |                 # frame: (C, H, W)
164 |                 img = ToPILImage()(frame).convert('RGB')
165 |                 img = self.dynamic_preprocess(img, image_size=448, use_thumbnail=True, max_num=dynamic_preprocess_max_num)
166 |                 tiles = [transform(tile) for tile in img]
167 |                 tiles = torch.stack(tiles)
168 |                 num_patches_list.append(tiles.shape[0])
169 |                 pixel_values_list.append(tiles)
170 |             pixel_values = torch.cat(pixel_values_list).to(device=self.model.device, dtype=self.model.dtype)
171 |             if T != 1:
172 |                 video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
173 |                 prompt = prompt.replace('<video>\n', video_prefix)
174 |             
175 |             generation_config = dict(max_new_tokens=2048, do_sample=True)
176 |             response, _ = self.model.chat(self.tokenizer, pixel_values, prompt, generation_config,
177 |                                num_patches_list=num_patches_list, history=None, return_history=True)
178 |             descriptions.append(response)
179 |             
180 |         return descriptions
181 | 
182 | class CaptionerForLlavaNextVideo(BaseModelForLlavaNextVideo, CaptionMixin):
183 |     
184 |     def describe(self, pixel_values: torch.Tensor | List[torch.Tensor]) -> List[str]:
185 |         if self.is_llm:
186 |             raise NotImplementedError("describe method is not implemented for LLM models.")
187 |         
188 |         pixel_values = self.transform_pixel_values(pixel_values)  
189 | 
190 |         images = None
191 |         videos = None
192 | 
193 |         if pixel_values.ndim == 4:
194 |             images = list(pixel_values)
195 |         elif pixel_values.ndim == 5:
196 |             videos = list(pixel_values)
197 |             # print(prompt)
198 |         else:
199 |             raise ValueError(f"pixel_values should be 4D or 5D, got {pixel_values.ndim}D")
200 |         prompt = [{
201 |             "role": "user",
202 |             "content": [{"type": "text", "text": f"<video>\n{self.describe_prompt}"}],
203 |         }]
204 |         prompt = self.processor.apply_chat_template(prompt, add_generation_prompt=True)
205 |         inputs = self.processor(prompt, images=images, videos=videos, return_tensors="pt").to('cuda')
206 |         outputs = self.model.generate(**inputs, max_new_tokens=512, repetition_penalty=1.2)
207 | 
208 |         descriptions = self.processor.batch_decode(outputs[:, inputs['input_ids'].shape[1]:], skip_special_tokens=True, clean_up_tokenization_spaces=True)
209 | 
210 |         return descriptions
211 | 
212 | class CaptionerForTarsier(BaseModelForTarsier, CaptionMixin):
213 |     
214 |     def describe(self, pixel_values: torch.Tensor | List[torch.Tensor]) -> List[str]:
215 | 
216 |         pixel_values = self.transform_pixel_values(pixel_values) # [B, T, C, H, W]
217 |         to_image = ToPILImage()
218 |         batched_frames = []
219 |         for batch in pixel_values:
220 |             frames = [to_image(v) for v in batch]
221 |             batched_frames.append(frames)
222 |         descriptions = []
223 |         generate_kwargs = {
224 |             "do_sample": False,
225 |             "max_new_tokens": 2048,
226 |             "top_p": 1,
227 |             "temperature": 0,
228 |             "use_cache": True
229 |         }
230 | 
231 |         for frames in batched_frames:
232 |             text_inputs = f"<video>\n{self.describe_prompt}"
233 |             text_inputs = self.processor.process_prompt(text_inputs, frames)
234 |             text_inputs = self.processor.get_text_inputs(text_inputs)
235 |             frames = self.processor.get_pixel_values(frames)
236 |             inputs = {
237 |                 "input_ids": text_inputs,
238 |                 "pixel_values": frames
239 |             }
240 |             inputs = {k:v.to(self.model.device) for k,v in inputs.items() if v is not None}
241 |             outputs = self.model.generate(
242 |                 **inputs,
243 |                 **generate_kwargs,
244 |             )
245 |             output_text = self.processor.tokenizer.decode(outputs[0][inputs['input_ids'][0].shape[0]:], skip_special_tokens=True)
246 |             descriptions.append(output_text)
247 |         return descriptions
248 | 
249 | class CaptionerForQwen2VL(BaseModelForQwen2VL, CaptionMixin):
250 |     
251 |     def describe(self, pixel_values: torch.Tensor | List[torch.Tensor]) -> List[str]:
252 |         if self.load_llm:
253 |             raise NotImplementedError("describe method is not implemented for LLM models.")
254 |         
255 |         batched_pixel_values = self.transform_pixel_values(pixel_values)
256 |         descriptions = []
257 |         for pixel_values in batched_pixel_values:
258 |         
259 |             nframes, _, height, width = pixel_values.shape
260 |             min_pixels = qwen_vl_vision_process.VIDEO_MIN_PIXELS
261 |             total_pixels = qwen_vl_vision_process.VIDEO_TOTAL_PIXELS
262 |             max_pixels = max(min(qwen_vl_vision_process.VIDEO_MAX_PIXELS, total_pixels / nframes * qwen_vl_vision_process.FRAME_FACTOR), int(min_pixels * 1.05))
263 |             max_pixels = 230400
264 |             resized_height, resized_width = self.smart_resize(
265 |                 height,
266 |                 width,
267 |                 factor=qwen_vl_vision_process.IMAGE_FACTOR,
268 |                 min_pixels=min_pixels,
269 |                 max_pixels=max_pixels,
270 |             )
271 |             pixel_values = functional.resize(
272 |                 pixel_values,
273 |                 [resized_height, resized_width],
274 |                 interpolation=InterpolationMode.BICUBIC,
275 |                 antialias=True,
276 |             ).float()
277 | 
278 |             messages = [{
279 |                     "role": "user",
280 |                     "content": [{"type": "text", "text": f"<video>\n{self.describe_prompt}"}],
281 |             }]
282 |             text = self.processor.apply_chat_template(
283 |                 messages, tokenize=False, add_generation_prompt=True
284 |             ).replace("<video>", "<|vision_start|><|video_pad|><|vision_end|>")
285 |             
286 |                     
287 |             inputs = self.processor(
288 |                 text=[text],
289 |                 images=None,
290 |                 videos=[pixel_values],
291 |                 padding=True,
292 |                 return_tensors="pt",
293 |             )
294 |             inputs = inputs.to(self.model.device)
295 |             with torch.inference_mode():
296 |                 generated_ids = self.model.generate(**inputs, max_new_tokens=512, repetition_penalty=1.2)
297 |             generated_ids_trimmed = [
298 |                 out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
299 |             ]
300 |             output_text = self.processor.batch_decode(
301 |                 generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
302 |             )
303 |             descriptions.append(output_text[0])
304 |         return descriptions
305 | 
306 | class CaptionerForCaRe(BaseModelForCaRe, CaptionMixin):
307 |     
308 |     def describe(self, pixel_values: torch.Tensor | List[torch.Tensor]) -> List[str]:
309 |         if self.load_llm:
310 |             raise NotImplementedError("describe method is not implemented for LLM models.")
311 |         
312 |         batched_pixel_values = self.transform_pixel_values(pixel_values)
313 |         batched_pixel_values = torch.repeat_interleave(batched_pixel_values, repeats=2, dim=1)
314 |         descriptions = []
315 |         for pixel_values in batched_pixel_values:
316 |         
317 |             nframes, _, height, width = pixel_values.shape
318 |             min_pixels = qwen_vl_vision_process.VIDEO_MIN_PIXELS
319 |             total_pixels = qwen_vl_vision_process.VIDEO_TOTAL_PIXELS
320 |             max_pixels = max(min(qwen_vl_vision_process.VIDEO_MAX_PIXELS, total_pixels / nframes * qwen_vl_vision_process.FRAME_FACTOR), int(min_pixels * 1.05))
321 |             max_pixels = 230400
322 |             resized_height, resized_width = self.smart_resize(
323 |                 height,
324 |                 width,
325 |                 factor=qwen_vl_vision_process.IMAGE_FACTOR,
326 |                 min_pixels=min_pixels,
327 |                 max_pixels=max_pixels,
328 |             )
329 |             pixel_values = functional.resize(
330 |                 pixel_values,
331 |                 [resized_height, resized_width],
332 |                 interpolation=InterpolationMode.BICUBIC,
333 |                 antialias=True,
334 |             ).float()
335 | 
336 |             messages = [{
337 |                     "role": "user",
338 |                     "content": [{"type": "text", "text": f"<video>\n{self.describe_prompt}"}],
339 |             }]
340 |             text = self.processor.apply_chat_template(
341 |                 messages, tokenize=False, add_generation_prompt=True
342 |             ).replace("<video>", "<|vision_start|><|video_pad|><|vision_end|>")
343 |             
344 |                     
345 |             inputs = self.processor(
346 |                 text=[text],
347 |                 images=None,
348 |                 videos=[pixel_values],
349 |                 padding=True,
350 |                 return_tensors="pt",
351 |             )
352 |             inputs = inputs.to(self.model.device)
353 |             with torch.inference_mode():
354 |                 generated_ids = self.model.generate(**inputs, max_new_tokens=512, repetition_penalty=1.2)
355 |             generated_ids_trimmed = [
356 |                 out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
357 |             ]
358 |             output_text = self.processor.batch_decode(
359 |                 generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
360 |             )
361 |             descriptions.append(output_text[0])
362 |         return descriptions


--------------------------------------------------------------------------------
/models/modeling_encoders.py:
--------------------------------------------------------------------------------
  1 | import torch
  2 | import torchvision.transforms as T
  3 | from torchvision.transforms.functional import InterpolationMode
  4 | from torchvision.transforms.v2 import (
  5 |     ToPILImage,
  6 |     functional,
  7 | )
  8 | from typing import Dict, List, Optional, Union
  9 | import os
 10 | 
 11 | from abc import ABCMeta, abstractmethod
 12 | 
 13 | from models.modeling_basemodels import (
 14 |     BaseModelForMiniCPMV,
 15 |     BaseModelForLlavaNextVideo,
 16 |     BaseModelForTarsier,
 17 |     BaseModelForQwen2VL,
 18 |     BaseModelForInternVL2,
 19 |     BaseModelForCaRe,
 20 | )
 21 | from utils.model import load_architectures_from_config, transform_pixel_values
 22 | 
 23 | IMAGE_FACTOR = 28
 24 | MIN_PIXELS = 4 * 28 * 28
 25 | MAX_PIXELS = 16384 * 28 * 28
 26 | MAX_RATIO = 200
 27 | 
 28 | VIDEO_MIN_PIXELS = 128 * 28 * 28
 29 | VIDEO_MAX_PIXELS = 768 * 28 * 28
 30 | VIDEO_TOTAL_PIXELS = 24576 * 28 * 28
 31 | FRAME_FACTOR = 2
 32 | FPS = 2.0
 33 | FPS_MIN_FRAMES = 4
 34 | FPS_MAX_FRAMES = 768
 35 | 
 36 | encoder_registry = {}
 37 | 
 38 | class EncodeMixin(metaclass=ABCMeta):
 39 |     def __init_subclass__(cls, **kwargs):
 40 |         super().__init_subclass__(**kwargs)
 41 |         # register model architecture
 42 |         if hasattr(cls, 'ARCHITECTURE'):
 43 |             encoder_registry[cls.ARCHITECTURE] = cls
 44 | 
 45 |     @abstractmethod
 46 |     def encode_vision(self, pixel_values: torch.Tensor | List[torch.Tensor]) -> torch.Tensor:
 47 |         """
 48 |         Encodes vision data (images or videos) into a tensor representation.
 49 | 
 50 |         Args:
 51 |             pixel_values (torch.Tensor | List[torch.Tensor]): The input pixel values. 
 52 |                 - If a tensor, it should be of shape (B, C, H, W) for images or (B, T, C, H, W) for videos.
 53 |                 - If a list, it will be stacked into a tensor.
 54 | 
 55 |         Returns:
 56 |             torch.Tensor: The encoded tensor representation of the input vision data.
 57 | 
 58 |         Raises:
 59 |             ValueError: If `pixel_values` is not 4D or 5D.
 60 | 
 61 |         ## Notes:
 62 |             - This function does not accept unbatched inputs.
 63 |             - `pixel_values` should be of type uint8.
 64 |         """
 65 |         raise NotImplementedError
 66 | 
 67 |     @abstractmethod
 68 |     def encode_text(self, text: str | List[str]) -> torch.Tensor:
 69 |         """
 70 |         Encodes the given text(s) into a tensor representation using the model.
 71 | 
 72 |         Args:
 73 |             text (str | List[str]): A single string or a list of strings to be encoded.
 74 | 
 75 |         Returns:
 76 |             torch.Tensor: The tensor representation of the encoded text(s).
 77 | 
 78 |         ## Notes:
 79 |             - The method uses a prompt to encode the text.
 80 |             - If a single string is provided, it is converted into a list containing that string.
 81 |             - The method processes the prompts and generates the tensor representation using the model.
 82 |             - The output tensor contains the hidden states of the last token for each input text.
 83 |         """
 84 |         raise NotImplementedError
 85 | 
 86 | class AutoEncoder:
 87 |     @staticmethod
 88 |     def from_pretrained(
 89 |         model_name_or_path: str,
 90 |         device_map: Optional[Union[str, Dict[str, int]]] = None,
 91 |         architecture: Optional[str] = None,
 92 |         **kwargs):
 93 | 
 94 |         config_path = os.path.join(model_name_or_path, 'config.json')
 95 |         if architecture is not None:
 96 |             model_arch = architecture
 97 |             print(f"Argument `architecture` of AutoEncoder is not None. Overriding model architecture to {model_arch}.")
 98 |         else:
 99 |             model_arch = load_architectures_from_config(config_path)
100 |         if model_arch not in encoder_registry:
101 |             raise ValueError(
102 |                 f"Model architecture {model_arch} is not registered. "
103 |                 "You can register it by subclassing EncoderBase and setting ARCHITECTURE attribute."
104 |             )
105 |         if device_map is None:
106 |             if torch.cuda.is_available():
107 |                 device_map = 'cuda'
108 |                 print(f"Argument `device_map` is None. CUDA is detected. Setting device_map={device_map}.")
109 |             else:
110 |                 device_map = 'cpu'
111 |                 print(f"Argument `device_map` is None. CUDA is not detected. Setting device_map={device_map}.")
112 |         
113 |         MODEL_CLASS = encoder_registry[model_arch]
114 | 
115 |         return MODEL_CLASS.from_pretrained(model_name_or_path, load_llm=False, device_map=device_map, **kwargs)
116 |     
117 | 
118 | class EncoderForMiniCPMV(BaseModelForMiniCPMV, EncodeMixin):
119 |     
120 |     def encode_text(self, text: str | List[str]) -> torch.Tensor:
121 |         
122 |         prompt = self.text_eol_prompt
123 |         # print(prompt)
124 |         if isinstance(text, str):
125 |             text = [text]
126 |         prompts = [prompt.replace('<sent>', t) for t in text]
127 |         inputs = self.processor(prompts, [[]]*len(prompts), return_tensors="pt").to('cuda')
128 |         inputs.pop("image_sizes")
129 |         with torch.no_grad():
130 |             outputs = self.model.generate(**inputs, tokenizer=self.tokenizer, max_new_tokens=1, decode_text=False, repetition_penalty=1.2, output_hidden_states=True, return_dict_in_generate=True)
131 |         return outputs.hidden_states[0][-1][:, -1, :]
132 |     
133 |     def encode_vision(self, pixel_values: torch.Tensor | List[torch.Tensor]) -> torch.Tensor:
134 | 
135 |         pixel_values = transform_pixel_values(pixel_values)
136 |         T = pixel_values.shape[1]
137 | 
138 |         if T == 1:
139 |             # if the input is image
140 |             prompt = self.image_eol_prompt
141 |             # replace <image>\n to (<image>./</image>)\n
142 |             prompts = [prompt.replace(f"<image>\n", f"(<image>./</image>)\n") for p in pixel_values]
143 |         else:
144 |             # if the input is video
145 |             prompt = self.video_eol_prompt
146 |             # replace <video>\n to N * (<image>./</image>)\n
147 |             prompts = [prompt.replace(f"<video>\n", f"(<image>./</image>)\n" * len(p)) for p in pixel_values]
148 | 
149 |         inputs = self.processor(prompts, pixel_values, return_tensors="pt").to('cuda')
150 |         inputs.pop("image_sizes")
151 |         with torch.no_grad():
152 |             outputs = self.model.generate(**inputs, tokenizer=self.tokenizer, max_new_tokens=1, decode_text=False, repetition_penalty=1.2, output_hidden_states=True, return_dict_in_generate=True)
153 | 
154 |         return outputs.hidden_states[0][-1][:, -1, :]
155 | 
156 | 
157 | class EncoderForInternVL2(BaseModelForInternVL2, EncodeMixin):
158 | 
159 |     def encode_text(self, text: str | List[str]) -> torch.Tensor:
160 |         self.model.eval()
161 |         prompt = self.text_eol_prompt
162 |         if isinstance(text, str):
163 |             text = [text]
164 |         prompts = [prompt.replace('<sent>', t) for t in text]
165 | 
166 |         # to avoid img_context_token_id assertion error
167 |         IMG_CONTEXT_TOKEN='<IMG_CONTEXT>'
168 |         img_context_token_id = self.tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
169 |         self.model.img_context_token_id = img_context_token_id
170 | 
171 |         eos_token_id = self.tokenizer.convert_tokens_to_ids("<|im_end|>")
172 |         self.tokenizer.padding_side = 'left'
173 |         inputs = self.tokenizer(prompts, return_tensors='pt', padding=True)
174 |         input_ids = inputs['input_ids'].to(self.model.device)
175 |         attention_mask = inputs['attention_mask'].to(self.model.device)
176 |         outputs = self.model.generate(
177 |             pixel_values=None,
178 |             input_ids=input_ids,
179 |             attention_mask=attention_mask,
180 |             max_new_tokens=1,
181 |             do_sample=True,
182 |             eos_token_id=eos_token_id,
183 |             output_hidden_states=True,
184 |             return_dict_in_generate=True,
185 |         )
186 |         return outputs.hidden_states[0][-1][:, -1, :]
187 | 
188 |     def encode_vision(self, pixel_values: torch.Tensor | List[torch.Tensor]) -> torch.Tensor:
189 |         pixel_values = transform_pixel_values(pixel_values)
190 |         T = pixel_values.shape[1]
191 | 
192 |         if T == 1:
193 |             # if the input is image
194 |             prompt = self.image_eol_prompt
195 |             dynamic_preprocess_max_num = 12
196 |         else:
197 |             # if the input is video
198 |             prompt = self.video_eol_prompt
199 |             dynamic_preprocess_max_num = 1
200 |         
201 |         self.model.eval()
202 |         IMG_START_TOKEN='<img>'
203 |         IMG_END_TOKEN='</img>'
204 |         IMG_CONTEXT_TOKEN='<IMG_CONTEXT>'
205 |         img_context_token_id = self.tokenizer.convert_tokens_to_ids(IMG_CONTEXT_TOKEN)
206 |         self.model.img_context_token_id = img_context_token_id
207 |         eos_token_id = self.tokenizer.convert_tokens_to_ids("<|im_end|>")
208 |         transform = self.build_transform(input_size=448)
209 |         output_embs = []
210 |         
211 |         for batch in pixel_values:
212 |             # batch: (T, C, H, W)
213 |             pixel_values_list, num_patches_list = [], []
214 |             T = batch.shape[0]
215 |             for frame in batch:
216 |                 # frame: (C, H, W)
217 |                 img = ToPILImage()(frame).convert('RGB')
218 |                 img = self.dynamic_preprocess(img, image_size=448, use_thumbnail=True, max_num=dynamic_preprocess_max_num)
219 |                 tiles = [transform(tile) for tile in img]
220 |                 tiles = torch.stack(tiles)
221 |                 num_patches_list.append(tiles.shape[0])
222 |                 pixel_values_list.append(tiles)
223 |             pixel_values = torch.cat(pixel_values_list).to(device=self.model.device, dtype=self.model.dtype)
224 |             if T != 1:
225 |                 video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
226 |                 prompt = prompt.replace('<video>\n', video_prefix)
227 | 
228 |             # num_patches_list = [pixel_values.shape[0]] if pixel_values is not None else []
229 |             assert pixel_values is None or len(pixel_values) == sum(num_patches_list)
230 | 
231 |             for num_patches in num_patches_list:
232 |                 image_tokens = IMG_START_TOKEN + IMG_CONTEXT_TOKEN * self.model.num_image_token * num_patches + IMG_END_TOKEN
233 |                 prompt = prompt.replace('<image>', image_tokens, 1)
234 | 
235 |             model_inputs = self.tokenizer(prompt, return_tensors='pt')
236 |             input_ids = model_inputs['input_ids'].to(self.model.device)
237 |             attention_mask = model_inputs['attention_mask'].to(self.model.device)
238 |             outputs = self.model.generate(
239 |                 pixel_values=pixel_values,
240 |                 input_ids=input_ids,
241 |                 attention_mask=attention_mask,
242 |                 max_new_tokens=1,
243 |                 do_sample=True,
244 |                 eos_token_id=eos_token_id,
245 |                 output_hidden_states=True,
246 |                 return_dict_in_generate=True,
247 |             )
248 |             output_embs.append(outputs.hidden_states[0][-1][:, -1, :])
249 | 
250 |         return torch.cat(output_embs)
251 |     
252 | class EncoderForLlavaNextVideo(BaseModelForLlavaNextVideo, EncodeMixin):
253 |     
254 |     def encode_vision(self, pixel_values: torch.Tensor | List[torch.Tensor]) -> torch.Tensor:
255 | 
256 |         if isinstance(pixel_values, list):
257 |             pixel_values = torch.stack(pixel_values)
258 |         
259 |         images = None
260 |         videos = None
261 | 
262 |         if pixel_values.ndim == 4:
263 |             # pixel_values is (B, C, H, W)
264 |             # (B, C, H, W) -> (B, 1, C, H, W)
265 |             prompt = self.image_eol_prompt
266 |             images = list(pixel_values)
267 |             # print(prompt)
268 |         elif pixel_values.ndim == 5:
269 |             # pixel_values is (B, T, C, H, W)
270 |             prompt = self.video_eol_prompt
271 |             videos = list(pixel_values)
272 |             # print(prompt)
273 |         else:
274 |             raise ValueError(f"pixel_values should be 4D or 5D, got {pixel_values.ndim}D")
275 |         inputs = self.processor(prompt, images=images, videos=videos, padding=True, return_tensors="pt").to('cuda')
276 |         outputs = self.model.generate(**inputs, max_new_tokens=1, output_hidden_states=True, return_dict_in_generate=True)
277 | 
278 |         return outputs.hidden_states[0][-1][:, -1, :]
279 | 
280 |     def encode_text(self, text: str | List[str]) -> torch.Tensor:
281 | 
282 |         prompt = self.text_eol_prompt
283 | 
284 |         if isinstance(text, str):
285 |             text = [text]
286 |         
287 |         prompts = [prompt.replace('<sent>', t) for t in text]
288 |         inputs = self.processor(prompts, padding=True, return_tensors="pt").to('cuda')
289 |         outputs = self.model.generate(**inputs, max_new_tokens=1, output_hidden_states=True, return_dict_in_generate=True)
290 |         return outputs.hidden_states[0][-1][:, -1, :]
291 | 
292 | class EncoderForTarsier(BaseModelForTarsier, EncodeMixin):
293 | 
294 |     def encode_vision(self, pixel_values: torch.Tensor | List[torch.Tensor]) -> torch.Tensor:
295 | 
296 |         pixel_values = transform_pixel_values(pixel_values) # [B, T, C, H, W]
297 |         nframes = pixel_values.shape[1]
298 |         prompt = self.image_eol_prompt if nframes == 1 else self.video_eol_prompt
299 |         
300 |         to_image = ToPILImage()
301 |         batched_frames = []
302 |         for batch in pixel_values:
303 |             frames = [to_image(v) for v in batch]
304 |             batched_frames.append(frames)
305 | 
306 |         generate_kwargs = {
307 |             "max_new_tokens": 1,
308 |             "output_hidden_states": True,
309 |             "return_dict_in_generate": True,
310 |         }
311 | 
312 |         vision_embs = []
313 | 
314 |         for frames in batched_frames:
315 |             input_prompt = prompt.replace("<video>", "<image>"*len(frames))
316 |             input_ids = self.processor.get_text_inputs(input_prompt)
317 |             frames = self.processor.get_pixel_values(frames)
318 |             inputs = {
319 |                 "input_ids": input_ids,
320 |                 "pixel_values": frames
321 |             }
322 |             inputs = {k:v.to(self.model.device) for k,v in inputs.items() if v is not None}
323 |             outputs = self.model.generate(
324 |                 **inputs,
325 |                 **generate_kwargs,
326 |             )
327 |             vision_embs.append(outputs.hidden_states[0][-1][:, -1, :])
328 |         
329 |         vision_embs = torch.cat(vision_embs)
330 |         return vision_embs
331 |     
332 |     def encode_text(self, text: str | List[str]) -> torch.Tensor:
333 | 
334 |         prompt = self.text_eol_prompt
335 | 
336 |         if isinstance(text, str):
337 |             text = [text]
338 |         
339 |         prompts = [prompt.replace('<sent>', t) for t in text]
340 | 
341 |         generate_kwargs = {
342 |             "max_new_tokens": 1,
343 |             "output_hidden_states": True,
344 |             "return_dict_in_generate": True,
345 |         }
346 | 
347 |         text_embs = []
348 | 
349 |         for p in prompts:
350 |             text_inputs = self.processor.get_text_inputs(p)
351 |             inputs = {
352 |                 "input_ids": text_inputs,
353 |             }
354 |             inputs = {k:v.to(self.model.device) for k,v in inputs.items() if v is not None}
355 |             outputs = self.model.generate(
356 |                 **inputs,
357 |                 **generate_kwargs,
358 |             )
359 |             text_embs.append(outputs.hidden_states[0][-1][:, -1, :])
360 |         
361 |         text_embs = torch.cat(text_embs)
362 |         return text_embs
363 |     
364 | class EncoderForQwen2VL(BaseModelForQwen2VL, EncodeMixin):
365 |     
366 |     def encode_vision(self, pixel_values: torch.Tensor | List[torch.Tensor]) -> torch.Tensor:
367 |         
368 |         batched_pixel_values = transform_pixel_values(pixel_values)
369 |         vision_embs = []
370 |         prompt = self.video_eol_prompt
371 |         prompt = prompt.replace("<video>", "<|vision_start|><|video_pad|><|vision_end|>")
372 | 
373 |         for pixel_values in batched_pixel_values:
374 |         
375 |             nframes, _, height, width = pixel_values.shape
376 |             min_pixels = VIDEO_MIN_PIXELS
377 |             total_pixels = VIDEO_TOTAL_PIXELS
378 |             max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05))
379 |             max_pixels = 230400
380 |             resized_height, resized_width = self.smart_resize(
381 |                 height,
382 |                 width,
383 |                 factor=IMAGE_FACTOR,
384 |                 min_pixels=min_pixels,
385 |                 max_pixels=max_pixels,
386 |             )
387 |             pixel_values = functional.resize(
388 |                 pixel_values,
389 |                 [resized_height, resized_width],
390 |                 interpolation=InterpolationMode.BICUBIC,
391 |                 antialias=True,
392 |             ).float()
393 | 
394 |             
395 |             inputs = self.processor(
396 |                 text=[prompt],
397 |                 images=None,
398 |                 videos=[pixel_values],
399 |                 padding=True,
400 |                 return_tensors="pt",
401 |             )
402 |             inputs = inputs.to(self.model.device)
403 |             with torch.inference_mode():
404 |                 output = self.model.generate(**inputs, max_new_tokens=1, output_hidden_states=True, return_dict_in_generate=True)
405 |             vision_embs.append(output.hidden_states[0][-1][:, -1, :])
406 |         vision_embs = torch.cat(vision_embs)
407 |         return vision_embs
408 |     
409 |     def encode_text(self, text: str | List[str]) -> torch.Tensor:
410 | 
411 |         prompt = self.text_eol_prompt
412 | 
413 |         if isinstance(text, str):
414 |             text = [text]
415 |         prompts = [prompt.replace('<sent>', t) for t in text]
416 |             
417 |         inputs = self.processor(
418 |             text=prompts,
419 |             padding=True,
420 |             return_tensors="pt",
421 |         )
422 |         inputs = inputs.to(self.model.device)
423 |         with torch.inference_mode():
424 |             output = self.model.generate(**inputs, max_new_tokens=1, output_hidden_states=True, return_dict_in_generate=True)
425 |         return output.hidden_states[0][-1][:, -1, :]
426 | 
427 | class EncoderForCaRe(BaseModelForCaRe, EncodeMixin):
428 |     
429 |     def encode_vision(self, pixel_values: torch.Tensor | List[torch.Tensor]) -> torch.Tensor:
430 |         
431 |         batched_pixel_values = transform_pixel_values(pixel_values)
432 |         batched_pixel_values = torch.repeat_interleave(batched_pixel_values, repeats=2, dim=1)
433 |         vision_embs = []
434 |         prompt = self.video_eol_prompt
435 |         prompt = prompt.replace("<video>", "<|vision_start|><|video_pad|><|vision_end|>")
436 | 
437 |         for pixel_values in batched_pixel_values:
438 |         
439 |             nframes, _, height, width = pixel_values.shape
440 |             min_pixels = VIDEO_MIN_PIXELS
441 |             total_pixels = VIDEO_TOTAL_PIXELS
442 |             max_pixels = max(min(VIDEO_MAX_PIXELS, total_pixels / nframes * FRAME_FACTOR), int(min_pixels * 1.05))
443 |             max_pixels = 230400
444 |             resized_height, resized_width = self.smart_resize(
445 |                 height,
446 |                 width,
447 |                 factor=IMAGE_FACTOR,
448 |                 min_pixels=min_pixels,
449 |                 max_pixels=max_pixels,
450 |             )
451 |             pixel_values = functional.resize(
452 |                 pixel_values,
453 |                 [resized_height, resized_width],
454 |                 interpolation=InterpolationMode.BICUBIC,
455 |                 antialias=True,
456 |             ).float()
457 | 
458 |             
459 |             inputs = self.processor(
460 |                 text=[prompt],
461 |                 images=None,
462 |                 videos=[pixel_values],
463 |                 padding=True,
464 |                 return_tensors="pt",
465 |             )
466 |             inputs = inputs.to(self.model.device)
467 |             with torch.inference_mode():
468 |                 output = self.model.generate(**inputs, max_new_tokens=1, output_hidden_states=True, return_dict_in_generate=True)
469 |             vision_embs.append(output.hidden_states[0][-1][:, -1, :])
470 |         vision_embs = torch.cat(vision_embs)
471 |         return vision_embs
472 |     
473 |     def encode_text(self, text: str | List[str]) -> torch.Tensor:
474 | 
475 |         prompt = self.text_eol_prompt
476 | 
477 |         if isinstance(text, str):
478 |             text = [text]
479 |         prompts = [prompt.replace('<sent>', t) for t in text]
480 |             
481 |         inputs = self.processor(
482 |             text=prompts,
483 |             padding=True,
484 |             return_tensors="pt",
485 |         )
486 |         inputs = inputs.to(self.model.device)
487 |         with torch.inference_mode():
488 |             output = self.model.generate(**inputs, max_new_tokens=1, output_hidden_states=True, return_dict_in_generate=True)
489 |         return output.hidden_states[0][-1][:, -1, :]
490 | 


--------------------------------------------------------------------------------
/tasks/finetuning.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import sys
  3 | sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
  4 | import warnings
  5 | warnings.simplefilter(action='ignore', category=FutureWarning)
  6 | 
  7 | import fire
  8 | import torch
  9 | import torch.nn as nn
 10 | import torch.distributed as dist
 11 | import datasets
 12 | from datasets import load_dataset, load_from_disk
 13 | import transformers
 14 | from transformers import Trainer
 15 | from transformers import set_seed
 16 | from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 17 | from transformers.utils import PaddingStrategy
 18 | from transformers.utils import logging
 19 | from transformers.trainer_callback import TrainerCallback
 20 | from typing import Any, Optional, Union
 21 | import numpy as np
 22 | from dataclasses import dataclass
 23 | from models.modeling_basemodels import AutoBase
 24 | 
 25 | 
 26 | logger = logging.get_logger(__name__)
 27 | 
 28 | class ForceTqdmUpdateCallback(TrainerCallback):
 29 |     def on_step_end(self, args, state, control, **kwargs):
 30 |         # pdsh can't update tqdm, except warning
 31 |         if state.is_world_process_zero:
 32 |             if state.global_step % 5 == 0 or state.global_step < 20:
 33 |                 logger.warning('')
 34 | @dataclass
 35 | class DataCollatorForSeq2SeqForNeg:
 36 |     tokenizer: PreTrainedTokenizerBase
 37 |     model: Optional[Any] = None
 38 |     padding: Union[bool, str, PaddingStrategy] = True
 39 |     max_length: Optional[int] = None
 40 |     pad_to_multiple_of: Optional[int] = None
 41 |     label_pad_token_id: int = -100
 42 |     return_tensors: str = "pt"
 43 | 
 44 |     def __call__(self, features, return_tensors=None):
 45 |         if return_tensors is None:
 46 |             return_tensors = self.return_tensors
 47 |         labels = [feature["labels"] for feature in features] if "labels" in features[0].keys() else None
 48 |         # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
 49 |         # same length to return tensors.
 50 |         if labels is not None:
 51 |             max_label_length = max(len(l) for l in labels)
 52 |             if self.pad_to_multiple_of is not None:
 53 |                 max_label_length = (
 54 |                     (max_label_length + self.pad_to_multiple_of - 1)
 55 |                     // self.pad_to_multiple_of
 56 |                     * self.pad_to_multiple_of
 57 |                 )
 58 | 
 59 |             padding_side = self.tokenizer.padding_side
 60 |             for feature in features:
 61 |                 remainder = [self.label_pad_token_id] * (max_label_length - len(feature["labels"]))
 62 |                 if isinstance(feature["labels"], list):
 63 |                     feature["labels"] = (
 64 |                         feature["labels"] + remainder if padding_side == "right" else remainder + feature["labels"]
 65 |                     )
 66 |                 elif padding_side == "right":
 67 |                     feature["labels"] = np.concatenate([feature["labels"], remainder]).astype(np.int64)
 68 |                 else:
 69 |                     feature["labels"] = np.concatenate([remainder, feature["labels"]]).astype(np.int64)
 70 | 
 71 |         _features = self.tokenizer.pad(
 72 |             {'input_ids': [feature['input_ids'] for feature in features]},
 73 |             padding=self.padding,
 74 |             max_length=self.max_length,
 75 |             pad_to_multiple_of=self.pad_to_multiple_of,
 76 |             return_tensors=return_tensors,
 77 |         )
 78 |         _features['attention_mask'] = self.tokenizer.pad(
 79 |             {'input_ids': [feature['attention_mask'] for feature in features]},
 80 |             padding=self.padding,
 81 |             max_length=self.max_length,
 82 |             pad_to_multiple_of=self.pad_to_multiple_of,
 83 |             return_tensors=return_tensors,
 84 |         )['input_ids']
 85 |         _features['labels'] = self.tokenizer.pad(
 86 |             {'input_ids': [feature['labels'] for feature in features]},
 87 |             padding=self.padding,
 88 |             max_length=self.max_length,
 89 |             pad_to_multiple_of=self.pad_to_multiple_of,
 90 |             return_tensors=return_tensors,
 91 |         )['input_ids']
 92 |         features = _features
 93 | 
 94 | 
 95 |         # prepare decoder_input_ids
 96 |         if (
 97 |             labels is not None
 98 |             and self.model is not None
 99 |             and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
100 |         ):
101 |             decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(labels=features["labels"])
102 |             features["decoder_input_ids"] = decoder_input_ids
103 | 
104 |         return features
105 | 
106 | class Similarity(nn.Module):
107 |     """
108 |     Dot product or cosine similarity
109 |     """
110 | 
111 |     def __init__(self, temp):
112 |         super().__init__()
113 |         self.temp = temp
114 |         self.cos = nn.CosineSimilarity(dim=-1)
115 | 
116 |     def forward(self, x, y):
117 |         return self.cos(x, y) / self.temp
118 | 
119 | from transformers.trainer_utils import has_length
120 | from transformers.file_utils import is_datasets_available
121 | from transformers.trainer_pt_utils import (
122 |     LengthGroupedSampler,
123 | )
124 | from torch.utils.data import RandomSampler, SequentialSampler
125 | 
126 | class SentembTrainer(Trainer):
127 |     force_tqdm_update = True
128 |     fix_attention_mask = False
129 | 
130 |     def _get_train_sampler(self) -> Optional[torch.utils.data.Sampler]:
131 |         if self.train_dataset is None or not has_length(self.train_dataset):
132 |             return None
133 |         if self.force_tqdm_update:
134 |             self.add_callback(ForceTqdmUpdateCallback)
135 | 
136 |         # Build the sampler.
137 |         if self.args.group_by_length:
138 |             if is_datasets_available() and isinstance(self.train_dataset, datasets.Dataset):
139 |                 lengths = (
140 |                     self.train_dataset[self.args.length_column_name]
141 |                     if self.args.length_column_name in self.train_dataset.column_names
142 |                     else None
143 |                 )
144 |             else:
145 |                 lengths = None
146 |             model_input_name = self.tokenizer.model_input_names[0] if self.tokenizer is not None else None
147 |             return LengthGroupedSampler(
148 |                 self.args.train_batch_size * self.args.gradient_accumulation_steps,
149 |                 dataset=self.train_dataset,
150 |                 lengths=lengths,
151 |                 model_input_name=model_input_name,
152 |             )
153 | 
154 |         return RandomSampler(self.train_dataset)
155 | 
156 |     def compute_loss(self, model, inputs, return_outputs=False):
157 | 
158 |         if self.is_nli and self.use_neg_sentence:
159 |             input_ids, labels, neg = inputs["input_ids"], inputs["labels"], inputs['attention_mask']
160 |             pad_token_id = self.tokenizer.pad_token_id
161 |             if self.fix_attention_mask:
162 |                 labels[labels < 0 ] = pad_token_id
163 |                 neg[neg < 0] = pad_token_id
164 |             else:
165 |                 labels[labels < 0 ] = 0
166 |                 neg[neg < 0] = 0
167 |             # padding tensor length
168 |             mw = max(input_ids.size(1), labels.size(1), neg.size(1))
169 | 
170 |             pad_size = mw - labels.size(1)
171 |             if pad_size > 0:
172 |                 label_pads = torch.zeros(labels.size(0), pad_size).cuda().long()
173 |                 label_pads.fill_(pad_token_id)
174 |                 labels = torch.cat([label_pads, labels], dim=1)
175 |             pad_size = mw - input_ids.size(1)
176 |             if pad_size > 0:
177 |                 input_pads = torch.zeros(input_ids.size(0), pad_size).cuda().long()
178 |                 input_pads.fill_(pad_token_id)
179 |                 input_ids = torch.cat([input_pads,
180 |                                        input_ids], dim=1)
181 |             pad_size = mw - neg.size(1)
182 |             if pad_size > 0:
183 |                 neg_pads = torch.zeros(neg.size(0), pad_size).cuda().long()
184 |                 neg_pads.fill_(pad_token_id)
185 |                 neg = torch.cat([neg_pads,
186 |                                  neg], dim=1)
187 | 
188 |             inputs['input_ids'] = torch.cat([input_ids, labels, neg], dim=0)
189 |             if self.fix_attention_mask:
190 |                 inputs['attention_mask'] = (inputs['input_ids'] != pad_token_id).long()
191 |             else:
192 |                 inputs['attention_mask'] = (inputs['input_ids'] > 0).long()
193 |             del inputs['labels']
194 |         elif self.is_nli:
195 |             input_ids, labels = inputs["input_ids"], inputs["labels"]
196 |             labels[labels < 0 ] = 0
197 |             # padding tensor length
198 |             if input_ids.size(1) > labels.size(1):
199 |                 pad_size = input_ids.size(1) - labels.size(1)
200 |                 labels = torch.cat([torch.zeros(labels.size(0), pad_size).cuda().long(), labels], dim=1)
201 |             else:
202 |                 pad_size = labels.size(1) - input_ids.size(1)
203 |                 input_ids = torch.cat([torch.zeros(input_ids.size(0), pad_size).cuda().long(), input_ids], dim=1)
204 |             inputs['input_ids'] = torch.cat([input_ids, labels], dim=0)
205 |             inputs['attention_mask'] = (inputs['input_ids'] > 0).long()
206 |             del inputs['labels']
207 |         else:
208 |             inputs['input_ids'] = torch.cat([inputs['input_ids'], inputs['input_ids']], dim=0)
209 |             inputs['attention_mask'] = torch.cat([inputs['attention_mask'], inputs['attention_mask']], dim=0)
210 |             del inputs['labels']
211 | 
212 |         pooler_output = model(output_hidden_states=True, return_dict=True, **inputs).hidden_states[-1][:, -1, :]
213 | 
214 |         if self.use_neg_sentence:
215 |             batch_size = pooler_output.size(0)//3
216 |             pooler_output = torch.stack([pooler_output[:batch_size],
217 |                                          pooler_output[batch_size:2*batch_size],
218 |                                          pooler_output[2*batch_size:]], dim=1)
219 |             z1, z2, z3 = pooler_output[:,0], pooler_output[:,1], pooler_output[:,2]
220 |         else:
221 |             batch_size = pooler_output.size(0)//2
222 |             pooler_output = torch.stack([pooler_output[:batch_size], pooler_output[batch_size:]], dim=1)
223 |             z1, z2 = pooler_output[:,0], pooler_output[:,1]
224 |         loss_fct = nn.CrossEntropyLoss()
225 | 
226 |         if dist.is_initialized():
227 |             if self.use_neg_sentence:
228 |                 z3_list = [torch.zeros_like(z3) for _ in range(dist.get_world_size())]
229 |                 dist.all_gather(tensor_list=z3_list, tensor=z3.contiguous())
230 |                 z3_list[dist.get_rank()] = z3
231 |                 z3 = torch.cat(z3_list, 0)
232 | 
233 |             # Dummy vectors for allgather
234 |             z1_list = [torch.zeros_like(z1) for _ in range(dist.get_world_size())]
235 |             z2_list = [torch.zeros_like(z2) for _ in range(dist.get_world_size())]
236 |             # Allgather
237 |             dist.all_gather(tensor_list=z1_list, tensor=z1.contiguous())
238 |             dist.all_gather(tensor_list=z2_list, tensor=z2.contiguous())
239 | 
240 |             # Since allgather results do not have gradients, we replace the
241 |             # current process's corresponding embeddings with original tensors
242 |             z1_list[dist.get_rank()] = z1
243 |             z2_list[dist.get_rank()] = z2
244 |             # Get full batch embeddings: (bs x N, hidden)
245 |             z1 = torch.cat(z1_list, 0)
246 |             z2 = torch.cat(z2_list, 0)
247 | 
248 |         if not hasattr(model, "sim"):
249 |             self.sim = Similarity(temp=0.05)
250 |         cos_sim = self.sim(z1.unsqueeze(1).float(), z2.unsqueeze(0).float())
251 | 
252 |         if self.use_neg_sentence:
253 |             z1_z3_cos = self.sim(z1.unsqueeze(1), z3.unsqueeze(0))
254 |             cos_sim = torch.cat([cos_sim, z1_z3_cos], 1)
255 | 
256 |         labels = torch.arange(cos_sim.size(0)).long().to(inputs['input_ids'].device)
257 | 
258 |         if self.use_neg_sentence:
259 |             z3_weight = 0
260 |             weights = torch.tensor(
261 |                 [[0.0] * (cos_sim.size(-1) - z1_z3_cos.size(-1)) + [0.0] * i + [z3_weight] + [0.0] * (z1_z3_cos.size(-1) - i - 1) for i in range(z1_z3_cos.size(-1))]
262 |             ).to(input_ids.device)
263 |             cos_sim = cos_sim + weights
264 |         loss = loss_fct(cos_sim, labels)
265 |         return (loss, pooler_output) if return_outputs else loss
266 |     
267 | # class ImgembTrainer(Trainer):
268 |     
269 | 
270 | def generate_sentemb_prompt(data_point, tokenizer, cutoff_len, template, prefix='input'):
271 |     sp = f's{prefix}'
272 |     if sp not in data_point:
273 |         input = tokenizer(
274 |             data_point[prefix],
275 |             truncation=True,
276 |             max_length=cutoff_len,
277 |             padding=False,
278 |             return_tensors=None,
279 |             add_special_tokens=False,
280 |         )
281 |         input = tokenizer.decode(input['input_ids'])
282 |         data_point[sp] = input
283 |     else:
284 |         input = data_point[sp]
285 | 
286 |     # template = template.replace('_', ' ').replace('*sep+*', '')\
287 |     #                                      .replace('*cls*', '').replace('\\n', '\n')
288 |     return template.replace('<sent>', input).strip()
289 | 
290 | def train(
291 |     # model/data params
292 |     model_name_or_path: str = "",  # the only required argument
293 |     data_path: str = "data/nli_for_simcse.csv",
294 |     output_dir: str = "./lora-alpaca",
295 |     # training hyperparams
296 |     batch_size: int = 256,
297 |     micro_batch_size: int = 64,
298 |     num_epochs: int = 1,
299 |     learning_rate: float = 5e-4,
300 |     warmup_ratio: float = 0.2,
301 |     cutoff_len: int = 32,
302 |     # llm hyperparams
303 |     group_by_length: bool = False,
304 |     run_name: str = None,
305 |     use_neg_sentence: bool = False,
306 |     save_steps: int = 100,
307 |     seed: int = 42,
308 |     deepspeed: str = None,
309 |     logging_steps: int = 10,
310 |     grad_checkpoint: bool = False,
311 |     fix_attention_mask: bool = False,
312 |     set_pad_to_unk: bool = False,
313 |     bf16: bool = False,
314 |     # make fire happy
315 |     local_rank: int = 0,
316 | ):
317 | 
318 |     gradient_accumulation_steps = batch_size // micro_batch_size
319 | 
320 |     world_size = int(os.environ.get("WORLD_SIZE", 1))
321 |     ddp = world_size != 1
322 | 
323 |     if ddp:
324 |         device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
325 |         gradient_accumulation_steps = gradient_accumulation_steps // world_size
326 |         torch.distributed.init_process_group("nccl")
327 |         rank, world_size = torch.distributed.get_rank(), torch.distributed.get_world_size()
328 |         device_id = rank % torch.cuda.device_count()
329 |         device = torch.device(device_id)
330 |         torch.cuda.set_device(device)
331 | 
332 |     set_seed(seed)
333 | 
334 |     base_model = AutoBase.from_pretrained(model_name_or_path, load_llm=True, device_map='cuda')
335 |     model = base_model.model
336 |     tokenizer = base_model.tokenizer
337 | 
338 |     if set_pad_to_unk:
339 |         tokenizer.pad_token_id = tokenizer.unk_token_id
340 | 
341 |     mask_embedding_sentence_template = base_model.text_eol_prompt
342 |     print(mask_embedding_sentence_template)
343 |     
344 |     def tokenize(prompt, add_eos_token=True, label_prompt=None, neg_prompt=None):
345 |         """
346 |         Tokenizes a prompt and returns a result.
347 | 
348 |         Return:
349 |         {
350 |             "input_ids": List[int],
351 |             "attention_mask": List[int],
352 |             "labels": List[int],
353 |         }
354 |         """
355 |         # there's probably a way to do this with the tokenizer settings
356 |         # but again, gotta move fast
357 | 
358 |         # tokenizer output format:
359 |         # see https://huggingface.co/docs/datasets/v2.20.0/en/use_dataset#preprocess
360 |         # {'input_ids': [101, 1103, 2067, 1110, 17348, 1106, 1129, 1103, 6880, 1432, 112, 188, 1207, 107, 14255, 1389, 107, 1105, 1115, 1119, 112, 188, 1280, 1106, 1294, 170, 24194, 1256, 3407, 1190, 170, 11791, 5253, 188, 1732, 7200, 10947, 12606, 2895, 117, 179, 7766, 118, 172, 15554, 1181, 3498, 6961, 3263, 1137, 188, 1566, 7912, 14516, 6997, 119, 102], 
361 |         # 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
362 |         # 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
363 |         result = tokenizer(
364 |             prompt,
365 |             padding=False,
366 |             return_tensors=None,
367 |         )
368 |         # if token list is not ended with eos token, add eos token
369 |         if (
370 |             result["input_ids"][-1] != tokenizer.eos_token_id
371 |             and len(result["input_ids"]) < cutoff_len
372 |             and add_eos_token
373 |         ):
374 |             result["input_ids"].append(tokenizer.eos_token_id)
375 |             result["attention_mask"].append(1)
376 |         if label_prompt:
377 |             label_result = tokenizer(
378 |                 label_prompt,
379 |                 padding=False,
380 |                 return_tensors=None,
381 |             )
382 |             result["labels"] = label_result["input_ids"]
383 |             if neg_prompt:
384 |                 neg_result = tokenizer(
385 |                     neg_prompt,
386 |                     padding=False,
387 |                     return_tensors=None,
388 |                 )
389 |                 result["attention_mask"] = neg_result["input_ids"]
390 |         else:
391 |             result["labels"] = result["input_ids"].copy()
392 | 
393 |         return result
394 | 
395 |     def generate_and_tokenize_prompt(data_point):
396 |         """
397 |         Generates a prompt from a data point and tokenizes it.
398 | 
399 |         Args:
400 |         data_point: Dict[str, str, str] with keys 'sent0', 'sent1' and 'hard_neg'
401 |         """
402 |         data_point['input'] = data_point['sent0']
403 |         data_point['output'] = data_point['sent1']
404 |         if use_neg_sentence:
405 |             data_point['neg'] = data_point['hard_neg']
406 | 
407 |         full_prompt = generate_sentemb_prompt(data_point, tokenizer, cutoff_len,
408 |                                               mask_embedding_sentence_template,
409 |                                               prefix='input')
410 |         pos_full_prompt = generate_sentemb_prompt(data_point, tokenizer, cutoff_len,
411 |                                                     mask_embedding_sentence_template,
412 |                                                     prefix='output')
413 |         if use_neg_sentence:
414 |             neg_full_prompt = generate_sentemb_prompt(data_point, tokenizer, cutoff_len,
415 |                                                         mask_embedding_sentence_template,
416 |                                                         prefix="neg")
417 | 
418 |         tokenized_full_prompt = tokenize(full_prompt, False,
419 |                                          label_prompt=pos_full_prompt,
420 |                                          neg_prompt=neg_full_prompt if use_neg_sentence else None)
421 |         return tokenized_full_prompt
422 |     
423 |     if grad_checkpoint:
424 |         model.enable_input_require_grads()
425 | 
426 |     if 'csv' in data_path:
427 |         data = load_dataset("csv", data_files=data_path)
428 |     elif os.path.isdir(data_path):
429 |         data = load_from_disk(data_path)
430 |     else:
431 |         data = load_dataset("json", data_files=data_path)
432 | 
433 |     # model.print_trainable_parameters()  # Be more transparent about the % of trainable params.
434 | 
435 |     train_data = data["train"].shuffle().map(generate_and_tokenize_prompt, num_proc=25)
436 |     DC_FUN = DataCollatorForSeq2SeqForNeg if use_neg_sentence else transformers.DataCollatorForSeq2Seq
437 |     data_collator = DC_FUN(
438 |         tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
439 |     )
440 | 
441 |     trainer = SentembTrainer(
442 |         model=model,
443 |         train_dataset=train_data,
444 |         args=transformers.TrainingArguments(
445 |             per_device_train_batch_size=micro_batch_size,
446 |             gradient_accumulation_steps=gradient_accumulation_steps,
447 |             warmup_ratio=warmup_ratio,
448 |             num_train_epochs=num_epochs,
449 |             learning_rate=learning_rate,
450 |             fp16=True if not bf16 else False,
451 |             bf16=bf16,
452 |             logging_steps=logging_steps,
453 |             save_strategy="steps",
454 |             eval_steps=None,
455 |             save_steps=save_steps,
456 |             output_dir=output_dir,
457 |             save_total_limit=1,
458 |             load_best_model_at_end=False,
459 |             ddp_find_unused_parameters=False if ddp else None,
460 |             group_by_length=group_by_length,
461 |             run_name=run_name,
462 |             report_to=None,
463 |             deepspeed=deepspeed,
464 |             gradient_checkpointing=grad_checkpoint,
465 |             remove_unused_columns=False,
466 |         ),
467 |         data_collator=data_collator,
468 |     )
469 |     trainer.tokenizer = tokenizer
470 |     trainer.is_nli = True
471 |     trainer.use_neg_sentence = use_neg_sentence
472 |     trainer.fix_attention_mask = fix_attention_mask
473 |     model.config.use_cache = False
474 | 
475 |     if torch.__version__ >= "2" and sys.platform != "win32":
476 |         model = torch.compile(model)
477 | 
478 |     trainer.train()
479 | 
480 |     model.save_pretrained(output_dir)
481 |     tokenizer.save_pretrained(output_dir)
482 | 
483 | if __name__ == "__main__":
484 |     fire.Fire(train)
485 | 


--------------------------------------------------------------------------------
/models/modeling_basemodels.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import math
  3 | import torch
  4 | import torchvision.transforms as T
  5 | import qwen_vl_utils.vision_process as qwen_vl_vision_process
  6 | from PIL import Image
  7 | from torchvision.transforms.functional import InterpolationMode
  8 | from transformers import (
  9 |     AutoModel,
 10 |     AutoProcessor,
 11 |     AutoTokenizer,
 12 |     AutoModelForCausalLM,
 13 |     LlavaNextVideoForConditionalGeneration,
 14 |     LlavaConfig, 
 15 |     LlamaForCausalLM,
 16 |     Qwen2ForCausalLM,
 17 |     Qwen2VLModel,
 18 |     Qwen2VLForConditionalGeneration,
 19 | )
 20 | from typing import Dict, Optional, Union
 21 | from models.tarsier.modeling_tarsier import TarsierForConditionalGeneration
 22 | from models.tarsier.processor import Processor
 23 | from utils.model import EOL_PROMPTS, load_architectures_from_config
 24 | from abc import ABCMeta
 25 | 
 26 | 
 27 | base_registry = {}
 28 | 
 29 | class BaseModel(metaclass=ABCMeta):
 30 |     def __init_subclass__(cls, **kwargs):
 31 |         super().__init_subclass__(**kwargs)
 32 |         # register model architecture
 33 |         if hasattr(cls, 'ARCHITECTURE'):
 34 |             base_registry[cls.ARCHITECTURE] = cls
 35 |     
 36 |     @classmethod
 37 |     def from_pretrained(
 38 |         cls,
 39 |         model_name_or_path: str,
 40 |         load_llm: bool = False,
 41 |         device_map: Optional[Union[str, Dict[str, int]]] = None,
 42 |         **kwargs):
 43 |         print(f'Loading {cls.__name__} from {model_name_or_path}')
 44 | 
 45 |         return cls(model_name_or_path, load_llm=load_llm, device_map=device_map, **kwargs)
 46 | 
 47 | class AutoBase:
 48 |     @staticmethod
 49 |     def from_pretrained(
 50 |         model_name_or_path: str,
 51 |         load_llm: bool = False,
 52 |         device_map: Optional[Union[str, Dict[str, int]]] = None,
 53 |         architecture: Optional[str] = None,
 54 |         **kwargs):
 55 | 
 56 |         config_path = os.path.join(model_name_or_path, 'config.json')
 57 |         if architecture is not None:
 58 |             model_arch = architecture
 59 |             print(f"Argument `architecture` of AutoBase is not None. Overriding model architecture to {model_arch}.")
 60 |         else:
 61 |             model_arch = load_architectures_from_config(config_path)
 62 |         if model_arch not in base_registry:
 63 |             raise ValueError(
 64 |                 f"Model architecture {model_arch} is not registered. "
 65 |                 "You can register it by subclassing BaseModel and setting ARCHITECTURE attribute."
 66 |             )
 67 |         if device_map is None:
 68 |             if torch.cuda.is_available():
 69 |                 device_map = 'cuda'
 70 |                 print(f"Argument `device_map` is None. CUDA is detected. Setting device_map={device_map}.")
 71 |             else:
 72 |                 device_map = 'cpu'
 73 |                 print(f"Argument `device_map` is None. CUDA is not detected. Setting device_map={device_map}.")
 74 |         
 75 |         MODEL_CLASS = base_registry[model_arch]
 76 | 
 77 |         return MODEL_CLASS.from_pretrained(model_name_or_path, load_llm=load_llm, device_map=device_map, **kwargs)
 78 | 
 79 | class BaseModelForMiniCPMV(BaseModel):
 80 | 
 81 |     ARCHITECTURE = "MiniCPMV"
 82 |     LLM_CLASS = Qwen2ForCausalLM
 83 |     MLLM_CLASS = AutoModel
 84 | 
 85 |     @property
 86 |     def describe_prompt(self):
 87 |         return "Describe the video in detail."
 88 |     
 89 |     @property
 90 |     def text_eol_prompt(self):
 91 |         prompt = [{'role': 'user', 'content': EOL_PROMPTS['text']}]
 92 |         prompt = self.tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True, use_image_id=False, max_slice_nums=2)
 93 |         return prompt
 94 |     
 95 |     @property
 96 |     def image_eol_prompt(self):
 97 |         prompt = [{'role': 'user', 'content': EOL_PROMPTS['image']}]
 98 |         prompt = self.tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True, use_image_id=False, max_slice_nums=2)
 99 |         return prompt
100 |     
101 |     @property
102 |     def video_eol_prompt(self):
103 |         prompt = [{'role': 'user', 'content': EOL_PROMPTS['video']}]
104 |         prompt = self.tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True, use_image_id=False, max_slice_nums=2)
105 |         return prompt
106 | 
107 | 
108 |     def __init__(
109 |             self, 
110 |             model_name_or_path: str,
111 |             load_llm: bool = False,
112 |             device_map: Optional[Union[str, Dict[str, int]]] = None,
113 |             **kwargs,
114 |         ):
115 | 
116 |         MODEL_CLASS = self.LLM_CLASS if load_llm else self.MLLM_CLASS
117 | 
118 |         if load_llm:
119 |             self.split_weights(model_name_or_path, model_name_or_path + '-llm')
120 |             model_name_or_path += '-llm'
121 | 
122 |         attn_implementation = 'flash_attention_2' if device_map == 'cuda' else 'sdpa'
123 | 
124 |         self.is_llm = load_llm
125 |         self.model = MODEL_CLASS.from_pretrained(
126 |             model_name_or_path,
127 |             torch_dtype=torch.bfloat16,
128 |             trust_remote_code=True,
129 |             attn_implementation=attn_implementation,
130 |             device_map=device_map,
131 |         )
132 |         
133 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
134 |         self.processor = AutoProcessor.from_pretrained(model_name_or_path, trust_remote_code=True)
135 |     
136 |     def split_weights(self, mllm_path, llm_path):
137 |         if os.path.exists(llm_path):
138 |             print(f'{llm_path} already exists. Skip splitting weights.')
139 |             return
140 |         print('Splitting LLM weights from MLLM.')
141 |         model = self.MLLM_CLASS.from_pretrained(mllm_path)
142 |         llm = model.llm
143 |         processor = AutoProcessor.from_pretrained(mllm_path)
144 |         tokenizer = AutoTokenizer.from_pretrained(mllm_path)
145 |         llm.save_pretrained(llm_path)
146 |         processor.save_pretrained(llm_path)
147 |         tokenizer.save_pretrained(llm_path)
148 | 
149 | 
150 | class BaseModelForInternVL2(BaseModel):
151 |     
152 |     ARCHITECTURE = "InternVLChatModel"
153 |     LLM_CLASS = AutoModelForCausalLM
154 |     MLLM_CLASS = AutoModel
155 | 
156 |     @property
157 |     def describe_prompt(self):
158 |         return "Describe the video in detail."
159 | 
160 |     @property
161 |     def text_eol_prompt(self):
162 |         prompt = f"<|im_start|>user\n{EOL_PROMPTS['text']}<|im_end|><|im_start|>assistant\n"
163 |         return prompt
164 |     
165 |     @property
166 |     def image_eol_prompt(self):
167 |         prompt = f"<|im_start|>user\n{EOL_PROMPTS['image']}<|im_end|><|im_start|>assistant\n"
168 |         return prompt
169 |     
170 |     @property
171 |     def video_eol_prompt(self):
172 |         prompt = f"<|im_start|>user\n{EOL_PROMPTS['video']}<|im_end|><|im_start|>assistant\n"
173 |         return prompt
174 | 
175 |     def __init__(
176 |             self, 
177 |             model_name_or_path: str,
178 |             load_llm: bool = False,
179 |             device_map: Optional[Union[str, Dict[str, int]]] = None,
180 |             **kwargs,
181 |         ):
182 | 
183 |         MODEL_CLASS = self.LLM_CLASS if load_llm else self.MLLM_CLASS
184 | 
185 |         if load_llm:
186 |             self.split_weights(model_name_or_path, model_name_or_path + '-llm')
187 |             model_name_or_path += '-llm'
188 | 
189 |         self.is_llm = load_llm
190 |         self.model = MODEL_CLASS.from_pretrained(
191 |             model_name_or_path,
192 |             torch_dtype=torch.bfloat16,
193 |             trust_remote_code=True,
194 |             device_map=device_map,
195 |         )
196 |              
197 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True, use_fast=False)
198 |         self.tokenizer.model_max_length = 16384
199 |     
200 |     def split_weights(self, mllm_path, llm_path):
201 |         if os.path.exists(llm_path):
202 |             print(f'{llm_path} already exists. Skip splitting weights.')
203 |             return
204 |         print('Splitting LLM weights from MLLM.')
205 |         model = self.MLLM_CLASS.from_pretrained(mllm_path)
206 |         llm = model.language_model
207 |         tokenizer = AutoTokenizer.from_pretrained(mllm_path)
208 |         llm.save_pretrained(llm_path)
209 |         tokenizer.save_pretrained(llm_path)
210 | 
211 |     def build_transform(self, input_size):
212 |         IMAGENET_MEAN = (0.485, 0.456, 0.406)
213 |         IMAGENET_STD = (0.229, 0.224, 0.225)
214 |         MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
215 |         transform = T.Compose([
216 |             T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
217 |             T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
218 |             T.ToTensor(),
219 |             T.Normalize(mean=MEAN, std=STD)
220 |         ])
221 |         return transform
222 |     
223 |     def find_closest_aspect_ratio(self, aspect_ratio, target_ratios, width, height, image_size):
224 |         best_ratio_diff = float('inf')
225 |         best_ratio = (1, 1)
226 |         area = width * height
227 |         for ratio in target_ratios:
228 |             target_aspect_ratio = ratio[0] / ratio[1]
229 |             ratio_diff = abs(aspect_ratio - target_aspect_ratio)
230 |             if ratio_diff < best_ratio_diff:
231 |                 best_ratio_diff = ratio_diff
232 |                 best_ratio = ratio
233 |             elif ratio_diff == best_ratio_diff:
234 |                 if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
235 |                     best_ratio = ratio
236 |         return best_ratio
237 | 
238 |     def dynamic_preprocess(self, image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
239 |         orig_width, orig_height = image.size
240 |         aspect_ratio = orig_width / orig_height
241 | 
242 |         # calculate the existing image aspect ratio
243 |         target_ratios = set(
244 |             (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
245 |             i * j <= max_num and i * j >= min_num)
246 |         target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
247 | 
248 |         # find the closest aspect ratio to the target
249 |         target_aspect_ratio = self.find_closest_aspect_ratio(
250 |             aspect_ratio, target_ratios, orig_width, orig_height, image_size)
251 | 
252 |         # calculate the target width and height
253 |         target_width = image_size * target_aspect_ratio[0]
254 |         target_height = image_size * target_aspect_ratio[1]
255 |         blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
256 | 
257 |         # resize the image
258 |         resized_img = image.resize((target_width, target_height))
259 |         processed_images = []
260 |         for i in range(blocks):
261 |             box = (
262 |                 (i % (target_width // image_size)) * image_size,
263 |                 (i // (target_width // image_size)) * image_size,
264 |                 ((i % (target_width // image_size)) + 1) * image_size,
265 |                 ((i // (target_width // image_size)) + 1) * image_size
266 |             )
267 |             # split the image
268 |             split_img = resized_img.crop(box)
269 |             processed_images.append(split_img)
270 |         assert len(processed_images) == blocks
271 |         if use_thumbnail and len(processed_images) != 1:
272 |             thumbnail_img = image.resize((image_size, image_size))
273 |             processed_images.append(thumbnail_img)
274 |         return processed_images
275 | 
276 |     def load_image(self, image_file, input_size=448, max_num=12):
277 |         image = Image.open(image_file).convert('RGB')
278 |         transform = self.build_transform(input_size=input_size)
279 |         images = self.dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
280 |         pixel_values = [transform(image) for image in images]
281 |         pixel_values = torch.stack(pixel_values)
282 |         return pixel_values
283 | 
284 | class BaseModelForLlavaNextVideo(BaseModel):
285 | 
286 |     ARCHITECTURE = "LlavaNextVideoForConditionalGeneration"
287 |     LLM_CLASS = AutoModelForCausalLM
288 |     MLLM_CLASS = LlavaNextVideoForConditionalGeneration
289 | 
290 |     @property
291 |     def describe_prompt(self):
292 |         return "Please provide a detailed description of the video, focusing on the main subjects, their actions, and the background scenes."
293 | 
294 |     @property
295 |     def text_eol_prompt(self):
296 |         prompt = [{
297 |             "role": "user",
298 |             "content": [{"type": "text", "text": EOL_PROMPTS['text']}],
299 |         }]
300 |         prompt = self.processor.apply_chat_template(prompt, add_generation_prompt=True)
301 |         return prompt
302 |     
303 |     @property
304 |     def image_eol_prompt(self):
305 |         prompt = [{
306 |             "role": "user",
307 |             "content": [{"type": "text", "text": EOL_PROMPTS['image']}],
308 |         }]
309 |         prompt = self.processor.apply_chat_template(prompt, add_generation_prompt=True)
310 |         return prompt
311 |     
312 |     @property
313 |     def video_eol_prompt(self):
314 |         prompt = [{
315 |             "role": "user",
316 |             "content": [{"type": "text", "text": EOL_PROMPTS['video']}],
317 |         }]
318 |         prompt = self.processor.apply_chat_template(prompt, add_generation_prompt=True)
319 |         return prompt
320 | 
321 |     def __init__(
322 |             self, 
323 |             model_name_or_path: str,
324 |             load_llm: bool = False,
325 |             device_map: Optional[Union[str, Dict[str, int]]] = None,
326 |             **kwargs,
327 |         ):
328 |         
329 |         MODEL_CLASS = self.LLM_CLASS if load_llm else self.MLLM_CLASS
330 | 
331 |         if load_llm:
332 |             self.split_weights(model_name_or_path, model_name_or_path + '-llm')
333 |             model_name_or_path += '-llm'
334 | 
335 |         self.is_llm = load_llm  
336 |         self.model = MODEL_CLASS.from_pretrained(
337 |             model_name_or_path,
338 |             torch_dtype=torch.bfloat16,
339 |             device_map=device_map,
340 |         )
341 |         
342 |         self.processor = AutoProcessor.from_pretrained(model_name_or_path)
343 |         self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
344 |     
345 |     def split_weights(self, mllm_path, llm_path):
346 |         if os.path.exists(llm_path):
347 |             print(f'{llm_path} already exists. Skip splitting weights.')
348 |             return
349 |         print('Splitting LLM weights from MLLM.')
350 |         model = self.MLLM_CLASS.from_pretrained(mllm_path)
351 |         llm = model.language_model
352 |         processor = AutoProcessor.from_pretrained(mllm_path)
353 |         tokenizer = AutoTokenizer.from_pretrained(mllm_path)
354 |         llm.save_pretrained(llm_path)
355 |         processor.save_pretrained(llm_path)
356 |         tokenizer.save_pretrained(llm_path)
357 | 
358 | class BaseModelForTarsier(BaseModel):
359 |     
360 |     ARCHITECTURE = "TarsierForConditionalGeneration"
361 |     LLM_CLASS = LlamaForCausalLM
362 |     MLLM_CLASS = TarsierForConditionalGeneration
363 | 
364 |     @property
365 |     def describe_prompt(self):
366 |         return "Describe the video in detail."
367 | 
368 |     @property
369 |     def text_eol_prompt(self):
370 |         prompt = f'USER: {EOL_PROMPTS["text"]} ASSISTANT: '
371 |         return prompt
372 |     
373 |     @property
374 |     def image_eol_prompt(self):
375 |         prompt = f'USER: {EOL_PROMPTS["image"]} ASSISTANT: '
376 |         return prompt
377 |     
378 |     @property
379 |     def video_eol_prompt(self):
380 |         prompt = f'USER: {EOL_PROMPTS["video"]} ASSISTANT: '
381 |         return prompt
382 | 
383 |     def __init__(
384 |             self, 
385 |             model_name_or_path: str,
386 |             load_llm: Optional[bool] = None,
387 |             device_map: Optional[Union[str, Dict[str, int]]] = None,
388 |             **kwargs,
389 |         ):
390 | 
391 |         MODEL_CLASS = self.LLM_CLASS if load_llm else self.MLLM_CLASS
392 | 
393 |         if load_llm:
394 |             self.split_weights(model_name_or_path, model_name_or_path + '-llm')
395 |             model_name_or_path += '-llm'
396 |             model_config = None
397 |             self.processor = AutoProcessor.from_pretrained(model_name_or_path, use_fast=False)
398 |         else:
399 |             model_config = LlavaConfig.from_pretrained(
400 |                 model_name_or_path,
401 |                 trust_remote_code=True,
402 |             )
403 |             self.processor = Processor(
404 |                 model_name_or_path,
405 |                 max_n_frames=32,
406 |             )
407 |         
408 |         self.tokenizer = self.processor.tokenizer
409 | 
410 |         self.model = MODEL_CLASS.from_pretrained(
411 |             model_name_or_path,
412 |             config=model_config,
413 |             torch_dtype=kwargs.get("torch_dtype", torch.bfloat16),
414 |             device_map=device_map,
415 |             trust_remote_code=True
416 |         )
417 |         
418 |         self.model.eval()
419 | 
420 |     def split_weights(self, mllm_path, llm_path):
421 |         if os.path.exists(llm_path):
422 |             print(f'{llm_path} already exists. Skip splitting weights.')
423 |             return
424 |         print('Splitting LLM weights from MLLM.')
425 |         model = self.MLLM_CLASS.from_pretrained(mllm_path)
426 |         llm = model.language_model
427 |         processor = AutoProcessor.from_pretrained(mllm_path)
428 |         tokenizer = AutoTokenizer.from_pretrained(mllm_path)
429 |         llm.save_pretrained(llm_path)
430 |         processor.save_pretrained(llm_path)
431 |         tokenizer.save_pretrained(llm_path)
432 | 
433 | class BaseModelForQwen2VL(BaseModel):
434 | 
435 |     ARCHITECTURE = "Qwen2VLForConditionalGeneration"
436 |     LLM_CLASS = Qwen2VLModel
437 |     MLLM_CLASS = Qwen2VLForConditionalGeneration
438 | 
439 |     @property
440 |     def describe_prompt(self):
441 |         return "Describe the video in detail."
442 | 
443 |     @property
444 |     def text_eol_prompt(self):
445 |         messages = [{
446 |             "role": "user",
447 |             "content": [{"type": "text", "text": EOL_PROMPTS['text']}],
448 |         }]
449 |         prompt = self.processor.apply_chat_template(
450 |             messages, tokenize=False, add_generation_prompt=True
451 |         )
452 |         return prompt
453 |     
454 |     @property
455 |     def image_eol_prompt(self):
456 |         messages = [{
457 |             "role": "user",
458 |             "content": [{"type": "text", "text": EOL_PROMPTS['image']}],
459 |         }]
460 |         prompt = self.processor.apply_chat_template(
461 |             messages, tokenize=False, add_generation_prompt=True
462 |         )
463 |         return prompt
464 |     
465 |     @property
466 |     def video_eol_prompt(self):
467 |         messages = [{
468 |             "role": "user",
469 |             "content": [{"type": "text", "text": EOL_PROMPTS['video']}],
470 |         }]
471 |         prompt = self.processor.apply_chat_template(
472 |             messages, tokenize=False, add_generation_prompt=True
473 |         )
474 |         return prompt
475 | 
476 |     def __init__(
477 |             self, 
478 |             model_name_or_path: str,
479 |             load_llm: Optional[bool] = None,
480 |             device_map: Optional[Union[str, Dict[str, int]]] = None,
481 |             **kwargs,
482 |         ):        
483 |         
484 |         MODEL_CLASS = self.LLM_CLASS if load_llm else self.MLLM_CLASS
485 | 
486 |         self.load_llm = load_llm
487 | 
488 |         if load_llm:
489 |             self.split_weights(model_name_or_path, model_name_or_path + '-llm')
490 |             model_name_or_path += '-llm'
491 |         
492 |         self.model = MODEL_CLASS.from_pretrained(
493 |             model_name_or_path,
494 |             torch_dtype=torch.bfloat16,
495 |             device_map=device_map,
496 |         )
497 |         self.model.eval()
498 |              
499 |         self.processor = AutoProcessor.from_pretrained(model_name_or_path)
500 |         self.tokenizer = self.processor.tokenizer
501 | 
502 |     def split_weights(self, mllm_path, llm_path):
503 |         if os.path.exists(llm_path):
504 |             print(f'{llm_path} already exists. Skip splitting weights.')
505 |             return
506 |         print('Splitting LLM weights from MLLM.')
507 |         model = self.MLLM_CLASS.from_pretrained(mllm_path)
508 |         llm = model.model
509 |         processor = AutoProcessor.from_pretrained(mllm_path)
510 |         tokenizer = AutoTokenizer.from_pretrained(mllm_path)
511 |         llm.save_pretrained(llm_path)
512 |         processor.save_pretrained(llm_path)
513 |         tokenizer.save_pretrained(llm_path)
514 |     
515 |     def round_by_factor(self, number: int, factor: int) -> int:
516 |         """Returns the closest integer to 'number' that is divisible by 'factor'."""
517 |         return round(number / factor) * factor
518 | 
519 | 
520 |     def ceil_by_factor(self, number: int, factor: int) -> int:
521 |         """Returns the smallest integer greater than or equal to 'number' that is divisible by 'factor'."""
522 |         return math.ceil(number / factor) * factor
523 | 
524 | 
525 |     def floor_by_factor(self, number: int, factor: int) -> int:
526 |         """Returns the largest integer less than or equal to 'number' that is divisible by 'factor'."""
527 |         return math.floor(number / factor) * factor
528 |     
529 |     def smart_resize(
530 |         self, height: int, width: int, factor: int = qwen_vl_vision_process.IMAGE_FACTOR, min_pixels: int = qwen_vl_vision_process.MIN_PIXELS, max_pixels: int = qwen_vl_vision_process.MAX_PIXELS
531 |     ) -> tuple[int, int]:
532 |         """
533 |         Rescales the image so that the following conditions are met:
534 | 
535 |         1. Both dimensions (height and width) are divisible by 'factor'.
536 | 
537 |         2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
538 | 
539 |         3. The aspect ratio of the image is maintained as closely as possible.
540 |         """
541 |         if max(height, width) / min(height, width) > qwen_vl_vision_process.MAX_RATIO:
542 |             raise ValueError(
543 |                 f"absolute aspect ratio must be smaller than {qwen_vl_vision_process.MAX_RATIO}, got {max(height, width) / min(height, width)}"
544 |             )
545 |         h_bar = max(factor, self.round_by_factor(height, factor))
546 |         w_bar = max(factor, self.round_by_factor(width, factor))
547 |         if h_bar * w_bar > max_pixels:
548 |             beta = math.sqrt((height * width) / max_pixels)
549 |             h_bar = self.floor_by_factor(height / beta, factor)
550 |             w_bar = self.floor_by_factor(width / beta, factor)
551 |         elif h_bar * w_bar < min_pixels:
552 |             beta = math.sqrt(min_pixels / (height * width))
553 |             h_bar = self.ceil_by_factor(height * beta, factor)
554 |             w_bar = self.ceil_by_factor(width * beta, factor)
555 |         return h_bar, w_bar
556 | 
557 | #  The base model is the same as Qwen2-VL
558 | class BaseModelForCaRe(BaseModelForQwen2VL):
559 | 
560 |     ARCHITECTURE = "CaReModel"


--------------------------------------------------------------------------------
/utils/dream_gpt.py:
--------------------------------------------------------------------------------
  1 | # Copyright (2024) Bytedance Ltd. and/or its affiliates
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | import json
 15 | import numpy as np
 16 | import ast
 17 | import time
 18 | from typing import List, Dict
 19 | from tqdm import tqdm
 20 | from pathos.multiprocessing import ProcessingPool as Pool
 21 | import func_timeout
 22 | from func_timeout import func_set_timeout
 23 | import logging
 24 | logger = logging.getLogger(__name__)
 25 | 
 26 | from utils.gpt_api import azure_gpt4_client
 27 | import re
 28 | import os
 29 | from copy import deepcopy
 30 | from traceback import format_exc
 31 | import openai
 32 | import random
 33 | 
 34 | def count_f1(r, p):
 35 |     return 2*r*p/(r+p)
 36 | 
 37 | def call_azure_gpt_api_for_events_relationship(events, reference, prediction, model):
 38 |     if len(events) == 0:
 39 |         events = [reference.replace('\n', ' ')]  
 40 |     completion = azure_gpt4_client.chat.completions.create(
 41 |         model=model,
 42 |         messages=[
 43 |             {
 44 |                 "role": "user",
 45 |                 "content":
 46 |                         "Given a video description and a list of events. For each event, classify the relationship between the video description and the event into three classes: entailment, neutral, contradiction.\n"
 47 |                         "- \"entailment\" means that the video description entails the event.\n"
 48 |                         "- \"contradiction\" means that some detail in the video description contradicts with the event.\n"
 49 |                         "- \"neutral\" means that the relationship is neither \"entailment\" or \"contradiction\".\n\n"
 50 |                         f"Video Description:\n{prediction}\n\n"
 51 |                         f"Events: {events}\n"
 52 | 
 53 |                         "Output a JSON formed as:\n"
 54 |                         "{\n"
 55 |                         "  \"events\": [\n"
 56 |                         "    {\"event\": \"copy an event here\", \"relationship\": \"put class name here\",  \"reason\": \"give your reason here\"},\n"
 57 |                         "    ...\n"
 58 |                         "  ]\n"
 59 |                         "}\n\n"
 60 |                         "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only output the JSON. Output:"
 61 |             }
 62 |         ]
 63 |     )
 64 |     return json.loads(completion.model_dump_json())['choices'][0]['message']['content']
 65 | 
 66 | def call_azure_gpt_api_for_objects_relationship(objects, reference, prediction, model):
 67 |     if len(objects) == 0:
 68 |         objects = [reference.replace('\n', ' ')]  
 69 |     completion = azure_gpt4_client.chat.completions.create(
 70 |         model=model,
 71 |         messages=[
 72 |             {
 73 |                 "role": "user",
 74 |                 "content":
 75 |                         "Given a video description and a list of objects. For each object, classify the relationship between the video description and the object into three classes: entailment, neutral, contradiction.\n"
 76 |                         "- \"entailment\" means that the video description entails the object.\n"
 77 |                         "- \"contradiction\" means that some detail in the video description contradicts with the object.\n"
 78 |                         "- \"neutral\" means that the relationship is neither \"entailment\" or \"contradiction\".\n\n"
 79 |                         f"Video Description:\n{prediction}\n\n"
 80 |                         f"Objects: {objects}\n"
 81 | 
 82 |                         "Output a JSON formed as:\n"
 83 |                         "{\n"
 84 |                         "  \"objects\": [\n"
 85 |                         "    {\"object\": \"copy an object here\", \"relationship\": \"put class name here\",  \"reason\": \"give your reason here\"},\n"
 86 |                         "    ...\n"
 87 |                         "  ]\n"
 88 |                         "}\n\n"
 89 |                         "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only output the JSON. Output:"
 90 |             }
 91 |         ]
 92 |     )
 93 |     return json.loads(completion.model_dump_json())['choices'][0]['message']['content']
 94 | 
 95 | def call_azure_gpt_api_for_events(caption, model):
 96 |     completion = azure_gpt4_client.chat.completions.create(
 97 |         model=model,
 98 |         messages=[
 99 |             {
100 |                 "role": "user",
101 |                 "content":
102 |                         "Below is a description of a video clip:\n"
103 |                         f"Video Description: {caption}\n\n"
104 | 
105 |                         "Extract at most 10 key events from the above video description paragraph. Requirements\n:"
106 |                         "- An event must include an action, motion or movement (NOT STATIC INFORMATION). DON'T repeat same events.\n"
107 |                         "- Every event is represented by a brief sentence within 10 words, with a subject, a predicate and optionally an object, avoid unnecessary appearance descriptions.\n"
108 |                         "- Every event must be atomic, meaning that it cannot be further split into multiple events.\n"
109 |                         "- Scene cuts and camera motions are NOT events.\n"
110 |                         "- Substitute pronouns by the nouns they refer to.\n\n"
111 |                         "Please generate the response in the form of a Python dictionary string with keys \"events\". The value of \"events\" is a List(str), of which each item is an event. "
112 |                         "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string. "
113 |                         "For example, your response should look like this: {\"events\": [event1, event2, ...]}"
114 |             }
115 |         ]
116 |     )
117 |     return json.loads(completion.model_dump_json())['choices'][0]['message']['content']
118 | 
119 | def call_azure_gpt_api_for_objects(caption, model):
120 |     completion = azure_gpt4_client.chat.completions.create(
121 |         model=model,
122 |         messages=[
123 |             {
124 |                 "role": "user",
125 |                 "content":
126 |                         "Below is a description of a video clip:\n"
127 |                         f"Video Description: {caption}\n\n"
128 | 
129 |                         "Extract at most 10 key objects from the above video description paragraph. Requirements\n:"
130 |                         "- Replace pronouns with the nouns they refer to based on the context.\n"
131 |                         "- An object must be described with its attributes within 10 words (e.g., color, size, shape, material).\n"
132 |                         " - If an object has multiple attributes, split it into multiple objects, each with a single attribute. (e.g., \"an old man with white hair and a white beard\" can be divided into \"an old man with white hair\" and \"an old man with a white beard\".)"
133 |                         "- Every object is represented by a brief sentence starting with \"There is/are\", including its attributes.\n"
134 |                         "- Every object description must be atomic, meaning that it cannot be further split into multiple descriptions, and each object must be distinctly distinguishable from one another.\n"
135 |                         "- Atmosphere, music, sounds, scene cuts, camera motions, and actions are NOT objects.\n\n"
136 | 
137 |                         "Please generate the response in the form of a Python dictionary string with the key \"objects\". The value of \"objects\" is a List(str), of which each item is an object description."
138 |                         "DO NOT PROVIDE ANY OTHER OUTPUT TEXT OR EXPLANATION. Only provide the Python dictionary string."
139 |                         "For example, your response should look like this: {\"objects\": [\"There is ...\", \"There is ...\", ...]}"
140 |             }
141 |         ]
142 |     )
143 |     return json.loads(completion.model_dump_json())['choices'][0]['message']['content']
144 | 
145 | def try_call_api_for_eval_events(events, answer, prediction, model, verbose=False, max_retry=-1):
146 |     retry_exceptions = [
147 |         "qpm limit, you can apply for expansion on the platform",
148 |         "reach token limit, you can apply for expansion on the platform",
149 |         "Request timed out",
150 |         "The service is temporarily unable to process your request.",
151 |         "upstream failed to respond",
152 |         "502 Bad Gateway",
153 |         "429 Too Many Requests",
154 |         "Retrying request to"
155 |     ]
156 |     retry = 0
157 |     while True and (retry<max_retry or max_retry<0):
158 |         retry += 1
159 |         try:
160 |             gpt_q = call_azure_gpt_api_for_events_relationship(events, answer, prediction, model)
161 |             gpt_q = gpt_q.strip()
162 |             gpt_q = re.sub(r'\n+', '\n', gpt_q)
163 |             gpt_q = re.sub(r'\s+', ' ', gpt_q)
164 |             
165 |             if gpt_q.startswith("```json"):
166 |                 gpt_q = gpt_q.replace("```json", "").replace("```", "").strip()
167 |             elif gpt_q.startswith("```python"):
168 |                 gpt_q = gpt_q.replace("```python", "").replace("```", "").strip()
169 |             if not gpt_q.startswith('{'):
170 |                 gpt_q = '{' + gpt_q
171 |             if not gpt_q.endswith('}'):
172 |                 gpt_q = gpt_q + '}'
173 |             gpt_q = gpt_q.replace("True", "true").replace("False", "false")
174 |             gpt_q = gpt_q.replace("} {", "}, {").replace("}{", "}, {")
175 |             gpt_q = gpt_q.replace(",\n}", "\n}").replace(", \n}", "\n}").replace(", }", "}").replace(",}", "}")
176 |             gpt_q = gpt_q.replace(",\n]", "\n]").replace(", \n]", "\n]").replace(", ]", "]").replace(",]", "]")
177 |             gpt_q = gpt_q.replace("[Placeholder]", "null")
178 |             gpt_q = gpt_q.replace("{Events:", "").strip()
179 |             
180 |             return gpt_q, True
181 |         except openai.RateLimitError as e:
182 |             time.sleep(random.randint(30, 90))
183 |         except openai.APIError as e:
184 |             time.sleep(5)
185 |         except Exception as e:
186 |             return e, False
187 |     return f"Exceed max try: {max_retry}", False
188 | 
189 | def try_call_api_for_eval_objects(objects, answer, prediction, model, verbose=False, max_retry=-1):
190 |     retry_exceptions = [
191 |         "qpm limit, you can apply for expansion on the platform",
192 |         "reach token limit, you can apply for expansion on the platform",
193 |         "Request timed out",
194 |         "The service is temporarily unable to process your request.",
195 |         "upstream failed to respond",
196 |         "502 Bad Gateway",
197 |         "429 Too Many Requests",
198 |         "Retrying request to"
199 |     ]
200 |     retry = 0
201 |     while True and (retry<max_retry or max_retry<0):
202 |         retry += 1
203 |         try:
204 |             gpt_q = call_azure_gpt_api_for_objects_relationship(objects, answer, prediction, model)
205 |             gpt_q = gpt_q.strip()
206 |             gpt_q = re.sub(r'\n+', '\n', gpt_q)
207 |             gpt_q = re.sub(r'\s+', ' ', gpt_q)
208 |             
209 |             if gpt_q.startswith("```json"):
210 |                 gpt_q = gpt_q.replace("```json", "").replace("```", "").strip()
211 |             elif gpt_q.startswith("```python"):
212 |                 gpt_q = gpt_q.replace("```python", "").replace("```", "").strip()
213 |             if not gpt_q.startswith('{'):
214 |                 gpt_q = '{' + gpt_q
215 |             if not gpt_q.endswith('}'):
216 |                 gpt_q = gpt_q + '}'
217 |             gpt_q = gpt_q.replace("True", "true").replace("False", "false")
218 |             gpt_q = gpt_q.replace("} {", "}, {").replace("}{", "}, {")
219 |             gpt_q = gpt_q.replace(",\n}", "\n}").replace(", \n}", "\n}").replace(", }", "}").replace(",}", "}")
220 |             gpt_q = gpt_q.replace(",\n]", "\n]").replace(", \n]", "\n]").replace(", ]", "]").replace(",]", "]")
221 |             gpt_q = gpt_q.replace("[Placeholder]", "null")
222 |             gpt_q = gpt_q.replace("{Objects:", "").strip()
223 |             
224 |             return gpt_q, True
225 |         except openai.RateLimitError as e:
226 |             time.sleep(random.randint(30, 90))
227 |         except openai.APIError as e:
228 |             time.sleep(5)
229 |         except Exception as e:
230 |             return e, False
231 |     return f"Exceed max try: {max_retry}", False
232 | 
233 | def try_call_api_for_events(caption, model, verbose=False):
234 |     retry_exceptions = [
235 |         "qpm limit, you can apply for expansion on the platform",
236 |         "reach token limit, you can apply for expansion on the platform",
237 |         "Request timed out",
238 |         "The service is temporarily unable to process your request.",
239 |         "upstream failed to respond",
240 |         "502 Bad Gateway",
241 |         "429 Too Many Requests",
242 |         "Retrying request to"
243 |     ]
244 |     while True:
245 |         try:
246 |             gpt_q = call_azure_gpt_api_for_events(caption, model)
247 |             if gpt_q.startswith("```json"):
248 |                 gpt_q = gpt_q.replace("```json", "").replace("```", "").strip()
249 |             elif gpt_q.startswith("```python"):
250 |                 gpt_q = gpt_q.replace("```python", "").replace("```", "").strip()
251 |             return gpt_q, True
252 |         except openai.RateLimitError as e:
253 |             time.sleep(random.randint(30, 90))
254 |         except openai.APIError as e:
255 |             time.sleep(5)
256 |         except Exception as e:
257 |             return e, False
258 | 
259 | def try_call_api_for_objects(caption, model, verbose=False):
260 |     retry_exceptions = [
261 |         "qpm limit, you can apply for expansion on the platform",
262 |         "reach token limit, you can apply for expansion on the platform",
263 |         "Request timed out",
264 |         "The service is temporarily unable to process your request.",
265 |         "upstream failed to respond",
266 |         "502 Bad Gateway",
267 |         "429 Too Many Requests",
268 |         "Retrying request to"
269 |     ]
270 |     while True:
271 |         try:
272 |             gpt_q = call_azure_gpt_api_for_objects(caption, model)
273 |             if gpt_q.startswith("```json"):
274 |                 gpt_q = gpt_q.replace("```json", "").replace("```", "").strip()
275 |             elif gpt_q.startswith("```python"):
276 |                 gpt_q = gpt_q.replace("```python", "").replace("```", "").strip()
277 |             return gpt_q, True
278 |         except openai.RateLimitError as e:
279 |             time.sleep(random.randint(30, 90))
280 |         except openai.APIError as e:
281 |             time.sleep(5)
282 |         except Exception as e:
283 |             return e, False
284 | 
285 | def extract_events(inputs, is_pred=False, max_retry=10):
286 |     data, model, verbose = inputs
287 |     if is_pred:
288 |         caption = data['prediction'].lower()
289 |     else:
290 |         caption = data['response'].lower()
291 |     caption = caption.replace("\"", "\'")
292 |     retry = 0
293 |     while True and (retry<max_retry or max_retry<0):
294 |         retry += 1
295 |         result, success = try_call_api_for_events(caption, model, verbose)
296 |         if not success:
297 |             logger.error(f"try_call_api_for_events failed!")
298 |             continue
299 |         try:
300 |             result = ast.literal_eval(result)
301 |             events = result['events']
302 |             if verbose:
303 |                 logger.info("pred_events=" if is_pred else "gt events=", events, ":", caption)
304 |             assert isinstance(events, list) and (len(events)==0 or isinstance(events[0], str))
305 |             return events
306 |         except Exception as e:
307 |             logger.error(format_exc())
308 |             continue
309 |     logger.error("Exceed max_retry!", flush=True)
310 |     raise ValueError("[error]: Exceed max_retry!")
311 | 
312 | def extract_objects(inputs, is_pred=False, max_retry=10):
313 |     data, model, verbose = inputs
314 |     if is_pred:
315 |         caption = data['prediction'].lower()
316 |     else:
317 |         caption = data['response'].lower()
318 |     caption = caption.replace("\"", "\'")
319 |     retry = 0
320 |     while True and (retry<max_retry or max_retry<0):
321 |         retry += 1
322 |         result, success = try_call_api_for_objects(caption, model, verbose)
323 |         if not success:
324 |             logger.error(f"try_call_api_for_objects failed!")
325 |             continue
326 |         try:
327 |             result = ast.literal_eval(result)
328 |             objects = result['objects']
329 |             if verbose:
330 |                 logger.info("pred_objects=" if is_pred else "gt_objects=", objects, ":", caption)
331 |             assert isinstance(objects, list) and (len(objects)==0 or isinstance(objects[0], str))
332 |             return objects
333 |         except Exception as e:
334 |             logger.error(format_exc())
335 |             continue
336 |     logger.error("Exceed max_retry!", flush=True)
337 |     raise ValueError("[error]: Exceed max_retry!")
338 |         
339 | 
340 | def evaluate_one_sample_for_events(events, response, prediction, model, verbose, return_hit_num=False, is_recall=False, max_retry=10):
341 |     retry = 0
342 |     while True and (retry<max_retry or max_retry<0):
343 |         retry += 1
344 |         try:
345 |             assert isinstance(events, list)
346 |             result = None
347 |             result, success = try_call_api_for_eval_events(events, response, prediction, model, verbose, max_retry)
348 |             if not success:
349 |                 logger.error("try_call_api_for_eval_events failed!", flush=True)
350 |                 continue
351 |             try:
352 |                 events_filled = json.loads(result)
353 |                 events_filled = events_filled['events']
354 |             except Exception as e:
355 |                 logger.error("load json failed:", result)
356 |                 continue
357 |             assert len(events) == len(events_filled) or (len(events) == 0 and len(events_filled) == 1)
358 |             num_matched_events = 0
359 |             try:
360 |                 for event in events_filled:
361 |                     pred = event['relationship'].strip().lower()
362 |                     assert pred in ['entailment', 'neutral', 'contradiction']
363 |                     pos_classes = ['entailment'] if is_recall else ['entailment', 'neutral']
364 |                     if pred in pos_classes:
365 |                         num_matched_events += 1
366 |             except Exception as e:
367 |                 logger.error(f"Invalid response: {events_filled}")
368 |                 continue
369 |             if len(events) == 0:
370 |                 motion_score = 1.0
371 |             else:
372 |                 motion_score = num_matched_events / len(events)
373 |             if return_hit_num:
374 |                 return motion_score, events_filled, f"hit: {num_matched_events} / {len(events)}"
375 |             return motion_score
376 |         except Exception as e:
377 |             logger.error(format_exc(), flush=True)
378 |             continue
379 |     logger.error("Exceed max_retry!", flush=True)
380 |     raise ValueError(f"[error]: Exceed max_retry!")
381 | 
382 | def evaluate_one_sample_for_objects(objects, response, prediction, model, verbose, return_hit_num=False, is_recall=False, max_retry=10):
383 |     retry = 0
384 |     while True and (retry<max_retry or max_retry<0):
385 |         retry += 1
386 |         try:
387 |             assert isinstance(objects, list)
388 |             result = None
389 |             result, success = try_call_api_for_eval_objects(objects, response, prediction, model, verbose, max_retry)
390 |             if not success:
391 |                 logger.error("try_call_api_for_eval_objects failed!", flush=True)
392 |                 continue
393 |             try:
394 |                 objects_filled = json.loads(result)
395 |                 objects_filled = objects_filled['objects']
396 |             except Exception as e:
397 |                 logger.error("load json failed:", result)
398 |                 continue
399 |             assert len(objects) == len(objects_filled) or (len(objects) == 0 and len(objects_filled) == 1)
400 |             num_matched_objects = 0
401 |             try:
402 |                 for object in objects_filled:
403 |                     pred = object['relationship'].strip().lower()
404 |                     assert pred in ['entailment', 'neutral', 'contradiction']
405 |                     pos_classes = ['entailment'] if is_recall else ['entailment', 'neutral']
406 |                     if pred in pos_classes:
407 |                         num_matched_objects += 1
408 |             except Exception as e:
409 |                 logger.error(f"Invalid response: {objects_filled}")
410 |                 continue
411 |             if len(objects) == 0:
412 |                 object_score = 1.0
413 |             else:
414 |                 object_score = num_matched_objects / len(objects)
415 |             if return_hit_num:
416 |                 return object_score, objects_filled, f"hit: {num_matched_objects} / {len(objects)}"
417 |             return object_score
418 |         except Exception as e:
419 |             logger.error(format_exc(), flush=True)
420 |             continue
421 |     logger.error("Exceed max_retry!", flush=True)
422 |     raise ValueError(f"[error]: Exceed max_retry!")
423 | 
424 | 
425 | def process_one_sample(inputs):
426 |     data, model, verbose = inputs
427 |     response, prediction = data['response'].lower(), data['prediction'].lower()
428 |     result = None
429 |     pid = os.getpid()
430 |     logger.info(f"[pid {pid}] Processing idx: {data['idx']}")
431 |     try:
432 |         if isinstance(data.get('events', None), list):
433 |             gt_events = data['events']
434 |         else:
435 |             gt_events = extract_events(inputs, is_pred=False)
436 |         if isinstance(data.get('objects', None), list):
437 |             gt_objects = data['objects']
438 |         else:
439 |             gt_objects = extract_objects(inputs, is_pred=False)
440 |         pred_events = extract_events(inputs, is_pred=True)
441 |         pred_objects = extract_objects(inputs, is_pred=True)
442 |         assert isinstance(gt_events, list) and isinstance(pred_events, list)
443 |         assert isinstance(gt_objects, list) and isinstance(pred_objects, list)
444 |         result = {}
445 |         events_score_r, events_filled_r, events_hit_num_r = evaluate_one_sample_for_events(gt_events, response, prediction, model, verbose, return_hit_num=True, is_recall=True)
446 |         events_score_p, events_filled_p, events_hit_num_p = evaluate_one_sample_for_events(pred_events, prediction, response, model, verbose, return_hit_num=True, is_recall=True)
447 |         objects_score_r, objects_filled_r, objects_hit_num_r = evaluate_one_sample_for_objects(gt_objects, response, prediction, model, verbose, return_hit_num=True, is_recall=True)
448 |         objects_score_p, objects_filled_p, objects_hit_num_p = evaluate_one_sample_for_objects(pred_objects, prediction, response, model, verbose, return_hit_num=True, is_recall=True)
449 |         result['events_score_r'] = events_score_r
450 |         result['events_score_p'] = events_score_p
451 |         result['objects_score_r'] = objects_score_r
452 |         result['objects_score_p'] = objects_score_p
453 |         result['eval_infos'] = {
454 |             'idx': data['idx'],
455 |             'gt': response,
456 |             'pred': prediction,
457 |             'events_gt': events_filled_r,
458 |             'events_hit_num_recall': events_hit_num_r,
459 |             'events_pred': events_filled_p,
460 |             'events_hit_num_precision': events_hit_num_p,
461 |             'objects_gt': objects_filled_r,
462 |             'objects_hit_num_recall': objects_hit_num_r,
463 |             'objects_pred': objects_filled_p,
464 |             'objects_hit_num_precision': objects_hit_num_p,
465 |         }
466 |         if 'extra_info' in data:
467 |             result['extra_info'] = data['extra_info']
468 |     except Exception as e:
469 |         if verbose:
470 |             logger.error(e)
471 |             logger.error(f'invalid GPT response: {result}')
472 |         result = None
473 |         return {'success': False, 'result': result, 'data': data}
474 |     return {'success': True, 'result': result, 'data': data}
475 | 
476 | class DREAMGPTMetric:
477 |     def __init__(self, dataset_name, verbose=False) -> None:
478 |         self.dataset_name = dataset_name
479 |         self.num_worker = 64
480 |         # self.model = 'gpt-35-turbo'
481 |         self.model = 'gpt-3.5-turbo-0125'
482 |         # self.model='gpt-4-1106-preview'
483 |         self.results = []
484 |         self.invalid_results = []
485 |         self.dataset = []
486 |         self.verbose = verbose
487 |         self.eval_infos = []
488 |         self.buckets = {
489 |             "subjects": {
490 |                 '<=1': [], '==2': [], '==3': [], '>=4': []
491 |             },
492 |             "shots": {'<=1': [], '==2': [], '==3': [], '>=4': []
493 |             },
494 |             "events": {'<=3': [], 'in [4, 5]': [], 'in [6, 7]': [], '>=8': []
495 |             }
496 |         }
497 |     
498 |     def add(self, data):
499 |         self.dataset.append(data)
500 |     
501 |     def select_bucket(self, bucket_name, num):
502 |         for key in self.buckets[bucket_name]:
503 |             if eval(f"{num}{key}"):
504 |                 return key
505 |         return ''
506 |     
507 |     def add_to_bucket(self, bucket_name, data):
508 |         sub_bucket = self.select_bucket(bucket_name, data['result']['extra_info'][f'n_{bucket_name}'])
509 |         if sub_bucket:
510 |             self.buckets[bucket_name][sub_bucket].append(data)
511 |     
512 |     def process(self, dataset: List[Dict]):
513 |         self._process_group_by_subtask(dataset)
514 |     
515 |     def _process(self, dataset: List[Dict], subtask=None):
516 |         pool = Pool(processes = self.num_worker, )
517 |         inputs = [(d, self.model, self.verbose) for d in dataset]
518 |         results = pool.uimap(process_one_sample, inputs, chunksize = 1)
519 | 
520 |         for result in tqdm(results, total = len(dataset), desc=f'eval {subtask}'):
521 |             if subtask:
522 |                 result['subtask'] = subtask
523 |             self.update_metric(result)
524 |         pool.close()
525 |         pool.join()
526 |         pool.clear() # MUST
527 | 
528 |     def _process_group_by_subtask(self, dataset: List[Dict]):
529 |         def _group_by_subtask(dataset):
530 |             subtasks = {}
531 |             for data in dataset:
532 |                 if data['dataset'] not in subtasks:
533 |                     subtasks[data['dataset']] = []
534 |                 subtasks[data['dataset']].append(data)
535 |             return subtasks
536 |         subtasks = _group_by_subtask(dataset)
537 |         for subtask, subdata in subtasks.items():
538 |             self._process(subdata, subtask)
539 | 
540 |     def update_metric(self, result):
541 |         if result['success']:
542 |             self.results.append(result)
543 |         else:
544 |             self.invalid_results.append(result)
545 |     
546 |     def summarize_metric(self):
547 |         self._summarize_metric_by_subtask()
548 |         self._summarize_metric_by_bucket()
549 | 
550 |     def _summarize_metric_by_subtask(self):
551 |         from prettytable import PrettyTable
552 |         self.table = PrettyTable(['Task', 'Action F1 Score', 'Action Recall', 'Action Precision', 'Object F1 Score', 'Object Recall', 'Object Precision', 'Success', 'Failed'])
553 |         def _group_by_subtask():
554 |             sub_results = {}
555 |             sub_invalid_results = {}
556 |             for data in self.results:
557 |                 if data['subtask'] not in sub_results:
558 |                     sub_results[data['subtask']] = []
559 |                 sub_results[data['subtask']].append(data)
560 |             for data in self.invalid_results:
561 |                 if data['subtask'] not in sub_invalid_results:
562 |                     sub_invalid_results[data['subtask']] = []
563 |                 sub_invalid_results[data['subtask']].append(data)
564 |             return sub_results, sub_invalid_results
565 |         sub_results, sub_invalid_results = _group_by_subtask()
566 |         events_overall_avg_recall = []
567 |         events_overall_avg_precision = []
568 |         objects_overall_avg_recall = []
569 |         objects_overall_avg_precision = []
570 |         subtasks = list(sub_results.keys())
571 |         subtasks.sort()
572 |         for subtask in subtasks:
573 |             sub_rsts = sub_results[subtask]
574 |             sub_in_rsts = sub_invalid_results.get(subtask, [])
575 |             events_recalls = []
576 |             events_precisions = []
577 |             objects_recalls = []
578 |             objects_precisions = []
579 |             for result in sub_rsts:
580 |                 events_recalls.append(result['result']['events_score_r'])
581 |                 events_precisions.append(result['result']['events_score_p'])
582 |                 objects_recalls.append(result['result']['objects_score_r'])
583 |                 objects_precisions.append(result['result']['objects_score_p'])
584 |                 self.eval_infos.append(result['result']['eval_infos'])
585 |             events_avg_recall = np.average(events_recalls)
586 |             events_avg_precision = np.average(events_precisions)
587 |             events_f1 = count_f1(events_avg_recall, events_avg_precision)
588 |             events_overall_avg_recall.append(events_avg_recall)
589 |             events_overall_avg_precision.append(events_avg_precision)
590 |             objects_avg_recall = np.average(objects_recalls)
591 |             objects_avg_precision = np.average(objects_precisions)
592 |             objects_f1 = count_f1(objects_avg_recall, objects_avg_precision)
593 |             objects_overall_avg_recall.append(objects_avg_recall)
594 |             objects_overall_avg_precision.append(objects_avg_precision)
595 | 
596 |             task_name = subtask
597 |             self.table.add_row([
598 |                 task_name,
599 |                 round(events_f1, 3),
600 |                 round(events_avg_recall, 3),
601 |                 round(events_avg_precision, 3),
602 |                 round(objects_f1, 3),
603 |                 round(objects_avg_recall, 3),
604 |                 round(objects_avg_precision, 3),
605 |                 len(sub_rsts),
606 |                 len(sub_in_rsts),
607 |             ])
608 |         events_overall_recall = np.average(events_overall_avg_recall)
609 |         events_overall_precision = np.average(events_overall_avg_precision)
610 |         events_overall_f1 = count_f1(events_overall_recall, events_overall_precision)
611 |         objects_overall_recall = np.average(objects_overall_avg_recall)
612 |         objects_overall_precision = np.average(objects_overall_avg_precision)
613 |         objects_overall_f1 = count_f1(objects_overall_recall, objects_overall_precision)
614 |         self.table.add_row([
615 |             'OVERALL',
616 |             round(events_overall_f1, 3),
617 |             round(events_overall_recall, 3),
618 |             round(events_overall_precision, 3),
619 |             round(objects_overall_f1, 3),
620 |             round(objects_overall_recall, 3),
621 |             round(objects_overall_precision, 3),
622 |             len(self.results),
623 |             len(self.invalid_results),
624 |         ])
625 |         logger.info(f'=====DREAM Evaluation Summary=====')
626 |         logger.info(self.table)
627 |         
628 | 
629 |     def _summarize_metric_by_bucket(self):
630 |         from prettytable import PrettyTable
631 |         self.bucket_tables = []
632 |         for bucket in self.buckets:
633 |             table = PrettyTable(['Score'] + list(self.buckets[bucket].keys()))
634 |             for data in self.results:
635 |                 self.add_to_bucket(bucket_name=bucket, data=data)
636 |             bucket_result = {}
637 |             for sub_bucket in self.buckets[bucket]:
638 |                 recalls = []
639 |                 precisions = []
640 |                 for result in self.buckets[bucket][sub_bucket]:
641 |                     r, p = result['result']['score_r'], result['result']['score_p']
642 |                     recalls.append(r)
643 |                     precisions.append(p)
644 |                 avg_recall = np.average(recalls)
645 |                 avg_precision = np.average(precisions)
646 |                 f1 = count_f1(avg_recall, avg_precision)
647 |                 bucket_result[sub_bucket] = (avg_recall, avg_precision, f1)
648 |             
649 |             raw = []
650 |             scores = ['Recall', 'Precision', 'F1']
651 |             for i in range(len(scores)):
652 |                 raw = [scores[i]]
653 |                 for sub_bucket in bucket_result:
654 |                     raw.append(round(bucket_result[sub_bucket][i], 3))
655 |                 table.add_row(raw)
656 |             sample_num = ['Count']
657 |             for k in self.buckets[bucket]:
658 |                 sample_num.append(len(self.buckets[bucket][k]))
659 |             table.add_row(sample_num)
660 |             bucket_info = f'\n=====DREAM Evaluation Split by Bucket #{bucket}====='
661 |             logger.info(bucket_info)
662 |             logger.info(table)
663 |             self.bucket_tables.append(bucket_info)
664 |             self.bucket_tables.append(deepcopy(table))
665 |     
666 |     def save_results(self, pred_path):
667 |         if os.path.isdir(pred_path):
668 |             output_dir = os.path.join(pred_path, 'eval_records')
669 |         else:
670 |             output_dir = os.path.join(os.path.dirname(pred_path), 'eval_records')
671 |         os.makedirs(output_dir, exist_ok=True)
672 |         fout = open(os.path.join(output_dir, f'{self.dataset_name}_eval_result.txt'), 'w')
673 |         print(self.table, file=fout)
674 |         # for bucket_info in self.bucket_tables:
675 |         #     print(bucket_info)
676 |         fout.close()
677 |     
678 |     def save_eval_infos(self, pred_path):
679 |         if os.path.isdir(pred_path):
680 |             output_dir = os.path.join(pred_path, 'eval_records')
681 |         else:
682 |             output_dir = os.path.join(os.path.dirname(pred_path), 'eval_records')
683 |         os.makedirs(output_dir, exist_ok=True)
684 |         fout = open(os.path.join(output_dir, 'DREAM_eval_infos.jsonl'), 'w')
685 |         for info in self.eval_infos:
686 |             fout.write(json.dumps(info) +'\n')
687 |         fout.close()
688 |         logger.info(f"DREAM evaluation information saved in: {os.path.join(output_dir, 'DREAM_eval_infos.jsonl')}")
689 | 


--------------------------------------------------------------------------------
/models/tarsier/modeling_tarsier.py:
--------------------------------------------------------------------------------
  1 | # Copyright (2024) Bytedance Ltd. and/or its affiliates
  2 | #
  3 | # Licensed under the Apache License, Version 2.0 (the "License");
  4 | # you may not use this file except in compliance with the License.
  5 | # You may obtain a copy of the License at
  6 | #
  7 | #     http://www.apache.org/licenses/LICENSE-2.0
  8 | #
  9 | # Unless required by applicable law or agreed to in writing, software
 10 | # distributed under the License is distributed on an "AS IS" BASIS,
 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 12 | # See the License for the specific language governing permissions and
 13 | # limitations under the License.
 14 | 
 15 | # copy and modify from: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
 16 | """ PyTorch Llava model."""
 17 | from dataclasses import dataclass
 18 | from typing import List, Optional, Tuple, Union
 19 | import math
 20 | import numpy as np
 21 | 
 22 | import torch
 23 | import torch.utils.checkpoint
 24 | from torch import nn
 25 | import torch.nn.functional as F
 26 | 
 27 | from transformers import PreTrainedModel
 28 | from transformers.activations import ACT2FN
 29 | from transformers.cache_utils import Cache
 30 | from transformers.modeling_outputs import ModelOutput
 31 | from transformers.utils import (
 32 |     add_start_docstrings,
 33 |     add_start_docstrings_to_model_forward,
 34 |     logging,
 35 |     replace_return_docstrings,
 36 | )
 37 | from transformers.models.auto import AutoModel, AutoModelForCausalLM, CONFIG_MAPPING
 38 | from transformers import LlamaForCausalLM
 39 | from transformers.configuration_utils import PretrainedConfig
 40 | 
 41 | 
 42 | logger = logging.get_logger(__name__)
 43 | 
 44 | LLAVA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
 45 |     "llava-hf/llava-v1.5-7b": "https://huggingface.co/llava-hf/llava-v1.5-7b/resolve/main/config.json",
 46 | }
 47 | 
 48 | class LlavaConfig(PretrainedConfig):
 49 |     r"""
 50 |     This is the configuration class to store the configuration of a [`LlavaForConditionalGeneration`]. It is used to instantiate an
 51 |     Llava model according to the specified arguments, defining the model architecture. Instantiating a configuration
 52 |     with the defaults will yield a similar configuration to that of the Llava-9B.
 53 | 
 54 |     e.g. [llava-hf/llava-9b](https://huggingface.co/llava-hf/llava-9b)
 55 | 
 56 |     Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
 57 |     documentation from [`PretrainedConfig`] for more information.
 58 | 
 59 |     Args:
 60 |         vision_config (`LlavaVisionConfig`,  *optional*):
 61 |             Custom vision config or dict
 62 |         text_config (`Union[AutoConfig, dict]`, *optional*):
 63 |             The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`.
 64 |         ignore_index (`int`, *optional*, defaults to -100):
 65 |             The ignore index for the loss function.
 66 |         image_token_index (`int`, *optional*, defaults to 32000):
 67 |             The image token index to encode the image prompt.
 68 |         projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
 69 |             The activation function used by the multimodal projector.
 70 |         vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
 71 |             The feature selection strategy used to select the vision feature from the CLIP backbone.
 72 |         vision_feature_layer (`int`, *optional*, defaults to -2):
 73 |             The index of the layer to select the vision feature.
 74 |         vocab_size (`int`, *optional*, defaults to 32000):
 75 |             Vocabulary size of the Llava model. Defines the number of different tokens that can be represented by the
 76 |             `inputs_ids` passed when calling [`~LlavaForConditionalGeneration`]
 77 | 
 78 |     Example:
 79 | 
 80 |     ```python
 81 |     >>> from transformers import LlavaForConditionalGeneration, LlavaConfig, CLIPVisionConfig, LlamaConfig
 82 | 
 83 |     >>> # Initializing a CLIP-vision config
 84 |     >>> vision_config = CLIPVisionConfig()
 85 | 
 86 |     >>> # Initializing a Llama config
 87 |     >>> text_config = LlamaConfig()
 88 | 
 89 |     >>> # Initializing a Llava llava-1.5-7b style configuration
 90 |     >>> configuration = LlavaConfig(vision_config, text_config)
 91 | 
 92 |     >>> # Initializing a model from the llava-1.5-7b style configuration
 93 |     >>> model = LlavaForConditionalGeneration(configuration)
 94 | 
 95 |     >>> # Accessing the model configuration
 96 |     >>> configuration = model.config
 97 |     ```"""
 98 | 
 99 |     model_type = "llava"
100 |     is_composition = False
101 | 
102 |     def __init__(
103 |         self,
104 |         vision_config=None,
105 |         text_config=None,
106 |         ignore_index=-100,
107 |         image_token_index=32000,
108 |         projector_hidden_act="gelu",
109 |         vision_feature_select_strategy="default",
110 |         vision_feature_layer=-2,
111 |         vocab_size=32000,
112 |         image_newline_idx=32002,
113 |         image_new_idx=32003,
114 |         **kwargs,
115 |     ):
116 |         self.ignore_index = ignore_index
117 |         self.image_token_index = image_token_index
118 |         self.projector_hidden_act = projector_hidden_act
119 |         self.vision_feature_select_strategy = vision_feature_select_strategy
120 |         self.vision_feature_layer = vision_feature_layer
121 |         self.vocab_size = vocab_size
122 |         self.image_newline_idx = image_newline_idx
123 |         self.image_new_idx = image_new_idx
124 | 
125 |         self.vision_config = vision_config
126 | 
127 |         if isinstance(self.vision_config, dict):
128 |             vision_config["model_type"] = (
129 |                 vision_config["model_type"] if "model_type" in vision_config else "clip_vision_model"
130 |             )
131 |             self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
132 |         elif vision_config is None:
133 |             self.vision_config = CONFIG_MAPPING["clip_vision_model"](
134 |                 intermediate_size=4096,
135 |                 hidden_size=1024,
136 |                 patch_size=14,
137 |                 image_size=336,
138 |                 num_hidden_layers=24,
139 |                 num_attention_heads=16,
140 |                 vocab_size=32000,
141 |                 projection_dim=768,
142 |             )
143 |         self.vocab_size = self.vocab_size
144 | 
145 |         self.text_config = text_config
146 | 
147 |         if isinstance(self.text_config, dict):
148 |             text_config["model_type"] = text_config["model_type"] if "model_type" in text_config else "llama"
149 |             self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
150 |             self.vocab_size = self.text_config.vocab_size
151 |         elif text_config is None:
152 |             self.text_config = CONFIG_MAPPING["llama"]()
153 | 
154 |         super().__init__(**kwargs)
155 | 
156 | 
157 | logger = logging.get_logger(__name__)
158 | 
159 | _CONFIG_FOR_DOC = "LlavaConfig"
160 | 
161 | LLAVA_PRETRAINED_MODEL_ARCHIVE_LIST = [
162 |     "llava-hf/llava-1.5-7b-hf",
163 |     "llava-hf/llava-1.5-13b-hf",
164 |     "llava-hf/bakLlava-v1-hf",
165 |     # See all Llava models at https://huggingface.co/models?filter=llava
166 | ]
167 | 
168 | 
169 | class Llava3DPositionalEncoding(nn.Module):
170 |     def __init__(self, num_pos, dim) -> None:
171 |         super().__init__()
172 |         dim1, dim2, dim3 = self.split_dim(dim)
173 |         frame_position_encodings = self.create_sinusoidal_positions(num_pos, dim1)
174 |         height_position_encodings = self.create_sinusoidal_positions(num_pos, dim2)
175 |         width_position_encodings = self.create_sinusoidal_positions(num_pos, dim3)
176 | 
177 |         self.register_buffer('frame_position_encodings', frame_position_encodings, persistent=False)
178 |         self.register_buffer('height_position_encodings', height_position_encodings, persistent=False)
179 |         self.register_buffer('width_position_encodings', width_position_encodings, persistent=False)
180 | 
181 |     def split_dim(self, dim):
182 |         dim1 = dim // 3
183 |         if dim1 % 2 != 0:
184 |             dim1 -= 1
185 | 
186 |         dim2 = dim // 3
187 |         if dim2 % 2 != 0:
188 |             dim2 -= 1
189 | 
190 |         dim3 = dim - dim1 - dim2
191 |         return dim1, dim2, dim3
192 | 
193 |     def create_sinusoidal_positions(self, num_pos: int, dim: int) -> torch.Tensor:
194 |         inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2) / dim))
195 |         sinusoid_inp = torch.einsum("i , j -> i j", torch.arange(num_pos, dtype=torch.float), inv_freq).float()
196 |         return torch.cat((torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)), dim=1)
197 | 
198 |     def forward(self, frame_position_ids, height_position_ids, width_position_ids):
199 |         frame_position_embeds = F.embedding(frame_position_ids, self.frame_position_encodings)
200 |         height_position_embeds = F.embedding(height_position_ids, self.height_position_encodings)
201 |         width_position_embeds = F.embedding(width_position_ids, self.width_position_encodings)
202 | 
203 |         return torch.cat([frame_position_embeds, height_position_embeds, width_position_embeds], dim = -1)
204 | 
205 | 
206 | @dataclass
207 | # Copied from transformers.models.idefics.modeling_idefics.IdeficsCausalLMOutputWithPast with Idefics->Llava
208 | class LlavaCausalLMOutputWithPast(ModelOutput):
209 |     """
210 |     Base class for Llava causal language model (or autoregressive) outputs.
211 | 
212 |     Args:
213 |         loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
214 |             Language modeling loss (for next-token prediction).
215 |         logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
216 |             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
217 |         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
218 |             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
219 |             `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
220 | 
221 |             Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
222 |             `past_key_values` input) to speed up sequential decoding.
223 |         hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
224 |             Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
225 |             one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
226 | 
227 |             Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
228 |         attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
229 |             Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
230 |             sequence_length)`.
231 | 
232 |             Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
233 |             heads.
234 |         image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
235 |             Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
236 |             sequence_length, hidden_size)`.
237 | 
238 |             image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
239 |     """
240 | 
241 |     loss: Optional[torch.FloatTensor] = None
242 |     logits: torch.FloatTensor = None
243 |     past_key_values: Optional[List[torch.FloatTensor]] = None
244 |     hidden_states: Optional[Tuple[torch.FloatTensor]] = None
245 |     attentions: Optional[Tuple[torch.FloatTensor]] = None
246 |     image_hidden_states: Optional[Tuple[torch.FloatTensor]] = None
247 |     vision_outputs: Optional[torch.FloatTensor] = None
248 |     llm_attn_mask: Optional[Tuple[torch.FloatTensor]] = None
249 | 
250 | 
251 | class LlavaMultiModalProjector(nn.Module):
252 |     def __init__(self, config: LlavaConfig):
253 |         super().__init__()
254 | 
255 |         self.linear_1 = nn.Linear(config.vision_config.hidden_size, config.text_config.hidden_size, bias=True)
256 |         self.act = ACT2FN[config.projector_hidden_act]
257 |         self.linear_2 = nn.Linear(config.text_config.hidden_size, config.text_config.hidden_size, bias=True)
258 | 
259 |     def forward(self, image_features):
260 |         hidden_states = self.linear_1(image_features)
261 |         hidden_states = self.act(hidden_states)
262 |         hidden_states = self.linear_2(hidden_states)
263 |         return hidden_states
264 | 
265 | 
266 | TARSIER_START_DOCSTRING = r"""
267 |     This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
268 |     library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
269 |     etc.)
270 | 
271 |     This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
272 |     Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
273 |     and behavior.
274 | 
275 |     Parameters:
276 |         config ([`LlavaConfig`] or [`LlavaVisionConfig`]):
277 |             Model configuration class with all the parameters of the model. Initializing with a config file does not
278 |             load the weights associated with the model, only the configuration. Check out the
279 |             [`~PreTrainedModel.from_pretrained`] method to load the model weights.
280 | """
281 | 
282 | 
283 | @add_start_docstrings(
284 |     "The bare LLaMA Model outputting raw hidden-states without any specific head on top.",
285 |     TARSIER_START_DOCSTRING,
286 | )
287 | class TarsierPreTrainedModel(PreTrainedModel):
288 |     config_class = LlavaConfig
289 |     base_model_prefix = "model"
290 |     supports_gradient_checkpointing = True
291 |     _no_split_modules = ["LlavaVisionAttention"]
292 |     _skip_keys_device_placement = "past_key_values"
293 |     _supports_flash_attn_2 = True
294 | 
295 |     def _init_weights(self, module):
296 |         # important: this ported version of Llava isn't meant for training from scratch - only
297 |         # inference and fine-tuning - so the proper init weights code has been removed - the original codebase
298 |         # https://github.com/haotian-liu/LLaVA/tree/main/llava should serve for that purpose
299 |         std = (
300 |             self.config.initializer_range
301 |             if hasattr(self.config, "initializer_range")
302 |             else self.config.text_config.initializer_range
303 |         )
304 | 
305 |         if hasattr(module, "class_embedding"):
306 |             module.class_embedding.data.normal_(mean=0.0, std=std)
307 | 
308 |         if isinstance(module, (nn.Linear, nn.Conv2d)):
309 |             module.weight.data.normal_(mean=0.0, std=std)
310 |             if module.bias is not None:
311 |                 module.bias.data.zero_()
312 |         elif isinstance(module, nn.Embedding):
313 |             module.weight.data.normal_(mean=0.0, std=std)
314 |             if module.padding_idx is not None:
315 |                 module.weight.data[module.padding_idx].zero_()
316 | 
317 |     @property
318 |     def _supports_sdpa(self):
319 |         """
320 |         Retrieve language_model's attribute to check whether the model supports
321 |         SDPA or not.
322 |         """
323 |         return self.language_model._supports_sdpa
324 | 
325 | 
326 | TARSIER_INPUTS_DOCSTRING = r"""
327 |     Args:
328 |         input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
329 |             Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
330 |             it.
331 | 
332 |             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
333 |             [`PreTrainedTokenizer.__call__`] for details.
334 | 
335 |             [What are input IDs?](../glossary#input-ids)
336 |         pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
337 |             The tensors corresponding to the input images. Pixel values can be obtained using
338 |             [`AutoImageProcessor`]. See [`CLIPImageProcessor.__call__`] for details ([]`LlavaProcessor`] uses
339 |             [`CLIPImageProcessor`] for processing images).
340 |         attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
341 |             Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
342 | 
343 |             - 1 for tokens that are **not masked**,
344 |             - 0 for tokens that are **masked**.
345 | 
346 |             [What are attention masks?](../glossary#attention-mask)
347 | 
348 |             Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
349 |             [`PreTrainedTokenizer.__call__`] for details.
350 | 
351 |             If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
352 |             `past_key_values`).
353 | 
354 |             If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
355 |             and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
356 |             information on the default strategy.
357 | 
358 |             - 1 indicates the head is **not masked**,
359 |             - 0 indicates the head is **masked**.
360 |         position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
361 |             Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
362 |             config.n_positions - 1]`. [What are position IDs?](../glossary#position-ids)
363 |         past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
364 |             Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
365 |             `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
366 |             `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
367 | 
368 |             Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
369 |             blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
370 | 
371 |             If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
372 |             don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
373 |             `decoder_input_ids` of shape `(batch_size, sequence_length)`.
374 |         inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
375 |             Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
376 |             is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
377 |             model's internal embedding lookup matrix.
378 |         use_cache (`bool`, *optional*):
379 |             If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
380 |             `past_key_values`).
381 |         output_attentions (`bool`, *optional*):
382 |             Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
383 |             tensors for more detail.
384 |         output_hidden_states (`bool`, *optional*):
385 |             Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
386 |             more detail.
387 |         return_dict (`bool`, *optional*):
388 |             Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
389 | """
390 | 
391 | 
392 | @add_start_docstrings(
393 |     """The LLAVA model which consists of a vision backbone and a language model.""",
394 |     TARSIER_INPUTS_DOCSTRING,
395 | )
396 | class TarsierForConditionalGeneration(TarsierPreTrainedModel):
397 |     def __init__(self, config: LlavaConfig):
398 |         super().__init__(config)
399 |         self.vision_tower = AutoModel.from_config(config.vision_config, trust_remote_code=True)
400 |         self.multi_modal_projector = LlavaMultiModalProjector(config)
401 |         self.vocab_size = config.vocab_size
402 |         self.language_model = AutoModelForCausalLM.from_config(config.text_config, attn_implementation="flash_attention_2")
403 |         image_newline_idx = torch.tensor([config.image_newline_idx], dtype=torch.long)
404 |         image_new_idx = torch.tensor([config.image_new_idx], dtype=torch.long)
405 |         self.register_buffer('image_newline_idx', image_newline_idx, persistent=False)
406 |         self.register_buffer('image_new_idx', image_new_idx, persistent=False)
407 |         self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
408 |         self.post_init()
409 | 
410 |     def get_input_embeddings(self):
411 |         return self.language_model.get_input_embeddings()
412 | 
413 |     def set_input_embeddings(self, value):
414 |         self.language_model.set_input_embeddings(value)
415 | 
416 |     def get_output_embeddings(self):
417 |         return self.language_model.get_output_embeddings()
418 | 
419 |     def set_output_embeddings(self, new_embeddings):
420 |         self.language_model.set_output_embeddings(new_embeddings)
421 | 
422 |     def set_decoder(self, decoder):
423 |         self.language_model.set_decoder(decoder)
424 | 
425 |     def get_decoder(self):
426 |         return self.language_model.get_decoder()
427 | 
428 |     def tie_weights(self):
429 |         return self.language_model.tie_weights()
430 | 
431 |     def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None) -> nn.Embedding:
432 |         model_embeds = self.language_model.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
433 |         # update vocab size
434 |         self.config.text_config.vocab_size = model_embeds.num_embeddings
435 |         self.config.vocab_size = model_embeds.num_embeddings
436 |         self.vocab_size = model_embeds.num_embeddings
437 |         return model_embeds
438 | 
439 |     def _merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids, attention_mask, labels):
440 |         num_images, num_image_patches, embed_dim = image_features.shape
441 | 
442 |         batch_size, sequence_length = input_ids.shape
443 |         left_padding = not torch.sum(input_ids[:, -1] == torch.tensor(self.pad_token_id))
444 |         # 1. Create a mask to know where special image tokens are
445 |         special_image_token_mask = input_ids == self.config.image_token_index
446 |         num_special_image_tokens = torch.sum(special_image_token_mask, dim=-1)
447 |         # Compute the maximum embed dimension
448 |         max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
449 |         batch_indices, non_image_indices = torch.where(input_ids != self.config.image_token_index)
450 | 
451 |         # 2. Compute the positions where text should be written
452 |         # Calculate new positions for text tokens in merged image-text sequence.
453 |         # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
454 |         # `torch.cumsum` computes how each image token shifts subsequent text token positions.
455 |         # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
456 |         new_token_positions = torch.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
457 |         nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
458 |         if left_padding:
459 |             new_token_positions += nb_image_pad[:, None]  # offset for left padding
460 |         text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
461 | 
462 |         # 3. Create the full embedding, already padded to the maximum position
463 |         final_embedding = torch.zeros(
464 |             batch_size, max_embed_dim, embed_dim, dtype=inputs_embeds.dtype, device=inputs_embeds.device
465 |         )
466 |         final_attention_mask = torch.zeros(
467 |             batch_size, max_embed_dim, dtype=attention_mask.dtype, device=inputs_embeds.device
468 |         )
469 |         if labels is not None:
470 |             final_labels = torch.full(
471 |                 (batch_size, max_embed_dim), self.config.ignore_index, dtype=input_ids.dtype, device=input_ids.device
472 |             )
473 |         # In case the Vision model or the Language model has been offloaded to CPU, we need to manually
474 |         # set the corresponding tensors into their correct target device.
475 |         target_device = inputs_embeds.device
476 |         batch_indices, non_image_indices, text_to_overwrite = (
477 |             batch_indices.to(target_device),
478 |             non_image_indices.to(target_device),
479 |             text_to_overwrite.to(target_device),
480 |         )
481 |         attention_mask = attention_mask.to(target_device)
482 | 
483 |         # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
484 |         # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
485 |         final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
486 |         final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
487 |         if labels is not None:
488 |             final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_image_indices]
489 | 
490 |         # 5. Fill the embeddings corresponding to the images. Anything that is still zeros needs filling
491 |         image_to_overwrite = torch.all(final_embedding == 0, dim=-1)
492 |         image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None].to(target_device)
493 | 
494 |         if image_to_overwrite.sum() != image_features.shape[:-1].numel():
495 |             raise ValueError(
496 |                 f"The input provided to the model are wrong. The number of image tokens is {torch.sum(special_image_token_mask)} while"
497 |                 f" the number of image given to the model is {num_images}. This prevents correct indexing and breaks batch generation."
498 |             )
499 | 
500 |         final_embedding[image_to_overwrite] = image_features.contiguous().reshape(-1, embed_dim).to(target_device)
501 |         final_attention_mask |= image_to_overwrite
502 |         position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
503 | 
504 |         if labels is None:
505 |             final_labels = None
506 | 
507 |         return final_embedding, final_attention_mask, final_labels, position_ids
508 |     
509 |     def add_split_tokens(self, image_features):
510 |         num_images, num_image_patches, embed_dim = image_features.shape
511 |         num_height_patches, num_width_patches = int(math.sqrt(num_image_patches)), int(math.sqrt(num_image_patches))
512 | 
513 |         # add image_newline
514 |         image_newline = self.get_input_embeddings()(self.image_newline_idx).squeeze()
515 |         image_features = image_features.view(num_images, num_height_patches, num_width_patches, embed_dim)
516 |         image_features = torch.cat([
517 |             image_features,
518 |             image_newline.expand((num_images, num_height_patches, 1, embed_dim)).to(device=image_features.device)
519 |         ], dim=2)
520 |         num_image_patches += num_height_patches
521 |         image_features = image_features.view(num_images, num_image_patches, embed_dim)
522 | 
523 |         # add image_new
524 |         image_new = self.get_input_embeddings()(self.image_new_idx).squeeze()
525 |         image_features = torch.cat([
526 |             image_features,
527 |             image_new.expand((num_images, 1, embed_dim)).to(device=image_features.device)
528 |         ], dim = 1)
529 | 
530 |         return image_features
531 | 
532 |     @add_start_docstrings_to_model_forward(TARSIER_INPUTS_DOCSTRING)
533 |     @replace_return_docstrings(output_type=LlavaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
534 |     def forward(
535 |         self,
536 |         input_ids: torch.LongTensor = None,
537 |         pixel_values: torch.FloatTensor = None,
538 |         attention_mask: Optional[torch.Tensor] = None,
539 |         position_ids: Optional[torch.LongTensor] = None,
540 |         past_key_values: Optional[List[torch.FloatTensor]] = None,
541 |         inputs_embeds: Optional[torch.FloatTensor] = None,
542 |         vision_feature_layer: Optional[int] = None,
543 |         vision_feature_select_strategy: Optional[str] = None,
544 |         labels: Optional[torch.LongTensor] = None,
545 |         use_cache: Optional[bool] = None,
546 |         output_attentions: Optional[bool] = None,
547 |         output_hidden_states: Optional[bool] = None,
548 |         return_dict: Optional[bool] = None,
549 |         **kwargs,
550 |     ) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
551 |         r"""
552 |         Args:
553 |             labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
554 |                 Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
555 |                 config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
556 |                 (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
557 | 
558 |         Returns:
559 | 
560 |         Example:
561 | 
562 |         ```python
563 |         >>> from PIL import Image
564 |         >>> import requests
565 |         >>> from transformers import AutoProcessor, LlavaForConditionalGeneration
566 | 
567 |         >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
568 |         >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
569 | 
570 |         >>> prompt = "<image>\nUSER: What's the content of the image?\nASSISTANT:"
571 |         >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
572 |         >>> image = Image.open(requests.get(url, stream=True).raw)
573 | 
574 |         >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
575 | 
576 |         >>> # Generate
577 |         >>> generate_ids = model.generate(**inputs, max_length=30)
578 |         >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
579 |         "\nUSER: What's the content of the image?\nASSISTANT: The image features a stop sign on a street corner"
580 |         ```"""
581 |         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
582 |         output_hidden_states = (
583 |             output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
584 |         )
585 |         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
586 |         vision_feature_layer = (
587 |             vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
588 |         )
589 |         vision_feature_select_strategy = (
590 |             vision_feature_select_strategy
591 |             if vision_feature_select_strategy is not None
592 |             else self.config.vision_feature_select_strategy
593 |         )
594 | 
595 |         image_features = None
596 |         if inputs_embeds is None:
597 |             # 1. Extra the input embeddings
598 |             inputs_embeds = self.get_input_embeddings()(input_ids)
599 |             
600 |             # 2. Merge text and images
601 |             if pixel_values is not None and input_ids.shape[1] != 1:
602 |                 pixel_values = pixel_values.to(dtype=self.vision_tower.dtype)
603 |                 image_outputs = self.vision_tower(pixel_values, output_hidden_states=True)
604 |                 # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
605 |                 selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
606 | 
607 |                 if vision_feature_select_strategy == "default":
608 |                     selected_image_feature = selected_image_feature[:, 1:]
609 |                 elif vision_feature_select_strategy == "full":
610 |                     selected_image_feature = selected_image_feature
611 |                 else:
612 |                     raise ValueError(
613 |                         f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}"
614 |                     )
615 | 
616 |                 image_features = self.multi_modal_projector(selected_image_feature)
617 | 
618 |                 special_image_token_mask = input_ids == self.config.image_token_index
619 |                 num_special_image_tokens = torch.sum(special_image_token_mask, dim = -1)
620 | 
621 |                 image_features = self.add_split_tokens(image_features)
622 | 
623 |                 if sum(num_special_image_tokens) > 0:
624 |                     # print(f'num_special_image_tokens: {num_special_image_tokens}')
625 |                     inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
626 |                         image_features, inputs_embeds, input_ids, attention_mask, labels
627 |                     )
628 |                 else:
629 |                     inputs_embeds = image_features.sum(dim=(0,1))[None, None, :] * 0. + inputs_embeds
630 | 
631 |                 if labels is None:
632 |                     labels = torch.full_like(attention_mask, self.config.ignore_index).to(torch.long)
633 |             else:
634 |                 # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
635 |                 # generation with cache
636 |                 if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
637 |                     # Retrieve the first layer to inspect the logits and mask out the hidden states
638 |                     # that are set to 0
639 |                     first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
640 | 
641 |                     # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
642 |                     batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
643 | 
644 |                     # Get the target length
645 |                     target_seqlen = first_layer_past_key_value.shape[-1] + 1
646 |                     extended_attention_mask = torch.ones(
647 |                         (attention_mask.shape[0], target_seqlen),
648 |                         dtype=attention_mask.dtype,
649 |                         device=attention_mask.device,
650 |                     )
651 | 
652 |                     extended_attention_mask[batch_index, non_attended_tokens] = 0
653 | 
654 |                     valid_indices = torch.ones_like(attention_mask)
655 |                     valid_indices[:, 0] = target_seqlen - extended_attention_mask.sum(dim=-1)
656 |                     valid_indices = torch.cumsum(valid_indices, dim=-1)
657 |                     extended_attention_mask = extended_attention_mask.scatter(1, valid_indices, attention_mask)
658 |                     attention_mask = extended_attention_mask
659 |                     position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
660 |         outputs = self.language_model(
661 |             attention_mask=attention_mask,
662 |             position_ids=position_ids,
663 |             past_key_values=past_key_values,
664 |             inputs_embeds=inputs_embeds,
665 |             use_cache=use_cache,
666 |             output_attentions=output_attentions,
667 |             output_hidden_states=output_hidden_states,
668 |             # use_rmpad=kwargs.get("use_rmpad", False),
669 |             return_dict=return_dict,
670 |         )
671 | 
672 |         logits = outputs[0]
673 | 
674 |         loss = None
675 |         if labels is not None:
676 |             # Shift so that tokens < n predict n
677 |             if attention_mask is not None:
678 |                 shift_attention_mask = attention_mask[..., 1:]
679 |                 shift_logits = logits[..., :-1, :][shift_attention_mask.to(logits.device) != 0].contiguous()
680 |                 shift_labels = labels[..., 1:][shift_attention_mask.to(labels.device) != 0].contiguous()
681 |             else:
682 |                 shift_logits = logits[..., :-1, :].contiguous()
683 |                 shift_labels = labels[..., 1:].contiguous()
684 |             # Flatten the tokens
685 |             loss_fct = nn.CrossEntropyLoss()
686 |             loss = loss_fct(
687 |                 shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1).to(shift_logits.device)
688 |             )
689 | 
690 |         if not return_dict:
691 |             output = (logits,) + outputs[1:]
692 |             return (loss,) + output if loss is not None else output
693 | 
694 |         return LlavaCausalLMOutputWithPast(
695 |             loss=loss,
696 |             logits=logits,
697 |             past_key_values=outputs.past_key_values,
698 |             hidden_states=outputs.hidden_states,
699 |             attentions=outputs.attentions,
700 |             llm_attn_mask=attention_mask
701 |         )
702 | 
703 |     def prepare_inputs_for_generation(
704 |         self, input_ids, past_key_values=None, inputs_embeds=None, pixel_values=None, attention_mask=None, **kwargs
705 |     ):
706 |         if past_key_values is not None:
707 |             if isinstance(past_key_values, Cache):
708 |                 cache_length = past_key_values.get_seq_length()
709 |                 past_length = past_key_values.seen_tokens
710 |             else:
711 |                 cache_length = past_length = past_key_values[0][0].shape[2]
712 | 
713 |             # Keep only the unprocessed tokens:
714 |             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
715 |             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as
716 |             # input)
717 |             if attention_mask is not None and attention_mask.shape[1] > input_ids.shape[1]:
718 |                 input_ids = input_ids[:, -(attention_mask.shape[1] - past_length) :]
719 |             # 2 - If the past_length is smaller than input_ids', then input_ids holds all input tokens. We can discard
720 |             # input_ids based on the past_length.
721 |             elif past_length < input_ids.shape[1]:
722 |                 input_ids = input_ids[:, past_length:]
723 |             # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
724 |             elif self.config.image_token_index in input_ids:
725 |                 input_ids = input_ids[:, input_ids.shape[1] - 1 :]
726 |             # If the cache has seen more tokens than it can hold, then the cache has a size limit. Let's discard the
727 |             # older attention values, as their corresponding values are not part of the input.
728 |             if cache_length < past_length and attention_mask is not None:
729 |                 attention_mask = attention_mask[:, -(cache_length + input_ids.shape[1]) :]
730 | 
731 |         position_ids = kwargs.get("position_ids", None)
732 |         if attention_mask is not None and position_ids is None:
733 |             # create position_ids on the fly for batch generation
734 |             position_ids = attention_mask.long().cumsum(-1) - 1
735 |             position_ids.masked_fill_(attention_mask == 0, 1)
736 |             if past_key_values:
737 |                 position_ids = position_ids[:, -input_ids.shape[1] :]
738 | 
739 |         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
740 |         if inputs_embeds is not None and past_key_values is None:
741 |             model_inputs = {"inputs_embeds": inputs_embeds}
742 |         else:
743 |             model_inputs = {"input_ids": input_ids}
744 | 
745 |         model_inputs.update(
746 |             {
747 |                 "position_ids": position_ids,
748 |                 "past_key_values": past_key_values,
749 |                 "use_cache": kwargs.get("use_cache"),
750 |                 "attention_mask": attention_mask,
751 |                 "pixel_values": pixel_values,
752 |             }
753 |         )
754 |         return model_inputs
755 | 
756 |     def _reorder_cache(self, *args, **kwargs):
757 |         return self.language_model._reorder_cache(*args, **kwargs)
758 | 


--------------------------------------------------------------------------------