├── src
    ├── __init__.py
    ├── serve
    │   ├── __init__.py
    │   └── app.py
    ├── train
    │   ├── __init__.py
    │   ├── reward_funcs.py
    │   ├── train_utils.py
    │   ├── monkey_patch_vision.py
    │   ├── train_grpo.py
    │   ├── train_cls.py
    │   ├── train_sft.py
    │   └── train_dpo.py
    ├── loss
    │   ├── __init__.py
    │   ├── loss_factory.py
    │   ├── focal_loss.py
    │   └── class_balance_loss.py
    ├── model
    │   └── __init__.py
    ├── trainer
    │   ├── __init__.py
    │   ├── cls_trainer.py
    │   └── dpo_trainer.py
    ├── dataset
    │   ├── __init__.py
    │   ├── data_utils.py
    │   ├── grpo_dataset.py
    │   ├── cls_dataset.py
    │   ├── dpo_dataset.py
    │   └── sft_dataset.py
    ├── constants.py
    ├── merge_lora_weights.py
    ├── utils.py
    └── params.py
├── scripts
    ├── merge_lora.sh
    ├── zero2.json
    ├── zero2_offload.json
    ├── zero3.json
    ├── zero3_offload.json
    ├── finetune_grpo.sh
    ├── finetune_cls.sh
    ├── finetune_dpo.sh
    ├── finetune.sh
    ├── finetune_video.sh
    ├── finetune_lora.sh
    └── finetune_lora_vision.sh
├── requirements.txt
├── .gitignore
├── environment.yaml
└── LICENSE


/src/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/serve/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/train/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/src/loss/__init__.py:
--------------------------------------------------------------------------------
1 | from .loss_factory import get_loss_function
2 | 
3 | __all__= [
4 |     "get_loss_function",
5 | ]


--------------------------------------------------------------------------------
/src/model/__init__.py:
--------------------------------------------------------------------------------
1 | from .modeling_cls import Qwen2VLForSequenceClassification, Qwen2_5_VLForSequenceClassification
2 | 
3 | __all__ = [
4 |     "Qwen2VLForSequenceClassification",
5 |     "Qwen2_5_VLForSequenceClassification",
6 | ]


--------------------------------------------------------------------------------
/src/trainer/__init__.py:
--------------------------------------------------------------------------------
1 | from .dpo_trainer import QwenDPOTrainer
2 | from .sft_trainer import QwenSFTTrainer, GenerativeEvalPrediction
3 | from .grpo_trainer import QwenGRPOTrainer
4 | from .cls_trainer import QwenCLSTrainer
5 | 
6 | __all__ = ["QwenSFTTrainer", "QwenDPOTrainer", "QwenGRPOTrainer", "QwenCLSTrainer", "GenerativeEvalPrediction"]


--------------------------------------------------------------------------------
/src/dataset/__init__.py:
--------------------------------------------------------------------------------
 1 | from .dpo_dataset import make_dpo_data_module
 2 | from .sft_dataset import make_supervised_data_module
 3 | from .grpo_dataset import make_grpo_data_module
 4 | from .cls_dataset import make_classification_data_module
 5 | 
 6 | __all__ =[
 7 |     "make_dpo_data_module",
 8 |     "make_supervised_data_module",
 9 |     "make_grpo_data_module",
10 |     "make_classification_data_module",
11 | ]


--------------------------------------------------------------------------------
/scripts/merge_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
 4 | # MODEL_NAME="Qwen/Qwen2-VL-2B-Instruct"
 5 | 
 6 | export PYTHONPATH=src:$PYTHONPATH
 7 | 
 8 | python src/merge_lora_weights.py \
 9 |     --model-path /home/workspace/Qwen2-VL-Finetune/output/testing_lora \
10 |     --model-base $MODEL_NAME  \
11 |     --save-model-path /home/workspace/Qwen2-VL-Finetune/output/merge_test \
12 |     --safe-serialization


--------------------------------------------------------------------------------
/src/constants.py:
--------------------------------------------------------------------------------
 1 | IGNORE_INDEX = -100
 2 | 
 3 | DEFAULT_IM_START_TOKEN = "<|im_start|>"
 4 | DEFAULT_IM_END_TOKEN = "<|im_end|>"
 5 | DEFAULT_IMAGE_TOKEN = "<|image_pad|>"
 6 | DEFAULT_VIDEO_TOKEN = "<|video_pad|>"
 7 | LLAVA_IMAGE_TOKEN = "<image>"
 8 | LLAVA_VIDEO_TOKEN = "<video>"
 9 | VISION_START_TOKEN = "<|vision_start|>"
10 | VISION_END_TOKEN = "<|vision_end|>"
11 | 
12 | SYSTEM_MESSAGE = "You are a helpful assistant."
13 | 
14 | MULTIMODAL_KEYWORDS = ["pixel_values", "image_grid_thw", "video_grid_thw", "pixel_values_videos", "second_per_grid_ts"]


--------------------------------------------------------------------------------
/scripts/zero2.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "train_micro_batch_size_per_gpu": "auto",
14 |   "train_batch_size": "auto",
15 |   "gradient_accumulation_steps": "auto",
16 |   "zero_optimization": {
17 |     "stage": 2,
18 |     "overlap_comm": true,
19 |     "contiguous_gradients": true,
20 |     "sub_group_size": 1e9,
21 |     "reduce_bucket_size": "auto"
22 |   }
23 | }
24 | 


--------------------------------------------------------------------------------
/scripts/zero2_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "train_micro_batch_size_per_gpu": "auto",
14 |   "train_batch_size": "auto",
15 |   "gradient_accumulation_steps": "auto",
16 |   "zero_optimization": {
17 |     "stage": 2,
18 |     "offload_optimizer": {
19 |       "device": "cpu",
20 |       "pin_memory": true
21 |     },
22 |     "offload_param": {
23 |       "device": "cpu",
24 |       "pin_memory": true
25 |     },
26 |     "overlap_comm": true,
27 |     "contiguous_gradients": true,
28 |     "sub_group_size": 1e9,
29 |     "reduce_bucket_size": "auto"
30 |   }
31 | }
32 | 


--------------------------------------------------------------------------------
/scripts/zero3.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "train_micro_batch_size_per_gpu": "auto",
14 |   "train_batch_size": "auto",
15 |   "gradient_accumulation_steps": "auto",
16 |   "zero_optimization": {
17 |     "stage": 3,
18 |     "overlap_comm": true,
19 |     "contiguous_gradients": true,
20 |     "sub_group_size": 1e9,
21 |     "reduce_bucket_size": "auto",
22 |     "stage3_prefetch_bucket_size": "auto",
23 |     "stage3_param_persistence_threshold": "auto",
24 |     "stage3_max_live_parameters": 1e9,
25 |     "stage3_max_reuse_distance": 1e9,
26 |     "stage3_gather_16bit_weights_on_model_save": true
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/merge_lora_weights.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | from utils import get_model_name_from_path, load_pretrained_model
 3 | 
 4 | def merge_lora(args):
 5 |     model_name = get_model_name_from_path(args.model_path)
 6 |     processor, model = load_pretrained_model(model_path=args.model_path, model_base=args.model_base,
 7 |                                              model_name=model_name, device_map='cpu')
 8 | 
 9 |     model.save_pretrained(args.save_model_path, safe_serialization=args.safe_serialization)
10 |     processor.save_pretrained(args.save_model_path)
11 | 
12 | 
13 | if __name__ == "__main__":
14 |     parser = argparse.ArgumentParser()
15 |     parser.add_argument("--model-path", type=str, required=True)
16 |     parser.add_argument("--model-base", type=str, required=True)
17 |     parser.add_argument("--save-model-path", type=str, required=True)
18 |     parser.add_argument("--safe-serialization", action='store_true')
19 | 
20 |     args = parser.parse_args()
21 | 
22 |     merge_lora(args)


--------------------------------------------------------------------------------
/src/loss/loss_factory.py:
--------------------------------------------------------------------------------
 1 | from .class_balance_loss import ClassBalancedCrossEntropyLoss, ClassBalancedFocalLoss
 2 | from .focal_loss import FocalLossCE
 3 | import torch.nn as nn
 4 | 
 5 | def get_loss_function(training_args, samples_per_class=None):
 6 |     
 7 |     if training_args.loss_type == "cross_entropy":
 8 |         return nn.CrossEntropyLoss()
 9 |     
10 |     elif training_args.loss_type == "focal_loss":
11 |         alpha = None if training_args.focal_alpha is None else [float(a) for a in training_args.focal_alpha.split(",")]
12 |         return FocalLossCE(alpha=alpha, gamma=training_args.focal_gamma, reduction="mean")
13 |     
14 |     elif training_args.loss_type == "class_balanced_cross_entropy":
15 |         return ClassBalancedCrossEntropyLoss(samples_per_cls=samples_per_class, beta=training_args.class_balanced_beta, reduction="mean")
16 |     
17 |     elif training_args.loss_type == "class_balanced_focal_loss":
18 |         return ClassBalancedFocalLoss(samples_per_cls=samples_per_class, beta=training_args.class_balanced_beta, gamma=training_args.focal_gamma, reduction="mean")
19 | 
20 |     else:
21 |         raise ValueError(f"Unknown loss type: {training_args.loss_type}")


--------------------------------------------------------------------------------
/scripts/zero3_offload.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "fp16": {
 3 |     "enabled": "auto",
 4 |     "loss_scale": 0,
 5 |     "loss_scale_window": 1000,
 6 |     "initial_scale_power": 16,
 7 |     "hysteresis": 2,
 8 |     "min_loss_scale": 1
 9 |   },
10 |   "bf16": {
11 |     "enabled": "auto"
12 |   },
13 |   "optimizer": {
14 |     "type": "AdamW",
15 |     "params": {
16 |       "lr": "auto",
17 |       "betas": "auto",
18 |       "eps": "auto",
19 |       "weight_decay": "auto"
20 |     }
21 |   },
22 |   "zero_optimization": {
23 |     "stage": 3,
24 |     "offload_optimizer": {
25 |       "device": "cpu",
26 |       "pin_memory": true
27 |     },
28 |     "offload_param": {
29 |       "device": "cpu",
30 |       "pin_memory": true
31 |     },
32 |     "overlap_comm": true,
33 |     "contiguous_gradients": true,
34 |     "sub_group_size": 1e9,
35 |     "reduce_bucket_size": "auto",
36 |     "stage3_prefetch_bucket_size": "auto",
37 |     "stage3_param_persistence_threshold": "auto",
38 |     "stage3_max_live_parameters": 1e9,
39 |     "stage3_max_reuse_distance": 1e9,
40 |     "gather_16bit_weights_on_model_save": true
41 |   },
42 |   "gradient_accumulation_steps": "auto",
43 |   "gradient_clipping": "auto",
44 |   "train_batch_size": "auto",
45 |   "train_micro_batch_size_per_gpu": "auto",
46 |   "steps_per_print": 1e5,
47 |   "wall_clock_breakdown": false
48 | }
49 | 


--------------------------------------------------------------------------------
/scripts/finetune_grpo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
 4 | # MODEL_NAME="Qwen/Qwen2-VL-2B-Instruct"
 5 | MODEL_NAME="Qwen/Qwen2.5-VL-3B-Instruct"
 6 | 
 7 | export PYTHONPATH=src:$PYTHONPATH
 8 | 
 9 | deepspeed src/train/train_grpo.py \
10 |     --deepspeed scripts/zero3.json \
11 |     --use_liger_loss True \
12 |     --model_id $MODEL_NAME \
13 |     --data_path /path/to/your/training/data.json \
14 |     --image_folder /path/to/your/image/folder \
15 |     --freeze_vision_tower False \
16 |     --freeze_llm True \
17 |     --freeze_merger False \
18 |     --bf16 True \
19 |     --fp16 False \
20 |     --disable_flash_attn2 False \
21 |     --output_dir output/test_grpo \
22 |     --num_train_epochs 1 \
23 |     --num_generations 2 \
24 |     --per_device_train_batch_size 1 \
25 |     --gradient_accumulation_steps 1 \
26 |     --max_completion_length 256 \
27 |     --max_prompt_length 512 \
28 |     --image_min_pixels $((128 * 28 * 28)) \
29 |     --image_max_pixels $((256 * 28 * 28)) \
30 |     --learning_rate 5e-6 \
31 |     --remove_unused_columns False \
32 |     --weight_decay 0.1 \
33 |     --warmup_ratio 0.03 \
34 |     --lr_scheduler_type "cosine" \
35 |     --logging_steps 1 \
36 |     --tf32 True \
37 |     --gradient_checkpointing True \
38 |     --report_to tensorboard \
39 |     --lazy_preprocess True \
40 |     --save_strategy "epoch" \
41 |     --save_total_limit 10 \
42 |     --dataloader_num_workers 4


--------------------------------------------------------------------------------
/scripts/finetune_cls.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
 4 | # MODEL_NAME="Qwen/Qwen2-VL-2B-Instruct"
 5 | MODEL_NAME="Qwen/Qwen2.5-VL-3B-Instruct"
 6 | 
 7 | #  loss_type should be one of "cross_entropy", "focal_loss", "class_balanced_cross_entropy", or "class_balanced_focal_loss".
 8 | 
 9 | export PYTHONPATH=src:$PYTHONPATH
10 | 
11 | deepspeed src/train/train_cls.py \
12 |     --deepspeed scripts/zero3.json \
13 |     --model_id $MODEL_NAME \
14 |     --data_path /path/to/your/training/data.json \
15 |     --image_folder /path/to/your/image/folder \
16 |     --eval_path /path/to/your/training/data.json \
17 |     --eval_image_folder /path/to/your/image/folder \
18 |     --freeze_llm True \
19 |     --freeze_vision_tower False \
20 |     --freeze_merger False \
21 |     --bf16 True \
22 |     --fp16 False \
23 |     --loss_type "cross_entropy" \
24 |     --num_labels 2 \
25 |     --disable_flash_attn2 False \
26 |     --output_dir output/qwen_cls \
27 |     --num_train_epochs 3 \
28 |     --per_device_train_batch_size 4 \
29 |     --gradient_accumulation_steps 4 \
30 |     --learning_rate 3e-5 \
31 |     --head_lr 4e-5 \
32 |     --vision_lr 6e-6 \
33 |     --merger_lr 2e-5 \
34 |     --weight_decay 0.02 \
35 |     --warmup_ratio 0.05 \
36 |     --max_grad_norm 1.0 \
37 |     --lr_scheduler_type "cosine" \
38 |     --logging_steps 1 \
39 |     --tf32 True \
40 |     --eval_strategy "epoch" \
41 |     --load_best_model_at_end True \
42 |     --metric_for_best_model "eval_f1" \
43 |     --greater_is_better True \
44 |     --gradient_checkpointing True \
45 |     --report_to tensorboard \
46 |     --lazy_preprocess True \
47 |     --save_strategy "epoch" \
48 |     --dataloader_num_workers 4


--------------------------------------------------------------------------------
/scripts/finetune_dpo.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
 4 | # MODEL_NAME="Qwen/Qwen2-VL-2B-Instruct"
 5 | MODEL_NAME="Qwen/Qwen2.5-VL-3B-Instruct"
 6 | # MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
 7 | 
 8 | GLOBAL_BATCH_SIZE=128
 9 | BATCH_PER_DEVICE=4
10 | NUM_DEVICES=8
11 | GRAD_ACCUM_STEPS=$((GLOBAL_BATCH_SIZE / (BATCH_PER_DEVICE * NUM_DEVICES)))
12 | 
13 | export PYTHONPATH=src:$PYTHONPATH
14 | 
15 | deepspeed src/train/train_dpo.py \
16 |     --dpo_loss "sigmoid" \
17 |     --precompute_ref_log_probs False \
18 |     --beta 0.1 \
19 |     --use_liger_loss True \
20 |     --deepspeed scripts/zero3_offload.json \
21 |     --model_id $MODEL_NAME \
22 |     --data_path /path/to/your/training/data.json \
23 |     --image_folder /path/to/your/image/folder \
24 |     --remove_unused_columns False \
25 |     --freeze_vision_tower False \
26 |     --freeze_llm False \
27 |     --freeze_merger False \
28 |     --bf16 True \
29 |     --fp16 False \
30 |     --disable_flash_attn2 False \
31 |     --output_dir output/test_dpo \
32 |     --num_train_epochs 1 \
33 |     --per_device_train_batch_size $BATCH_PER_DEVICE \
34 |     --gradient_accumulation_steps $GRAD_ACCUM_STEPS \
35 |     --image_min_pixels $((512 * 28 * 28)) \
36 |     --image_max_pixels $((1280 * 28 * 28)) \
37 |     --learning_rate 1e-5 \
38 |     --merger_lr 1e-5 \
39 |     --vision_lr 2e-6 \
40 |     --weight_decay 0.1 \
41 |     --warmup_ratio 0.03 \
42 |     --lr_scheduler_type "cosine" \
43 |     --logging_steps 1 \
44 |     --tf32 True \
45 |     --gradient_checkpointing True \
46 |     --report_to tensorboard \
47 |     --lazy_preprocess True \
48 |     --save_strategy "steps" \
49 |     --save_steps 200 \
50 |     --save_total_limit 10 \
51 |     --dataloader_num_workers 4


--------------------------------------------------------------------------------
/scripts/finetune.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
 4 | # MODEL_NAME="Qwen/Qwen2-VL-2B-Instruct"
 5 | # MODEL_NAME="Qwen/Qwen2.5-VL-3B-Instruct"
 6 | # MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
 7 | 
 8 | MODEL_NAME="Qwen/Qwen3-VL-4B-Instruct"
 9 | 
10 | GLOBAL_BATCH_SIZE=128
11 | BATCH_PER_DEVICE=4
12 | NUM_DEVICES=8
13 | GRAD_ACCUM_STEPS=$((GLOBAL_BATCH_SIZE / (BATCH_PER_DEVICE * NUM_DEVICES)))
14 | 
15 | export PYTHONPATH=src:$PYTHONPATH
16 | 
17 | # If you want to set the min pixels and max pixels for Qwen3-VL, You should set as (N * 32 * 32)
18 | 
19 | deepspeed src/train/train_sft.py \
20 |     --use_liger_kernel True \
21 |     --deepspeed scripts/zero3_offload.json \
22 |     --model_id $MODEL_NAME \
23 |     --data_path /path/to/your/training/data.json \
24 |     --image_folder /path/to/your/image/folder \
25 |     --remove_unused_columns False \
26 |     --freeze_vision_tower False \
27 |     --freeze_llm False \
28 |     --freeze_merger False \
29 |     --bf16 True \
30 |     --fp16 False \
31 |     --disable_flash_attn2 False \
32 |     --output_dir output/test_fft \
33 |     --num_train_epochs 1 \
34 |     --per_device_train_batch_size $BATCH_PER_DEVICE \
35 |     --gradient_accumulation_steps $GRAD_ACCUM_STEPS \
36 |     --image_min_pixels $((512 * 28 * 28)) \
37 |     --image_max_pixels $((1280 * 28 * 28)) \
38 |     --learning_rate 1e-5 \
39 |     --merger_lr 1e-5 \
40 |     --vision_lr 2e-6 \
41 |     --weight_decay 0.1 \
42 |     --warmup_ratio 0.03 \
43 |     --lr_scheduler_type "cosine" \
44 |     --logging_steps 1 \
45 |     --tf32 True \
46 |     --gradient_checkpointing True \
47 |     --report_to tensorboard \
48 |     --lazy_preprocess True \
49 |     --save_strategy "steps" \
50 |     --save_steps 200 \
51 |     --save_total_limit 10 \
52 |     --dataloader_num_workers 4


--------------------------------------------------------------------------------
/scripts/finetune_video.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
 4 | # MODEL_NAME="Qwen/Qwen2-VL-2B-Instruct"
 5 | # MODEL_NAME="Qwen/Qwen2.5-VL-3B-Instruct"
 6 | # MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
 7 | 
 8 | MODEL_NAME="Qwen/Qwen3-VL-4B-Instruct"
 9 | 
10 | export PYTHONPATH=src:$PYTHONPATH
11 | 
12 | GLOBAL_BATCH_SIZE=128
13 | BATCH_PER_DEVICE=4
14 | NUM_DEVICES=8
15 | GRAD_ACCUM_STEPS=$((GLOBAL_BATCH_SIZE / (BATCH_PER_DEVICE * NUM_DEVICES)))
16 | 
17 | # If your dataset is mixed with images and videos, you need to use zero2.
18 | # If you want to set the min pixels and max pixels for Qwen3-VL, You should set as (N * 32 * 32)
19 | 
20 | deepspeed src/train/train_sft.py \
21 |     --use_liger_kernel True \
22 |     --deepspeed scripts/zero3_offload.json \
23 |     --model_id $MODEL_NAME \
24 |     --data_path /path/to/your/training/data.json \
25 |     --image_folder /path/to/your/image/folder \
26 |     --remove_unused_columns False \
27 |     --freeze_vision_tower False \
28 |     --freeze_llm False \
29 |     --freeze_merger False \
30 |     --bf16 True \
31 |     --fp16 False \
32 |     --disable_flash_attn2 False \
33 |     --output_dir output/test_train \
34 |     --num_train_epochs 1 \
35 |     --per_device_train_batch_size $BATCH_PER_DEVICE \
36 |     --gradient_accumulation_steps $GRAD_ACCUM_STEPS \
37 |     --video_max_pixels $((360 * 420)) \
38 |     --fps 1.0 \
39 |     --learning_rate 1e-5 \
40 |     --merger_lr 1e-5 \
41 |     --vision_lr 2e-6 \
42 |     --weight_decay 0.1 \
43 |     --warmup_ratio 0.03 \
44 |     --lr_scheduler_type "cosine" \
45 |     --logging_steps 1 \
46 |     --tf32 True \
47 |     --gradient_checkpointing True \
48 |     --report_to tensorboard \
49 |     --lazy_preprocess True \
50 |     --save_strategy "steps" \
51 |     --save_steps 1 \
52 |     --save_total_limit 10 \
53 |     --dataloader_num_workers 4


--------------------------------------------------------------------------------
/src/loss/focal_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | 
 5 | class FocalLossCE(nn.Module):
 6 |     """
 7 |     Plain Focal Loss (multi-class), optional class weights alpha
 8 |     - gamma=0, alpha=None  → nn.CrossEntropyLoss(mean)와 동일 스케일
 9 |     - gamma=0, alpha!=None → nn.CrossEntropyLoss(weight=alpha, mean)와 동일 스케일
10 |     """
11 |     def __init__(self, alpha=None, gamma=1.5, reduction="mean"):
12 |         super().__init__()
13 |         if alpha is not None and not torch.is_tensor(alpha):
14 |             alpha = torch.tensor(alpha, dtype=torch.float32)
15 |         # 빈 텐서는 "가중치 없음" 신호로 사용
16 |         self.register_buffer("alpha", alpha if alpha is not None else torch.tensor([]))
17 |         self.gamma = float(gamma)
18 |         self.reduction = reduction
19 | 
20 |     def forward(self, logits, targets):
21 |         """
22 |         logits:  [B, C]
23 |         targets: [B] (long)
24 |         """
25 |         targets = targets.long().to(logits.device)
26 |         log_probs = F.log_softmax(logits, dim=1)                           # [B, C]
27 |         log_pt = log_probs.gather(1, targets.unsqueeze(1)).squeeze(1)      # [B]
28 |         pt = log_pt.exp().clamp_min(1e-8)                                   # [B]
29 | 
30 |         if self.alpha.numel() > 0:
31 |             alpha_t = self.alpha.to(device=logits.device, dtype=logits.dtype)[targets]  # [B]
32 |         else:
33 |             alpha_t = torch.ones_like(pt)
34 | 
35 |         loss = -alpha_t * (1.0 - pt).pow(self.gamma) * log_pt               # [B]
36 | 
37 |         if self.reduction == "mean":
38 |             # CE의 "가중 평균"과 스케일 일치: 분모를 가중치 합으로
39 |             denom = (alpha_t.sum() if self.alpha.numel() > 0 else pt.new_tensor(len(pt)))
40 |             return loss.sum() / (denom + 1e-12)
41 |         elif self.reduction == "sum":
42 |             return loss.sum()
43 |         else:
44 |             return loss
45 | 


--------------------------------------------------------------------------------
/scripts/finetune_lora.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
 4 | # MODEL_NAME="Qwen/Qwen2-VL-2B-Instruct"
 5 | # MODEL_NAME="Qwen/Qwen2.5-VL-3B-Instruct"
 6 | # MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
 7 | 
 8 | MODEL_NAME="Qwen/Qwen3-VL-4B-Instruct"
 9 | 
10 | export PYTHONPATH=src:$PYTHONPATH
11 | 
12 | GLOBAL_BATCH_SIZE=128
13 | BATCH_PER_DEVICE=4
14 | NUM_DEVICES=8
15 | GRAD_ACCUM_STEPS=$((GLOBAL_BATCH_SIZE / (BATCH_PER_DEVICE * NUM_DEVICES)))
16 | 
17 | # If you want to tune the `embed_token` with LoRA, You need to tune `lm_head` together
18 | 
19 | # If you want to set the min pixels and max pixels for Qwen3-VL, You should set as (N * 32 * 32)
20 | 
21 | deepspeed src/train/train_sft.py \
22 |     --use_liger_kernel True \
23 |     --lora_enable True \
24 |     --use_dora False \
25 |     --lora_namespan_exclude "['lm_head', 'embed_tokens']" \
26 |     --lora_rank 32 \
27 |     --lora_alpha 64 \
28 |     --lora_dropout 0.05 \
29 |     --num_lora_modules -1 \
30 |     --deepspeed scripts/zero3_offload.json \
31 |     --model_id $MODEL_NAME \
32 |     --data_path /path/to/your/training/data.json \
33 |     --image_folder /path/to/your/image/folder \
34 |     --remove_unused_columns False \
35 |     --freeze_vision_tower False \
36 |     --freeze_llm True \
37 |     --freeze_merger False \
38 |     --bf16 True \
39 |     --fp16 False \
40 |     --disable_flash_attn2 False \
41 |     --output_dir output/testing_lora \
42 |     --num_train_epochs 1 \
43 |     --per_device_train_batch_size $BATCH_PER_DEVICE \
44 |     --gradient_accumulation_steps $GRAD_ACCUM_STEPS \
45 |     --image_min_pixels $((256 * 28 * 28)) \
46 |     --image_max_pixels $((1280 * 28 * 28)) \
47 |     --learning_rate 1e-4 \
48 |     --merger_lr 1e-5 \
49 |     --vision_lr 2e-6 \
50 |     --weight_decay 0.1 \
51 |     --warmup_ratio 0.03 \
52 |     --lr_scheduler_type "cosine" \
53 |     --logging_steps 1 \
54 |     --tf32 True \
55 |     --gradient_checkpointing True \
56 |     --report_to tensorboard \
57 |     --lazy_preprocess True \
58 |     --save_strategy "steps" \
59 |     --save_steps 200 \
60 |     --save_total_limit 10 \
61 |     --dataloader_num_workers 4


--------------------------------------------------------------------------------
/scripts/finetune_lora_vision.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | 
 4 | # MODEL_NAME="Qwen/Qwen2-VL-7B-Instruct"
 5 | # MODEL_NAME="Qwen/Qwen2-VL-2B-Instruct"
 6 | MODEL_NAME="Qwen/Qwen2.5-VL-3B-Instruct"
 7 | # MODEL_NAME="Qwen/Qwen2.5-VL-7B-Instruct"
 8 | 
 9 | export PYTHONPATH=src:$PYTHONPATH
10 | 
11 | GLOBAL_BATCH_SIZE=128
12 | BATCH_PER_DEVICE=4
13 | NUM_DEVICES=8
14 | GRAD_ACCUM_STEPS=$((GLOBAL_BATCH_SIZE / (BATCH_PER_DEVICE * NUM_DEVICES)))
15 | 
16 | # If you want to tune the `embed_token` with LoRA, You need to tune `lm_head` together
17 | # You should freeze the the merger also, becuase the merger is included in the vision_tower.
18 | 
19 | # Please set the gradient_checkpointing to False when you are using LoRA with vision models.
20 | 
21 | deepspeed src/train/train_sft.py \
22 |     --use_liger_kernel True \
23 |     --lora_enable True \
24 |     --vision_lora True \
25 |     --use_dora False \
26 |     --lora_namespan_exclude "['lm_head', 'embed_tokens']" \
27 |     --lora_rank 32 \
28 |     --lora_alpha 64 \
29 |     --lora_dropout 0.05 \
30 |     --num_lora_modules -1 \
31 |     --deepspeed scripts/zero3.json \
32 |     --model_id $MODEL_NAME \
33 |     --data_path /path/to/your/training/data.json \
34 |     --image_folder /path/to/your/image/folder \
35 |     --remove_unused_columns False \
36 |     --freeze_vision_tower True \
37 |     --freeze_llm True \
38 |     --freeze_merger True \
39 |     --bf16 True \
40 |     --fp16 False \
41 |     --disable_flash_attn2 False \
42 |     --output_dir output/lora_vision_test \
43 |     --num_train_epochs 1 \
44 |     --per_device_train_batch_size $BATCH_PER_DEVICE \
45 |     --gradient_accumulation_steps $GRAD_ACCUM_STEPS \
46 |     --image_min_pixels $((256 * 28 * 28)) \
47 |     --image_max_pixels $((1280 * 28 * 28)) \
48 |     --learning_rate 2e-4 \
49 |     --weight_decay 0.1 \
50 |     --warmup_ratio 0.03 \
51 |     --lr_scheduler_type "cosine" \
52 |     --logging_steps 1 \
53 |     --tf32 True \
54 |     --gradient_checkpointing False \
55 |     --report_to tensorboard \
56 |     --lazy_preprocess True \
57 |     --save_strategy "steps" \
58 |     --save_steps 200 \
59 |     --save_total_limit 10 \
60 |     --dataloader_num_workers 4


--------------------------------------------------------------------------------
/src/train/reward_funcs.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | from math_verify import LatexExtractionConfig, parse, verify
 3 | from latex2sympy2_extended import NormalizationConfig
 4 | 
 5 | def accuracy_reward(completions, assistant, **kwargs):
 6 |     """Reward function that checks if the completion is correct using either symbolic verification or exact string matching."""
 7 |     rewards = []
 8 | 
 9 |     for completion, sol in zip(completions, assistant):
10 |         try:
11 |             gold_parsed = parse(sol, extraction_mode="first_match")
12 |         except Exception as e:
13 |             gold_parsed = []
14 | 
15 |         if len(gold_parsed) != 0:
16 |             # Try parsing predicted answer too
17 |             try:
18 |                 answer_parsed = parse(
19 |                     completion,
20 |                     extraction_config=[
21 |                         LatexExtractionConfig(
22 |                             normalization_config=NormalizationConfig(
23 |                                 nits=False,
24 |                                 malformed_operators=False,
25 |                                 basic_latex=True,
26 |                                 boxed="all",
27 |                                 units=True,
28 |                             ),
29 |                             boxed_match_priority=0,
30 |                             try_extract_without_anchor=False,
31 |                         )
32 |                     ],
33 |                     extraction_mode="first_match",
34 |                 )
35 |                 reward = float(verify(gold_parsed, answer_parsed))
36 |             except Exception as e:
37 |                 print(f"verify failed: {e}, answer: {completion}, gold: {sol}")
38 |                 reward = None
39 |         else:
40 |             # fallback to text match
41 |             reward = float(completion.strip().lower() == sol.strip().lower())
42 | 
43 |         rewards.append(reward)
44 | 
45 |     return rewards
46 | 
47 | def format_reward(completions, **kwargs):
48 |     """Reward function that checks if the completion has a specific format."""
49 |     pattern = r"^<think>\n.*?\n</think>\n<answer>\n.*?\n</answer>$"
50 |     matches = [re.match(pattern, content, re.DOTALL | re.MULTILINE) for content in completions]
51 |     rewards = [1.0 if match else 0.0 for match in matches]
52 |     return rewards


--------------------------------------------------------------------------------
/src/loss/class_balance_loss.py:
--------------------------------------------------------------------------------
 1 | import torch
 2 | import torch.nn as nn
 3 | import torch.nn.functional as F
 4 | import numpy as np
 5 | 
 6 | class ClassBalancedCrossEntropyLoss(nn.Module):
 7 |     """
 8 |     Class‑Balanced Cross‑Entropy (CB‑CE)
 9 |     """
10 |     def __init__(self, samples_per_cls, beta=0.999, reduction="mean"):
11 |         """
12 |         samples_per_cls : list[int]  # 각 클래스로부터의 원본 샘플 개수
13 |         beta            : float      # 0.9~0.999 사이 권장 (0이면 일반 CE와 동일)
14 |         """
15 |         super().__init__()
16 |         eff_num = 1.0 - np.power(beta, samples_per_cls)
17 |         weights = (1.0 - beta) / np.array(eff_num)
18 |         weights = weights / weights.sum() * len(samples_per_cls)          # 정규화
19 |         self.register_buffer("weights", torch.tensor(weights, dtype=torch.float32))
20 |         self.reduction = reduction
21 | 
22 |     def forward(self, logits, targets):
23 |         """
24 |         logits  : [B, C]
25 |         targets : [B]  (long)
26 |         """
27 |         loss = F.cross_entropy(
28 |             logits, targets,
29 |             weight=self.weights.to(logits.device),
30 |             reduction=self.reduction
31 |         )
32 |         return loss
33 |     
34 | class ClassBalancedFocalLoss(nn.Module):
35 |     def __init__(self, samples_per_cls, beta=0.9995, gamma=1.5, reduction="mean"):
36 |         super().__init__()
37 |         eff = 1.0 - np.power(beta, samples_per_cls)
38 |         w = (1.0 - beta) / eff
39 |         w = w / w.sum() * len(samples_per_cls)
40 |         self.register_buffer("weights", torch.tensor(w, dtype=torch.float32))
41 |         self.gamma = float(gamma)
42 |         self.reduction = reduction
43 | 
44 |     def forward(self, logits, targets):
45 |         targets = targets.long().to(logits.device)
46 |         weights = self.weights.to(device=logits.device, dtype=logits.dtype)
47 | 
48 |         log_probs = F.log_softmax(logits, dim=1)
49 |         log_pt = log_probs.gather(1, targets.unsqueeze(1)).squeeze(1)  # [B]
50 |         pt = log_pt.exp().clamp_min(1e-8)                               # [B]
51 | 
52 |         cb_w = weights[targets]                                         # [B]
53 |         loss = -cb_w * (1.0 - pt).pow(self.gamma) * log_pt              # [B]
54 | 
55 |         if self.reduction == "mean":
56 |             return loss.sum() / (cb_w.sum() + 1e-12)  # ← CE와 동일 스케일
57 |         elif self.reduction == "sum":
58 |             return loss.sum()
59 |         else:
60 |             return loss


--------------------------------------------------------------------------------
/src/train/train_utils.py:
--------------------------------------------------------------------------------
 1 | import transformers
 2 | import torch
 3 | import logging
 4 | 
 5 | 
 6 | def maybe_zero_3(param, ignore_status=False, name=None, device=torch.device('cpu')):
 7 |     from deepspeed import zero
 8 |     from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 9 |     if type(device) is str:
10 |         device = torch.device(device)
11 |     if hasattr(param, "ds_id"):
12 |         if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
13 |             if not ignore_status:
14 |                 logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
15 |         with zero.GatheredParameters([param]):
16 |             param = param.data.detach()
17 |     else:
18 |         param = param.detach()
19 |     if device == param.device:
20 |         return param.clone()
21 |     else:
22 |         return param.to(device)
23 | 
24 | # Borrowed from peft.utils.get_peft_model_state_dict
25 | def get_peft_state_maybe_zero_3(named_params, bias):
26 |     if bias == "none":
27 |         to_return = {k: t for k, t in named_params if "lora_" in k}
28 |     elif bias == "all":
29 |         to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
30 |     elif bias == "lora_only":
31 |         to_return = {}
32 |         maybe_lora_bias = {}
33 |         lora_bias_names = set()
34 |         for k, t in named_params:
35 |             if "lora_" in k:
36 |                 to_return[k] = t
37 |                 bias_name = k.split("lora_")[0] + "bias"
38 |                 lora_bias_names.add(bias_name)
39 |             elif "bias" in k:
40 |                 maybe_lora_bias[k] = t
41 |         for k, t in maybe_lora_bias:
42 |             if bias_name in lora_bias_names:
43 |                 to_return[bias_name] = t
44 |     else:
45 |         raise NotImplementedError
46 |     to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
47 |     return to_return
48 | 
49 | 
50 | def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
51 |     to_return = {k: t for k, t in named_params if "lora_" not in k}
52 |     if require_grad_only:
53 |         to_return = {k: t for k, t in to_return.items() if t.requires_grad}
54 |     to_return = {k: maybe_zero_3(v, ignore_status=True) for k, v in to_return.items()}
55 |     return to_return
56 | 
57 | def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
58 |                                    output_dir: str):
59 |     """Collects the state dict and dump to disk."""
60 | 
61 |     if trainer.deepspeed:
62 |         torch.cuda.synchronize()
63 |         trainer.save_model(output_dir)
64 |         return
65 | 
66 |     state_dict = trainer.model.state_dict()
67 |     if trainer.args.should_save:
68 |         cpu_state_dict = {
69 |             key: value.cpu()
70 |             for key, value in state_dict.items()
71 |         }
72 |         del state_dict
73 |         trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
74 |         trainer.model.config.save_pretrained(output_dir)


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | accelerate==1.10.1
  2 | aiohappyeyeballs==2.6.1
  3 | aiohttp==3.11.18
  4 | aiosignal==1.3.2
  5 | annotated-types==0.7.0
  6 | asttokens==3.0.0
  7 | attrs==25.3.0
  8 | av==14.3.0
  9 | bitsandbytes==0.45.5
 10 | certifi==2025.4.26
 11 | charset-normalizer==3.4.1
 12 | click==8.1.8
 13 | comm==0.2.2
 14 | contourpy==1.3.3
 15 | cycler==0.12.1
 16 | datasets==3.5.1
 17 | debugpy==1.8.14
 18 | decorator==5.2.1
 19 | decord==0.6.0
 20 | deepspeed==0.17.5
 21 | dill==0.3.8
 22 | docker-pycreds==0.4.0
 23 | einops==0.8.1
 24 | exceptiongroup==1.2.2
 25 | executing==2.2.0
 26 | filelock==3.13.1
 27 | fonttools==4.59.2
 28 | frozenlist==1.6.0
 29 | fsspec==2024.6.1
 30 | gitdb==4.0.12
 31 | GitPython==3.1.44
 32 | hf-xet==1.1.9
 33 | hjson==3.1.0
 34 | huggingface-hub==0.34.4
 35 | idna==3.10
 36 | importlib_metadata==8.6.1
 37 | ipykernel==6.29.5
 38 | ipython==9.2.0
 39 | ipython_pygments_lexers==1.1.1
 40 | ipywidgets==8.1.6
 41 | jedi==0.19.2
 42 | Jinja2==3.1.4
 43 | joblib==1.5.2
 44 | jupyter_client==8.6.3
 45 | jupyter_core==5.7.2
 46 | jupyterlab_widgets==3.0.14
 47 | kiwisolver==1.4.9
 48 | liger_kernel==0.6.4
 49 | markdown-it-py==3.0.0
 50 | MarkupSafe==2.1.5
 51 | matplotlib==3.10.6
 52 | matplotlib-inline==0.1.7
 53 | mdurl==0.1.2
 54 | mpmath==1.3.0
 55 | msgpack==1.1.0
 56 | multidict==6.4.3
 57 | multiprocess==0.70.16
 58 | nest_asyncio==1.6.0
 59 | networkx==3.3
 60 | ninja==1.11.1.4
 61 | numpy==2.1.2
 62 | nvidia-cublas-cu12==12.8.4.1
 63 | nvidia-cuda-cupti-cu12==12.8.90
 64 | nvidia-cuda-nvrtc-cu12==12.8.93
 65 | nvidia-cuda-runtime-cu12==12.8.90
 66 | nvidia-cudnn-cu12==9.10.2.21
 67 | nvidia-cufft-cu12==11.3.3.83
 68 | nvidia-cufile-cu12==1.13.1.3
 69 | nvidia-curand-cu12==10.3.9.90
 70 | nvidia-cusolver-cu12==11.7.3.90
 71 | nvidia-cusparse-cu12==12.5.8.93
 72 | nvidia-cusparselt-cu12==0.7.1
 73 | nvidia-ml-py==12.570.86
 74 | nvidia-nccl-cu12==2.27.3
 75 | nvidia-nvjitlink-cu12==12.8.93
 76 | nvidia-nvtx-cu12==12.8.90
 77 | opencv-python==4.11.0.86
 78 | packaging==25.0
 79 | pandas==2.2.3
 80 | parso==0.8.4
 81 | peft==0.15.2
 82 | pexpect==4.9.0
 83 | pickleshare==0.7.5
 84 | pillow==11.3.0
 85 | pip==25.1
 86 | platformdirs==4.3.7
 87 | prompt_toolkit==3.0.51
 88 | propcache==0.3.1
 89 | protobuf==6.30.2
 90 | psutil==7.0.0
 91 | ptyprocess==0.7.0
 92 | pure_eval==0.2.3
 93 | py-cpuinfo==9.0.0
 94 | pyarrow==20.0.0
 95 | pydantic==2.11.3
 96 | pydantic_core==2.33.1
 97 | Pygments==2.19.1
 98 | pyparsing==3.2.3
 99 | python-dateutil==2.9.0.post0
100 | pytz==2025.2
101 | PyYAML==6.0.2
102 | pyzmq==26.4.0
103 | regex==2024.11.6
104 | requests==2.32.3
105 | rich==14.0.0
106 | safetensors==0.6.2
107 | scikit-learn==1.7.2
108 | scipy==1.16.2
109 | sentry-sdk==2.27.0
110 | setproctitle==1.3.5
111 | setuptools==79.0.1
112 | six==1.17.0
113 | smmap==5.0.2
114 | stack_data==0.6.3
115 | sympy==1.13.3
116 | tabulate==0.9.0
117 | tensorboardX==2.6.2.2
118 | threadpoolctl==3.6.0
119 | tokenizers==0.22.0
120 | torch==2.8.0
121 | torchaudio==2.8.0
122 | torchvision==0.23.0
123 | tornado==6.4.2
124 | tqdm==4.67.1
125 | traitlets==5.14.3
126 | transformers==4.57.1
127 | triton==3.4.0
128 | trl==0.25.0
129 | typing_extensions==4.13.2
130 | typing-inspection==0.4.0
131 | tzdata==2025.2
132 | ujson==5.10.0
133 | urllib3==2.4.0
134 | wandb==0.19.10
135 | wcwidth==0.2.13
136 | wheel==0.45.1
137 | widgetsnbextension==4.0.14
138 | xxhash==3.5.0
139 | yarl==1.20.0
140 | zipp==3.21.0


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
110 | .pdm.toml
111 | .pdm-python
112 | .pdm-build/
113 | 
114 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
115 | __pypackages__/
116 | 
117 | # Celery stuff
118 | celerybeat-schedule
119 | celerybeat.pid
120 | 
121 | # SageMath parsed files
122 | *.sage.py
123 | 
124 | # Environments
125 | .env
126 | .venv
127 | env/
128 | venv/
129 | ENV/
130 | env.bak/
131 | venv.bak/
132 | 
133 | # Spyder project settings
134 | .spyderproject
135 | .spyproject
136 | 
137 | # Rope project settings
138 | .ropeproject
139 | 
140 | # mkdocs documentation
141 | /site
142 | 
143 | # mypy
144 | .mypy_cache/
145 | .dmypy.json
146 | dmypy.json
147 | 
148 | # Pyre type checker
149 | .pyre/
150 | 
151 | # pytype static type analyzer
152 | .pytype/
153 | 
154 | # Cython debug symbols
155 | cython_debug/
156 | 
157 | # PyCharm
158 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
159 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
160 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
161 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
162 | #.idea/
163 | 
164 | output
165 | tf-logs
166 | tmp/
167 | scripts_tmp/
168 | logs/*.log
169 | 
170 | testing.ipynb


--------------------------------------------------------------------------------
/src/dataset/data_utils.py:
--------------------------------------------------------------------------------
  1 | import re
  2 | import torch
  3 | 
  4 | from qwen_vl_utils import process_vision_info
  5 | 
  6 | from src.constants import (
  7 |     DEFAULT_IMAGE_TOKEN,
  8 |     DEFAULT_VIDEO_TOKEN,
  9 |     LLAVA_IMAGE_TOKEN,
 10 |     LLAVA_VIDEO_TOKEN,
 11 |     VISION_START_TOKEN,
 12 |     VISION_END_TOKEN,
 13 | )
 14 | 
 15 | 
 16 | def replace_image_tokens(input_string, is_video=False):
 17 |     if is_video:
 18 |         pattern = r'\n?' + re.escape(LLAVA_VIDEO_TOKEN) + r'\n?'
 19 |         replacement = VISION_START_TOKEN + DEFAULT_VIDEO_TOKEN + VISION_END_TOKEN
 20 |     else:
 21 |         pattern = r'\n?' + re.escape(LLAVA_IMAGE_TOKEN) + r'\n?'
 22 |         replacement = VISION_START_TOKEN + DEFAULT_IMAGE_TOKEN + VISION_END_TOKEN
 23 | 
 24 |     return re.sub(pattern, replacement, input_string)
 25 | 
 26 | def llava_to_openai(conversations, is_video=False):
 27 |     role_mapping = {"human": "user", "gpt": "assistant"}
 28 | 
 29 |     transformed_data = []
 30 |     for conversation in conversations:
 31 |         transformed_content = replace_image_tokens(conversation["value"], is_video=is_video)
 32 |         transformed_entry = {
 33 |             "role": role_mapping.get(conversation["from"], conversation["from"]),
 34 |             "content": transformed_content,
 35 |         }
 36 |         transformed_data.append(transformed_entry)
 37 | 
 38 |     return transformed_data
 39 | 
 40 | 
 41 | def truncate_sequence(input_ids, labels, max_length, eos_token_id):
 42 |     if input_ids.size(0) > max_length:
 43 |         input_ids = input_ids[:max_length-1]
 44 |         labels = labels[:max_length-1]
 45 | 
 46 |     if eos_token_id is not None:
 47 |         input_ids = torch.cat([input_ids, torch.tensor([eos_token_id])])
 48 |         labels = torch.cat([labels, torch.tensor([eos_token_id])])
 49 | 
 50 |     return input_ids, labels
 51 | 
 52 | def pad_sequence(sequences, padding_side='right', padding_value=0):
 53 |     """
 54 |     Pad a list of sequences to the same length.
 55 |     sequences: list of tensors in [seq_len, *] shape
 56 |     """
 57 |     assert padding_side in ['right', 'left']
 58 |     max_size = sequences[0].size()
 59 |     trailing_dims = max_size[1:]
 60 |     max_len = max(len(seq) for seq in sequences)
 61 |     batch_size = len(sequences)
 62 |     output = sequences[0].new_full((batch_size, max_len) + trailing_dims, padding_value)
 63 |     for i, seq in enumerate(sequences):
 64 |         length = seq.size(0)
 65 |         if padding_side == 'right':
 66 |             output.data[i, :length] = seq
 67 |         else:
 68 |             output.data[i, -length:] = seq
 69 |     return output
 70 | 
 71 | def get_image_info(image_path, min_pixel, max_pixel, width, height, image_patch_size):
 72 |     # Using this because of process_vision_info function
 73 |     # Need to fix this in the future
 74 |     content = {
 75 |         "type": "image", 
 76 |         "image": image_path,
 77 |         "min_pixels": min_pixel,
 78 |         "max_pixels": max_pixel
 79 |     }
 80 | 
 81 |     if width is not None and height is not None:
 82 |         content["resized_width"] = width
 83 |         content["resized_height"] = height
 84 |     
 85 |     messages = [
 86 |         {
 87 |             "role": "user", 
 88 |             "content": [content]
 89 |         }
 90 |     ]
 91 | 
 92 |     image_input, _ = process_vision_info(messages, image_patch_size=image_patch_size)
 93 | 
 94 |     return image_input[0]
 95 | 
 96 | def get_video_info(video_path, min_pixels, max_pixels, width, height, fps, image_patch_size, return_video_metadata=False):
 97 |     # Using this because of process_vision_info function
 98 |     # Need to fix this in the future
 99 |     content = {
100 |         "type": "video", 
101 |         "video": video_path,
102 |         "min_pixels": min_pixels,
103 |         "max_pixels": max_pixels,
104 |         "fps": fps
105 |     }
106 | 
107 |     if width is not None and height is not None:
108 |         content["resized_width"] = width
109 |         content["resized_height"] = height
110 |     
111 |     messages = [
112 |         {
113 |             "role": "user", 
114 |             "content": [content]
115 |         }
116 |     ]
117 | 
118 |     _, video_input, video_kwargs = process_vision_info(
119 |         messages, 
120 |         return_video_kwargs=True, 
121 |         image_patch_size=image_patch_size, 
122 |         return_video_metadata=return_video_metadata
123 |     )
124 | 
125 |     return video_input[0], video_kwargs
126 | 
127 | def samples_per_class_from_ids(label_ids, num_classes):
128 |     
129 |     counts = torch.bincount(
130 |         torch.as_tensor(label_ids, dtype=torch.long),
131 |         minlength=num_classes
132 |     )
133 |     
134 |     return counts.tolist()


--------------------------------------------------------------------------------
/src/dataset/grpo_dataset.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import os
  3 | from typing import Dict
  4 | import torch
  5 | import transformers
  6 | import ujson as json
  7 | from torch.utils.data import Dataset
  8 | 
  9 | from src.params import DataArguments
 10 | from src.constants import (
 11 |     DEFAULT_IM_START_TOKEN,
 12 |     DEFAULT_IM_END_TOKEN,
 13 |     SYSTEM_MESSAGE,
 14 | )
 15 | 
 16 | from .data_utils import get_image_info, get_video_info, llava_to_openai
 17 | 
 18 | class GRPODataset(Dataset):
 19 |     """Dataset for DPO training"""
 20 | 
 21 |     def __init__(
 22 |         self,
 23 |         data_path: str | list,
 24 |         processor: transformers.ProcessorMixin,
 25 |         data_args: DataArguments,
 26 |         model_id,
 27 |         padding=True,
 28 |     ):
 29 |         super(GRPODataset, self).__init__()
 30 |         if isinstance(data_path, str):
 31 |             list_data_dict = json.load(open(data_path, "r"))
 32 |         else:
 33 |             list_data_dict = data_path
 34 | 
 35 |         self.model_id = model_id
 36 |         self.processor = processor
 37 |         self.list_data_dict = list_data_dict
 38 |         self.data_args = data_args
 39 |         self.padding = padding
 40 |         self.image_min_pixel = data_args.image_min_pixels
 41 |         self.image_max_pixel = data_args.image_max_pixels
 42 |         self.video_min_pixel = data_args.video_min_pixels
 43 |         self.video_max_pixel = data_args.video_max_pixels
 44 |         self.image_resized_w = data_args.image_resized_width
 45 |         self.image_resized_h = data_args.image_resized_height
 46 |         self.video_resized_w = data_args.video_resized_width
 47 |         self.video_resized_h = data_args.video_resized_height
 48 |         self.fps = data_args.fps
 49 |         self.nframes = data_args.nframes
 50 | 
 51 |         if "Qwen3" in self.model_id:
 52 |             self.image_patch_size = 16
 53 |             self.return_video_metadata = True
 54 |         else:
 55 |             self.image_patch_size = 14
 56 |             self.return_video_metadata = False
 57 | 
 58 |         self.processor.image_processor.do_resize = False
 59 | 
 60 |     def __len__(self):
 61 |         return len(self.list_data_dict)
 62 |     
 63 |     def __getitem__(self, i) -> Dict[str, torch.Tensor]:
 64 |         sources = self.list_data_dict[i]
 65 | 
 66 |         is_video = False
 67 | 
 68 |         if "image" in sources:
 69 |             videos = None
 70 |             
 71 |             image_files = sources["image"]
 72 |             image_folder = self.data_args.image_folder
 73 | 
 74 |             if isinstance(image_files, str):
 75 |                 image_files = [image_files]
 76 | 
 77 |             images = []
 78 |             
 79 |             for image_file in image_files:
 80 |                 if not os.path.exists(image_file):
 81 |                     if not image_file.startswith("http"):
 82 |                         image_file = os.path.join(image_folder, image_file)
 83 |                 image_input = get_image_info(
 84 |                         image_file, 
 85 |                         self.image_min_pixel, 
 86 |                         self.image_max_pixel, 
 87 |                         self.image_resized_w, 
 88 |                         self.image_resized_h, 
 89 |                         self.image_patch_size
 90 |                     )
 91 |                 images.append(image_input)
 92 |         elif "video" in sources:
 93 |             is_video = True
 94 |             images=None
 95 | 
 96 |             video_files = sources["video"]
 97 |             video_folder = self.data_args.image_folder
 98 | 
 99 |             if isinstance(video_files, str):
100 |                 video_files = [video_files]
101 | 
102 |             videos = []
103 |             for video_file in video_files:
104 |                 if not os.path.exists(video_file):
105 |                     if not video_file.startswith("http"):
106 |                         video_file = os.path.join(video_folder, video_file)
107 |                 video_input, video_kwargs = get_video_info(
108 |                     video_file, 
109 |                     self.video_min_pixel, 
110 |                     self.video_max_pixel, 
111 |                     self.video_resized_w, 
112 |                     self.video_resized_h, 
113 |                     self.data_args.fps,
114 |                     self.image_patch_size,
115 |                     return_video_metadata=self.return_video_metadata
116 |                 )
117 |                 videos.append(video_input)
118 |         else:
119 |             images=None
120 |             videos=None
121 | 
122 |         conversations = copy.deepcopy(llava_to_openai(sources['conversations'], is_video=is_video))
123 | 
124 |         user_input = conversations[0]
125 |         gpt_response = conversations[1]
126 | 
127 |         system_message = f"{DEFAULT_IM_START_TOKEN}system\n{SYSTEM_MESSAGE}{DEFAULT_IM_END_TOKEN}\n"
128 |         user_message = f"{DEFAULT_IM_START_TOKEN}{user_input['role']}\n{user_input['content']}{DEFAULT_IM_END_TOKEN}\n{DEFAULT_IM_START_TOKEN}{gpt_response['role']}\n"
129 | 
130 |         user_prompt = system_message + user_message
131 |         assistant_prompt = gpt_response['content']
132 | 
133 |         data_dict = dict(
134 |             prompt=user_prompt,
135 |             assistant=assistant_prompt,
136 |             images=images,
137 |             videos=videos,
138 |             video_kwargs=video_kwargs if is_video else None,
139 |         )
140 | 
141 |         return data_dict
142 |     
143 | def make_grpo_data_module(model_id, processor, data_args):
144 |     """Make dataset and collator for supervised fine-tuning."""
145 |     grpo_dataset = GRPODataset(
146 |         data_path=data_args.data_path, processor=processor, data_args=data_args, model_id=model_id
147 |     )
148 | 
149 |     return dict(train_dataset=grpo_dataset,
150 |                 eval_dataset=None)


--------------------------------------------------------------------------------
/environment.yaml:
--------------------------------------------------------------------------------
  1 | name: train
  2 | channels:
  3 |   - conda-forge
  4 | dependencies:
  5 |   - _libgcc_mutex=0.1=conda_forge
  6 |   - _openmp_mutex=4.5=2_gnu
  7 |   - asttokens=3.0.0=pyhd8ed1ab_1
  8 |   - bzip2=1.0.8=h4bc722e_7
  9 |   - ca-certificates=2025.4.26=hbd8a1cb_0
 10 |   - comm=0.2.2=pyhd8ed1ab_1
 11 |   - debugpy=1.8.14=py311hfdbb021_0
 12 |   - decorator=5.2.1=pyhd8ed1ab_0
 13 |   - exceptiongroup=1.2.2=pyhd8ed1ab_1
 14 |   - executing=2.2.0=pyhd8ed1ab_0
 15 |   - importlib-metadata=8.6.1=pyha770c72_0
 16 |   - ipykernel=6.29.5=pyh3099207_0
 17 |   - ipython=9.2.0=pyhfb0248b_0
 18 |   - ipython_pygments_lexers=1.1.1=pyhd8ed1ab_0
 19 |   - jedi=0.19.2=pyhd8ed1ab_1
 20 |   - jupyter_client=8.6.3=pyhd8ed1ab_1
 21 |   - jupyter_core=5.7.2=pyh31011fe_1
 22 |   - keyutils=1.6.1=h166bdaf_0
 23 |   - krb5=1.21.3=h659f571_0
 24 |   - ld_impl_linux-64=2.43=h712a8e2_4
 25 |   - libedit=3.1.20250104=pl5321h7949ede_0
 26 |   - libexpat=2.7.0=h5888daf_0
 27 |   - libffi=3.4.6=h2dba641_1
 28 |   - libgcc=14.2.0=h767d61c_2
 29 |   - libgcc-ng=14.2.0=h69a702a_2
 30 |   - libgomp=14.2.0=h767d61c_2
 31 |   - liblzma=5.8.1=hb9d3cd8_0
 32 |   - libnsl=2.0.1=hd590300_0
 33 |   - libsodium=1.0.20=h4ab18f5_0
 34 |   - libsqlite=3.49.1=hee588c1_2
 35 |   - libstdcxx=14.2.0=h8f9b012_2
 36 |   - libstdcxx-ng=14.2.0=h4852527_2
 37 |   - libuuid=2.38.1=h0b41bf4_0
 38 |   - libxcrypt=4.4.36=hd590300_1
 39 |   - libzlib=1.3.1=hb9d3cd8_2
 40 |   - matplotlib-inline=0.1.7=pyhd8ed1ab_1
 41 |   - ncurses=6.5=h2d0b736_3
 42 |   - nest-asyncio=1.6.0=pyhd8ed1ab_1
 43 |   - openssl=3.5.0=h7b32b05_0
 44 |   - packaging=25.0=pyh29332c3_1
 45 |   - parso=0.8.4=pyhd8ed1ab_1
 46 |   - pexpect=4.9.0=pyhd8ed1ab_1
 47 |   - pickleshare=0.7.5=pyhd8ed1ab_1004
 48 |   - pip=25.1=pyh8b19718_0
 49 |   - platformdirs=4.3.7=pyh29332c3_0
 50 |   - prompt-toolkit=3.0.51=pyha770c72_0
 51 |   - psutil=7.0.0=py311h9ecbd09_0
 52 |   - ptyprocess=0.7.0=pyhd8ed1ab_1
 53 |   - pure_eval=0.2.3=pyhd8ed1ab_1
 54 |   - pygments=2.19.1=pyhd8ed1ab_0
 55 |   - python=3.11.12=h9e4cc4f_0_cpython
 56 |   - python-dateutil=2.9.0.post0=pyhff2d567_1
 57 |   - python_abi=3.11=7_cp311
 58 |   - pyzmq=26.4.0=py311h7deb3e3_0
 59 |   - readline=8.2=h8c095d6_2
 60 |   - setuptools=79.0.1=pyhff2d567_0
 61 |   - six=1.17.0=pyhd8ed1ab_0
 62 |   - stack_data=0.6.3=pyhd8ed1ab_1
 63 |   - tk=8.6.13=noxft_h4845f30_101
 64 |   - tornado=6.4.2=py311h9ecbd09_0
 65 |   - traitlets=5.14.3=pyhd8ed1ab_1
 66 |   - typing_extensions=4.13.2=pyh29332c3_0
 67 |   - wcwidth=0.2.13=pyhd8ed1ab_1
 68 |   - wheel=0.45.1=pyhd8ed1ab_1
 69 |   - zeromq=4.3.5=h3b0a872_7
 70 |   - zipp=3.21.0=pyhd8ed1ab_1
 71 |   - pip:
 72 |       - accelerate==1.10.1
 73 |       - aiohappyeyeballs==2.6.1
 74 |       - aiohttp==3.11.18
 75 |       - aiosignal==1.3.2
 76 |       - annotated-types==0.7.0
 77 |       - attrs==25.3.0
 78 |       - av==14.3.0
 79 |       - bitsandbytes==0.45.5
 80 |       - certifi==2025.4.26
 81 |       - charset-normalizer==3.4.1
 82 |       - click==8.1.8
 83 |       - contourpy==1.3.3
 84 |       - cycler==0.12.1
 85 |       - datasets==3.5.1
 86 |       - decord==0.6.0
 87 |       - deepspeed==0.17.5
 88 |       - dill==0.3.8
 89 |       - docker-pycreds==0.4.0
 90 |       - einops==0.8.1
 91 |       - filelock==3.13.1
 92 |       - fonttools==4.59.2
 93 |       - frozenlist==1.6.0
 94 |       - fsspec==2024.6.1
 95 |       - gitdb==4.0.12
 96 |       - gitpython==3.1.44
 97 |       - hf-xet==1.1.9
 98 |       - hjson==3.1.0
 99 |       - huggingface-hub==0.34.4
100 |       - idna==3.10
101 |       - ipywidgets==8.1.6
102 |       - jinja2==3.1.4
103 |       - joblib==1.5.2
104 |       - jupyterlab-widgets==3.0.14
105 |       - kiwisolver==1.4.9
106 |       - liger-kernel==0.6.4
107 |       - markdown-it-py==3.0.0
108 |       - markupsafe==2.1.5
109 |       - matplotlib==3.10.6
110 |       - mdurl==0.1.2
111 |       - mpmath==1.3.0
112 |       - msgpack==1.1.0
113 |       - multidict==6.4.3
114 |       - multiprocess==0.70.16
115 |       - networkx==3.3
116 |       - ninja==1.11.1.4
117 |       - numpy==2.1.2
118 |       - nvidia-cublas-cu12==12.8.4.1
119 |       - nvidia-cuda-cupti-cu12==12.8.90
120 |       - nvidia-cuda-nvrtc-cu12==12.8.93
121 |       - nvidia-cuda-runtime-cu12==12.8.90
122 |       - nvidia-cudnn-cu12==9.10.2.21
123 |       - nvidia-cufft-cu12==11.3.3.83
124 |       - nvidia-cufile-cu12==1.13.1.3
125 |       - nvidia-curand-cu12==10.3.9.90
126 |       - nvidia-cusolver-cu12==11.7.3.90
127 |       - nvidia-cusparse-cu12==12.5.8.93
128 |       - nvidia-cusparselt-cu12==0.7.1
129 |       - nvidia-ml-py==12.570.86
130 |       - nvidia-nccl-cu12==2.27.3
131 |       - nvidia-nvjitlink-cu12==12.8.93
132 |       - nvidia-nvtx-cu12==12.8.90
133 |       - opencv-python==4.11.0.86
134 |       - pandas==2.2.3
135 |       - peft==0.15.2
136 |       - pillow==11.3.0
137 |       - pillow-simd==9.5.0.post2
138 |       - propcache==0.3.1
139 |       - protobuf==6.30.2
140 |       - py-cpuinfo==9.0.0
141 |       - pyarrow==20.0.0
142 |       - pydantic==2.11.3
143 |       - pydantic-core==2.33.1
144 |       - pyparsing==3.2.3
145 |       - pytz==2025.2
146 |       - pyyaml==6.0.2
147 |       - regex==2024.11.6
148 |       - requests==2.32.3
149 |       - rich==14.0.0
150 |       - safetensors==0.6.2
151 |       - scikit-learn==1.7.2
152 |       - scipy==1.16.2
153 |       - sentry-sdk==2.27.0
154 |       - setproctitle==1.3.5
155 |       - smmap==5.0.2
156 |       - sympy==1.13.3
157 |       - tabulate==0.9.0
158 |       - tensorboardx==2.6.2.2
159 |       - threadpoolctl==3.6.0
160 |       - tokenizers==0.22.0
161 |       - torch==2.8.0
162 |       - torchaudio==2.8.0
163 |       - torchvision==0.23.0
164 |       - tqdm==4.67.1
165 |       - transformers==4.57.1
166 |       - triton==3.4.0
167 |       - trl==0.25.0
168 |       - typing-inspection==0.4.0
169 |       - tzdata==2025.2
170 |       - ujson==5.10.0
171 |       - urllib3==2.4.0
172 |       - wandb==0.19.10
173 |       - widgetsnbextension==4.0.14
174 |       - xxhash==3.5.0
175 |       - yarl==1.20.0
176 | 


--------------------------------------------------------------------------------
/src/serve/app.py:
--------------------------------------------------------------------------------
  1 | import argparse
  2 | from threading import Thread
  3 | import gradio as gr
  4 | from PIL import Image
  5 | from src.utils import load_pretrained_model, get_model_name_from_path, disable_torch_init
  6 | from transformers import TextIteratorStreamer
  7 | from functools import partial
  8 | import warnings
  9 | from qwen_vl_utils import process_vision_info
 10 | 
 11 | warnings.filterwarnings("ignore")
 12 | 
 13 | def is_video_file(filename):
 14 |     video_extensions = ['.mp4', '.avi', '.mkv', '.mov', '.wmv', '.flv', '.webm', '.mpeg']
 15 |     return any(filename.lower().endswith(ext) for ext in video_extensions)
 16 | 
 17 | def bot_streaming(message, history, generation_args):
 18 |     # Initialize variables
 19 |     images = []
 20 |     videos = []
 21 | 
 22 |     if message["files"]:
 23 |         for file_item in message["files"]:
 24 |             if isinstance(file_item, dict):
 25 |                 file_path = file_item["path"]
 26 |             else:
 27 |                 file_path = file_item
 28 |             if is_video_file(file_path):
 29 |                 videos.append(file_path)
 30 |             else:
 31 |                 images.append(file_path)
 32 | 
 33 |     conversation = []
 34 |     for user_turn, assistant_turn in history:
 35 |         user_content = []
 36 |         if isinstance(user_turn, tuple):
 37 |             file_paths = user_turn[0]
 38 |             user_text = user_turn[1]
 39 |             if not isinstance(file_paths, list):
 40 |                 file_paths = [file_paths]
 41 |             for file_path in file_paths:
 42 |                 if is_video_file(file_path):
 43 |                     user_content.append({"type": "video", "video": file_path, "fps":1.0})
 44 |                 else:
 45 |                     user_content.append({"type": "image", "image": file_path})
 46 |             if user_text:
 47 |                 user_content.append({"type": "text", "text": user_text})
 48 |         else:
 49 |             user_content.append({"type": "text", "text": user_turn})
 50 |         conversation.append({"role": "user", "content": user_content})
 51 | 
 52 |         if assistant_turn is not None:
 53 |             assistant_content = [{"type": "text", "text": assistant_turn}]
 54 |             conversation.append({"role": "assistant", "content": assistant_content})
 55 | 
 56 |     user_content = []
 57 |     for image in images:
 58 |         user_content.append({"type": "image", "image": image})
 59 |     for video in videos:
 60 |         user_content.append({"type": "video", "video": video, "fps":1.0})
 61 |     user_text = message['text']
 62 |     if user_text:
 63 |         user_content.append({"type": "text", "text": user_text})
 64 |     conversation.append({"role": "user", "content": user_content})
 65 | 
 66 |     prompt = processor.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
 67 |     image_inputs, video_inputs = process_vision_info(conversation)
 68 |     
 69 |     inputs = processor(text=[prompt], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to(device) 
 70 | 
 71 |     streamer = TextIteratorStreamer(processor.tokenizer, **{"skip_special_tokens": True, "skip_prompt": True, 'clean_up_tokenization_spaces':False,}) 
 72 |     generation_kwargs = dict(inputs, streamer=streamer, eos_token_id=processor.tokenizer.eos_token_id, **generation_args)
 73 | 
 74 |     thread = Thread(target=model.generate, kwargs=generation_kwargs)
 75 |     thread.start()
 76 | 
 77 |     buffer = ""
 78 |     for new_text in streamer:
 79 |         buffer += new_text
 80 |         yield buffer
 81 | 
 82 | def main(args):
 83 | 
 84 |     global processor, model, device
 85 | 
 86 |     device = args.device
 87 |     
 88 |     disable_torch_init()
 89 | 
 90 |     use_flash_attn = True
 91 |     
 92 |     model_name = get_model_name_from_path(args.model_path)
 93 |     
 94 |     if args.disable_flash_attention:
 95 |         use_flash_attn = False
 96 | 
 97 |     processor, model = load_pretrained_model(model_base = args.model_base, model_path = args.model_path, 
 98 |                                                 device_map=args.device, model_name=model_name, 
 99 |                                                 load_4bit=args.load_4bit, load_8bit=args.load_8bit,
100 |                                                 device=args.device, use_flash_attn=use_flash_attn
101 |     )
102 | 
103 |     chatbot = gr.Chatbot(scale=2)
104 |     chat_input = gr.MultimodalTextbox(interactive=True, file_types=["image", "video"], placeholder="Enter message or upload file...",
105 |                                   show_label=False)
106 |     
107 |     generation_args = {
108 |         "max_new_tokens": args.max_new_tokens,
109 |         "temperature": args.temperature,
110 |         "do_sample": True if args.temperature > 0 else False,
111 |         "repetition_penalty": args.repetition_penalty,
112 |     }
113 |     
114 |     bot_streaming_with_args = partial(bot_streaming, generation_args=generation_args)
115 | 
116 |     with gr.Blocks(fill_height=True) as demo:
117 |         gr.ChatInterface(
118 |             fn=bot_streaming_with_args,
119 |             title="Qwen2-VL-7B Instruct",
120 |             stop_btn="Stop Generation",
121 |             multimodal=True,
122 |             textbox=chat_input,
123 |             chatbot=chatbot,
124 |         )
125 | 
126 | 
127 |     demo.queue(api_open=False)
128 |     demo.launch(show_api=False, share=False, server_name='0.0.0.0')
129 | 
130 | if __name__ == "__main__":
131 |     parser = argparse.ArgumentParser()
132 |     parser.add_argument("--model-path", type=str, default=None)
133 |     parser.add_argument("--model-base", type=str, default="Qwen/Qwen2-VL-7B-Instruct")
134 |     parser.add_argument("--device", type=str, default="cuda")
135 |     parser.add_argument("--load-8bit", action="store_true")
136 |     parser.add_argument("--load-4bit", action="store_true")
137 |     parser.add_argument("--disable_flash_attention", action="store_true")
138 |     parser.add_argument("--temperature", type=float, default=0)
139 |     parser.add_argument("--repetition-penalty", type=float, default=1.0)
140 |     parser.add_argument("--max-new-tokens", type=int, default=1024)
141 |     parser.add_argument("--debug", action="store_true")
142 |     args = parser.parse_args()
143 |     main(args)


--------------------------------------------------------------------------------
/src/utils.py:
--------------------------------------------------------------------------------
  1 | from pathlib import Path
  2 | from peft import PeftModel
  3 | import torch
  4 | from transformers import (
  5 |     BitsAndBytesConfig, 
  6 |     Qwen2VLForConditionalGeneration, 
  7 |     AutoProcessor, 
  8 |     AutoConfig, 
  9 |     Qwen2_5_VLForConditionalGeneration,
 10 |     Qwen3VLForConditionalGeneration,
 11 |     Qwen3VLMoeForConditionalGeneration
 12 | )
 13 | import warnings
 14 | import os
 15 | import json
 16 | import importlib
 17 | import inspect
 18 | from types import ModuleType
 19 | from typing import Callable, List
 20 | 
 21 | def disable_torch_init():
 22 |     """
 23 |     Disable the redundant torch default initialization to accelerate model creation.
 24 |     """
 25 |     setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
 26 |     setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
 27 | 
 28 | # This code is borrowed from LLaVA
 29 | def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, 
 30 |                           device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
 31 |     kwargs = {"device_map": device_map}
 32 |     
 33 |     if device != "cuda":
 34 |         kwargs['device_map'] = {"":device}
 35 |     
 36 |     if load_8bit:
 37 |         kwargs['load_in_8bit'] = True
 38 |     elif load_4bit:
 39 |         kwargs['quantization_config'] = BitsAndBytesConfig(
 40 |             load_in_4bit=True,
 41 |             bnb_4bit_compute_dtype=torch.float16,
 42 |             bnb_4bit_use_double_quant=True,
 43 |             bnb_4bit_quant_type='nf4'
 44 |         )
 45 |     else:
 46 |         kwargs['torch_dtype'] = torch.float16
 47 | 
 48 |     if use_flash_attn:
 49 |         kwargs['_attn_implementation'] = 'flash_attention_2'
 50 | 
 51 |     if is_lora_model(model_path) and model_base is None:
 52 |         warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument.')
 53 |     if is_lora_model(model_path) and model_base is not None:
 54 |         lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
 55 |         if hasattr(lora_cfg_pretrained, 'quantization_config'):
 56 |             del lora_cfg_pretrained.quantization_config
 57 |         processor = AutoProcessor.from_pretrained(model_base)
 58 |         print('Loading Qwen2-VL from base model...')
 59 |         if lora_cfg_pretrained.model_type == "qwen3_vl_moe":
 60 |             model = Qwen3VLMoeForConditionalGeneration.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
 61 |         elif lora_cfg_pretrained.model_type == "qwen3_vl":
 62 |             model = Qwen3VLForConditionalGeneration.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
 63 |         elif lora_cfg_pretrained.model_type == "qwen2_5_vl":
 64 |             model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
 65 |         else:
 66 |             model = Qwen2VLForConditionalGeneration.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
 67 |             
 68 |         token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
 69 |         if model.lm_head.weight.shape[0] != token_num:
 70 |             model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
 71 |             model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
 72 | 
 73 |         print('Loading additional Qwen2-VL weights...')
 74 |         non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_state_dict.bin'), map_location='cpu')
 75 |         non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
 76 |         if any(k.startswith('model.model.') for k in non_lora_trainables):
 77 |             non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
 78 |         model.load_state_dict(non_lora_trainables, strict=False)
 79 |     
 80 |         print('Loading LoRA weights...')
 81 |         model = PeftModel.from_pretrained(model, model_path)
 82 | 
 83 |         print('Merging LoRA weights...')
 84 |         model = model.merge_and_unload()
 85 | 
 86 |         print('Model Loaded!!!')
 87 | 
 88 |     else:
 89 |         print(f"Loading model from {model_path} as a standard model. Adapter files were not found, so it can't be merged")
 90 |         config_path = Path(model_path) / 'config.json'
 91 |         with open(config_path, 'r') as f:
 92 |             config = json.load(f)
 93 | 
 94 |         processor = AutoProcessor.from_pretrained(model_path)
 95 |         
 96 |         architecture = config.get("model_type", [None])[0]
 97 |         if "qwen3_vl_moe" in architecture:
 98 |             model = Qwen3VLMoeForConditionalGeneration.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
 99 |         elif "qwen3_vl" in architecture:
100 |             model = Qwen3VLForConditionalGeneration.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
101 |         elif "qwen2_5_vl" in architecture:
102 |             model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
103 |         else:
104 |             model = Qwen2VLForConditionalGeneration.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
105 | 
106 |     return processor, model
107 | 
108 | def is_lora_model(model_path: str | Path) -> bool:
109 |     """
110 |     Check if a model directory contains LoRA adapter files.
111 |     
112 |     Args:
113 |         model_path: Path to the model directory
114 |         
115 |     Returns:
116 |         bool: True if the directory contains LoRA adapter files
117 |     """
118 |     model_dir = Path(model_path)
119 |     return (model_dir / 'adapter_config.json').exists() and (model_dir / 'adapter_model.safetensors').exists()
120 | 
121 | def get_model_name_from_path(model_path):
122 |     model_path = model_path.strip("/")
123 |     model_paths = model_path.split("/")
124 |     if model_paths[-1].startswith('checkpoint-'):
125 |         return model_paths[-2] + "_" + model_paths[-1]
126 |     else:
127 |         return model_paths[-1]
128 |     
129 | def load_reward_funcs(
130 |     module_path: str = "train.reward_funcs",
131 |     *,
132 |     name_pred = lambda n: n.endswith("_reward"),
133 |     obj_pred  = lambda o: callable(o),
134 |     keep_order: bool = True
135 | ) -> List[Callable]:
136 | 
137 |     mod: ModuleType = importlib.import_module(module_path)
138 |     
139 |     members = inspect.getmembers(mod, predicate=obj_pred)
140 | 
141 |     reward_funcs = [(n, o) for n, o in members if name_pred(n)]
142 | 
143 |     if keep_order:
144 |         reward_funcs.sort(key=lambda pair: inspect.getsourcelines(pair[1])[1])
145 | 
146 |     return [o for _, o in reward_funcs]


--------------------------------------------------------------------------------
/src/train/monkey_patch_vision.py:
--------------------------------------------------------------------------------
  1 | from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
  2 |     Qwen2_5_VisionPatchEmbed,
  3 |     Qwen2_5_VisionRotaryEmbedding,
  4 |     Qwen2_5_VLVisionBlock,
  5 |     Qwen2_5_VLPatchMerger,
  6 |     Qwen2_5_VLPreTrainedModel
  7 | )
  8 | from transformers.models.qwen2_5_vl.configuration_qwen2_5_vl import Qwen2_5_VLVisionConfig
  9 | import torch
 10 | import torch.nn as nn
 11 | import torch.nn.functional as F
 12 | import numpy as np
 13 | import transformers.models.qwen2_5_vl.modeling_qwen2_5_vl
 14 | 
 15 | def replace_qwen2_5_vision():
 16 |     transformers.models.qwen2_5_vl.modeling_qwen2_5_vl.Qwen2_5_VisionTransformerPretrainedModel = Qwen2_5_VisionTransformerPretrainedModelWithPatchedWindow
 17 | 
 18 | class Qwen2_5_VisionTransformerPretrainedModelWithPatchedWindow(Qwen2_5_VLPreTrainedModel):
 19 |     config: Qwen2_5_VLVisionConfig
 20 |     _no_split_modules = ["Qwen2_5_VLVisionBlock"]
 21 | 
 22 |     def __init__(self, config, *inputs, **kwargs) -> None:
 23 |         super().__init__(config, *inputs, **kwargs)
 24 |         self.spatial_merge_size = config.spatial_merge_size
 25 |         self.patch_size = config.patch_size
 26 |         self.fullatt_block_indexes = config.fullatt_block_indexes
 27 |         self.window_size = config.window_size
 28 |         self.spatial_merge_unit = self.spatial_merge_size * self.spatial_merge_size
 29 | 
 30 |         self.patch_embed = Qwen2_5_VisionPatchEmbed(
 31 |             patch_size=config.patch_size,
 32 |             temporal_patch_size=config.temporal_patch_size,
 33 |             in_channels=config.in_channels,
 34 |             embed_dim=config.hidden_size,
 35 |         )
 36 | 
 37 |         head_dim = config.hidden_size // config.num_heads
 38 |         self.rotary_pos_emb = Qwen2_5_VisionRotaryEmbedding(head_dim // 2)
 39 | 
 40 |         self.blocks = nn.ModuleList(
 41 |             [Qwen2_5_VLVisionBlock(config, config._attn_implementation) for _ in range(config.depth)]
 42 |         )
 43 |         self.merger = Qwen2_5_VLPatchMerger(
 44 |             dim=config.out_hidden_size,
 45 |             context_dim=config.hidden_size,
 46 |             spatial_merge_size=config.spatial_merge_size,
 47 |         )
 48 |         self.gradient_checkpointing = False
 49 | 
 50 |     def rot_pos_emb(self, grid_thw):
 51 |         pos_ids = []
 52 |         for t, h, w in grid_thw:
 53 |             hpos_ids = torch.arange(h).unsqueeze(1).expand(-1, w)
 54 |             hpos_ids = hpos_ids.reshape(
 55 |                 h // self.spatial_merge_size,
 56 |                 self.spatial_merge_size,
 57 |                 w // self.spatial_merge_size,
 58 |                 self.spatial_merge_size,
 59 |             )
 60 |             hpos_ids = hpos_ids.permute(0, 2, 1, 3)
 61 |             hpos_ids = hpos_ids.flatten()
 62 | 
 63 |             wpos_ids = torch.arange(w).unsqueeze(0).expand(h, -1)
 64 |             wpos_ids = wpos_ids.reshape(
 65 |                 h // self.spatial_merge_size,
 66 |                 self.spatial_merge_size,
 67 |                 w // self.spatial_merge_size,
 68 |                 self.spatial_merge_size,
 69 |             )
 70 |             wpos_ids = wpos_ids.permute(0, 2, 1, 3)
 71 |             wpos_ids = wpos_ids.flatten()
 72 |             pos_ids.append(torch.stack([hpos_ids, wpos_ids], dim=-1).repeat(t, 1))
 73 |         pos_ids = torch.cat(pos_ids, dim=0)
 74 |         max_grid_size = grid_thw[:, 1:].max()
 75 |         rotary_pos_emb_full = self.rotary_pos_emb(max_grid_size)
 76 |         rotary_pos_emb = rotary_pos_emb_full[pos_ids].flatten(1)
 77 |         return rotary_pos_emb
 78 | 
 79 |     def get_window_index(self, grid_thw):
 80 |         window_index: list = []
 81 |         cu_window_seqlens: list = [0]
 82 |         window_index_id = 0
 83 |         vit_merger_window_size = self.window_size // self.spatial_merge_size // self.patch_size
 84 | 
 85 |         for grid_t, grid_h, grid_w in grid_thw:
 86 |             llm_grid_h, llm_grid_w = (
 87 |                 grid_h // self.spatial_merge_size,
 88 |                 grid_w // self.spatial_merge_size,
 89 |             )
 90 |             index = torch.arange(grid_t * llm_grid_h * llm_grid_w).reshape(grid_t, llm_grid_h, llm_grid_w)
 91 | 
 92 |             pad_h = (vit_merger_window_size - llm_grid_h % vit_merger_window_size) % vit_merger_window_size
 93 |             pad_w = (vit_merger_window_size - llm_grid_w % vit_merger_window_size) % vit_merger_window_size
 94 | 
 95 |             num_windows_h = (llm_grid_h + pad_h) // vit_merger_window_size
 96 |             num_windows_w = (llm_grid_w + pad_w) // vit_merger_window_size
 97 | 
 98 |             index_padded = F.pad(index, (0, pad_w, 0, pad_h), "constant", -100)
 99 |             index_padded = index_padded.reshape(
100 |                 grid_t,
101 |                 num_windows_h,
102 |                 vit_merger_window_size,
103 |                 num_windows_w,
104 |                 vit_merger_window_size,
105 |             )
106 |             index_padded = index_padded.permute(0, 1, 3, 2, 4).reshape(
107 |                 grid_t,
108 |                 num_windows_h * num_windows_w,
109 |                 vit_merger_window_size,
110 |                 vit_merger_window_size,
111 |             )
112 |             seqlens = (index_padded != -100).sum([2, 3]).reshape(-1)
113 |             index_padded = index_padded.reshape(-1)
114 |             index_new = index_padded[index_padded != -100]
115 |             window_index.append(index_new + window_index_id)
116 |             cu_seqlens_tmp = seqlens.cumsum(0) * self.spatial_merge_unit + cu_window_seqlens[-1]
117 |             cu_window_seqlens.extend(cu_seqlens_tmp.tolist())
118 |             window_index_id += (grid_t * llm_grid_h * llm_grid_w).item()
119 |         window_index = torch.cat(window_index, dim=0)
120 | 
121 |         return window_index, cu_window_seqlens
122 | 
123 | 
124 |     def forward(self, hidden_states: torch.Tensor, grid_thw: torch.Tensor) -> torch.Tensor:
125 |         hidden_states = self.patch_embed(hidden_states)
126 |         seq_len, dim = hidden_states.size()
127 | 
128 |         rotary_pos_emb = self.rot_pos_emb(grid_thw)
129 | 
130 |         window_index, cu_window_seqlens_list = self.get_window_index(grid_thw)
131 | 
132 |         cu_window_seqlens = torch.tensor(
133 |             cu_window_seqlens_list,
134 |             device=hidden_states.device,
135 |             dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
136 |         )
137 |         cu_window_seqlens = torch.unique_consecutive(cu_window_seqlens)
138 | 
139 |         group = self.spatial_merge_unit
140 |         G = seq_len // group
141 | 
142 |         hidden_states = hidden_states.view(G, group, dim)
143 |         rotary_pos_emb = rotary_pos_emb.view(G, group, -1)
144 | 
145 |         window_index_dev = window_index.to(hidden_states.device, non_blocking=True)
146 | 
147 |         hidden_states = hidden_states.index_select(0, window_index_dev).reshape(seq_len, dim)
148 |         rotary_pos_emb = rotary_pos_emb.index_select(0, window_index_dev).reshape(seq_len, -1)
149 | 
150 |         emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1)
151 |         position_embeddings = (emb.cos(), emb.sin())
152 | 
153 |         cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(
154 |             dim=0,
155 |             dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32,
156 |         )
157 |         cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0)
158 | 
159 |         if cu_seqlens.device != hidden_states.device:
160 |              cu_seqlens = cu_seqlens.to(hidden_states.device, non_blocking=True)
161 |              
162 |         for layer_num, blk in enumerate(self.blocks):
163 |             if layer_num in self.fullatt_block_indexes:
164 |                 cu_seqlens_now = cu_seqlens
165 |             else:
166 |                 cu_seqlens_now = cu_window_seqlens
167 | 
168 |             if self.gradient_checkpointing and self.training:
169 |                 hidden_states = self._gradient_checkpointing_func(
170 |                     blk.__call__, hidden_states, cu_seqlens_now, None, position_embeddings
171 |                 )
172 |             else:
173 |                 hidden_states = blk(hidden_states, cu_seqlens=cu_seqlens_now, position_embeddings=position_embeddings)
174 | 
175 |         hidden_states = self.merger(hidden_states)
176 | 
177 |         reverse_indices = torch.empty_like(window_index_dev)
178 |         reverse_indices.scatter_(0, window_index_dev, torch.arange(window_index_dev.numel(), dtype=torch.long, device=window_index_dev.device))
179 |         hidden_states = hidden_states.index_select(0, reverse_indices)
180 | 
181 |         return hidden_states


--------------------------------------------------------------------------------
/src/dataset/cls_dataset.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import os
  3 | from typing import Dict
  4 | import torch
  5 | import transformers
  6 | import ujson as json
  7 | from torch.utils.data import Dataset
  8 | from qwen_vl_utils import process_vision_info
  9 | 
 10 | from src.params import DataArguments
 11 | from src.constants import (
 12 |     SYSTEM_MESSAGE,
 13 | )
 14 | 
 15 | from .data_utils import pad_sequence, samples_per_class_from_ids
 16 | 
 17 | CLASS_2_ID = {
 18 |     "A": 0,
 19 |     "B": 1
 20 | }
 21 | 
 22 | USER_MESSAGE = """Enter your prompt here. This will be used when your data does not have a prompt field."""
 23 | 
 24 | def get_image_content(image_path, min_pixel, max_pixel, width, height):
 25 |     content = {
 26 |         "type": "image", 
 27 |         "image": image_path,
 28 |         "min_pixels": min_pixel,
 29 |         "max_pixels": max_pixel
 30 |     }
 31 | 
 32 |     if width is not None and height is not None:
 33 |         content["resized_width"] = width
 34 |         content["resized_height"] = height
 35 | 
 36 |     return content
 37 | 
 38 | def get_video_content(video_path, min_pixels, max_pixels, width, height, fps, nframes):
 39 |     content = {
 40 |         "type": "video", 
 41 |         "video": video_path,
 42 |         "min_pixels": min_pixels,
 43 |         "max_pixels": max_pixels,
 44 |     }
 45 | 
 46 |     if nframes is not None:
 47 |         content["nframes"] = nframes
 48 |     else:
 49 |         content["fps"] = fps
 50 | 
 51 |     if width is not None and height is not None:
 52 |         content["resized_width"] = width
 53 |         content["resized_height"] = height
 54 |     
 55 |     return content
 56 | 
 57 | class ClassificationDataset(Dataset):
 58 |     """Dataset for supervised fine-tuning."""
 59 | 
 60 |     def __init__(
 61 |         self,
 62 |         data_path: str | list,
 63 |         processor: transformers.ProcessorMixin,
 64 |         data_args: DataArguments,
 65 |         model_id,
 66 |         padding=True,
 67 |     ):
 68 |         super(ClassificationDataset, self).__init__()
 69 |         if isinstance(data_path, str):
 70 |             list_data_dict = json.load(open(data_path, "r"))
 71 |         else:
 72 |             list_data_dict = data_path
 73 | 
 74 |         self.compute_dtype = data_args.compute_dtype
 75 | 
 76 |         self.model_id = model_id
 77 |         self.processor = processor
 78 |         self.list_data_dict = list_data_dict
 79 |         self.data_args = data_args
 80 |         self.padding = padding
 81 |         self.image_min_pixel = data_args.image_min_pixels
 82 |         self.image_max_pixel = data_args.image_max_pixels
 83 |         self.video_min_pixel = data_args.video_min_pixels
 84 |         self.video_max_pixel = data_args.video_max_pixels
 85 |         self.image_resized_w = data_args.image_resized_width
 86 |         self.image_resized_h = data_args.image_resized_height
 87 |         self.video_resized_w = data_args.video_resized_width
 88 |         self.video_resized_h = data_args.video_resized_height
 89 |         self.fps = data_args.fps
 90 |         self.nframes = data_args.nframes
 91 | 
 92 |     def __len__(self):
 93 |         return len(self.list_data_dict)
 94 | 
 95 |     def __getitem__(self, i) -> Dict[str, torch.Tensor]:
 96 |         sources = self.list_data_dict[i]
 97 | 
 98 |         contents = []
 99 |         
100 |         if "image" in sources:
101 |             image_files = sources["image"]
102 |             image_folder = self.data_args.image_folder
103 | 
104 |             if isinstance(image_files, str):
105 |                 image_files = [image_files]
106 |             
107 |             for image_file in image_files:
108 |                 if not os.path.exists(image_file):
109 |                     if not image_file.startswith("http"):
110 |                         image_file = os.path.join(image_folder, image_file)
111 |                 contents.append(get_image_content(image_file, self.image_min_pixel, self.image_max_pixel, self.image_resized_w, self.image_resized_h))
112 | 
113 |         elif "video" in sources:
114 |             video_files = sources["video"]
115 |             video_folder = self.data_args.image_folder
116 | 
117 |             if isinstance(video_files, str):
118 |                 video_files = [video_files]
119 | 
120 |             frame_paths = []
121 |             for video_file in video_files:
122 |                 if not os.path.exists(video_file):
123 |                     if not video_file.startswith("http"):
124 |                         video_file = os.path.join(video_folder, video_file)
125 |                     frame_paths.append(video_file)
126 |             
127 |             contents.append(get_video_content(frame_paths, self.video_min_pixel, self.video_max_pixel, self.video_resized_w, self.video_resized_h, self.fps, self.nframes))
128 | 
129 |         if "prompt" in sources:
130 |             text_content = {"type": "text", "text": sources["prompt"]}
131 | 
132 |         else: 
133 |             text_content = {"type": "text", "text": USER_MESSAGE}
134 |         
135 |         contents.append(text_content)
136 | 
137 |         user_prompt = [{"role": "user", "content": contents}]
138 | 
139 |         if len(SYSTEM_MESSAGE) > 0:
140 |             system_message = {"role": "system", "content": SYSTEM_MESSAGE}
141 |             user_prompt.insert(0, system_message)
142 | 
143 |         text = self.processor.apply_chat_template(
144 |             user_prompt, tokenize=False, add_generation_prompt=True
145 |         )
146 | 
147 |         image_inputs, video_inputs, video_kwargs = process_vision_info(user_prompt, return_video_kwargs=True)
148 | 
149 |         data_dict = self.processor(
150 |             text=text,
151 |             images=image_inputs,
152 |             videos=video_inputs,
153 |             return_tensors="pt",
154 |             **video_kwargs
155 |         )
156 | 
157 |         labels = [torch.tensor(CLASS_2_ID[sources["label"]], dtype=torch.long)]
158 | 
159 |         # eos_token_id = processor.tokenizer.convert_tokens_to_ids(DEFAULT_IM_END_TOKEN)
160 |         # input_ids, labels = truncate_sequence(input_ids, labels, self.max_length, eos_token_id)
161 | 
162 |         attention_mask = (data_dict['input_ids'] > -1000000).to(torch.long)
163 | 
164 |         data_dict['labels'] = labels
165 |         data_dict['attention_mask'] = attention_mask
166 | 
167 |         for key, value in data_dict.items():  # cast data dtype for paligemma
168 |             if torch.is_tensor(value) and torch.is_floating_point(value):
169 |                 data_dict[key] = value.to(self.compute_dtype)
170 |         
171 |         return data_dict
172 | 
173 | class DataCollatorForClassificationDataset(object):
174 |     """Collate examples for supervised fine-tuning."""
175 | 
176 |     def __init__(self, pad_token_id: int, padding_side: str = "right"):
177 |         self.pad_token_id = pad_token_id
178 |         self.padding_side = padding_side
179 | 
180 |     def __call__(self, examples):
181 |         batch_input_ids = []
182 |         batch_labels = []
183 |         batch_pixel_values = []
184 |         batch_pixel_video_values = []
185 |         batch_video_thw = []
186 |         batch_image_thw = []
187 |         batch_second_per_grid_ts = []
188 |         
189 |         for example in examples:
190 |             keys = example.keys()
191 |             if "pixel_values_videos" in keys:
192 |                 batch_pixel_video_values.append(example["pixel_values_videos"])
193 |                 batch_video_thw.append(example["video_grid_thw"])
194 |             elif "pixel_values" in keys:
195 |                 batch_pixel_values.append(example["pixel_values"])
196 |                 batch_image_thw.append(example["image_grid_thw"])
197 |             
198 |             batch_input_ids.append(example["input_ids"].squeeze(0))
199 |             batch_labels.extend(example["labels"])
200 | 
201 |             if "second_per_grid_ts" in keys:
202 |                 batch_second_per_grid_ts.extend(example["second_per_grid_ts"])
203 |         
204 |         input_ids = pad_sequence(
205 |             batch_input_ids, padding_side=self.padding_side, padding_value=self.pad_token_id
206 |         )
207 |         labels = torch.tensor(batch_labels, dtype=torch.long)
208 | 
209 |         attention_mask = input_ids != self.pad_token_id
210 | 
211 |         data_dict = {
212 |             'input_ids': input_ids,
213 |             'labels': labels,
214 |             'attention_mask': attention_mask,
215 |         }
216 | 
217 |         if len(batch_pixel_values) > 0:
218 |             pixel_values = torch.cat(batch_pixel_values, dim=0)
219 |             image_thw = torch.cat(batch_image_thw, dim=0)
220 |             data_dict["pixel_values"] = pixel_values
221 |             data_dict["image_grid_thw"] = image_thw
222 | 
223 |         if len(batch_pixel_video_values) > 0:
224 |             pixel_video_values = torch.cat(batch_pixel_video_values, dim=0)
225 |             video_thw = torch.cat(batch_video_thw, dim=0)
226 |             data_dict["pixel_values_videos"] = pixel_video_values
227 |             data_dict["video_grid_thw"] = video_thw
228 | 
229 |         if len(batch_second_per_grid_ts) > 0:
230 |             data_dict["second_per_grid_ts"] = batch_second_per_grid_ts
231 | 
232 |         return data_dict
233 |     
234 | def make_classification_data_module(model_id, processor, data_args):
235 | 
236 |     eval_ds = None
237 |     eval_data_collator = None
238 | 
239 |     cls_dataset = ClassificationDataset(
240 |         data_path=data_args.data_path, processor=processor, data_args=data_args, model_id=model_id
241 |     )
242 |     train_data_collator = DataCollatorForClassificationDataset(pad_token_id=processor.tokenizer.pad_token_id, padding_side="left")
243 | 
244 |     labels_list = [CLASS_2_ID[s["label"]] for s in cls_dataset.list_data_dict]
245 | 
246 |     samples_per_class = samples_per_class_from_ids(
247 |         labels_list, num_classes=len(CLASS_2_ID)
248 |     )
249 | 
250 |     if data_args.eval_path is not None:
251 |         eval_data_args = copy.deepcopy(data_args)
252 |         eval_data_args.image_folder = data_args.eval_image_folder
253 |         eval_data_args.data_path = data_args.eval_path
254 |         eval_ds = ClassificationDataset(
255 |             data_path=eval_data_args.data_path, processor=processor, data_args=eval_data_args, model_id=model_id
256 |         )
257 |         eval_data_collator = DataCollatorForClassificationDataset(pad_token_id=processor.tokenizer.pad_token_id, padding_side="left")
258 | 
259 |     return dict(
260 |             train_dataset=cls_dataset,
261 |             eval_dataset=eval_ds,
262 |             train_data_collator=train_data_collator,
263 |             eval_data_collator=eval_data_collator,
264 |             samples_per_class=samples_per_class,
265 |         )


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                              Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 | 1.  Definitions.
  8 | 
  9 |     "License" shall mean the terms and conditions for use, reproduction,
 10 |     and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |     "Licensor" shall mean the copyright owner or entity authorized by
 13 |     the copyright owner that is granting the License.
 14 | 
 15 |     "Legal Entity" shall mean the union of the acting entity and all
 16 |     other entities that control, are controlled by, or are under common
 17 |     control with that entity. For the purposes of this definition,
 18 |     "control" means (i) the power, direct or indirect, to cause the
 19 |     direction or management of such entity, whether by contract or
 20 |     otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |     outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |     "You" (or "Your") shall mean an individual or Legal Entity
 24 |     exercising permissions granted by this License.
 25 | 
 26 |     "Source" form shall mean the preferred form for making modifications,
 27 |     including but not limited to software source code, documentation
 28 |     source, and configuration files.
 29 | 
 30 |     "Object" form shall mean any form resulting from mechanical
 31 |     transformation or translation of a Source form, including but
 32 |     not limited to compiled object code, generated documentation,
 33 |     and conversions to other media types.
 34 | 
 35 |     "Work" shall mean the work of authorship, whether in Source or
 36 |     Object form, made available under the License, as indicated by a
 37 |     copyright notice that is included in or attached to the work
 38 |     (an example is provided in the Appendix below).
 39 | 
 40 |     "Derivative Works" shall mean any work, whether in Source or Object
 41 |     form, that is based on (or derived from) the Work and for which the
 42 |     editorial revisions, annotations, elaborations, or other modifications
 43 |     represent, as a whole, an original work of authorship. For the purposes
 44 |     of this License, Derivative Works shall not include works that remain
 45 |     separable from, or merely link (or bind by name) to the interfaces of,
 46 |     the Work and Derivative Works thereof.
 47 | 
 48 |     "Contribution" shall mean any work of authorship, including
 49 |     the original version of the Work and any modifications or additions
 50 |     to that Work or Derivative Works thereof, that is intentionally
 51 |     submitted to Licensor for inclusion in the Work by the copyright owner
 52 |     or by an individual or Legal Entity authorized to submit on behalf of
 53 |     the copyright owner. For the purposes of this definition, "submitted"
 54 |     means any form of electronic, verbal, or written communication sent
 55 |     to the Licensor or its representatives, including but not limited to
 56 |     communication on electronic mailing lists, source code control systems,
 57 |     and issue tracking systems that are managed by, or on behalf of, the
 58 |     Licensor for the purpose of discussing and improving the Work, but
 59 |     excluding communication that is conspicuously marked or otherwise
 60 |     designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |     "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |     on behalf of whom a Contribution has been received by Licensor and
 64 |     subsequently incorporated within the Work.
 65 | 
 66 | 2.  Grant of Copyright License. Subject to the terms and conditions of
 67 |     this License, each Contributor hereby grants to You a perpetual,
 68 |     worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |     copyright license to reproduce, prepare Derivative Works of,
 70 |     publicly display, publicly perform, sublicense, and distribute the
 71 |     Work and such Derivative Works in Source or Object form.
 72 | 
 73 | 3.  Grant of Patent License. Subject to the terms and conditions of
 74 |     this License, each Contributor hereby grants to You a perpetual,
 75 |     worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |     (except as stated in this section) patent license to make, have made,
 77 |     use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |     where such license applies only to those patent claims licensable
 79 |     by such Contributor that are necessarily infringed by their
 80 |     Contribution(s) alone or by combination of their Contribution(s)
 81 |     with the Work to which such Contribution(s) was submitted. If You
 82 |     institute patent litigation against any entity (including a
 83 |     cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |     or a Contribution incorporated within the Work constitutes direct
 85 |     or contributory patent infringement, then any patent licenses
 86 |     granted to You under this License for that Work shall terminate
 87 |     as of the date such litigation is filed.
 88 | 
 89 | 4.  Redistribution. You may reproduce and distribute copies of the
 90 |     Work or Derivative Works thereof in any medium, with or without
 91 |     modifications, and in Source or Object form, provided that You
 92 |     meet the following conditions:
 93 | 
 94 |     (a) You must give any other recipients of the Work or
 95 |     Derivative Works a copy of this License; and
 96 | 
 97 |     (b) You must cause any modified files to carry prominent notices
 98 |     stating that You changed the files; and
 99 | 
100 |     (c) You must retain, in the Source form of any Derivative Works
101 |     that You distribute, all copyright, patent, trademark, and
102 |     attribution notices from the Source form of the Work,
103 |     excluding those notices that do not pertain to any part of
104 |     the Derivative Works; and
105 | 
106 |     (d) If the Work includes a "NOTICE" text file as part of its
107 |     distribution, then any Derivative Works that You distribute must
108 |     include a readable copy of the attribution notices contained
109 |     within such NOTICE file, excluding those notices that do not
110 |     pertain to any part of the Derivative Works, in at least one
111 |     of the following places: within a NOTICE text file distributed
112 |     as part of the Derivative Works; within the Source form or
113 |     documentation, if provided along with the Derivative Works; or,
114 |     within a display generated by the Derivative Works, if and
115 |     wherever such third-party notices normally appear. The contents
116 |     of the NOTICE file are for informational purposes only and
117 |     do not modify the License. You may add Your own attribution
118 |     notices within Derivative Works that You distribute, alongside
119 |     or as an addendum to the NOTICE text from the Work, provided
120 |     that such additional attribution notices cannot be construed
121 |     as modifying the License.
122 | 
123 |     You may add Your own copyright statement to Your modifications and
124 |     may provide additional or different license terms and conditions
125 |     for use, reproduction, or distribution of Your modifications, or
126 |     for any such Derivative Works as a whole, provided Your use,
127 |     reproduction, and distribution of the Work otherwise complies with
128 |     the conditions stated in this License.
129 | 
130 | 5.  Submission of Contributions. Unless You explicitly state otherwise,
131 |     any Contribution intentionally submitted for inclusion in the Work
132 |     by You to the Licensor shall be under the terms and conditions of
133 |     this License, without any additional terms or conditions.
134 |     Notwithstanding the above, nothing herein shall supersede or modify
135 |     the terms of any separate license agreement you may have executed
136 |     with Licensor regarding such Contributions.
137 | 
138 | 6.  Trademarks. This License does not grant permission to use the trade
139 |     names, trademarks, service marks, or product names of the Licensor,
140 |     except as required for reasonable and customary use in describing the
141 |     origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 | 7.  Disclaimer of Warranty. Unless required by applicable law or
144 |     agreed to in writing, Licensor provides the Work (and each
145 |     Contributor provides its Contributions) on an "AS IS" BASIS,
146 |     WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |     implied, including, without limitation, any warranties or conditions
148 |     of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |     PARTICULAR PURPOSE. You are solely responsible for determining the
150 |     appropriateness of using or redistributing the Work and assume any
151 |     risks associated with Your exercise of permissions under this License.
152 | 
153 | 8.  Limitation of Liability. In no event and under no legal theory,
154 |     whether in tort (including negligence), contract, or otherwise,
155 |     unless required by applicable law (such as deliberate and grossly
156 |     negligent acts) or agreed to in writing, shall any Contributor be
157 |     liable to You for damages, including any direct, indirect, special,
158 |     incidental, or consequential damages of any character arising as a
159 |     result of this License or out of the use or inability to use the
160 |     Work (including but not limited to damages for loss of goodwill,
161 |     work stoppage, computer failure or malfunction, or any and all
162 |     other commercial damages or losses), even if such Contributor
163 |     has been advised of the possibility of such damages.
164 | 
165 | 9.  Accepting Warranty or Additional Liability. While redistributing
166 |     the Work or Derivative Works thereof, You may choose to offer,
167 |     and charge a fee for, acceptance of support, warranty, indemnity,
168 |     or other liability obligations and/or rights consistent with this
169 |     License. However, in accepting such obligations, You may act only
170 |     on Your own behalf and on Your sole responsibility, not on behalf
171 |     of any other Contributor, and only if You agree to indemnify,
172 |     defend, and hold each Contributor harmless for any liability
173 |     incurred by, or claims asserted against, such Contributor by reason
174 |     of your accepting any such warranty or additional liability.
175 | 
176 | END OF TERMS AND CONDITIONS
177 | 
178 | APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 | Copyright [yyyy] [name of copyright owner]
190 | 
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 | 


--------------------------------------------------------------------------------
/src/params.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass, field
  2 | from typing import Optional
  3 | 
  4 | try:
  5 |     from accelerate.utils import ParallelismConfig as _PC
  6 | except Exception:
  7 |     class _PC:
  8 |         pass
  9 | 
 10 | import transformers.training_args as _ta
 11 | if not hasattr(_ta, "ParallelismConfig"):
 12 |     _ta.ParallelismConfig = _PC
 13 | 
 14 | from transformers import TrainingArguments as HFTrainingArguments
 15 | from trl import DPOConfig as DPOConfigTRL
 16 | from trl import GRPOConfig as GRPOConfigTRL
 17 | 
 18 | 
 19 | @dataclass
 20 | class ModelArguments:
 21 |     model_id: Optional[str] = field(default="Qwen/Qwen2-VL-7B-Instruct")
 22 | 
 23 | 
 24 | @dataclass
 25 | class CLSArguments(HFTrainingArguments):
 26 |     cache_dir: Optional[str] = field(default=None)
 27 |     optim: str = field(default="adamw_torch")
 28 |     adam_beta1: float = field(default=0.9)
 29 |     adam_beta2: float = field(default=0.999)
 30 |     adam_epsilon: float = field(default=1e-8)
 31 | 
 32 |     freeze_vision_tower: bool = field(default=False)
 33 |     freeze_llm: bool = field(default=False)
 34 |     freeze_merger: bool = field(default=False)
 35 |     disable_flash_attn2: bool = field(default=False)
 36 |     unfreeze_topk_llm: int = 0
 37 |     unfreeze_topk_vision: int = 0
 38 |     mlp_head_dim: Optional[int] = field(default=0)
 39 |     mlp_head_dropout: Optional[float] = field(default=0.0)
 40 |     
 41 |     loss_type : str = field(
 42 |         default="cross_entropy",
 43 |         metadata={"help": "Loss type to use. Should be one of `cross_entropy`, `focal_loss`, `class_balanced_cross_entropy`, or `class_balanced_focal_loss`."}
 44 |     )
 45 |     focal_alpha: Optional[str] = field(
 46 |         default=None,
 47 |         metadata={"help": "Focal Loss alpha value. If None use CrossEntropyLoss. ex '1.0,7.5'"}
 48 |     )
 49 |     focal_gamma: float = field(
 50 |         default=0.0,
 51 |         metadata={"help": "Focal Loss gamma value"}
 52 |     )
 53 |     num_labels: int = field(
 54 |         default=2,
 55 |         metadata={"help": "Number of labels for classification."}
 56 |     )
 57 |     class_balanced_beta: float = field(
 58 |         default=0.999,
 59 |         metadata={"help": "Beta value for Class Balanced Loss. If 0.0, use standard CrossEntropyLoss."}
 60 |     )
 61 |     early_stopping_patience: int = field(
 62 |         default=0,
 63 |         metadata={"help": "Number of epochs with no improvement after which training will be stopped."}
 64 |     )
 65 |     early_stopping_threshold: float = field(
 66 |         default=0.0,
 67 |         metadata={"help": "Minimum change in the monitored quantity to qualify as an improvement."}
 68 |     )
 69 | 
 70 |     max_seq_length: int = field(
 71 |         default=32768, # This is the default value of the qwen2-vl model
 72 |         metadata={
 73 |             "help":
 74 |                 "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
 75 |         },
 76 |     )
 77 |     double_quant: bool = field(
 78 |         default=True,
 79 |         metadata={"help": "Compress the quantization statistics through double quantization."}
 80 |     )
 81 |     quant_type: str = field(
 82 |         default="nf4",
 83 |         metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
 84 |     )
 85 |     bits: int = field(
 86 |         default=16,
 87 |         metadata={"help": "How many bits to use."}
 88 |     )
 89 |     lora_enable: bool = False
 90 |     vision_lora: bool = False
 91 |     use_dora: bool = False
 92 |     lora_rank: int = 64
 93 |     lora_alpha: int = 16
 94 |     lora_dropout: float = 0.05
 95 |     lora_weight_path: str = ""
 96 |     lora_bias: str = "none"
 97 |     vision_lr: Optional[float] = None
 98 |     merger_lr: Optional[float] = None
 99 |     head_lr: Optional[float] = None
100 |     lora_namespan_exclude: str = field(default=None, metadata={"help": "List of namespan to exclude for LoRA"})
101 |     num_lora_modules: int = -1
102 |     use_liger_kernel: bool = True
103 | 
104 | 
105 | @dataclass
106 | class TrainingArguments(HFTrainingArguments):
107 |     cache_dir: Optional[str] = field(default=None)
108 |     optim: str = field(default="adamw_torch")
109 |     adam_beta1: float = field(default=0.9)
110 |     adam_beta2: float = field(default=0.999)
111 |     adam_epsilon: float = field(default=1e-8)
112 | 
113 |     freeze_vision_tower: bool = field(default=False)
114 |     freeze_llm: bool = field(default=False)
115 |     freeze_merger: bool = field(default=False)
116 |     disable_flash_attn2: bool = field(default=False)
117 |     unfreeze_topk_llm: int = 0
118 |     unfreeze_topk_vision: int = 0
119 | 
120 |     max_seq_length: int = field(
121 |         default=32768, # This is the default value of the qwen2-vl model
122 |         metadata={
123 |             "help":
124 |                 "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
125 |         },
126 |     )
127 | 
128 |     double_quant: bool = field(
129 |         default=True,
130 |         metadata={"help": "Compress the quantization statistics through double quantization."}
131 |     )
132 |     quant_type: str = field(
133 |         default="nf4",
134 |         metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
135 |     )
136 |     bits: int = field(
137 |         default=16,
138 |         metadata={"help": "How many bits to use."}
139 |     )
140 |     lora_enable: bool = False
141 |     vision_lora: bool = False
142 |     use_dora: bool = False
143 |     lora_rank: int = 64
144 |     lora_alpha: int = 16
145 |     lora_dropout: float = 0.05
146 |     lora_weight_path: str = ""
147 |     lora_bias: str = "none"
148 |     vision_lr: Optional[float] = None
149 |     merger_lr: Optional[float] = None
150 |     lora_namespan_exclude: str = field(default=None, metadata={"help": "List of namespan to exclude for LoRA"})
151 |     num_lora_modules: int = -1
152 |     use_liger_kernel: bool = True
153 | 
154 |     # Generation-based evaluation settings
155 |     generation_max_new_tokens: int = field(
156 |         default=512,
157 |         metadata={"help": "Maximum number of new tokens to generate during evaluation."}
158 |     )
159 | 
160 | @dataclass
161 | class DPOArguments(DPOConfigTRL):
162 |     cache_dir: Optional[str] = field(default=None)
163 |     optim: str = field(default="adamw_torch")
164 |     adam_beta1: float = field(default=0.9)
165 |     adam_beta2: float = field(default=0.999)
166 |     adam_epsilon: float = field(default=1e-8)
167 | 
168 |     freeze_vision_tower: bool = field(default=False)
169 |     freeze_llm: bool = field(default=False)
170 |     freeze_merger: bool = field(default=False)
171 |     disable_flash_attn2: bool = field(default=False)
172 |     unfreeze_topk_llm: int = 0
173 |     unfreeze_topk_vision: int = 0
174 | 
175 |     max_seq_length: int = field(
176 |         default=32768, # This is the default value of the qwen2-vl model
177 |         metadata={
178 |             "help":
179 |                 "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
180 |         },
181 |     )
182 |     double_quant: bool = field(
183 |         default=True,
184 |         metadata={"help": "Compress the quantization statistics through double quantization."}
185 |     )
186 |     quant_type: str = field(
187 |         default="nf4",
188 |         metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
189 |     )
190 |     bits: int = field(
191 |         default=16,
192 |         metadata={"help": "How many bits to use."}
193 |     )
194 |     lora_enable: bool = False
195 |     vision_lora: bool = False
196 |     use_dora: bool = False
197 |     lora_rank: int = 64
198 |     lora_alpha: int = 16
199 |     lora_dropout: float = 0.05
200 |     lora_weight_path: str = ""
201 |     lora_bias: str = "none"
202 |     vision_lr: Optional[float] = None
203 |     merger_lr: Optional[float] = None
204 |     lora_namespan_exclude: str = field(default=None, metadata={"help": "List of namespan to exclude for LoRA"})
205 |     num_lora_modules: int = -1
206 |     use_liger_loss: bool = True
207 |     beta: float = field(
208 |         default=0.1,
209 |         metadata={"help": "The beta value for DPO."}
210 |     )
211 |     precompute_ref_log_probs: bool = field(
212 |         default=False,
213 |         metadata={"help": "Whether to precompute the reference log probabilities."}
214 |     )
215 |     dpo_loss:str = field(
216 |         default="sigmoid",
217 |         metadata={"help": "The type of DPO loss to use."}
218 |     )
219 | 
220 | @dataclass
221 | class GRPOArguments(GRPOConfigTRL):
222 |     cache_dir: Optional[str] = field(default=None)
223 |     optim: str = field(default="adamw_torch")
224 |     adam_beta1: float = field(default=0.9)
225 |     adam_beta2: float = field(default=0.999)
226 |     adam_epsilon: float = field(default=1e-8)
227 | 
228 |     freeze_vision_tower: bool = field(default=False)
229 |     freeze_llm: bool = field(default=False)
230 |     freeze_merger: bool = field(default=False)
231 |     disable_flash_attn2: bool = field(default=False)
232 |     unfreeze_topk_llm: int = 0
233 |     unfreeze_topk_vision: int = 0
234 | 
235 |     double_quant: bool = field(
236 |         default=True,
237 |         metadata={"help": "Compress the quantization statistics through double quantization."}
238 |     )
239 |     quant_type: str = field(
240 |         default="nf4",
241 |         metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
242 |     )
243 |     bits: int = field(
244 |         default=16,
245 |         metadata={"help": "How many bits to use."}
246 |     )
247 |     lora_enable: bool = False
248 |     vision_lora: bool = False
249 |     use_dora: bool = False
250 |     lora_rank: int = 64
251 |     lora_alpha: int = 16
252 |     lora_dropout: float = 0.05
253 |     lora_weight_path: str = ""
254 |     lora_bias: str = "none"
255 |     vision_lr: Optional[float] = None
256 |     merger_lr: Optional[float] = None
257 |     lora_namespan_exclude: str = field(default=None, metadata={"help": "List of namespan to exclude for LoRA"})
258 |     num_lora_modules: int = -1
259 |     beta: float = field(
260 |         default=0.04,
261 |         metadata={
262 |             "help": "KL coefficient. If `0.0`, the reference model is not loaded, reducing memory usage and improving "
263 |             "training speed, but may be numerically unstable for long training runs."
264 |         },
265 |     )
266 |     temperature: float = 0.9
267 |     top_p: float = 1.0
268 |     top_k: int = 50
269 |     min_p: Optional[float] = None
270 |     repetition_penalty: float = 1.0
271 |     max_completion_length: int = 256
272 |     max_prompt_length: int = 512
273 |     use_liger_loss: bool = True
274 | 
275 | 
276 | @dataclass
277 | class DataArguments:
278 |     data_path: str = field(
279 |         default=None, metadata={"help": "Path to the training data."}
280 |     )
281 |     eval_path: str= field(
282 |         default=None, metadata={"help": "Path to the evaluation data."}
283 |     )
284 |     eval_image_folder: Optional[str] = field(
285 |         default=None, metadata={"help": "Path to the evaluation image data."}
286 |     )
287 |     lazy_preprocess: bool = False
288 |     image_folder: Optional[str] = field(default=None)
289 |     image_min_pixels: Optional[int] = field(default=3136)
290 |     image_max_pixels: Optional[int] = field(default=12845056)
291 |     video_min_pixels: Optional[int] = field(default=100352)
292 |     video_max_pixels: Optional[int] = field(default=602112)
293 |     image_resized_width: int = field(default=None)
294 |     image_resized_height: int = field(default=None)
295 |     video_resized_width: int = field(default=None)
296 |     video_resized_height: int = field(default=None)
297 |     fps: Optional[int] = field(default=None, metadata={"help": "Frames per second for video data."})
298 |     nframes: Optional[int] = field(default=None, metadata={"help": "Number of frames for video data."})


--------------------------------------------------------------------------------
/src/train/train_grpo.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from peft import LoraConfig
  4 | import ast
  5 | import pathlib
  6 | from transformers import (
  7 |     AutoProcessor, 
  8 |     AutoConfig,
  9 |     BitsAndBytesConfig, 
 10 |     Qwen2VLForConditionalGeneration, 
 11 |     HfArgumentParser, 
 12 |     Qwen2_5_VLForConditionalGeneration,
 13 |     Qwen3VLForConditionalGeneration,
 14 |     Qwen3VLMoeForConditionalGeneration
 15 | )
 16 | 
 17 | from src.trainer import QwenGRPOTrainer
 18 | from src.dataset import make_grpo_data_module
 19 | from src.params import DataArguments, ModelArguments, GRPOArguments
 20 | from train.train_utils import get_peft_state_maybe_zero_3, get_peft_state_non_lora_maybe_zero_3, safe_save_model_for_hf_trainer
 21 | from monkey_patch_forward import (
 22 |     replace_qwen2_5_with_mixed_modality_forward, 
 23 |     replace_qwen_2_with_mixed_modality_forward,
 24 |     replace_qwen3_with_mixed_modality_forward,
 25 |     replace_qwen3_vl_moe_with_mixed_modality_forward
 26 | )
 27 | from monkey_patch_vision import replace_qwen2_5_vision
 28 | from src.utils import  load_reward_funcs
 29 | 
 30 | local_rank = None
 31 | 
 32 | def rank0_print(*args):
 33 |     if local_rank == 0 or local_rank == '0' or local_rank is None:
 34 |         print(*args)
 35 | 
 36 | def find_target_linear_names(model, num_lora_modules=-1, lora_namespan_exclude=[], verbose=True):
 37 |     linear_cls = torch.nn.modules.Linear
 38 |     embedding_cls = torch.nn.modules.Embedding
 39 |     lora_module_names = []
 40 | 
 41 |     for name, module in model.named_modules():
 42 |         if any(ex_keyword in name for ex_keyword in lora_namespan_exclude):
 43 |             continue
 44 |         if isinstance(module, (linear_cls, embedding_cls)):
 45 |             lora_module_names.append(name)
 46 |     
 47 |     if num_lora_modules > 0:
 48 |         lora_module_names = lora_module_names[-num_lora_modules:]
 49 |     if verbose:
 50 |         rank0_print(f"Found {len(lora_module_names)} lora modules: {lora_module_names}")
 51 |     return lora_module_names
 52 | 
 53 | def set_requires_grad(parameters, requires_grad):
 54 |     for p in parameters:
 55 |         p.requires_grad = requires_grad
 56 | 
 57 | def configure_vision_tower(model, training_args, compute_dtype, device):
 58 |     vision_tower = model.visual
 59 |     vision_tower.to(dtype=compute_dtype, device=device)
 60 | 
 61 |     vision_model_params = model.visual.parameters()
 62 |     set_requires_grad(vision_model_params, not training_args.freeze_vision_tower)
 63 |     
 64 |     # Handle merger specifically
 65 |     merger_params = model.visual.merger.parameters()
 66 |     set_requires_grad(merger_params, not training_args.freeze_merger)
 67 | 
 68 |     if hasattr(model.visual, "deepstack_merger_list"):
 69 |         deepstack_merger_list_params = model.visual.deepstack_merger_list.parameters()
 70 |         set_requires_grad(deepstack_merger_list_params, not training_args.freeze_merger)
 71 | 
 72 | def configure_llm(model, training_args):
 73 |     lm_head = model.lm_head.parameters()
 74 |     set_requires_grad(lm_head, not training_args.freeze_llm)
 75 | 
 76 |     llm_params = model.language_model.parameters()
 77 |     set_requires_grad(llm_params, not training_args.freeze_llm)
 78 | 
 79 | def unfreeze_topk_layers(model, k_llm: int = 0, k_vis: int = 0):
 80 |     if k_llm and hasattr(model, "language_model") and hasattr(model.language_model, "layers"):
 81 |         for layer in model.language_model.layers[-k_llm:]:
 82 |             for p in layer.parameters():
 83 |                 p.requires_grad = True
 84 | 
 85 |     if k_vis and hasattr(model, "visual") and hasattr(model.visual, "blocks"):
 86 |         for blk in model.visual.blocks[-k_vis:]:
 87 |             for p in blk.parameters():
 88 |                 p.requires_grad = True
 89 | 
 90 | 
 91 | 
 92 | def train():
 93 |     global local_rank
 94 | 
 95 |     parser = HfArgumentParser(
 96 |         (ModelArguments, DataArguments, GRPOArguments))
 97 |     
 98 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 99 | 
100 |     if data_args.nframes is not None and data_args.fps is not None:
101 |         raise ValueError("You cannot set both `nframes` and `fps` at the same time. Please set only one of them.")
102 | 
103 |     if training_args.lora_enable and not training_args.freeze_llm:
104 |         raise ValueError("If `lora_enable` is True, `freeze_llm` must also be True.")
105 | 
106 |     if not training_args.lora_enable:
107 |         assert not training_args.vision_lora, \
108 |             "Error: training_args.lora_enable is not enabled, but training_args.vision_lora is enabled."
109 |         
110 |     if training_args.vision_lora and not training_args.freeze_vision_tower:
111 |         raise ValueError("If `vision_lora` is True, `freeze_vision_tower` must also be True.")
112 | 
113 |     else:
114 |         if training_args.lora_namespan_exclude is not None:
115 |             training_args.lora_namespan_exclude = ast.literal_eval(training_args.lora_namespan_exclude)
116 |         else:
117 |             training_args.lora_namespan_exclude = []
118 | 
119 |         if not training_args.vision_lora:
120 |             training_args.lora_namespan_exclude += ["visual"]
121 | 
122 |     local_rank = training_args.local_rank
123 |     compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
124 | 
125 |     bnb_model_from_pretrained_args = {}
126 |     if training_args.bits in [4,8]:
127 |         bnb_model_from_pretrained_args.update(dict(
128 |             device_map={"":training_args.device},
129 |             quantization_config = BitsAndBytesConfig(
130 |                 load_in_4bit=training_args.bits==4,
131 |                 load_in_8bit=training_args.bits==8,
132 |                 llm_int8_skip_modules=["visual"],
133 |                 llm_int8_threshold=6.0,
134 |                 llm_int8_has_fp16_weight=False,
135 |                 bnb_4bit_compute_dtype=compute_dtype,
136 |                 bnb_4bit_use_double_quant=training_args.double_quant,
137 |                 bnb_4bit_quant_type=training_args.quant_type,
138 |             )
139 |         ))
140 | 
141 |     config = AutoConfig.from_pretrained(model_args.model_id)
142 | 
143 |     if config.model_type == "qwen3_vl_moe":
144 |         replace_qwen3_vl_moe_with_mixed_modality_forward()
145 |         model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
146 |             model_args.model_id,
147 |             dtype=compute_dtype,
148 |             attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa",
149 |             **bnb_model_from_pretrained_args
150 |         )
151 | 
152 |     elif config.model_type == "qwen3_vl":
153 |         replace_qwen3_with_mixed_modality_forward()
154 |         model = Qwen3VLForConditionalGeneration.from_pretrained(
155 |             model_args.model_id,
156 |             dtype=compute_dtype,
157 |             attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa",
158 |             **bnb_model_from_pretrained_args
159 |         )
160 | 
161 |     elif config.model_type == "qwen2_5_vl":
162 |         replace_qwen2_5_with_mixed_modality_forward()
163 |         replace_qwen2_5_vision()
164 |         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
165 |             model_args.model_id,
166 |             dtype=compute_dtype,
167 |             attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa", 
168 |             **bnb_model_from_pretrained_args
169 |         )
170 |         
171 |     else:
172 |         replace_qwen_2_with_mixed_modality_forward()
173 |         model = Qwen2VLForConditionalGeneration.from_pretrained(
174 |             model_args.model_id,
175 |             dtype=compute_dtype,
176 |             attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa", 
177 |             **bnb_model_from_pretrained_args
178 |         )
179 | 
180 | 
181 |     model.config.use_cache = False
182 |     model_to_configure = model
183 |     configure_llm(model_to_configure, training_args)
184 |     configure_vision_tower(model_to_configure, training_args, compute_dtype, training_args.device)
185 | 
186 |     unfreeze_topk_layers(
187 |         model_to_configure,
188 |         k_llm=getattr(training_args, "unfreeze_topk_llm", 0),
189 |         k_vis=getattr(training_args, "unfreeze_topk_vision", 0),
190 |     )
191 | 
192 |     if training_args.gradient_checkpointing:
193 |         if training_args.vision_lora:
194 |             training_args.gradient_checkpointing_kwargs = {"use_reentrant": False}
195 |         else:
196 |             training_args.gradient_checkpointing_kwargs = {"use_reentrant": True}
197 |         
198 |         model.enable_input_require_grads()
199 | 
200 |     if training_args.bits in [4,8]:
201 |         model.config.dtype = (torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
202 |         from peft import prepare_model_for_kbit_training
203 |         model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing, gradient_checkpointing_kwargs=training_args.gradient_checkpointing_kwargs)
204 | 
205 |     peft_config = None
206 | 
207 |     if training_args.lora_enable:
208 |         lora_namespan_exclude = training_args.lora_namespan_exclude
209 |         peft_config = LoraConfig(
210 |             r=training_args.lora_rank,
211 |             lora_alpha=training_args.lora_alpha,
212 |             target_modules=find_target_linear_names(model, lora_namespan_exclude=lora_namespan_exclude, num_lora_modules=training_args.num_lora_modules),
213 |             lora_dropout=training_args.lora_dropout,
214 |             bias=training_args.lora_bias
215 |         )
216 |         if training_args.bits == 16:
217 |             if training_args.bf16:
218 |                 model.to(torch.bfloat16)
219 |             if training_args.fp16:
220 |                 model.to(torch.float16)
221 | 
222 |     processor = AutoProcessor.from_pretrained(model_args.model_id)
223 |     processor.image_processor.do_resize = False
224 | 
225 |     if training_args.bits in [4, 8]:
226 |         from peft.tuners.lora import LoraLayer
227 |         for name, module in model.named_modules():
228 |             if isinstance(module, LoraLayer):
229 |                 if training_args.bf16:
230 |                     module = module.to(torch.bfloat16)
231 |             if 'norm' in name:
232 |                 module = module.to(torch.float32)
233 |             
234 |             if 'lm_head' in name or 'embed_token' in name:
235 |                 if hasattr(module, 'weight'):
236 |                     if training_args.bf16 and module.weight.dtype == torch.float32:
237 |                         module = module.to(torch.bfloat16)
238 | 
239 |     dataset_module = make_grpo_data_module(model_id=model_args.model_id,
240 |                                               processor=processor,
241 |                                               data_args=data_args)
242 | 
243 |     reward_funcs = load_reward_funcs("src.train.reward_funcs")
244 | 
245 |     trainer = QwenGRPOTrainer(
246 |         model=model,
247 |         train_dataset=dataset_module["train_dataset"],
248 |         eval_dataset=dataset_module["eval_dataset"],
249 |         processing_class=processor,
250 |         reward_funcs=reward_funcs,
251 |         args=training_args,
252 |         peft_config=peft_config,
253 |     )
254 | 
255 |     if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
256 |         trainer.train(resume_from_checkpoint=True)
257 |     else:
258 |         trainer.train()
259 | 
260 |     trainer.save_state()
261 | 
262 |     model.config.use_cache = True
263 |     
264 |     if training_args.lora_enable:
265 |         state_dict = get_peft_state_maybe_zero_3(
266 |             model.named_parameters(), training_args.lora_bias
267 |         )
268 | 
269 |         non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(
270 |             model.named_parameters(), require_grad_only=False
271 |         )
272 | 
273 |         if local_rank == 0 or local_rank == -1:
274 |             model.config.save_pretrained(training_args.output_dir)
275 |             model.save_pretrained(training_args.output_dir, state_dict=state_dict)
276 |             processor.save_pretrained(training_args.output_dir)
277 |             torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, "non_lora_state_dict.bin"))
278 |     else:
279 |         safe_save_model_for_hf_trainer(trainer, output_dir=training_args.output_dir)
280 | 
281 | 
282 | 
283 | if __name__ == "__main__":
284 |     train()


--------------------------------------------------------------------------------
/src/train/train_cls.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from peft import LoraConfig, get_peft_model
  4 | import ast
  5 | from transformers import AutoProcessor, BitsAndBytesConfig, HfArgumentParser, AutoConfig
  6 | from src.trainer import QwenCLSTrainer
  7 | from src.model import Qwen2VLForSequenceClassification, Qwen2_5_VLForSequenceClassification
  8 | from src.dataset import make_classification_data_module
  9 | from src.loss import get_loss_function
 10 | from src.params import DataArguments, ModelArguments, CLSArguments
 11 | from train.train_utils import get_peft_state_maybe_zero_3, get_peft_state_non_lora_maybe_zero_3, safe_save_model_for_hf_trainer
 12 | import pathlib
 13 | from sklearn.metrics import accuracy_score, precision_recall_fscore_support
 14 | from transformers import EarlyStoppingCallback
 15 | 
 16 | 
 17 | def compute_metrics(pred):
 18 |     preds = pred.predictions.argmax(axis=-1)
 19 |     labels = pred.label_ids
 20 |     acc     = accuracy_score(labels, preds)
 21 |     prec, rec, f1, _ = precision_recall_fscore_support(
 22 |         labels, preds, average="weighted")
 23 |     return {
 24 |         "acc": acc,
 25 |         "precision": prec,
 26 |         "recall": rec,
 27 |         "f1": f1,
 28 |     }
 29 | 
 30 | local_rank = None
 31 | 
 32 | def rank0_print(*args):
 33 |     if local_rank == 0 or local_rank == '0' or local_rank is None:
 34 |         print(*args)
 35 | 
 36 | def find_target_linear_names(model, num_lora_modules=-1, lora_namespan_exclude=[], verbose=True):
 37 |     linear_cls = torch.nn.modules.Linear
 38 |     embedding_cls = torch.nn.modules.Embedding
 39 |     lora_module_names = []
 40 | 
 41 |     for name, module in model.named_modules():
 42 |         if any(ex_keyword in name for ex_keyword in lora_namespan_exclude):
 43 |             continue
 44 |         if isinstance(module, (linear_cls, embedding_cls)):
 45 |             lora_module_names.append(name)
 46 |     
 47 |     if num_lora_modules > 0:
 48 |         lora_module_names = lora_module_names[-num_lora_modules:]
 49 |     if verbose:
 50 |         rank0_print(f"Found {len(lora_module_names)} lora modules: {lora_module_names}")
 51 |     return lora_module_names
 52 | 
 53 | def set_requires_grad(parameters, requires_grad):
 54 |     for p in parameters:
 55 |         p.requires_grad = requires_grad
 56 | 
 57 | def configure_vision_tower(model, training_args, compute_dtype, device):
 58 |     vision_model_params = model.visual.parameters()
 59 |     set_requires_grad(vision_model_params, not training_args.freeze_vision_tower)
 60 |     
 61 |     # Handle merger specifically
 62 |     merger_params = model.visual.merger.parameters()
 63 |     set_requires_grad(merger_params, not training_args.freeze_merger)
 64 | 
 65 | def configure_llm(model, training_args):
 66 |     llm_params = model.language_model.parameters()
 67 |     set_requires_grad(llm_params, not training_args.freeze_llm)
 68 | 
 69 | def unfreeze_topk_layers(model, k_llm: int = 0, k_vis: int = 0):
 70 |     if k_llm and hasattr(model, "language_model") and hasattr(model.language_model, "layers"):
 71 |         for layer in model.language_model.layers[-k_llm:]:
 72 |             for p in layer.parameters():
 73 |                 p.requires_grad = True
 74 | 
 75 |     if k_vis and hasattr(model, "visual") and hasattr(model.visual, "blocks"):
 76 |         for blk in model.visual.blocks[-k_vis:]:
 77 |             for p in blk.parameters():
 78 |                 p.requires_grad = True
 79 | 
 80 | def train():
 81 |     global local_rank
 82 | 
 83 |     parser = HfArgumentParser(
 84 |         (ModelArguments, DataArguments, CLSArguments))
 85 |     
 86 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 87 |     
 88 |     if training_args.lora_enable and not training_args.freeze_llm:
 89 |         raise ValueError("If `lora_enable` is True, `freeze_llm` must also be True.")
 90 | 
 91 |     if not training_args.lora_enable:
 92 |         assert not training_args.vision_lora, \
 93 |             "Error: training_args.lora_enable is not enabled, but training_args.vision_lora is enabled."
 94 |         
 95 |     if training_args.vision_lora and not training_args.freeze_vision_tower:
 96 |         raise ValueError("If `vision_lora` is True, `freeze_vision_tower` must also be True.")
 97 | 
 98 |     else:
 99 |         if training_args.lora_namespan_exclude is not None:
100 |             training_args.lora_namespan_exclude = ast.literal_eval(training_args.lora_namespan_exclude)
101 |         else:
102 |             training_args.lora_namespan_exclude = []
103 | 
104 |         if not training_args.vision_lora:
105 |             training_args.lora_namespan_exclude += ["visual"]
106 | 
107 |     local_rank = training_args.local_rank
108 |     compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
109 |     data_args.compute_dtype = compute_dtype
110 | 
111 |     bnb_model_from_pretrained_args = {}
112 |     if training_args.bits in [4,8]:
113 |         bnb_model_from_pretrained_args.update(dict(
114 |             device_map={"":training_args.device},
115 |             quantization_config = BitsAndBytesConfig(
116 |                 load_in_4bit=training_args.bits==4,
117 |                 load_in_8bit=training_args.bits==8,
118 |                 llm_int8_skip_modules=["visual", "score"],
119 |                 llm_int8_threshold=6.0,
120 |                 llm_int8_has_fp16_weight=False,
121 |                 bnb_4bit_compute_dtype=compute_dtype,
122 |                 bnb_4bit_use_double_quant=training_args.double_quant,
123 |                 bnb_4bit_quant_type=training_args.quant_type,
124 |             )
125 |         ))
126 | 
127 |     if "Qwen2.5" in model_args.model_id:
128 |         cfg = AutoConfig.from_pretrained(model_args.model_id)
129 |         cfg.mlp_head_hidden_dim = training_args.mlp_head_dim
130 |         cfg.mlp_head_dropout = training_args.mlp_head_dropout
131 |         cfg.num_labels = training_args.num_labels
132 |         
133 |         model = Qwen2_5_VLForSequenceClassification.from_pretrained(
134 |             model_args.model_id,
135 |             config=cfg,
136 |             torch_dtype=compute_dtype,
137 |             attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa",
138 |             **bnb_model_from_pretrained_args
139 |         )
140 |     else:
141 |         cfg = AutoConfig.from_pretrained(model_args.model_id)
142 |         cfg.mlp_head_hidden_dim = training_args.mlp_head_dim
143 |         cfg.mlp_head_dropout = training_args.mlp_head_dropout
144 |         cfg.num_labels = training_args.num_labels
145 |         
146 |         model = Qwen2VLForSequenceClassification.from_pretrained(
147 |             model_args.model_id,
148 |             config=cfg,
149 |             torch_dtype=compute_dtype,
150 |             attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa",
151 |             **bnb_model_from_pretrained_args
152 |         )
153 | 
154 |     model.config.use_cache = False
155 |     model.config.num_labels = training_args.num_labels
156 |     model_to_configure = model
157 |     configure_llm(model_to_configure, training_args)
158 |     configure_vision_tower(model_to_configure, training_args, compute_dtype, training_args.device)
159 | 
160 |     unfreeze_topk_layers(
161 |         model_to_configure,
162 |         k_llm=getattr(training_args, "unfreeze_topk_llm", 0),
163 |         k_vis=getattr(training_args, "unfreeze_topk_vision", 0),
164 |     )
165 | 
166 |     if training_args.bits in [4,8]:
167 |         model.config.torch_dtype = (torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
168 |         from peft import prepare_model_for_kbit_training
169 |         model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing, gradient_checkpointing_kwargs={"use_reentrant": True})
170 |     
171 |     if training_args.gradient_checkpointing:
172 |         model.enable_input_require_grads()
173 |         if hasattr(model, "enable_input_require_grads"):
174 |             model.enable_input_require_grads()
175 |         else:
176 |             def make_inputs_require_grad(module, input, output):
177 |                 output.requires_grad_(True)
178 | 
179 |             model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
180 |             
181 |         training_args.gradient_checkpointing_kwargs = {"use_reentrant": True}
182 | 
183 |     if training_args.lora_enable:
184 |         lora_namespan_exclude = training_args.lora_namespan_exclude
185 |         peft_config = LoraConfig(
186 |             r=training_args.lora_rank,
187 |             lora_alpha=training_args.lora_alpha,
188 |             target_modules=find_target_linear_names(model, lora_namespan_exclude=lora_namespan_exclude, num_lora_modules=training_args.num_lora_modules),
189 |             lora_dropout=training_args.lora_dropout,
190 |             bias=training_args.lora_bias,
191 |             task_type="CAUSAL_LM",
192 |         )
193 |         rank0_print("Adding LoRA to the model...")
194 |         model = get_peft_model(model, peft_config)
195 | 
196 |         # Peft maodel makes vision tower and merger freezed again.
197 |         # Configuring fuction could be called here, but sometimes it does not work properly.
198 |         # So I just made it this way.
199 |         # Need to be fixed in the future.
200 | 
201 |         if not training_args.freeze_vision_tower:
202 |             for name, param in model.named_parameters():
203 |                 if "visual" in name:
204 |                     param.requires_grad = True
205 | 
206 |         if not training_args.freeze_merger:
207 |             for name, param in model.named_parameters():
208 |                 if "merger" in name:
209 |                     param.requires_grad = True
210 | 
211 |     processor = AutoProcessor.from_pretrained(model_args.model_id)
212 | 
213 |     # model.config.tokenizer_model_max_length = processor.tokenizer.model_max_length
214 |     model.config.pad_token_id = processor.tokenizer.pad_token_id
215 | 
216 |     if training_args.bits in [4, 8]:
217 |         from peft.tuners.lora import LoraLayer
218 |         for name, module in model.named_modules():
219 |             if isinstance(module, LoraLayer):
220 |                 if training_args.bf16:
221 |                     module = module.to(torch.bfloat16)
222 |             if 'norm' in name:
223 |                 module = module.to(torch.float32)
224 |             
225 |             if 'score' in name or 'embed_token' in name:
226 |                 if hasattr(module, 'weight'):
227 |                     if training_args.bf16 and module.weight.dtype == torch.float32:
228 |                         module = module.to(torch.bfloat16)
229 | 
230 |     data_module = make_classification_data_module(model_id=model_args.model_id,
231 |                                               processor=processor,
232 |                                               data_args=data_args)
233 |     
234 |     samples_per_class = data_module.pop("samples_per_class")
235 | 
236 |     loss_fn = get_loss_function(training_args, samples_per_class=samples_per_class)
237 |     model.loss_fn = loss_fn.to(model.dtype if hasattr(model, "dtype") else torch.float32)
238 | 
239 |     callback_list = None
240 | 
241 |     if training_args.early_stopping_patience > 0:
242 |         early_stop_cb = EarlyStoppingCallback(
243 |             early_stopping_patience=training_args.early_stopping_patience,
244 |             early_stopping_threshold=training_args.early_stopping_threshold,
245 |         )
246 |         callback_list = [early_stop_cb]
247 | 
248 |     trainer = QwenCLSTrainer(
249 |         model=model,
250 |         processing_class=processor,
251 |         args=training_args,
252 |         compute_metrics=compute_metrics,
253 |         callbacks=callback_list,
254 |         **data_module,
255 |     )
256 | 
257 |     if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
258 |         trainer.train(resume_from_checkpoint=True)
259 |     else:
260 |         trainer.train()
261 | 
262 |     trainer.save_state()
263 | 
264 |     model.config.use_cache = True
265 |     
266 |     if training_args.lora_enable:
267 |         state_dict = get_peft_state_maybe_zero_3(
268 |             model.named_parameters(), training_args.lora_bias
269 |         )
270 | 
271 |         non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(
272 |             model.named_parameters(), require_grad_only=True
273 |         )
274 | 
275 |         if local_rank == 0 or local_rank == -1:
276 |             model.config.save_pretrained(training_args.output_dir)
277 |             model.save_pretrained(training_args.output_dir, state_dict=state_dict)
278 |             torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, "non_lora_state_dict.bin"))
279 |     else:
280 |         safe_save_model_for_hf_trainer(trainer, output_dir=training_args.output_dir)
281 | 
282 | if __name__ == "__main__":
283 |     train()
284 | 


--------------------------------------------------------------------------------
/src/train/train_sft.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from peft import LoraConfig, get_peft_model
  4 | import ast
  5 | from transformers import (
  6 |     AutoProcessor,
  7 |     AutoConfig,
  8 |     BitsAndBytesConfig, 
  9 |     Qwen2VLForConditionalGeneration, 
 10 |     HfArgumentParser, 
 11 |     Qwen2_5_VLForConditionalGeneration,
 12 |     Qwen3VLForConditionalGeneration,
 13 |     Qwen3VLMoeForConditionalGeneration
 14 | )
 15 | from src.trainer import QwenSFTTrainer
 16 | from src.dataset import make_supervised_data_module
 17 | from src.params import DataArguments, ModelArguments, TrainingArguments
 18 | from train.train_utils import get_peft_state_maybe_zero_3, get_peft_state_non_lora_maybe_zero_3, safe_save_model_for_hf_trainer
 19 | import pathlib
 20 | from monkey_patch_forward import (
 21 |     replace_qwen3_with_mixed_modality_forward,
 22 |     replace_qwen2_5_with_mixed_modality_forward, 
 23 |     replace_qwen_2_with_mixed_modality_forward,
 24 |     replace_qwen3_vl_moe_with_mixed_modality_forward
 25 | )
 26 | from monkey_patch_vision import replace_qwen2_5_vision
 27 | 
 28 | local_rank = None
 29 | 
 30 | def rank0_print(*args):
 31 |     if local_rank == 0 or local_rank == '0' or local_rank is None:
 32 |         print(*args)
 33 | 
 34 | def find_target_linear_names(model, num_lora_modules=-1, lora_namespan_exclude=[], verbose=True):
 35 |     linear_cls = torch.nn.modules.Linear
 36 |     embedding_cls = torch.nn.modules.Embedding
 37 |     lora_module_names = []
 38 | 
 39 |     for name, module in model.named_modules():
 40 |         if any(ex_keyword in name for ex_keyword in lora_namespan_exclude):
 41 |             continue
 42 |         if isinstance(module, (linear_cls, embedding_cls)):
 43 |             lora_module_names.append(name)
 44 |     
 45 |     if num_lora_modules > 0:
 46 |         lora_module_names = lora_module_names[-num_lora_modules:]
 47 |     if verbose:
 48 |         rank0_print(f"Found {len(lora_module_names)} lora modules: {lora_module_names}")
 49 |     return lora_module_names
 50 | 
 51 | def set_requires_grad(parameters, requires_grad):
 52 |     for p in parameters:
 53 |         p.requires_grad = requires_grad
 54 | 
 55 | def configure_vision_tower(model, training_args, compute_dtype, device):
 56 |     vision_tower = model.visual
 57 |     vision_tower.to(dtype=compute_dtype, device=device)
 58 | 
 59 |     vision_model_params = model.visual.parameters()
 60 |     set_requires_grad(vision_model_params, not training_args.freeze_vision_tower)
 61 |     
 62 |     # Handle merger specifically
 63 |     merger_params = model.visual.merger.parameters()
 64 |     set_requires_grad(merger_params, not training_args.freeze_merger)
 65 | 
 66 |     if hasattr(model.visual, "deepstack_merger_list"):
 67 |         deepstack_merger_list_params = model.visual.deepstack_merger_list.parameters()
 68 |         set_requires_grad(deepstack_merger_list_params, not training_args.freeze_merger)
 69 | 
 70 | def configure_llm(model, training_args):
 71 |     lm_head = model.lm_head.parameters()
 72 |     set_requires_grad(lm_head, not training_args.freeze_llm)
 73 | 
 74 |     llm_params = model.language_model.parameters()
 75 |     set_requires_grad(llm_params, not training_args.freeze_llm)
 76 | 
 77 | def unfreeze_topk_layers(model, k_llm: int = 0, k_vis: int = 0):
 78 |     if k_llm and hasattr(model, "language_model") and hasattr(model.language_model, "layers"):
 79 |         for layer in model.language_model.layers[-k_llm:]:
 80 |             for p in layer.parameters():
 81 |                 p.requires_grad = True
 82 | 
 83 |     if k_vis and hasattr(model, "visual") and hasattr(model.visual, "blocks"):
 84 |         for blk in model.visual.blocks[-k_vis:]:
 85 |             for p in blk.parameters():
 86 |                 p.requires_grad = True
 87 | 
 88 | 
 89 | def train():
 90 |     global local_rank
 91 | 
 92 |     parser = HfArgumentParser(
 93 |         (ModelArguments, DataArguments, TrainingArguments))
 94 |     
 95 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 96 |     
 97 |     if data_args.nframes is not None and data_args.fps is not None:
 98 |         raise ValueError("You cannot set both `nframes` and `fps` at the same time. Please set only one of them.")
 99 | 
100 |     if training_args.lora_enable and not training_args.freeze_llm:
101 |         raise ValueError("If `lora_enable` is True, `freeze_llm` must also be True.")
102 | 
103 |     if not training_args.lora_enable:
104 |         assert not training_args.vision_lora, \
105 |             "Error: training_args.lora_enable is not enabled, but training_args.vision_lora is enabled."
106 |         
107 |     if training_args.vision_lora and not training_args.freeze_vision_tower:
108 |         raise ValueError("If `vision_lora` is True, `freeze_vision_tower` must also be True.")
109 | 
110 |     else:
111 |         if training_args.lora_namespan_exclude is not None:
112 |             training_args.lora_namespan_exclude = ast.literal_eval(training_args.lora_namespan_exclude)
113 |         else:
114 |             training_args.lora_namespan_exclude = []
115 | 
116 |         if not training_args.vision_lora:
117 |             training_args.lora_namespan_exclude += ["visual"]
118 | 
119 |     local_rank = training_args.local_rank
120 |     compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
121 | 
122 |     bnb_model_from_pretrained_args = {}
123 |     if training_args.bits in [4,8]:
124 |         bnb_model_from_pretrained_args.update(dict(
125 |             device_map={"":training_args.device},
126 |             quantization_config = BitsAndBytesConfig(
127 |                 load_in_4bit=training_args.bits==4,
128 |                 load_in_8bit=training_args.bits==8,
129 |                 llm_int8_skip_modules=["visual", "lm_head"],
130 |                 llm_int8_threshold=6.0,
131 |                 llm_int8_has_fp16_weight=False,
132 |                 bnb_4bit_compute_dtype=compute_dtype,
133 |                 bnb_4bit_use_double_quant=training_args.double_quant,
134 |                 bnb_4bit_quant_type=training_args.quant_type,
135 |             )
136 |         ))
137 | 
138 |     config = AutoConfig.from_pretrained(model_args.model_id)
139 | 
140 |     if config.model_type == "qwen3_vl_moe":
141 |         replace_qwen3_vl_moe_with_mixed_modality_forward()
142 |         model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
143 |             model_args.model_id,
144 |             dtype=compute_dtype,
145 |             attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa",
146 |             **bnb_model_from_pretrained_args
147 |         )
148 | 
149 |     elif config.model_type == "qwen3_vl":
150 |         replace_qwen3_with_mixed_modality_forward()
151 |         model = Qwen3VLForConditionalGeneration.from_pretrained(
152 |             model_args.model_id,
153 |             dtype=compute_dtype,
154 |             attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa",
155 |             **bnb_model_from_pretrained_args
156 |         )
157 | 
158 |     elif config.model_type == "qwen2_5_vl":
159 |         replace_qwen2_5_with_mixed_modality_forward()
160 |         replace_qwen2_5_vision()
161 |         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
162 |             model_args.model_id,
163 |             dtype=compute_dtype,
164 |             attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa", 
165 |             **bnb_model_from_pretrained_args
166 |         )
167 |         
168 |     else:
169 |         replace_qwen_2_with_mixed_modality_forward()
170 |         model = Qwen2VLForConditionalGeneration.from_pretrained(
171 |             model_args.model_id,
172 |             dtype=compute_dtype,
173 |             attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa", 
174 |             **bnb_model_from_pretrained_args
175 |         )
176 | 
177 |     model.config.use_cache = False
178 |     model_to_configure = model
179 |     configure_llm(model_to_configure, training_args)
180 |     configure_vision_tower(model_to_configure, training_args, compute_dtype, training_args.device)
181 | 
182 |     unfreeze_topk_layers(
183 |         model_to_configure,
184 |         k_llm=getattr(training_args, "unfreeze_topk_llm", 0),
185 |         k_vis=getattr(training_args, "unfreeze_topk_vision", 0),
186 |     )
187 | 
188 |     if training_args.gradient_checkpointing:
189 |         if training_args.vision_lora:
190 |             training_args.gradient_checkpointing_kwargs = {"use_reentrant": False}
191 |         else:
192 |             training_args.gradient_checkpointing_kwargs = {"use_reentrant": True}
193 |         
194 |         model.enable_input_require_grads()
195 | 
196 |     if training_args.bits in [4,8]:
197 |         model.config.dtype = (torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
198 |         from peft import prepare_model_for_kbit_training
199 |         model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing, gradient_checkpointing_kwargs=training_args.gradient_checkpointing_kwargs)
200 |     
201 |     if training_args.lora_enable:
202 |         lora_namespan_exclude = training_args.lora_namespan_exclude
203 |         peft_config = LoraConfig(
204 |             r=training_args.lora_rank,
205 |             lora_alpha=training_args.lora_alpha,
206 |             target_modules=find_target_linear_names(model, lora_namespan_exclude=lora_namespan_exclude, num_lora_modules=training_args.num_lora_modules),
207 |             lora_dropout=training_args.lora_dropout,
208 |             bias=training_args.lora_bias
209 |         )
210 |         if training_args.bits == 16:
211 |             if training_args.bf16:
212 |                 model.to(torch.bfloat16)
213 |             if training_args.fp16:
214 |                 model.to(torch.float16)
215 |         rank0_print("Adding LoRA to the model...")
216 |         model = get_peft_model(model, peft_config)
217 | 
218 |         # Peft maodel makes vision tower and merger freezed again.
219 |         # Configuring fuction could be called here, but sometimes it does not work properly.
220 |         # So I just made it this way.
221 |         # Need to be fixed in the future.
222 | 
223 |         if not training_args.freeze_vision_tower:
224 |             for name, param in model.named_parameters():
225 |                 if "visual" in name:
226 |                     param.requires_grad = True
227 | 
228 |         if not training_args.freeze_merger:
229 |             for name, param in model.named_parameters():
230 |                 if "merger" in name:
231 |                     param.requires_grad = True
232 | 
233 |     processor = AutoProcessor.from_pretrained(model_args.model_id)
234 | 
235 |     # model.config.tokenizer_model_max_length = processor.tokenizer.model_max_length
236 | 
237 |     if training_args.bits in [4, 8]:
238 |         from peft.tuners.lora import LoraLayer
239 |         for name, module in model.named_modules():
240 |             if isinstance(module, LoraLayer):
241 |                 if training_args.bf16:
242 |                     module = module.to(torch.bfloat16)
243 |             if 'norm' in name:
244 |                 module = module.to(torch.float32)
245 |             
246 |             if 'lm_head' in name or 'embed_token' in name:
247 |                 if hasattr(module, 'weight'):
248 |                     if training_args.bf16 and module.weight.dtype == torch.float32:
249 |                         module = module.to(torch.bfloat16)
250 | 
251 |     data_module = make_supervised_data_module(model_id=model_args.model_id,
252 |                                               processor=processor,
253 |                                               data_args=data_args)
254 | 
255 |     trainer = QwenSFTTrainer(
256 |         model=model,
257 |         processing_class=processor,
258 |         args=training_args,
259 |         **data_module
260 |     )
261 | 
262 |     if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
263 |         trainer.train(resume_from_checkpoint=True)
264 |     else:
265 |         trainer.train()
266 | 
267 |     trainer.save_state()
268 | 
269 |     model.config.use_cache = True
270 |     
271 |     if training_args.lora_enable:
272 |         state_dict = get_peft_state_maybe_zero_3(
273 |             model.named_parameters(), training_args.lora_bias
274 |         )
275 | 
276 |         non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(
277 |             model.named_parameters(), require_grad_only=True
278 |         )
279 | 
280 |         if local_rank == 0 or local_rank == -1:
281 |             model.config.save_pretrained(training_args.output_dir)
282 |             model.save_pretrained(training_args.output_dir, state_dict=state_dict)
283 |             processor.save_pretrained(training_args.output_dir)
284 |             torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, "non_lora_state_dict.bin"))
285 |     else:
286 |         safe_save_model_for_hf_trainer(trainer, output_dir=training_args.output_dir)
287 | 
288 | 
289 | 
290 | if __name__ == "__main__":
291 |     train()
292 | 


--------------------------------------------------------------------------------
/src/dataset/dpo_dataset.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | from typing import Dict
  3 | import torch
  4 | import transformers
  5 | import ujson as json
  6 | from torch.utils.data import Dataset
  7 | 
  8 | from src.params import DataArguments
  9 | from src.constants import (
 10 |     DEFAULT_IM_START_TOKEN,
 11 |     DEFAULT_IM_END_TOKEN,
 12 |     DEFAULT_IMAGE_TOKEN,
 13 |     DEFAULT_VIDEO_TOKEN,
 14 |     SYSTEM_MESSAGE,
 15 | )
 16 | 
 17 | from .data_utils import get_image_info, get_video_info, pad_sequence, replace_image_tokens
 18 | 
 19 | 
 20 | class DPODataset(Dataset):
 21 |     """Dataset for DPO training"""
 22 | 
 23 |     def __init__(
 24 |         self,
 25 |         data_path: str | list,
 26 |         processor: transformers.ProcessorMixin,
 27 |         data_args: DataArguments,
 28 |         model_id,
 29 |         padding=True,
 30 |     ):
 31 |         super(DPODataset, self).__init__()
 32 |         if isinstance(data_path, str):
 33 |             list_data_dict = json.load(open(data_path, "r"))
 34 |         else:
 35 |             list_data_dict = data_path
 36 | 
 37 |         self.model_id = model_id
 38 |         self.processor = processor
 39 |         self.list_data_dict = list_data_dict
 40 |         self.data_args = data_args
 41 |         self.padding = padding
 42 |         self.image_min_pixel = data_args.image_min_pixels
 43 |         self.image_max_pixel = data_args.image_max_pixels
 44 |         self.video_min_pixel = data_args.video_min_pixels
 45 |         self.video_max_pixel = data_args.video_max_pixels
 46 |         self.image_resized_w = data_args.image_resized_width
 47 |         self.image_resized_h = data_args.image_resized_height
 48 |         self.video_resized_w = data_args.video_resized_width
 49 |         self.video_resized_h = data_args.video_resized_height
 50 |         self.fps = data_args.fps
 51 |         self.nframes = data_args.nframes
 52 | 
 53 |         if "Qwen3" in self.model_id:
 54 |             self.image_patch_size = 16
 55 |             self.return_video_metadata = True
 56 |         else:
 57 |             self.image_patch_size = 14
 58 |             self.return_video_metadata = False
 59 | 
 60 |     def __len__(self):
 61 |         return len(self.list_data_dict)
 62 |     
 63 |     def __getitem__(self, i) -> Dict[str, torch.Tensor]:
 64 |         sources = self.list_data_dict[i]
 65 | 
 66 |         is_video = False
 67 |         processor = self.processor
 68 | 
 69 |         if "image" in sources:
 70 |             videos = None
 71 |             grid_key = "image_grid_thw"
 72 |             pixel_key = "pixel_values"
 73 |             
 74 |             image_files = sources["image"]
 75 |             image_folder = self.data_args.image_folder
 76 | 
 77 |             if isinstance(image_files, str):
 78 |                 image_files = [image_files]
 79 | 
 80 |             images = []
 81 |             
 82 |             for image_file in image_files:
 83 |                 if not os.path.exists(image_file):
 84 |                     if not image_file.startswith("http"):
 85 |                         image_file = os.path.join(image_folder, image_file)
 86 |                 image_input = get_image_info(
 87 |                         image_file, 
 88 |                         self.image_min_pixel, 
 89 |                         self.image_max_pixel, 
 90 |                         self.image_resized_w, 
 91 |                         self.image_resized_h, 
 92 |                         self.image_patch_size
 93 |                     )
 94 |                 images.append(image_input)
 95 | 
 96 |         elif "video" in sources:
 97 |             is_video = True
 98 |             images=None
 99 |             grid_key = "video_grid_thw"
100 |             pixel_key = "pixel_values_videos"
101 | 
102 |             video_files = sources["video"]
103 |             video_folder = self.data_args.image_folder
104 | 
105 |             if isinstance(video_files, str):
106 |                 video_files = [video_files]
107 | 
108 |             videos = []
109 |             for video_file in video_files:
110 |                 if not os.path.exists(video_file):
111 |                     if not video_file.startswith("http"):
112 |                         video_file = os.path.join(video_folder, video_file)
113 |                 video_input, video_kwargs = get_video_info(
114 |                     video_file, 
115 |                     self.video_min_pixel, 
116 |                     self.video_max_pixel, 
117 |                     self.video_resized_w, 
118 |                     self.video_resized_h, 
119 |                     self.data_args.fps,
120 |                     self.image_patch_size,
121 |                     return_video_metadata=self.return_video_metadata
122 |                 )
123 |                 videos.append(video_input)
124 |         else:
125 |             grid_key = None
126 |             pixel_key = None
127 |             images=None
128 |             videos=None
129 | 
130 |         all_input_ids = [] 
131 |         all_rejected = []
132 |         all_chosen =[]
133 |         all_pixel_values = []
134 |         all_image_grid_thw = []
135 |         all_second_gird = []
136 | 
137 |         if len(SYSTEM_MESSAGE) > 0 and "Qwen3" not in self.model_id:
138 |             system_message = f"{DEFAULT_IM_START_TOKEN}system\n{SYSTEM_MESSAGE}{DEFAULT_IM_END_TOKEN}\n"
139 |             system_message_input_ids = processor.tokenizer(system_message, add_special_tokens=False, return_tensors='pt')['input_ids'] 
140 |             
141 |             all_input_ids.append(system_message_input_ids.squeeze(0))
142 | 
143 |         user_prompt = replace_image_tokens(sources["prompt"], is_video=is_video)
144 |         chosen_response = sources["chosen"]
145 |         rejected_response = sources["rejected"]
146 | 
147 |         user_input = f"{DEFAULT_IM_START_TOKEN}user\n{user_prompt}{DEFAULT_IM_END_TOKEN}\n{DEFAULT_IM_START_TOKEN}assistant\n"
148 |         chosen_response = f"{chosen_response}{DEFAULT_IM_END_TOKEN}\n"
149 |         rejected_response = f"{rejected_response}{DEFAULT_IM_END_TOKEN}\n"
150 | 
151 |         if DEFAULT_IMAGE_TOKEN in user_input:
152 |             inputs = processor(text=[user_input], images=images, videos=videos, padding=False, do_resize=False, return_tensors='pt')
153 |             prompt_input_ids = inputs['input_ids']
154 |             all_pixel_values.append(inputs[pixel_key])
155 |             all_image_grid_thw.append(inputs[grid_key])
156 |         elif DEFAULT_VIDEO_TOKEN in user_input:
157 |             if "Qwen2.5" in self.model_id:
158 |                 inputs = processor(
159 |                     text=[user_input], 
160 |                     images=images, 
161 |                     videos=videos, 
162 |                     padding=False, 
163 |                     do_resize=False, 
164 |                     return_tensors='pt', 
165 |                     **video_kwargs
166 |                 )
167 |                 
168 |                 all_second_gird.extend(inputs["second_per_grid_ts"])
169 |             
170 |             elif "Qwen3" in self.model_id:
171 | 
172 |                 video_datas, video_metadatas = zip(*videos)
173 |                 video_datas, video_metadatas = list(video_datas), list(video_metadatas)
174 |                 
175 |                 inputs = processor(
176 |                     text=[user_input], 
177 |                     images=images, 
178 |                     videos=video_datas, 
179 |                     padding=False, 
180 |                     do_resize=False, 
181 |                     return_tensors='pt', 
182 |                     **video_kwargs, 
183 |                     video_metadata=video_metadatas,
184 |                 )
185 |             
186 |             else:
187 |                 inputs = processor(
188 |                     text=[user_input], 
189 |                     images=images, 
190 |                     videos=videos, 
191 |                     padding=False, 
192 |                     do_resize=False, 
193 |                     return_tensors='pt'
194 |                 )
195 |             
196 |             prompt_input_ids = inputs['input_ids']
197 |             all_pixel_values.append(inputs[pixel_key])
198 |             all_image_grid_thw.append(inputs[grid_key])
199 | 
200 |         else:
201 |             prompt_input_ids = processor.tokenizer(user_input, add_special_tokens=False, padding=False, return_tensors='pt')['input_ids']
202 | 
203 |         input_ids = prompt_input_ids.squeeze(0)
204 |         chosen_input_ids = processor.tokenizer(chosen_response, add_special_tokens=False, padding=False, return_tensors='pt')['input_ids'].squeeze(0)
205 |         rejected_input_ids = processor.tokenizer(rejected_response, add_special_tokens=False, padding=False, return_tensors='pt')['input_ids'].squeeze(0)
206 | 
207 |         all_input_ids.append(input_ids)
208 |         all_chosen.append(chosen_input_ids)
209 |         all_rejected.append(rejected_input_ids)
210 | 
211 |         input_ids = torch.cat(all_input_ids, dim=0).to(torch.long)
212 |         chosen = torch.cat(all_chosen, dim=0).to(torch.long)
213 |         rejected = torch.cat(all_rejected, dim=0).to(torch.long)
214 |         
215 |         data_dict = dict(
216 |             prompt_input_ids=input_ids,
217 |             chosen_input_ids=chosen,
218 |             rejected_input_ids=rejected,
219 |         )
220 | 
221 |         if pixel_key and grid_key:
222 |             pixel_values = torch.cat(all_pixel_values, dim=0)
223 |             image_thw = torch.cat(all_image_grid_thw, dim=0)
224 |             data_dict[pixel_key] = pixel_values
225 |             data_dict[grid_key] = image_thw
226 |         
227 |         if len(all_second_gird) > 0:
228 |             second_gird = all_second_gird
229 |             data_dict["second_per_grid_ts"] = second_gird
230 | 
231 |         return data_dict
232 |     
233 | class DataCollatorForDPODataset(object):
234 |     """Collate examples for DPO fine-tuning."""
235 | 
236 |     def __init__(self, pad_token_id: int):
237 |         self.pad_token_id = pad_token_id
238 | 
239 |     def __call__(self, examples):
240 |         batch_input_ids = []
241 |         batch_chosen_ids = []
242 |         batch_rejected_ids = []
243 |         batch_pixel_values = []
244 |         batch_pixel_video_values = []
245 |         batch_video_thw = []
246 |         batch_image_thw = []
247 |         batch_second_per_grid_ts = []
248 | 
249 |         for example in examples:
250 |             keys = example.keys()
251 |             if "pixel_values_videos" in keys:
252 |                 batch_pixel_video_values.append(example["pixel_values_videos"])
253 |                 batch_video_thw.append(example["video_grid_thw"])
254 |             elif "pixel_values" in keys:
255 |                 batch_pixel_values.append(example["pixel_values"])
256 |                 batch_image_thw.append(example["image_grid_thw"])
257 |             
258 |             batch_input_ids.append(example["prompt_input_ids"])
259 |             batch_chosen_ids.append(example["chosen_input_ids"])
260 |             batch_rejected_ids.append(example["rejected_input_ids"])
261 | 
262 |             if "second_per_grid_ts" in keys:
263 |                 batch_second_per_grid_ts.extend(example["second_per_grid_ts"])
264 | 
265 |         prompt_input_ids = pad_sequence(
266 |             batch_input_ids, padding_side='right', padding_value=self.pad_token_id
267 |         )
268 | 
269 |         chosen = pad_sequence(batch_chosen_ids, padding_side='right', padding_value=self.pad_token_id)
270 |         rejected = pad_sequence(batch_rejected_ids, padding_side='right', padding_value=self.pad_token_id)
271 | 
272 |         # torch.argmax used in `trl.trainer.utils.flush_left` does not accept bool tensors on some torch versions
273 |         # (e.g., torch==2.1); keep masks int to stay compatible.
274 |         prompt_attention_mask = (prompt_input_ids != self.pad_token_id).long()
275 |         chosen_attention_mask = (chosen != self.pad_token_id).long()
276 |         rejected_attention_mask = (rejected != self.pad_token_id).long()
277 | 
278 | 
279 |         data_dict = {
280 |             'prompt_input_ids': prompt_input_ids,
281 |             'prompt_attention_mask': prompt_attention_mask,
282 |             'chosen_input_ids': chosen,
283 |             'chosen_attention_mask': chosen_attention_mask,
284 |             'rejected_input_ids': rejected,
285 |             'rejected_attention_mask': rejected_attention_mask,
286 |         }
287 | 
288 |         if len(batch_pixel_values) > 0:
289 |             pixel_values = torch.cat(batch_pixel_values, dim=0)
290 |             image_thw = torch.cat(batch_image_thw, dim=0)
291 |             data_dict["pixel_values"] = pixel_values
292 |             data_dict["image_grid_thw"] = image_thw
293 | 
294 |         if len(batch_pixel_video_values) > 0:
295 |             pixel_video_values = torch.cat(batch_pixel_video_values, dim=0)
296 |             video_thw = torch.cat(batch_video_thw, dim=0)
297 |             data_dict["pixel_values_videos"] = pixel_video_values
298 |             data_dict["video_grid_thw"] = video_thw
299 | 
300 |         if len(batch_second_per_grid_ts) > 0:
301 |             data_dict["second_per_grid_ts"] = batch_second_per_grid_ts
302 | 
303 |         return data_dict
304 |     
305 | def make_dpo_data_module(model_id, processor, data_args):
306 |     """Make dataset and collator for DPO fine-tuning."""
307 |     dpo_dataset = DPODataset(
308 |         data_path=data_args.data_path, processor=processor, data_args=data_args, model_id=model_id
309 |     )
310 |     data_collator = DataCollatorForDPODataset(pad_token_id=processor.tokenizer.pad_token_id)
311 | 
312 |     return dict(train_dataset=dpo_dataset,
313 |                 eval_dataset=None,
314 |                 data_collator=data_collator)
315 | 


--------------------------------------------------------------------------------
/src/trainer/cls_trainer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | import torch.nn as nn
  4 | 
  5 | from transformers import Trainer
  6 | from transformers.trainer import (
  7 |     is_sagemaker_mp_enabled,
  8 |     get_parameter_names,
  9 |     TRAINER_STATE_NAME,
 10 |     PREFIX_CHECKPOINT_DIR,
 11 |     logger,
 12 |     ExportableState,
 13 |     SaveStrategy
 14 | )
 15 | from transformers.pytorch_utils import (
 16 |     ALL_LAYERNORM_LAYERS
 17 | )
 18 | from transformers.trainer_utils import seed_worker
 19 | from transformers.utils import is_datasets_available
 20 | from torch.utils.data import DataLoader
 21 | from torch.utils.data.distributed import DistributedSampler
 22 | import datasets
 23 | from typing import Optional, Callable
 24 | from functools import partial
 25 | from torch.utils.data import Dataset
 26 | 
 27 | from src.train.train_utils import get_peft_state_non_lora_maybe_zero_3
 28 | 
 29 | def maybe_zero_3(param, ignore_status=False, name=None):
 30 |     from deepspeed import zero
 31 |     from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 32 | 
 33 |     if hasattr(param, "ds_id"):
 34 |         if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
 35 |             if not ignore_status:
 36 |                 print(name, "no ignore status")
 37 |         with zero.GatheredParameters([param]):
 38 |             param = param.data.detach().cpu().clone()
 39 |     else:
 40 |         param = param.detach().cpu().clone()
 41 |     return param
 42 | 
 43 | class QwenCLSTrainer(Trainer):
 44 | 
 45 |     def __init__(self, *args, sampler=None, train_data_collator=None, eval_data_collator=None, **kwargs):
 46 |         self._custom_sampler = sampler
 47 |         self._train_data_collator = train_data_collator
 48 |         self._eval_data_collator = eval_data_collator
 49 |         super().__init__(*args, data_collator=self._train_data_collator,**kwargs)
 50 |     
 51 |     def get_eval_dataloader(self, eval_dataset=None):
 52 |         dl = super().get_eval_dataloader(eval_dataset)
 53 |         dl.collate_fn = self._eval_data_collator
 54 |         return dl
 55 | 
 56 |     def get_train_dataloader(self) -> DataLoader:
 57 |         """
 58 |         Returns the training [`~torch.utils.data.DataLoader`].
 59 | 
 60 |         Will use no sampler if `train_dataset` does not implement `__len__`, a random sampler (adapted to distributed
 61 |         training if necessary) otherwise.
 62 | 
 63 |         Subclass and override this method if you want to inject some custom behavior.
 64 |         """
 65 |         if self.train_dataset is None:
 66 |             raise ValueError("Trainer: training requires a train_dataset.")
 67 | 
 68 |         sampler = self._custom_sampler if self._custom_sampler is not None else self._get_train_sampler
 69 | 
 70 |         return self._get_dataloader(
 71 |             dataset=self.train_dataset,
 72 |             description="Training",
 73 |             batch_size=self._train_batch_size,
 74 |             sampler_fn=sampler,
 75 |             is_training=True,
 76 |         )
 77 |     
 78 |     def _get_dataloader(
 79 |         self,
 80 |         dataset: Dataset,
 81 |         description: str,
 82 |         batch_size: int,
 83 |         sampler_fn: Optional[Callable[[Dataset], torch.utils.data.Sampler]] = None,
 84 |         is_training: bool = False,
 85 |         dataloader_key: Optional[str] = None,
 86 |     ) -> DataLoader:
 87 |         """Create a [`~torch.utils.data.DataLoader`] from the given dataset."""
 88 | 
 89 |         data_collator = self.data_collator
 90 |         if is_datasets_available() and isinstance(dataset, datasets.Dataset):
 91 |             dataset = self._remove_unused_columns(dataset, description=description)
 92 |         else:
 93 |             data_collator = self._get_collator_with_removed_columns(self.data_collator, description=description)
 94 | 
 95 |         dataloader_params = {
 96 |             "batch_size": batch_size,
 97 |             "collate_fn": data_collator,
 98 |             "num_workers": self.args.dataloader_num_workers,
 99 |             "pin_memory": self.args.dataloader_pin_memory,
100 |             "persistent_workers": self.args.dataloader_persistent_workers,
101 |         }
102 | 
103 |         if not isinstance(dataset, torch.utils.data.IterableDataset):
104 |             if sampler_fn is not None:
105 |                 dataloader_params["sampler"] = sampler_fn(dataset)
106 |             dataloader_params["drop_last"] = self.args.dataloader_drop_last
107 |             dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
108 |             if is_training:
109 |                 dataloader_params["worker_init_fn"] = partial(
110 |                     seed_worker, num_workers=self.args.dataloader_num_workers, rank=self.args.process_index
111 |                 )
112 | 
113 |         dataloader = DataLoader(dataset, **dataloader_params)
114 |         
115 |         if isinstance(sampler_fn, DistributedSampler):
116 |             dataloader = self.accelerator.prepare(dataloader)
117 | 
118 |         # Store the prepared dataloader for subsequent evaluations if using persistent workers.
119 |         if dataloader_key is not None and self.args.dataloader_persistent_workers:
120 |             if hasattr(self, "_eval_dataloaders"):
121 |                 self._eval_dataloaders[dataloader_key] = dataloader
122 |             else:
123 |                 self._eval_dataloaders = {dataloader_key: dataloader}
124 | 
125 |         return dataloader
126 | 
127 | 
128 |     def create_optimizer(self):
129 |         """
130 |         Setup the optimizer.
131 |         We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
132 |         Trainer's init through `optimizers`, or subclass and override this method in a subclass.
133 |         """
134 |         if is_sagemaker_mp_enabled():
135 |             return super().create_optimizer()
136 |         
137 |         opt_model = self.model
138 | 
139 |         if self.optimizer is None:
140 |             decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
141 |             decay_parameters = [name for name in decay_parameters if "bias" not in name]
142 |             lr_mapper = {}
143 |             visual_parameters = []
144 |             merger_parameters = []
145 |             head_parameters = []
146 | 
147 |             if self.args.vision_lr is not None:
148 |                 lr_mapper["visual"] = self.args.vision_lr
149 |                 visual_parameters = [name for name, _ in opt_model.named_parameters() if "visual" in name and "merger" not in name]
150 |             if self.args.merger_lr is not None:
151 |                 lr_mapper["merger"] = self.args.merger_lr
152 |                 merger_parameters = [name for name, _ in opt_model.named_parameters() if "merger" in name]
153 |             if self.args.head_lr is not None:
154 |                 lr_mapper["score"] = self.args.head_lr
155 |                 head_parameters = [name for name, _ in opt_model.named_parameters() if "score" in name]
156 | 
157 |             if len(lr_mapper) > 0:
158 |                 special_lr_parameters = merger_parameters + visual_parameters + head_parameters
159 |                 
160 |                 optimizer_grouped_parameters = [
161 |                     {
162 |                         "params": [p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in special_lr_parameters and p.requires_grad)],
163 |                         "weight_decay": self.args.weight_decay,
164 |                     },
165 |                     {
166 |                         "params": [p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in special_lr_parameters and p.requires_grad)],
167 |                         "weight_decay": 0.0,
168 |                     },
169 |                 ]
170 |                 
171 |                 if visual_parameters: 
172 |                     optimizer_grouped_parameters.extend(
173 |                         [
174 |                             {
175 |                                 "params": [p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in visual_parameters and p.requires_grad)],
176 |                                 "weight_decay": self.args.weight_decay,
177 |                                 "lr": self.args.vision_lr,
178 |                                 "param_group_name": "visaul_decay"
179 |                             },
180 |                             {
181 |                                 "params": [p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in visual_parameters and p.requires_grad)],
182 |                                 "weight_decay": 0.0,
183 |                                 "lr": self.args.vision_lr,
184 |                                 "param_group_name": "visaul_non_decay"
185 |                             },
186 |                         ]
187 |                     )
188 |                 
189 |                 if merger_parameters: 
190 |                     optimizer_grouped_parameters.extend(
191 |                         [
192 |                             {
193 |                                 "params": [p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in merger_parameters and p.requires_grad)],
194 |                                 "weight_decay": self.args.weight_decay,
195 |                                 "lr": self.args.merger_lr,
196 |                                 "param_group_name": "merger_decay",
197 |                             },
198 |                             {
199 |                                 "params": [p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in merger_parameters and p.requires_grad)],
200 |                                 "weight_decay": 0.0,
201 |                                 "lr": self.args.merger_lr,
202 |                                 "param_group_name": "merger_non_decay",
203 |                             },
204 |                         ]
205 |                     )
206 |                 if head_parameters:
207 |                     optimizer_grouped_parameters.extend(
208 |                         [
209 |                             {
210 |                                 "params": [p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in head_parameters and p.requires_grad)],
211 |                                 "weight_decay": self.args.weight_decay,
212 |                                 "lr": self.args.head_lr,
213 |                                 "param_group_name": "head_decay",
214 |                             },
215 |                             {
216 |                                 "params": [p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in head_parameters and p.requires_grad)],
217 |                                 "weight_decay": 0.0,
218 |                                 "lr": self.args.head_lr,
219 |                                 "param_group_name": "head_non_decay",
220 |                             },
221 |                         ]
222 |                     )
223 |             else:
224 |                 optimizer_grouped_parameters = [
225 |                     {
226 |                         "params": [p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)],
227 |                         "weight_decay": self.args.weight_decay,
228 |                     },
229 |                     {
230 |                         "params": [p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)],
231 |                         "weight_decay": 0.0,
232 |                     },
233 |                 ]
234 |             optimizer_cls, optimizer_kwargs = Trainer.get_optimizer_cls_and_kwargs(self.args)
235 | 
236 |             self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
237 |             if optimizer_cls.__name__ == "Adam8bit":
238 |                 import bitsandbytes
239 | 
240 |                 manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
241 | 
242 |                 skipped = 0
243 |                 for module in opt_model.modules():
244 |                     if isinstance(module, nn.Embedding):
245 |                         skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
246 |                         logger.info(f"skipped {module}: {skipped/2**20}M params")
247 |                         manager.register_module_override(module, "weight", {"optim_bits": 32})
248 |                         logger.debug(f"bitsandbytes: will optimize {module} in fp32")
249 |                 logger.info(f"skipped: {skipped/2**20}M params")
250 | 
251 |         return self.optimizer
252 |     
253 |     def _save_checkpoint(self, model, trial):
254 |         # In all cases, including ddp/dp/deepspeed, self.model is always a reference to the model we
255 |         # want to save except FullyShardedDDP.
256 |         # assert unwrap_model(model) is self.model, "internal model should be a reference to self.model"
257 | 
258 |         # Save model checkpoint
259 |         super()._save_checkpoint(model, trial)
260 | 
261 |         if not self.args.lora_enable:
262 |             return
263 | 
264 |         checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
265 |         run_dir = self._get_output_dir(trial=trial)
266 |         output_dir = os.path.join(run_dir, checkpoint_folder)
267 | 
268 |         non_lora = get_peft_state_non_lora_maybe_zero_3(
269 |             self.model.named_parameters(),
270 |             require_grad_only=True,
271 |         )
272 | 
273 |         if self.args.should_save:
274 |             torch.save(non_lora, os.path.join(output_dir, "non_lora_state_dict.bin"))
275 |             self.model.base_model.config.to_json_file(os.path.join(output_dir, "config.json"))
276 | 
277 | 
278 |     # def training_step(self, model, inputs):
279 |     #     for name, param in model.named_parameters():
280 |     #         if 'visual' in name and param.requires_grad:
281 |     #             print(f"Training parameter {name}")
282 |     # 
283 |     #     return super().training_step(model, inputs)
284 | 


--------------------------------------------------------------------------------
/src/dataset/sft_dataset.py:
--------------------------------------------------------------------------------
  1 | import copy
  2 | import os
  3 | from typing import Dict
  4 | import torch
  5 | import transformers
  6 | import ujson as json
  7 | from torch.utils.data import Dataset
  8 | 
  9 | from src.params import DataArguments
 10 | from src.constants import (
 11 |     IGNORE_INDEX,
 12 |     DEFAULT_IM_START_TOKEN,
 13 |     DEFAULT_IM_END_TOKEN,
 14 |     DEFAULT_IMAGE_TOKEN,
 15 |     DEFAULT_VIDEO_TOKEN,
 16 |     SYSTEM_MESSAGE,
 17 | )
 18 | 
 19 | from .data_utils import get_image_info, get_video_info, llava_to_openai, pad_sequence
 20 | 
 21 | class SupervisedDataset(Dataset):
 22 |     """Dataset for supervised fine-tuning."""
 23 | 
 24 |     def __init__(
 25 |         self,
 26 |         data_path: str | list,
 27 |         processor: transformers.ProcessorMixin,
 28 |         data_args: DataArguments,
 29 |         model_id,
 30 |         padding=True,
 31 |     ):
 32 |         super(SupervisedDataset, self).__init__()
 33 |         if isinstance(data_path, str):
 34 |             list_data_dict = json.load(open(data_path, "r"))
 35 |         else:
 36 |             list_data_dict = data_path
 37 | 
 38 |         self.model_id = model_id
 39 |         self.processor = processor
 40 |         self.list_data_dict = list_data_dict
 41 |         self.data_args = data_args
 42 |         self.padding = padding
 43 |         self.image_min_pixel = data_args.image_min_pixels
 44 |         self.image_max_pixel = data_args.image_max_pixels
 45 |         self.video_min_pixel = data_args.video_min_pixels
 46 |         self.video_max_pixel = data_args.video_max_pixels
 47 |         self.image_resized_w = data_args.image_resized_width
 48 |         self.image_resized_h = data_args.image_resized_height
 49 |         self.video_resized_w = data_args.video_resized_width
 50 |         self.video_resized_h = data_args.video_resized_height
 51 |         self.fps = data_args.fps
 52 |         self.nframes = data_args.nframes
 53 | 
 54 |         if "Qwen3" in self.model_id:
 55 |             self.image_patch_size = 16
 56 |             self.return_video_metadata = True
 57 |         else:
 58 |             self.image_patch_size = 14
 59 |             self.return_video_metadata = False
 60 | 
 61 |     def __len__(self):
 62 |         return len(self.list_data_dict)
 63 | 
 64 |     def __getitem__(self, i) -> Dict[str, torch.Tensor]:
 65 |         sources = self.list_data_dict[i]
 66 | 
 67 |         is_video = False
 68 | 
 69 |         processor = self.processor
 70 |         if "image" in sources:
 71 |             videos = None
 72 |             grid_key = "image_grid_thw"
 73 |             pixel_key = "pixel_values"
 74 | 
 75 |             image_files = sources["image"]
 76 |             image_folder = self.data_args.image_folder
 77 | 
 78 |             if isinstance(image_files, str):
 79 |                 image_files = [image_files]
 80 | 
 81 |             images = []
 82 | 
 83 |             for image_file in image_files:
 84 |                 if not os.path.exists(image_file):
 85 |                     if not image_file.startswith("http"):
 86 |                         image_file = os.path.join(image_folder, image_file)
 87 |                 image_input = get_image_info(
 88 |                         image_file, 
 89 |                         self.image_min_pixel, 
 90 |                         self.image_max_pixel, 
 91 |                         self.image_resized_w, 
 92 |                         self.image_resized_h, 
 93 |                         self.image_patch_size
 94 |                     )
 95 |                 images.append(image_input)
 96 | 
 97 |         elif "video" in sources:
 98 |             is_video = True
 99 |             images=None
100 |             grid_key = "video_grid_thw"
101 |             pixel_key = "pixel_values_videos"
102 | 
103 |             video_files = sources["video"]
104 |             video_folder = self.data_args.image_folder
105 | 
106 |             if isinstance(video_files, str):
107 |                 video_files = [video_files]
108 | 
109 |             videos = []
110 |             for video_file in video_files:
111 |                 if not os.path.exists(video_file):
112 |                     if not video_file.startswith("http"):
113 |                         video_file = os.path.join(video_folder, video_file)
114 |                 video_input, video_kwargs = get_video_info(
115 |                     video_file, 
116 |                     self.video_min_pixel, 
117 |                     self.video_max_pixel, 
118 |                     self.video_resized_w, 
119 |                     self.video_resized_h, 
120 |                     self.data_args.fps,
121 |                     self.image_patch_size,
122 |                     return_video_metadata=self.return_video_metadata
123 |                 )
124 |                 videos.append(video_input)
125 |         else:
126 |             grid_key = None
127 |             pixel_key = None
128 |             images=None
129 |             videos=None
130 | 
131 |         sources = copy.deepcopy(llava_to_openai(sources['conversations'], is_video=is_video))
132 | 
133 |         all_input_ids = []
134 |         all_labels = []
135 |         all_pixel_values = []
136 |         all_image_grid_thw = []
137 |         all_second_gird = []
138 | 
139 |         image_curr_count = 0
140 |         video_curr_count = 0
141 |         
142 |         # Qwen2-VL uses a default system message so I've added this.
143 |         # Qwen3-Vl does not use a system message by default.
144 |         if len(SYSTEM_MESSAGE) > 0 and "Qwen3" not in self.model_id:
145 |             system_message = f"{DEFAULT_IM_START_TOKEN}system\n{SYSTEM_MESSAGE}{DEFAULT_IM_END_TOKEN}\n"
146 |             system_message_input_ids = processor.tokenizer(system_message, add_special_tokens=False, return_tensors='pt')['input_ids']
147 |             system_labels = torch.full_like(system_message_input_ids, IGNORE_INDEX)
148 | 
149 |             all_input_ids.append(system_message_input_ids.squeeze(0))
150 |             all_labels.append(system_labels.squeeze(0))
151 | 
152 |         for _, j in enumerate(range(0, len(sources), 2)):
153 |             user_input = sources[j]
154 |             gpt_response = sources[j + 1]
155 | 
156 |             user_input = f"{DEFAULT_IM_START_TOKEN}{user_input['role']}\n{user_input['content']}{DEFAULT_IM_END_TOKEN}\n{DEFAULT_IM_START_TOKEN}{gpt_response['role']}\n"
157 |             gpt_response = f"{gpt_response['content']}{DEFAULT_IM_END_TOKEN}\n"
158 | 
159 |             if DEFAULT_IMAGE_TOKEN in user_input:
160 |                 num_images = user_input.count(DEFAULT_IMAGE_TOKEN)
161 |                 # Slice the images list to get the images for the current turn.
162 |                 images_for_this_turn = images[image_curr_count : image_curr_count + num_images]
163 |                 inputs = processor(text=[user_input], images=images_for_this_turn, videos=videos, padding=False, do_resize=False, return_tensors='pt')
164 |                 prompt_input_ids = inputs['input_ids']
165 |                 all_pixel_values.append(inputs[pixel_key])
166 |                 all_image_grid_thw.append(inputs[grid_key])
167 |                 image_curr_count += num_images
168 | 
169 |             elif DEFAULT_VIDEO_TOKEN in user_input:
170 |                 num_videos = user_input.count(DEFAULT_VIDEO_TOKEN)
171 |                 # Slice the videos list to get the videos for the current turn.
172 |                 videos_for_this_turn = videos[video_curr_count : video_curr_count + num_videos]
173 |                 if "Qwen2.5" in self.model_id:
174 |                     inputs = processor(
175 |                         text=[user_input], 
176 |                         images=images, 
177 |                         videos=videos_for_this_turn, 
178 |                         padding=False, 
179 |                         do_resize=False, 
180 |                         return_tensors='pt', 
181 |                         **video_kwargs
182 |                     )
183 |                     all_second_gird.extend(inputs["second_per_grid_ts"])
184 |                 elif "Qwen3" in self.model_id:
185 | 
186 |                     videos_for_this_turn = videos[video_curr_count : video_curr_count + num_videos]
187 |                     video_datas_for_turn, video_metadatas_for_turn = zip(*videos_for_this_turn)
188 |                     video_datas_for_turn = list(video_datas_for_turn)
189 |                     video_metadatas_for_turn = list(video_metadatas_for_turn)
190 | 
191 |                     inputs = processor(
192 |                         text=[user_input],
193 |                         images=images,
194 |                         videos=video_datas_for_turn,
195 |                         padding=False,
196 |                         do_resize=False,
197 |                         return_tensors='pt',
198 |                         **video_kwargs,
199 |                         video_metadata=video_metadatas_for_turn,
200 |                     )
201 |                 else:
202 |                     inputs = processor(
203 |                         text=[user_input], 
204 |                         images=images, 
205 |                         videos=videos_for_this_turn, 
206 |                         padding=False, 
207 |                         do_resize=False, 
208 |                         return_tensors='pt'
209 |                     )
210 |                 prompt_input_ids = inputs['input_ids']
211 |                 all_pixel_values.append(inputs[pixel_key])
212 |                 all_image_grid_thw.append(inputs[grid_key])
213 |                 video_curr_count += num_videos
214 | 
215 |             else:
216 |                 prompt_input_ids = processor.tokenizer(user_input, add_special_tokens=False, padding=False, return_tensors='pt')['input_ids']
217 | 
218 |             response_input_ids = processor.tokenizer(gpt_response, add_special_tokens=False, padding=False, return_tensors='pt')['input_ids']
219 | 
220 |             input_ids = torch.cat([prompt_input_ids, response_input_ids], dim=1).squeeze(0)
221 |             labels = torch.cat(
222 |                 [
223 |                     torch.tensor([IGNORE_INDEX] * len(prompt_input_ids[0])),
224 |                     response_input_ids.squeeze(0),
225 |                 ],
226 |                 dim=0,
227 |             )
228 | 
229 |             all_input_ids.append(input_ids)
230 |             all_labels.append(labels)
231 | 
232 |         # There is no need for eos or bos tokens in the input_ids
233 |         # Qwen2-VL does not use them
234 |         input_ids = torch.cat(all_input_ids, dim=0).to(torch.long)
235 |         labels = torch.cat(all_labels, dim=0).to(torch.long)
236 | 
237 |         # eos_token_id = processor.tokenizer.convert_tokens_to_ids(DEFAULT_IM_END_TOKEN)
238 |         # input_ids, labels = truncate_sequence(input_ids, labels, self.max_length, eos_token_id)
239 | 
240 |         attention_mask = (input_ids > -1000000).to(torch.long)
241 | 
242 |         data_dict = dict(
243 |             input_ids=input_ids,
244 |             attention_mask=attention_mask,
245 |             labels=labels,
246 |         )
247 | 
248 |         if pixel_key and grid_key:
249 |             pixel_values = torch.cat(all_pixel_values, dim=0)
250 |             image_thw = torch.cat(all_image_grid_thw, dim=0)
251 |             data_dict[pixel_key] = pixel_values
252 |             data_dict[grid_key] = image_thw
253 | 
254 |         if len(all_second_gird) > 0:
255 |             second_gird = all_second_gird
256 |             data_dict["second_per_grid_ts"] = second_gird
257 | 
258 |         return data_dict
259 | 
260 | class DataCollatorForSupervisedDataset(object):
261 |     """Collate examples for supervised fine-tuning."""
262 | 
263 |     def __init__(self, pad_token_id: int):
264 |         self.pad_token_id = pad_token_id
265 | 
266 |     def __call__(self, examples):
267 |         batch_input_ids = []
268 |         batch_label_ids = []
269 |         batch_pixel_values = []
270 |         batch_pixel_video_values = []
271 |         batch_video_thw = []
272 |         batch_image_thw = []
273 |         batch_second_per_grid_ts = []
274 | 
275 |         for example in examples:
276 |             keys = example.keys()
277 |             if "pixel_values_videos" in keys:
278 |                 batch_pixel_video_values.append(example["pixel_values_videos"])
279 |                 batch_video_thw.append(example["video_grid_thw"])
280 |             elif "pixel_values" in keys:
281 |                 batch_pixel_values.append(example["pixel_values"])
282 |                 batch_image_thw.append(example["image_grid_thw"])
283 | 
284 |             batch_input_ids.append(example["input_ids"])
285 |             batch_label_ids.append(example["labels"])
286 | 
287 |             if "second_per_grid_ts" in keys:
288 |                 batch_second_per_grid_ts.extend(example["second_per_grid_ts"])
289 | 
290 |         input_ids = pad_sequence(
291 |             batch_input_ids, padding_side='right', padding_value=self.pad_token_id
292 |         )
293 | 
294 |         attention_mask = input_ids != self.pad_token_id
295 |         labels = pad_sequence(batch_label_ids, padding_side='right', padding_value=IGNORE_INDEX)
296 | 
297 |         data_dict = {
298 |             'input_ids': input_ids,
299 |             'labels': labels,
300 |             'attention_mask': attention_mask,
301 |         }
302 | 
303 |         if len(batch_pixel_values) > 0:
304 |             pixel_values = torch.cat(batch_pixel_values, dim=0)
305 |             image_thw = torch.cat(batch_image_thw, dim=0)
306 |             data_dict["pixel_values"] = pixel_values
307 |             data_dict["image_grid_thw"] = image_thw
308 | 
309 |         if len(batch_pixel_video_values) > 0:
310 |             pixel_video_values = torch.cat(batch_pixel_video_values, dim=0)
311 |             video_thw = torch.cat(batch_video_thw, dim=0)
312 |             data_dict["pixel_values_videos"] = pixel_video_values
313 |             data_dict["video_grid_thw"] = video_thw
314 | 
315 |         if len(batch_second_per_grid_ts) > 0:
316 |             data_dict["second_per_grid_ts"] = batch_second_per_grid_ts
317 | 
318 |         return data_dict
319 | 
320 | def make_supervised_data_module(model_id, processor, data_args):
321 |     """Make dataset and collator for supervised fine-tuning."""
322 |     sft_dataset = SupervisedDataset(
323 |         data_path=data_args.data_path, processor=processor, data_args=data_args, model_id=model_id
324 |     )
325 |     eval_dataset = None
326 |     if data_args.eval_path is not None:
327 |         eval_dataset = SupervisedDataset(
328 |               data_path=data_args.eval_path,
329 |               processor=processor,
330 |               data_args=data_args,
331 |               model_id=model_id
332 |           )
333 |         
334 |     data_collator = DataCollatorForSupervisedDataset(pad_token_id=processor.tokenizer.pad_token_id)
335 | 
336 |     return dict(train_dataset=sft_dataset,
337 |                 eval_dataset=eval_dataset,
338 |                 data_collator=data_collator)
339 | 


--------------------------------------------------------------------------------
/src/train/train_dpo.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from peft import LoraConfig, get_peft_model
  4 | import ast
  5 | from transformers import (
  6 |     AutoProcessor,
  7 |     AutoConfig,
  8 |     BitsAndBytesConfig,
  9 |     Qwen2VLForConditionalGeneration, 
 10 |     HfArgumentParser, 
 11 |     Qwen2_5_VLForConditionalGeneration,
 12 |     Qwen3VLForConditionalGeneration,
 13 |     Qwen3VLMoeForConditionalGeneration
 14 | )
 15 | from src.trainer import QwenDPOTrainer
 16 | from src.dataset import make_dpo_data_module
 17 | from src.params import DataArguments, ModelArguments, DPOArguments
 18 | from train.train_utils import get_peft_state_maybe_zero_3, get_peft_state_non_lora_maybe_zero_3, safe_save_model_for_hf_trainer
 19 | import pathlib
 20 | from monkey_patch_forward import (
 21 |     replace_qwen2_5_with_mixed_modality_forward, 
 22 |     replace_qwen_2_with_mixed_modality_forward,
 23 |     replace_qwen3_with_mixed_modality_forward,
 24 |     replace_qwen3_vl_moe_with_mixed_modality_forward
 25 | )
 26 | from monkey_patch_vision import replace_qwen2_5_vision
 27 | 
 28 | local_rank = None
 29 | 
 30 | def rank0_print(*args):
 31 |     if local_rank == 0 or local_rank == '0' or local_rank is None:
 32 |         print(*args)
 33 | 
 34 | def find_target_linear_names(model, num_lora_modules=-1, lora_namespan_exclude=[], verbose=True):
 35 |     linear_cls = torch.nn.modules.Linear
 36 |     embedding_cls = torch.nn.modules.Embedding
 37 |     lora_module_names = []
 38 | 
 39 |     for name, module in model.named_modules():
 40 |         if any(ex_keyword in name for ex_keyword in lora_namespan_exclude):
 41 |             continue
 42 |         if isinstance(module, (linear_cls, embedding_cls)):
 43 |             lora_module_names.append(name)
 44 |     
 45 |     if num_lora_modules > 0:
 46 |         lora_module_names = lora_module_names[-num_lora_modules:]
 47 |     if verbose:
 48 |         rank0_print(f"Found {len(lora_module_names)} lora modules: {lora_module_names}")
 49 |     return lora_module_names
 50 | 
 51 | def set_requires_grad(parameters, requires_grad):
 52 |     for p in parameters:
 53 |         p.requires_grad = requires_grad
 54 | 
 55 | def configure_vision_tower(model, training_args, compute_dtype, device):
 56 |     vision_tower = model.visual
 57 |     vision_tower.to(dtype=compute_dtype, device=device)
 58 | 
 59 |     vision_model_params = model.visual.parameters()
 60 |     set_requires_grad(vision_model_params, not training_args.freeze_vision_tower)
 61 |     
 62 |     # Handle merger specifically
 63 |     merger_params = model.visual.merger.parameters()
 64 |     set_requires_grad(merger_params, not training_args.freeze_merger)
 65 | 
 66 |     if hasattr(model.visual, "deepstack_merger_list"):
 67 |         deepstack_merger_list_params = model.visual.deepstack_merger_list.parameters()
 68 |         set_requires_grad(deepstack_merger_list_params, not training_args.freeze_merger)
 69 | 
 70 | def configure_llm(model, training_args):
 71 |     lm_head = model.lm_head.parameters()
 72 |     set_requires_grad(lm_head, not training_args.freeze_llm)
 73 | 
 74 |     llm_params = model.language_model.parameters()
 75 |     set_requires_grad(llm_params, not training_args.freeze_llm)
 76 | 
 77 | def unfreeze_topk_layers(model, k_llm: int = 0, k_vis: int = 0):
 78 |     if k_llm and hasattr(model, "language_model") and hasattr(model.language_model, "layers"):
 79 |         for layer in model.language_model.layers[-k_llm:]:
 80 |             for p in layer.parameters():
 81 |                 p.requires_grad = True
 82 | 
 83 |     if k_vis and hasattr(model, "visual") and hasattr(model.visual, "blocks"):
 84 |         for blk in model.visual.blocks[-k_vis:]:
 85 |             for p in blk.parameters():
 86 |                 p.requires_grad = True
 87 | 
 88 | def train():
 89 |     global local_rank
 90 | 
 91 |     parser = HfArgumentParser(
 92 |         (ModelArguments, DataArguments, DPOArguments))
 93 |     
 94 |     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
 95 |     
 96 |     if data_args.nframes is not None and data_args.fps is not None:
 97 |         raise ValueError("You cannot set both `nframes` and `fps` at the same time. Please set only one of them.")
 98 | 
 99 |     if training_args.lora_enable and not training_args.freeze_llm:
100 |         raise ValueError("If `lora_enable` is True, `freeze_llm` must also be True.")
101 | 
102 |     if not training_args.lora_enable:
103 |         assert not training_args.vision_lora, \
104 |             "Error: training_args.lora_enable is not enabled, but training_args.vision_lora is enabled."
105 |         
106 |     if training_args.vision_lora and not training_args.freeze_vision_tower:
107 |         raise ValueError("If `vision_lora` is True, `freeze_vision_tower` must also be True.")
108 | 
109 |     else:
110 |         if training_args.lora_namespan_exclude is not None:
111 |             training_args.lora_namespan_exclude = ast.literal_eval(training_args.lora_namespan_exclude)
112 |         else:
113 |             training_args.lora_namespan_exclude = []
114 | 
115 |         if not training_args.vision_lora:
116 |             training_args.lora_namespan_exclude += ["visual"]
117 | 
118 |     local_rank = training_args.local_rank
119 |     compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
120 | 
121 |     bnb_model_from_pretrained_args = {}
122 |     if training_args.bits in [4,8]:
123 |         bnb_model_from_pretrained_args.update(dict(
124 |             device_map={"":training_args.device},
125 |             quantization_config = BitsAndBytesConfig(
126 |                 load_in_4bit=training_args.bits==4,
127 |                 load_in_8bit=training_args.bits==8,
128 |                 llm_int8_skip_modules=["visual", "lm_head"],
129 |                 llm_int8_threshold=6.0,
130 |                 llm_int8_has_fp16_weight=False,
131 |                 bnb_4bit_compute_dtype=compute_dtype,
132 |                 bnb_4bit_use_double_quant=training_args.double_quant,
133 |                 bnb_4bit_quant_type=training_args.quant_type,
134 |             )
135 |         ))
136 | 
137 |     ref_model = None
138 | 
139 |     config = AutoConfig.from_pretrained(model_args.model_id)
140 | 
141 |     if config.model_type == "qwen3_vl_moe":
142 |         replace_qwen3_vl_moe_with_mixed_modality_forward()
143 |         model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
144 |             model_args.model_id,
145 |             dtype=compute_dtype,
146 |             attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa",
147 |             **bnb_model_from_pretrained_args
148 |         )
149 |         if not training_args.lora_enable:
150 |             ref_model = Qwen3VLMoeForConditionalGeneration.from_pretrained(
151 |                 model_args.model_id,
152 |                 dtype=compute_dtype,
153 |                 attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa",
154 |                 **bnb_model_from_pretrained_args
155 |             )
156 | 
157 |     elif config.model_type == "qwen3_vl":
158 |         replace_qwen3_with_mixed_modality_forward()
159 |         model = Qwen3VLForConditionalGeneration.from_pretrained(
160 |             model_args.model_id,
161 |             dtype=compute_dtype,
162 |             attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa",
163 |             **bnb_model_from_pretrained_args
164 |         )
165 |         if not training_args.lora_enable:
166 |             ref_model = Qwen3VLForConditionalGeneration.from_pretrained(
167 |                 model_args.model_id,
168 |                 dtype=compute_dtype,
169 |                 attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa",
170 |                 **bnb_model_from_pretrained_args
171 |             )
172 | 
173 |     elif config.model_type == "qwen2_5_vl":
174 |         replace_qwen2_5_with_mixed_modality_forward()
175 |         replace_qwen2_5_vision()
176 |         model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
177 |             model_args.model_id,
178 |             dtype=compute_dtype,
179 |             attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa", 
180 |             **bnb_model_from_pretrained_args
181 |         )
182 |         if not training_args.lora_enable:
183 |             ref_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
184 |                 model_args.model_id,
185 |                 dtype=compute_dtype,
186 |                 attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa", 
187 |                 **bnb_model_from_pretrained_args
188 |             )
189 |         
190 |     else:
191 |         replace_qwen_2_with_mixed_modality_forward()
192 |         model = Qwen2VLForConditionalGeneration.from_pretrained(
193 |             model_args.model_id,
194 |             dtype=compute_dtype,
195 |             attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa", 
196 |             **bnb_model_from_pretrained_args
197 |         )
198 |         if not training_args.lora_enable:
199 |             ref_model = Qwen2VLForConditionalGeneration.from_pretrained(
200 |                 model_args.model_id,
201 |                 dtype=compute_dtype,
202 |                 attn_implementation="flash_attention_2" if not training_args.disable_flash_attn2 else "sdpa", 
203 |                 **bnb_model_from_pretrained_args
204 |             )
205 | 
206 |     model.config.use_cache = False
207 |     model_to_configure = model
208 |     configure_llm(model_to_configure, training_args)
209 |     configure_vision_tower(model_to_configure, training_args, compute_dtype, training_args.device)
210 | 
211 |     unfreeze_topk_layers(
212 |         model_to_configure,
213 |         k_llm=getattr(training_args, "unfreeze_topk_llm", 0),
214 |         k_vis=getattr(training_args, "unfreeze_topk_vision", 0),
215 |     )
216 | 
217 |     if training_args.gradient_checkpointing:
218 |         if training_args.vision_lora:
219 |             training_args.gradient_checkpointing_kwargs = {"use_reentrant": False}
220 |         else:
221 |             training_args.gradient_checkpointing_kwargs = {"use_reentrant": True}
222 |         
223 |         model.enable_input_require_grads()
224 | 
225 |     if training_args.bits in [4,8]:
226 |         model.config.dtype = (torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
227 |         from peft import prepare_model_for_kbit_training
228 |         model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing, gradient_checkpointing_kwargs=training_args.gradient_checkpointing_kwargs)
229 | 
230 |     if training_args.lora_enable:
231 |         lora_namespan_exclude = training_args.lora_namespan_exclude
232 |         peft_config = LoraConfig(
233 |             r=training_args.lora_rank,
234 |             lora_alpha=training_args.lora_alpha,
235 |             target_modules=find_target_linear_names(model, lora_namespan_exclude=lora_namespan_exclude, num_lora_modules=training_args.num_lora_modules),
236 |             lora_dropout=training_args.lora_dropout,
237 |             bias=training_args.lora_bias
238 |         )
239 |         if training_args.bits == 16:
240 |             if training_args.bf16:
241 |                 model.to(torch.bfloat16)
242 |             if training_args.fp16:
243 |                 model.to(torch.float16)
244 |         rank0_print("Adding LoRA to the model...")
245 |         model = get_peft_model(model, peft_config)
246 | 
247 |         # Peft maodel makes vision tower and merger freezed again.
248 |         # Configuring fuction could be called here, but sometimes it does not work properly.
249 |         # So I just made it this way.
250 |         # Need to be fixed in the future.
251 | 
252 |         if not training_args.freeze_vision_tower:
253 |             for name, param in model.named_parameters():
254 |                 if "visual" in name:
255 |                     param.requires_grad = True
256 | 
257 |         if not training_args.freeze_merger:
258 |             for name, param in model.named_parameters():
259 |                 if "merger" in name:
260 |                     param.requires_grad = True
261 | 
262 |     processor = AutoProcessor.from_pretrained(model_args.model_id)
263 | 
264 |     # model.config.tokenizer_model_max_length = processor.tokenizer.model_max_length
265 | 
266 |     if ref_model is not None:
267 |         ref_model.eval()
268 |         ref_model.config.use_cache = False
269 | 
270 |     if training_args.bits in [4, 8]:
271 |         from peft.tuners.lora import LoraLayer
272 |         for name, module in model.named_modules():
273 |             if isinstance(module, LoraLayer):
274 |                 if training_args.bf16:
275 |                     module = module.to(torch.bfloat16)
276 |             if 'norm' in name:
277 |                 module = module.to(torch.float32)
278 |             
279 |             if 'lm_head' in name or 'embed_token' in name:
280 |                 if hasattr(module, 'weight'):
281 |                     if training_args.bf16 and module.weight.dtype == torch.float32:
282 |                         module = module.to(torch.bfloat16)
283 | 
284 |     dataset_module = make_dpo_data_module(model_id=model_args.model_id,
285 |                                               processor=processor,
286 |                                               data_args=data_args)
287 |     
288 |     training_args.padding_value = processor.tokenizer.pad_token_id
289 | 
290 |     trainer = QwenDPOTrainer(
291 |         model=model,
292 |         ref_model = ref_model,
293 |         train_dataset=dataset_module["train_dataset"],
294 |         eval_dataset = dataset_module["eval_dataset"],
295 |         data_collator= dataset_module["data_collator"],
296 |         processing_class=processor,
297 |         args=training_args,
298 |     )
299 | 
300 |     if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
301 |         trainer.train(resume_from_checkpoint=True)
302 |     else:
303 |         trainer.train()
304 | 
305 |     trainer.save_state()
306 | 
307 |     model.config.use_cache = True
308 |     
309 |     if training_args.lora_enable:
310 |         state_dict = get_peft_state_maybe_zero_3(
311 |             model.named_parameters(), training_args.lora_bias
312 |         )
313 | 
314 |         non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(
315 |             model.named_parameters(), require_grad_only=True
316 |         )
317 | 
318 |         if local_rank == 0 or local_rank == -1:
319 |             model.config.save_pretrained(training_args.output_dir)
320 |             model.save_pretrained(training_args.output_dir, state_dict=state_dict)
321 |             processor.save_pretrained(training_args.output_dir)
322 |             torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, "non_lora_state_dict.bin"))
323 |     else:
324 |         safe_save_model_for_hf_trainer(trainer, output_dir=training_args.output_dir)
325 | 
326 | 
327 | 
328 | if __name__ == "__main__":
329 |     train()


--------------------------------------------------------------------------------
/src/trainer/dpo_trainer.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import torch
  3 | from torch import nn
  4 | from pathlib import Path
  5 | import torch.nn.functional as F
  6 | from typing import Union
  7 | 
  8 | from transformers.trainer import (
  9 |     is_sagemaker_mp_enabled,
 10 |     get_parameter_names,
 11 |     TRAINER_STATE_NAME,
 12 |     PREFIX_CHECKPOINT_DIR,
 13 |     logger,
 14 |     ExportableState,
 15 |     SaveStrategy
 16 | )
 17 | from transformers.pytorch_utils import (
 18 |     ALL_LAYERNORM_LAYERS
 19 | )
 20 | from trl import DPOTrainer
 21 | from trl.trainer.utils import pad_to_length, flush_left, selective_log_softmax
 22 | from train.train_utils import get_peft_state_non_lora_maybe_zero_3
 23 | 
 24 | def maybe_zero_3(param, ignore_status=False, name=None):
 25 |     from deepspeed import zero
 26 |     from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
 27 | 
 28 |     if hasattr(param, "ds_id"):
 29 |         if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
 30 |             if not ignore_status:
 31 |                 print(name, "no ignore status")
 32 |         with zero.GatheredParameters([param]):
 33 |             param = param.data.detach().cpu().clone()
 34 |     else:
 35 |         param = param.detach().cpu().clone()
 36 |     return param
 37 | 
 38 | class QwenDPOTrainer(DPOTrainer):
 39 | 
 40 |     def __init__(self, *args, **kwargs):
 41 |         super(QwenDPOTrainer, self).__init__(*args, **kwargs)
 42 | 
 43 |     def _prepare_dataset(
 44 |         self,
 45 |         dataset,
 46 |         processing_class,
 47 |         args,
 48 |         dataset_name
 49 |     ):
 50 |         return dataset
 51 | 
 52 |     @staticmethod
 53 |     def concatenated_inputs(
 54 |         batch: dict[str, Union[list, torch.LongTensor]], padding_value: int
 55 |     ) -> dict[str, torch.LongTensor]:
 56 | 
 57 |         concatenated_batch = {}
 58 | 
 59 |         concatenated_batch['prompt_input_ids'] = torch.cat([batch["prompt_input_ids"], batch["prompt_input_ids"]], dim=0)
 60 |         concatenated_batch['prompt_attention_mask'] = torch.cat([batch["prompt_attention_mask"], batch["prompt_attention_mask"]], dim=0)
 61 | 
 62 |         if 'pixel_values' in batch:
 63 |             concatenated_batch['pixel_values'] = torch.cat([batch["pixel_values"], batch["pixel_values"]], dim=0)
 64 |             concatenated_batch['image_grid_thw'] = torch.cat([batch["image_grid_thw"], batch["image_grid_thw"]], dim=0)
 65 | 
 66 |         if 'pixel_values_videos' in batch:
 67 |             concatenated_batch['pixel_values_videos'] = torch.cat(
 68 |                 [batch["pixel_values_videos"], batch["pixel_values_videos"]], dim=0
 69 |             )
 70 |             concatenated_batch['video_grid_thw'] = torch.cat(
 71 |                 [batch["video_grid_thw"], batch["video_grid_thw"]], dim=0
 72 |             )
 73 | 
 74 |         if 'second_grid_ts' in batch:
 75 |             concatenated_batch['second_grid_ts'] = torch.cat(
 76 |                 [batch["second_grid_ts"], batch["second_grid_ts"]], dim=0
 77 |             )
 78 | 
 79 |         max_completion_length = max(batch["chosen_input_ids"].shape[1], batch["rejected_input_ids"].shape[1])
 80 | 
 81 |         concatenated_batch['completion_input_ids'] = torch.cat(
 82 |             (
 83 |                 pad_to_length(batch["chosen_input_ids"], max_completion_length, pad_value=padding_value),
 84 |                 pad_to_length(batch["rejected_input_ids"], max_completion_length, pad_value=padding_value),
 85 |             ),
 86 |         )
 87 | 
 88 |         concatenated_batch['completion_attention_mask'] = torch.cat(
 89 |             (
 90 |                 pad_to_length(batch["chosen_attention_mask"], max_completion_length, pad_value=0),
 91 |                 pad_to_length(batch["rejected_attention_mask"], max_completion_length, pad_value=0),
 92 |             ),
 93 |         )
 94 | 
 95 |         return concatenated_batch
 96 |     
 97 | 
 98 |     def concatenated_forward(self, model, batch, is_ref_model:bool=False):
 99 | 
100 |         num_examples = batch['prompt_input_ids'].shape[0]
101 |         
102 |         concatenated_batch = self.concatenated_inputs(batch, padding_value=self.padding_value)
103 | 
104 |         model_kwargs = {}
105 | 
106 |         if self.aux_loss_enabled:
107 |             model_kwargs['output_router_logits'] = True
108 | 
109 |         # Add image/video values to model kwargs
110 |         if 'pixel_values' in batch:
111 |             model_kwargs['pixel_values'] = concatenated_batch['pixel_values']
112 |             model_kwargs['image_grid_thw'] = concatenated_batch['image_grid_thw']
113 |         if 'pixel_values_videos' in batch:
114 |             model_kwargs['pixel_values_videos'] = concatenated_batch['pixel_values_videos']
115 |             model_kwargs['video_grid_thw'] = concatenated_batch['video_grid_thw']
116 |         if 'second_grid_ts' in batch:
117 |             model_kwargs['second_grid_ts'] = concatenated_batch['second_grid_ts']
118 | 
119 |         prompt_input_ids = concatenated_batch["prompt_input_ids"]
120 |         prompt_attention_mask = concatenated_batch["prompt_attention_mask"]
121 |         completion_input_ids = concatenated_batch["completion_input_ids"]
122 |         completion_attention_mask = concatenated_batch["completion_attention_mask"]
123 |         
124 |         input_ids = torch.cat((prompt_input_ids, completion_input_ids), dim=1)
125 |         attention_mask = torch.cat((prompt_attention_mask, completion_attention_mask), dim=1)
126 |         loss_mask = torch.cat(
127 |             (torch.zeros_like(prompt_attention_mask), completion_attention_mask), dim=1
128 |         )
129 | 
130 |         # Flush left to reduce the memory usage
131 |         # [[0, 0, x, x, x, x],  ->  [[x, x, x, x],
132 |         #  [0, x, x, x, 0, 0]]       [x, x, x, 0]]
133 |         attention_mask, input_ids, loss_mask = flush_left(attention_mask, input_ids, loss_mask)
134 | 
135 |         model_kwargs["attention_mask"] = attention_mask
136 | 
137 |         outputs = model(input_ids, **model_kwargs)
138 |         logits = outputs.logits
139 | 
140 |         labels = torch.roll(input_ids, shifts=-1, dims=1)
141 |         loss_mask = torch.roll(loss_mask, shifts=-1, dims=1).bool()
142 | 
143 |         if logits.shape[:2] != labels.shape[:2]:
144 |             # for llava, the returned logits include the image tokens (placed before the text tokens)
145 |             seq_len = labels.shape[1]
146 |             logits = logits[:, -seq_len:]
147 | 
148 |         # Compute the log probabilities of the labels
149 |         labels[~loss_mask] = 0  # dummy token; we'll ignore the losses on these tokens later
150 |         per_token_logps = selective_log_softmax(logits, labels)
151 |         per_token_logps[~loss_mask] = 0
152 |         per_token_logps = torch.roll(per_token_logps, shifts=1, dims=1)
153 | 
154 |         all_logps = per_token_logps.sum(-1)
155 | 
156 |         output = {}
157 | 
158 |         if self.use_weighting:
159 |             with torch.no_grad():
160 |                 # Eq (2) of the WPO paper: https://huggingface.co/papers/2406.11827
161 |                 logprobs = F.log_softmax(logits, dim=-1)
162 |                 weights_adjustment_factor = torch.logsumexp(2 * logprobs, dim=-1)  # same as sum(probs**2) in log space
163 |                 per_token_logps_adjusted = per_token_logps - weights_adjustment_factor
164 |                 all_weights = (per_token_logps_adjusted * loss_mask).sum(-1) / loss_mask.sum(-1)
165 |                 chosen_weights = all_weights[:num_examples]
166 |                 rejected_weights = all_weights[num_examples:]
167 |                 output["policy_weights"] = torch.clamp(torch.exp(chosen_weights + rejected_weights), max=1)
168 | 
169 |         if self.args.rpo_alpha is not None:
170 |             # Only use the chosen logits for the RPO loss
171 |             chosen_logits = logits[:num_examples]
172 |             chosen_labels = labels[:num_examples]
173 | 
174 |             # Compute the log probabilities of the labels
175 |             output["nll_loss"] = F.cross_entropy(
176 |                 torch.flatten(chosen_logits, end_dim=1), torch.flatten(chosen_labels, end_dim=1), ignore_index=0
177 |             )
178 | 
179 |         if "ipo" in self.loss_type:
180 |             all_logps = all_logps / loss_mask.sum(-1)
181 | 
182 |         output["chosen_logps"] = all_logps[:num_examples]
183 |         output["rejected_logps"] = all_logps[num_examples:]
184 |         output["mean_chosen_logits"] = logits[:num_examples][loss_mask[:num_examples]].mean()
185 |         output["mean_rejected_logits"] = logits[num_examples:][loss_mask[num_examples:]].mean()
186 | 
187 |         if self.aux_loss_enabled:
188 |             output["aux_loss"] = outputs.aux_loss
189 | 
190 |         return output
191 | 
192 |     def create_optimizer(self):
193 |         """
194 |         Setup the optimizer.
195 |         We provide a reasonable default that works well. If you want to use something else, you can pass a tuple in the
196 |         Trainer's init through `optimizers`, or subclass and override this method in a subclass.
197 |         """
198 |         if is_sagemaker_mp_enabled():
199 |             return super().create_optimizer()
200 |         
201 |         opt_model = self.model
202 | 
203 |         if self.optimizer is None:
204 |             decay_parameters = get_parameter_names(opt_model, ALL_LAYERNORM_LAYERS)
205 |             decay_parameters = [name for name in decay_parameters if "bias" not in name]
206 |             lr_mapper = {}
207 |             visual_parameters = []
208 |             merger_parameters = []
209 | 
210 |             if self.args.vision_lr is not None:
211 |                 lr_mapper["visual"] = self.args.vision_lr
212 |                 visual_parameters = [name for name, _ in opt_model.named_parameters() if "visual" in name and "merger" not in name]
213 |             if self.args.merger_lr is not None:
214 |                 lr_mapper["merger"] = self.args.merger_lr
215 |                 merger_parameters = [name for name, _ in opt_model.named_parameters() if "merger" in name]
216 | 
217 |             if len(lr_mapper) > 0:
218 |                 special_lr_parameters = merger_parameters + visual_parameters
219 |                 
220 |                 optimizer_grouped_parameters = [
221 |                     {
222 |                         "params": [p for n, p in opt_model.named_parameters() if (n in decay_parameters and n not in special_lr_parameters and p.requires_grad)],
223 |                         "weight_decay": self.args.weight_decay,
224 |                     },
225 |                     {
226 |                         "params": [p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n not in special_lr_parameters and p.requires_grad)],
227 |                         "weight_decay": 0.0,
228 |                     },
229 |                 ]
230 |                 
231 |                 if visual_parameters: 
232 |                     optimizer_grouped_parameters.extend(
233 |                         [
234 |                             {
235 |                                 "params": [p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in visual_parameters and p.requires_grad)],
236 |                                 "weight_decay": self.args.weight_decay,
237 |                                 "lr": self.args.vision_lr,
238 |                                 "param_group_name": "visaul_decay"
239 |                             },
240 |                             {
241 |                                 "params": [p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in visual_parameters and p.requires_grad)],
242 |                                 "weight_decay": 0.0,
243 |                                 "lr": self.args.vision_lr,
244 |                                 "param_group_name": "visaul_non_decay"
245 |                             },
246 |                         ]
247 |                     )
248 |                 
249 |                 if merger_parameters: 
250 |                     optimizer_grouped_parameters.extend(
251 |                         [
252 |                             {
253 |                                 "params": [p for n, p in opt_model.named_parameters() if (n in decay_parameters and n in merger_parameters and p.requires_grad)],
254 |                                 "weight_decay": self.args.weight_decay,
255 |                                 "lr": self.args.merger_lr,
256 |                                 "param_group_name": "merger_decay",
257 |                             },
258 |                             {
259 |                                 "params": [p for n, p in opt_model.named_parameters() if (n not in decay_parameters and n in merger_parameters and p.requires_grad)],
260 |                                 "weight_decay": 0.0,
261 |                                 "lr": self.args.merger_lr,
262 |                                 "param_group_name": "merger_non_decay",
263 |                             },
264 |                         ]
265 |                     )
266 |             else:
267 |                 optimizer_grouped_parameters = [
268 |                     {
269 |                         "params": [p for n, p in opt_model.named_parameters() if (n in decay_parameters and p.requires_grad)],
270 |                         "weight_decay": self.args.weight_decay,
271 |                     },
272 |                     {
273 |                         "params": [p for n, p in opt_model.named_parameters() if (n not in decay_parameters and p.requires_grad)],
274 |                         "weight_decay": 0.0,
275 |                     },
276 |                 ]
277 |             optimizer_cls, optimizer_kwargs = self.get_optimizer_cls_and_kwargs(self.args)
278 | 
279 |             self.optimizer = optimizer_cls(optimizer_grouped_parameters, **optimizer_kwargs)
280 |             if optimizer_cls.__name__ == "Adam8bit":
281 |                 import bitsandbytes
282 | 
283 |                 manager = bitsandbytes.optim.GlobalOptimManager.get_instance()
284 | 
285 |                 skipped = 0
286 |                 for module in opt_model.modules():
287 |                     if isinstance(module, nn.Embedding):
288 |                         skipped += sum({p.data_ptr(): p.numel() for p in module.parameters()}.values())
289 |                         logger.info(f"skipped {module}: {skipped/2**20}M params")
290 |                         manager.register_module_override(module, "weight", {"optim_bits": 32})
291 |                         logger.debug(f"bitsandbytes: will optimize {module} in fp32")
292 |                 logger.info(f"skipped: {skipped/2**20}M params")
293 | 
294 |         return self.optimizer
295 | 
296 | 
297 |     def _save_checkpoint(self, model, trial):
298 |         super()._save_checkpoint(model, trial)
299 | 
300 |         if not self.args.lora_enable:
301 |             return
302 | 
303 |         checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
304 |         run_dir = self._get_output_dir(trial=trial)
305 |         output_dir = os.path.join(run_dir, checkpoint_folder)
306 | 
307 |         non_lora = get_peft_state_non_lora_maybe_zero_3(
308 |             self.model.named_parameters(),
309 |             require_grad_only=True,
310 |         )
311 | 
312 |         if self.args.should_save:
313 |             torch.save(non_lora, os.path.join(output_dir, "non_lora_state_dict.bin"))
314 |             self.model.base_model.config.to_json_file(os.path.join(output_dir, "config.json"))
315 | 


--------------------------------------------------------------------------------