├── .gitattributes
├── projects
    ├── trl-ppo-fine-tuning
    │   ├── requirements.txt
    │   ├── README.md
    │   ├── test_base_vs_lora.py
    │   └── train.py
    ├── financial-reasoning-enhanced
    │   ├── requirements.txt
    │   ├── QUICK_START.md
    │   ├── README.md
    │   └── financial_reasoning_enhanced.py
    └── vllm-fine-tuning-smolvlm
    │   ├── requirements.txt
    │   ├── config_example.json
    │   ├── QUICK_START.md
    │   ├── README.md
    │   ├── test_smolvlm_chartqa.py
    │   └── train_smolvlm_chartqa.py
├── LICENSE
└── README.md


/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/projects/trl-ppo-fine-tuning/requirements.txt:
--------------------------------------------------------------------------------
1 | # Core dependencies for TRL
2 | trl
3 | torch
4 | transformers
5 | datasets
6 | accelerate
7 | 


--------------------------------------------------------------------------------
/projects/financial-reasoning-enhanced/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Core dependencies for Financial Reasoning Enhanced
 2 | torch>=2.0.0
 3 | transformers>=4.47.0
 4 | trl>=0.14.0
 5 | datasets>=3.2.0
 6 | accelerate>=0.20.0
 7 | 
 8 | # Unsloth for efficient fine-tuning
 9 | unsloth
10 | 
11 | # Financial analysis and sentiment
12 | sentence-transformers
13 | 
14 | # Optional visualization dependencies
15 | matplotlib
16 | seaborn
17 | 
18 | # Utilities
19 | numpy
20 | scipy
21 | scikit-learn
22 | 
23 | # Optional: 4-bit quantization (handled by Unsloth)
24 | # bitsandbytes
25 | 


--------------------------------------------------------------------------------
/projects/vllm-fine-tuning-smolvlm/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Core dependencies for SmolVLM-256M ChartQA Fine-Tuning
 2 | torch>=2.0.0
 3 | transformers>=4.40.0
 4 | datasets>=2.18.0
 5 | trl>=0.14.0
 6 | peft>=0.10.0
 7 | 
 8 | # Image processing
 9 | Pillow>=9.0.0
10 | requests>=2.25.0
11 | 
12 | # Optional: For visualization (not required for core functionality)
13 | matplotlib>=3.5.0
14 | seaborn>=0.11.0
15 | 
16 | # Utilities
17 | numpy>=1.21.0
18 | pandas>=1.3.0
19 | 
20 | # Optional: For advanced features
21 | accelerate>=0.20.0
22 | bitsandbytes>=0.41.0  # For 4-bit quantization (optional)
23 | 
24 | # Development dependencies (optional)
25 | pytest>=7.0.0
26 | black>=22.0.0
27 | isort>=5.10.0
28 | flake8>=4.0.0
29 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2025 Pavan Kunchala
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/projects/vllm-fine-tuning-smolvlm/config_example.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "comment": "Example configuration for SmolVLM-256M ChartQA fine-tuning",
 3 |   "model": {
 4 |     "base_model": "HuggingFaceTB/SmolVLM-256M-Instruct",
 5 |     "output_dir": "smolvlm-256m-chartqa-sft"
 6 |   },
 7 |   "training": {
 8 |     "batch_size": 16,
 9 |     "learning_rate": 0.001,
10 |     "epochs": 2,
11 |     "max_steps": 500,
12 |     "gradient_accumulation": 2,
13 |     "memory_limit_gb": 14.0
14 |   },
15 |   "lora": {
16 |     "rank": 16,
17 |     "alpha": 32,
18 |     "dropout": 0.05
19 |   },
20 |   "precision": {
21 |     "mixed_precision": "bf16",
22 |     "gradient_checkpointing": false
23 |   },
24 |   "data": {
25 |     "dataset": "HuggingFaceM4/ChartQA",
26 |     "train_split": "[:80%]",
27 |     "val_split": "[:20%]"
28 |   },
29 |   "presets": {
30 |     "high_performance_16gb": {
31 |       "batch_size": 16,
32 |       "memory_limit_gb": 14.0,
33 |       "lora_rank": 16,
34 |       "lora_alpha": 32
35 |     },
36 |     "balanced_12gb": {
37 |       "batch_size": 12,
38 |       "memory_limit_gb": 10.0,
39 |       "lora_rank": 16,
40 |       "lora_alpha": 32
41 |     },
42 |     "conservative_8gb": {
43 |       "batch_size": 8,
44 |       "memory_limit_gb": 6.0,
45 |       "lora_rank": 12,
46 |       "lora_alpha": 24,
47 |       "gradient_checkpointing": true
48 |     },
49 |     "maximum_accuracy": {
50 |       "batch_size": 12,
51 |       "learning_rate": 0.0005,
52 |       "epochs": 3,
53 |       "max_steps": 750,
54 |       "lora_rank": 24,
55 |       "lora_alpha": 48,
56 |       "memory_limit_gb": 12.0
57 |     }
58 |   }
59 | }
60 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Reinforcement Learning Learnings
 2 | 
 3 | This repository is a collection of my personal projects and experiments in the field of Reinforcement Learning (RL), particularly focusing on techniques involving language models and fine-tuning approaches, including vision-language models and memory-efficient training strategies.
 4 | 
 5 | ## Projects
 6 | 
 7 | The projects are organized into subdirectories within the `projects/` folder. Each project is self-contained with its own README and dependency list.
 8 | 
 9 | -   **[TRL PPO Fine-Tuning](./projects/trl-ppo-fine-tuning/)**: An example of fine-tuning a language model using Proximal Policy Optimization (PPO) with the `trl` library.
10 | -   **[Financial Reasoning Enhanced](./projects/financial-reasoning-enhanced/)**: Advanced fine-tuning pipeline combining SFT and GRPO with multi-level reward functions for financial reasoning tasks.
11 | -   **[vLLM Fine-Tuning SmolVLM](./projects/vllm-fine-tuning-smolvlm/)**: Ultra-efficient fine-tuning of SmolVLM-256M on ChartQA using lazy loading and streaming for maximum memory efficiency on consumer GPUs.
12 | 
13 | ## Approaches Covered
14 | 
15 | - **PPO (Proximal Policy Optimization)**: Traditional reinforcement learning for language model fine-tuning
16 | - **GRPO (Group Relative Policy Optimization)**: Advanced RLHF technique with group-based sampling
17 | - **SFT + RL Hybrid**: Supervised fine-tuning followed by reinforcement learning for optimal performance
18 | - **Vision-Language Fine-Tuning**: Efficient fine-tuning of multimodal models for chart understanding and visual reasoning tasks
19 | - **Memory-Efficient Training**: Lazy loading and streaming techniques for training large models on limited hardware
20 | - **Multi-level Rewards**: Sophisticated reward functions combining format compliance, reasoning quality, and domain expertise
21 | 
22 | ---
23 | 
24 | *This repository is actively being updated.*


--------------------------------------------------------------------------------
/projects/financial-reasoning-enhanced/QUICK_START.md:
--------------------------------------------------------------------------------
 1 | # Quick Start Guide - Financial Reasoning Enhanced
 2 | 
 3 | ## 🚀 Get Running in 5 Minutes
 4 | 
 5 | ### 1. Install Dependencies
 6 | ```bash
 7 | pip install -r requirements.txt
 8 | ```
 9 | 
10 | ### 2. Basic Training (Uses Built-in Data)
11 | ```bash
12 | python financial_reasoning_enhanced.py \
13 |     --base-model "unsloth/gemma-3-270m-it" \
14 |     --output-dir "./my_financial_model" \
15 |     --use-4bit
16 | ```
17 | 
18 | ### 3. Test Your Model
19 | After training, the script automatically runs sanity checks and shows sample outputs.
20 | 
21 | ## 🎯 What This Does
22 | 
23 | - **Phase 1**: SFT training on financial reasoning examples
24 | - **Phase 2**: GRPO reinforcement learning with sophisticated rewards
25 | - **Output**: Model that generates structured financial analysis with `<REASONING>`, `<SENTIMENT>`, and `<CONFIDENCE>` tags
26 | 
27 | ## 📊 Sample Output
28 | 
29 | **Input**: "Tech company reports 25% revenue growth but 8% profit decline due to R&D investment"
30 | 
31 | **Expected Output**:
32 | ```
33 | <REASONING> Revenue growth is strong, reflecting demand and expansion. Profit dipped due to R&D, which is strategic with long-term upside. Near-term margins compress, but growth story remains intact. </REASONING>
34 | <SENTIMENT> positive </SENTIMENT>
35 | <CONFIDENCE> 0.75 </CONFIDENCE>
36 | ```
37 | 
38 | ## ⚡ Quick Customization
39 | 
40 | - **Change model**: `--base-model "your/model"`
41 | - **Adjust training**: `--sft-epochs 5 --grpo-epochs 3.0`
42 | - **Use custom data**: `--train-jsonl "./your_data.jsonl"`
43 | 
44 | ## 🔧 Troubleshooting
45 | 
46 | - **Memory issues**: Reduce batch sizes or ensure `--use-4bit` is set
47 | - **Slow training**: Start with smaller models or reduce epochs
48 | - **Poor output**: Check that your data follows the expected format
49 | 
50 | ## 📚 Next Steps
51 | 
52 | - Read the full [README.md](README.md) for detailed configuration options
53 | - Check the [main repository](../README.md) for other RL approaches
54 | - Experiment with different reward function weights and training parameters
55 | 
56 | ---
57 | 
58 | **Need help?** Check the main README or experiment with the built-in synthetic data first!
59 | 


--------------------------------------------------------------------------------
/projects/vllm-fine-tuning-smolvlm/QUICK_START.md:
--------------------------------------------------------------------------------
  1 | # Quick Start: SmolVLM-256M ChartQA Fine-Tuning
  2 | 
  3 | ## 🚀 Get Training in 5 Minutes
  4 | 
  5 | ### 1. Navigate to the Project
  6 | ```bash
  7 | cd Reinforcement-learning-with-verifable-rewards-Learnings/projects/vllm-fine-tuning-smolvlm
  8 | ```
  9 | 
 10 | ### 2. Install Dependencies
 11 | ```bash
 12 | pip install -r requirements.txt
 13 | ```
 14 | 
 15 | ### 3. Start Training (Optimized for 16GB GPU)
 16 | ```bash
 17 | python train_smolvlm_chartqa.py
 18 | ```
 19 | 
 20 | **That's it!** The script will:
 21 | - ✅ Download SmolVLM-256M model
 22 | - ✅ Load ChartQA dataset with lazy loading
 23 | - ✅ Fine-tune with optimized parameters
 24 | - ✅ Save model to `./smolvlm-256m-chartqa-sft/`
 25 | 
 26 | ## 📊 Expected Output
 27 | 
 28 | ```
 29 | 🚀 High-Performance Training Configuration:
 30 |    • Batch Size: 16
 31 |    • Learning Rate: 0.001
 32 |    • Epochs: 2
 33 |    • LoRA Rank (r): 16
 34 |    • Memory Target: 14.0GB
 35 | 
 36 | 🚀 Starting HIGH-PERFORMANCE SmolVLM-256M training...
 37 | ✅ Training completed successfully!
 38 | 💾 Saving final model to smolvlm-256m-chartqa-sft
 39 | ```
 40 | 
 41 | ## 🧪 Test Your Model
 42 | 
 43 | ### Quick Test (10 samples)
 44 | ```bash
 45 | python test_smolvlm_chartqa.py
 46 | ```
 47 | 
 48 | ### Extended Test (50 samples)
 49 | ```bash
 50 | python test_smolvlm_chartqa.py --num_samples 50
 51 | ```
 52 | 
 53 | ## ⚡ Performance Tips
 54 | 
 55 | ### For Different GPU Sizes
 56 | 
 57 | #### 16GB GPU (Recommended)
 58 | ```bash
 59 | python train_smolvlm_chartqa.py --memory_limit 14.0 --batch_size 16
 60 | ```
 61 | 
 62 | #### 12GB GPU
 63 | ```bash
 64 | python train_smolvlm_chartqa.py --memory_limit 10.0 --batch_size 12
 65 | ```
 66 | 
 67 | #### 8GB GPU
 68 | ```bash
 69 | python train_smolvlm_chartqa.py --memory_limit 6.0 --batch_size 8
 70 | ```
 71 | 
 72 | ### For Better Accuracy
 73 | 
 74 | #### High-Capacity LoRA
 75 | ```bash
 76 | python train_smolvlm_chartqa.py --lora_r 24 --lora_alpha 48 --epochs 3
 77 | ```
 78 | 
 79 | #### Longer Training
 80 | ```bash
 81 | python train_smolvlm_chartqa.py --max_steps 750 --learning_rate 0.001
 82 | ```
 83 | 
 84 | ## 🎯 What You Get
 85 | 
 86 | After training, you'll have:
 87 | - **Fine-tuned SmolVLM-256M** model in `./smolvlm-256m-chartqa-sft/`
 88 | - **Test results** in `./smolvlm_256m_test_output/`
 89 | - **Performance metrics** and sample predictions
 90 | - **Ready-to-use model** for chart understanding tasks
 91 | 
 92 | ## 📈 Expected Performance
 93 | 
 94 | | Metric | Expected | Actual (Recent Test) |
 95 | |--------|----------|---------------------|
 96 | | Training Time | 15-25 min | ~12 min |
 97 | | GPU Memory | <2GB | ~0.5GB |
 98 | | Test Accuracy | 40-60% | 40% (4/10) |
 99 | | Model Size | 256M params | 256M params |
100 | 
101 | ## 🔧 Troubleshooting
102 | 
103 | ### Common Issues & Solutions
104 | 
105 | #### Memory Errors
106 | ```bash
107 | # Reduce batch size and memory limit
108 | python train_smolvlm_chartqa.py --batch_size 8 --memory_limit 6.0
109 | ```
110 | 
111 | #### Slow Training
112 | ```bash
113 | # Use higher learning rate and larger batch size
114 | python train_smolvlm_chartqa.py --learning_rate 0.002 --batch_size 20
115 | ```
116 | 
117 | #### Poor Performance
118 | ```bash
119 | # Increase LoRA capacity and training time
120 | python train_smolvlm_chartqa.py --lora_r 24 --lora_alpha 48 --epochs 3 --max_steps 750
121 | ```
122 | 
123 | ## 📚 Next Steps
124 | 
125 | 1. **Read the full [README.md](README.md)** for detailed configuration options
126 | 2. **Experiment with different LoRA settings** for your specific use case
127 | 3. **Check the [main repository](../../README.md)** for other RL approaches
128 | 4. **Share your results** and contribute improvements!
129 | 
130 | ## 🎉 Success Metrics
131 | 
132 | **Your training is successful if:**
133 | - ✅ Model loads without errors
134 | - ✅ Training completes in 15-25 minutes
135 | - ✅ GPU memory usage stays under your specified limit
136 | - ✅ Test accuracy is above 30%
137 | - ✅ Model generates reasonable chart-related answers
138 | 
139 | ---
140 | 
141 | **Happy fine-tuning! 🎯**
142 | 
143 | *Need help? Check the full README.md or open an issue in the main repository.*
144 | 


--------------------------------------------------------------------------------
/projects/trl-ppo-fine-tuning/README.md:
--------------------------------------------------------------------------------
 1 | # TRL PPO Fine-Tuning Example
 2 | 
 3 | 
 4 | [Blog Link](https://pavankunchalapk.medium.com/windows-friendly-grpo-fine-tuning-with-trl-from-zero-to-verifiable-rewards-f28008c89323)
 5 | 
 6 | This project demonstrates how to fine-tune a language model using Proximal Policy Optimization (PPO) with the `trl` library. The script is designed to work with models and datasets from the Hugging Face Hub or from local storage.
 7 | 
 8 | ## Setup
 9 | 
10 | 1.  It is highly recommended to use a Python virtual environment.
11 | 2.  Install the required dependencies:
12 |     ```bash
13 |     pip install -r requirements.txt
14 |     ```
15 | 
16 | ## Usage
17 | 
18 | The `train.py` script accepts several command-line arguments to specify the model, dataset, and training parameters. Below are some common usage examples.
19 | 
20 | ### Example 1: Fine-Tuning with a Hugging Face Model and Dataset
21 | 
22 | This command downloads a model and a dataset from the Hugging Face Hub, then starts the fine-tuning process.
23 | 
24 | ```bash
25 | python train.py \
26 |     --base-model "meta-llama/Llama-2-7b-chat-hf" \
27 |     --dataset "lvwerra/stack-exchange-paired" \
28 |     --output-dir "./checkpoints/llama2-stack-exchange" \
29 |     --use-4bit \
30 |     --lora-r 16 \
31 |     --lr 5e-6 \
32 |     --hub-token "YOUR_HF_TOKEN_HERE"
33 | ```
34 | 
35 | ### Example 2: Fine-Tuning with a Local Model and Local JSONL Data
36 | 
37 | This command uses a model that you have already downloaded to your local machine and a local `.jsonl` file for training data. The `--local-only` flag ensures no external connections are made.
38 | 
39 | ```bash
40 | python train.py \
41 |     --base-model "/path/to/your/local/model" \
42 |     --train-jsonl "/path/to/your/training_data.jsonl" \
43 |     --eval-jsonl "/path/to/your/validation_data.jsonl" \
44 |     --output-dir "./checkpoints/local-model-local-data" \
45 |     --local-only \
46 |     --use-4bit
47 | ```
48 | 
49 | ## Comparing Model Outputs
50 | 
51 | After fine-tuning, you can compare the performance of the base model against your updated (LoRA-tuned) model using the `compare_base_vs_lora.py` script.
52 | 
53 | This script runs a set of prompts through both the original base model and the model with your LoRA adapters and saves the outputs to a CSV file for easy comparison.
54 | 
55 | ### Example
56 | 
57 | ```bash
58 | python compare_base_vs_lora.py ^
59 |     --base-model Qwen/Qwen3-4B-Instruct-2507 ^
60 |     --adapter-dir .\output ^
61 |     --use-4bit ^
62 |     --prompts "Explain GRPO in 2 sentences." "What is the capital of Japan?"
63 | ```
64 | 
65 | You can also provide prompts from a text file:
66 | 
67 | ```bash
68 | python compare_base_vs_lora.py ^
69 |     --base-model Qwen/Qwen3-4B-Instruct-2507 ^
70 |     --adapter-dir .\output ^
71 |     --prompts-file .\prompts.txt
72 | ```
73 | 
74 | ## Key Arguments
75 | 
76 | #### Model & Data Arguments
77 | *   `--base-model` (Required): The identifier for the model on the Hugging Face Hub (e.g., `codellama/CodeLlama-7b-hf`) or the absolute path to a local model directory.
78 | *   `--dataset`: The identifier for a dataset on the Hugging Face Hub. Use this OR the local data arguments below.
79 | *   `--train-jsonl`: The path to a local training data file in JSON Lines format.
80 | *   `--eval-jsonl`: The path to a local validation data file in JSON Lines format.
81 | *   `--output-dir` (Required): The directory where the trained model checkpoints will be saved.
82 | 
83 | #### System & Performance Arguments
84 | *   `--local-only`: If set, the script will only use local files and not attempt to download models or datasets.
85 | *   `--use-4bit`: If set, enables 4-bit quantization to reduce memory usage.
86 | *   `--bf16` / `--fp16`: Use bfloat16 or float16 precision. Defaults to bf16 if available.
87 | *   `--attn-impl`: The attention implementation to use (e.g., `sdpa` for scaled dot product attention).
88 | 
89 | #### Training & LoRA Arguments
90 | *   `--lora-off`: If set, disables LoRA and performs full fine-tuning.
91 | *   `--lora-r`: The rank of the LoRA matrices. Default is `16`.
92 | *   `--lora-alpha`: The alpha parameter for LoRA scaling. Default is `32.0`.
93 | *   `--lr`: The learning rate. Default is `5e-6`.
94 | *   `--num-epochs`: The number of training epochs. Default is `1`.
95 | 


--------------------------------------------------------------------------------
/projects/financial-reasoning-enhanced/README.md:
--------------------------------------------------------------------------------
  1 | # Financial Reasoning Enhanced: SFT + GRPO Fine-Tuning
  2 | 
  3 | This project demonstrates advanced fine-tuning of language models for financial reasoning tasks using a two-phase approach: Supervised Fine-Tuning (SFT) followed by GRPO (Group Relative Policy Optimization) with multi-level reward functions.
  4 | 
  5 | ## Overview
  6 | 
  7 | The Financial Reasoning Enhanced pipeline combines:
  8 | - **SFT Phase**: Initial training on structured financial reasoning examples
  9 | - **GRPO Phase**: Reinforcement learning with sophisticated reward functions including:
 10 |   - Format compliance gates
 11 |   - Financial reasoning quality analysis
 12 |   - FinBERT teacher alignment
 13 |   - Confidence calibration
 14 |   - Directional consistency validation
 15 | 
 16 | ## Key Features
 17 | 
 18 | - **Multi-level Reward System**: Combines format, reasoning quality, and financial expertise
 19 | - **FinBERT Integration**: Uses FinBERT as a teacher model for sentiment alignment
 20 | - **Structured Output**: Enforces consistent `<REASONING>`, `<SENTIMENT>`, and `<CONFIDENCE>` tags
 21 | - **Unsloth Integration**: Optimized for 4-bit quantization and efficient training
 22 | - **Flexible Data Sources**: Supports Financial PhraseBank, synthetic data, or custom JSONL
 23 | 
 24 | ## Setup
 25 | 
 26 | 1. **Install Dependencies**:
 27 |    ```bash
 28 |    pip install -r requirements.txt
 29 |    ```
 30 | 
 31 | 2. **Optional Dependencies**:
 32 |    - `seaborn` and `matplotlib` for visualization (will work without them)
 33 |    - `bitsandbytes` for 4-bit quantization (optional, Unsloth handles this)
 34 | 
 35 | ## Usage
 36 | 
 37 | ### Basic Training Pipeline
 38 | 
 39 | ```bash
 40 | python financial_reasoning_enhanced.py \
 41 |     --base-model "unsloth/gemma-3-270m-it" \
 42 |     --output-dir "./financial_reasoning_outputs" \
 43 |     --use-4bit \
 44 |     --data-mode "mixed"
 45 | ```
 46 | 
 47 | ### Advanced Configuration
 48 | 
 49 | ```bash
 50 | python financial_reasoning_enhanced.py \
 51 |     --base-model "unsloth/gemma-3-270m-it" \
 52 |     --output-dir "./custom_outputs" \
 53 |     --use-4bit \
 54 |     --sft-epochs 5 \
 55 |     --grpo-epochs 3.0 \
 56 |     --sft-batch 8 \
 57 |     --grpo-batch 4 \
 58 |     --sft-lr 2e-4 \
 59 |     --grpo-lr 1e-5 \
 60 |     --beta 0.2 \
 61 |     --temperature 0.8 \
 62 |     --data-mode "real" \
 63 |     --max-real-examples 500
 64 | ```
 65 | 
 66 | ### Custom Data Training
 67 | 
 68 | ```bash
 69 | python financial_reasoning_enhanced.py \
 70 |     --base-model "unsloth/gemma-3-270m-it" \
 71 |     --output-dir "./custom_training" \
 72 |     --use-4bit \
 73 |     --train-jsonl "./custom_sft_data.jsonl" \
 74 |     --eval-jsonl "./custom_grpo_data.jsonl"
 75 | ```
 76 | 
 77 | ## Data Format
 78 | 
 79 | ### SFT Training Data (JSONL)
 80 | Each line should contain:
 81 | ```json
 82 | {
 83 |     "text": "Tech company reports 25% revenue growth but 8% profit decline due to R&D investment",
 84 |     "reasoning": "Revenue growth is strong, reflecting demand and expansion. Profit dipped due to R&D, which is strategic with long-term upside.",
 85 |     "sentiment": "positive",
 86 |     "confidence": 0.75
 87 | }
 88 | ```
 89 | 
 90 | ### GRPO Evaluation Data (JSONL)
 91 | Each line should contain:
 92 | ```json
 93 | {
 94 |     "text": "Bank announces 3% dividend increase while facing regulatory scrutiny over compliance issues"
 95 | }
 96 | ```
 97 | 
 98 | ## Output Structure
 99 | 
100 | The model learns to generate responses in this exact format:
101 | ```
102 | <REASONING> Revenue growth is strong but margins compressed due to higher input costs; guidance is cautious, suggesting near-term volatility. Therefore, outlook is balanced with upside from new products. </REASONING>
103 | <SENTIMENT> neutral </SENTIMENT>
104 | <CONFIDENCE> 0.72 </CONFIDENCE>
105 | ```
106 | 
107 | ## Key Arguments
108 | 
109 | ### Model & Training
110 | - `--base-model`: Base model identifier (default: unsloth/gemma-3-270m-it)
111 | - `--output-dir`: Output directory for checkpoints and final model
112 | - `--use-4bit`: Enable 4-bit quantization for memory efficiency
113 | - `--local-only`: Use only local files (offline mode)
114 | 
115 | ### SFT Parameters
116 | - `--sft-epochs`: Number of SFT training epochs (default: 3)
117 | - `--sft-batch`: SFT batch size (default: 12)
118 | - `--sft-lr`: SFT learning rate (default: 1e-4)
119 | - `--sft-warmup`: SFT warmup ratio (default: 0.1)
120 | 
121 | ### GRPO Parameters
122 | - `--grpo-epochs`: Number of GRPO training epochs (default: 4.0)
123 | - `--grpo-batch`: GRPO batch size (default: 12)
124 | - `--grpo-lr`: GRPO learning rate (default: 1e-5)
125 | - `--beta`: KL penalty coefficient (default: 0.15)
126 | - `--temperature`: Generation temperature (default: 0.7)
127 | - `--num-generations`: Completions per prompt (default: 6)
128 | 
129 | ### Data Control
130 | - `--data-mode`: Data source selection ["mixed", "real", "synthetic"]
131 | - `--max-real-examples`: Maximum examples from Financial PhraseBank (default: 200)
132 | - `--train-jsonl`: Custom SFT training data file
133 | - `--eval-jsonl`: Custom GRPO evaluation data file
134 | 
135 | ### LoRA Configuration
136 | - `--lora-rank`: LoRA rank for parameter-efficient training (default: 32)
137 | - `--lora-alpha`: LoRA alpha scaling factor (default: 64)
138 | 
139 | ## Reward Function Components
140 | 
141 | 1. **Format Gate (35%)**: Ensures proper tag structure
142 | 2. **Financial Reasoning (25%)**: Quality, logic, and context analysis
143 | 3. **Sentiment Alignment (20%)**: FinBERT teacher agreement
144 | 4. **Confidence Calibration (15%)**: Brier score-like confidence accuracy
145 | 5. **Directional Consistency (5%)**: Reasoning-sentiment alignment
146 | 
147 | ## Example Outputs
148 | 
149 | ### Input
150 | ```
151 | "Energy company reports 30% production increase but faces environmental lawsuit"
152 | ```
153 | 
154 | ### Expected Output
155 | ```
156 | <REASONING> Production increase signals operational efficiency and market demand growth, which is positive for revenue. However, environmental lawsuit introduces regulatory risk and potential financial liabilities that could offset gains. </REASONING>
157 | <SENTIMENT> neutral </SENTIMENT>
158 | <CONFIDENCE> 0.68 </CONFIDENCE>
159 | ```
160 | 
161 | ## Performance Monitoring
162 | 
163 | The training pipeline provides:
164 | - Real-time loss and reward tracking
165 | - KL divergence monitoring
166 | - Memory usage statistics
167 | - Training progress visualization (if matplotlib/seaborn available)
168 | 
169 | ## Troubleshooting
170 | 
171 | ### Common Issues
172 | 1. **Memory Errors**: Reduce batch sizes or enable 4-bit quantization
173 | 2. **Training Instability**: Lower learning rates or increase beta for KL penalty
174 | 3. **Poor Format Compliance**: Increase format gate weight in reward function
175 | 4. **Slow Convergence**: Adjust warmup ratios or learning rate schedules
176 | 
177 | ### Performance Tips
178 | - Use 4-bit quantization for memory efficiency
179 | - Start with smaller models for experimentation
180 | - Use synthetic data for initial testing
181 | - Monitor reward components to identify training issues
182 | 
183 | ## Integration with Other Projects
184 | 
185 | This project complements the TRL PPO fine-tuning example by providing:
186 | - Specialized financial reasoning capabilities
187 | - Multi-level reward system demonstration
188 | - FinBERT integration example
189 | - Structured output enforcement
190 | 
191 | ## Citation
192 | 
193 | If you use this work in your research, please cite:
194 | ```bibtex
195 | @misc{financial_reasoning_enhanced_2025,
196 |   title={Financial Reasoning Enhanced: SFT + GRPO Fine-Tuning},
197 |   author={Pavan Kunchala},
198 |   year={2025},
199 |   url={https://github.com/yourusername/Reinforcement-learning-with-verifable-rewards-Learnings}
200 | }
201 | ```
202 | 
203 | ## License
204 | 
205 | This project is licensed under the MIT License - see the [LICENSE](../LICENSE) file for details.
206 | 


--------------------------------------------------------------------------------
/projects/trl-ppo-fine-tuning/test_base_vs_lora.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | compare_base_vs_lora.py
  6 | 
  7 | Compare generations from a base HF chat model vs the same model + your LoRA adapters.
  8 | - Works great for Qwen/Qwen3 Instruct models (uses tokenizer.apply_chat_template).
  9 | - Runs prompts twice (base, then LoRA) to avoid double GPU memory usage.
 10 | - Saves CSV with prompt, base_output, lora_output.
 11 | 
 12 | Example:
 13 |   python compare_base_vs_lora.py ^
 14 |     --base-model Qwen/Qwen3-4B-Instruct-2507 ^
 15 |     --adapter-dir .\output ^
 16 |     --use-4bit ^
 17 |     --prompts "Explain GRPO in 2 sentences." "What is the capital of Japan?"
 18 | 
 19 | Or from a file:
 20 |   python compare_base_vs_lora.py ^
 21 |     --base-model Qwen/Qwen3-4B-Instruct-2507 ^
 22 |     --adapter-dir .\output ^
 23 |     --prompts-file .\prompts.txt
 24 | """
 25 | 
 26 | import argparse
 27 | import csv
 28 | import os
 29 | import torch
 30 | from typing import List, Optional
 31 | 
 32 | from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 33 | from peft import PeftModel
 34 | 
 35 | 
 36 | def load_model_and_tokenizer(
 37 |     base_model: str,
 38 |     use_4bit: bool,
 39 |     local_only: bool,
 40 |     attn_impl: str,
 41 |     dtype: torch.dtype,
 42 | ):
 43 |     tok = AutoTokenizer.from_pretrained(
 44 |         base_model, use_fast=True, trust_remote_code=True, local_files_only=local_only
 45 |     )
 46 |     bnb = None
 47 |     if use_4bit:
 48 |         bnb = BitsAndBytesConfig(
 49 |             load_in_4bit=True, bnb_4bit_compute_dtype=dtype, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True
 50 |         )
 51 |     model = AutoModelForCausalLM.from_pretrained(
 52 |         base_model,
 53 |         trust_remote_code=True,
 54 |         local_files_only=local_only,
 55 |         device_map="auto",
 56 |         torch_dtype=dtype,
 57 |         attn_implementation=attn_impl,
 58 |         quantization_config=bnb,
 59 |     )
 60 |     if tok.pad_token_id is None:
 61 |         tok.pad_token = tok.eos_token
 62 |     try:
 63 |         model.generation_config.eos_token_id = tok.eos_token_id
 64 |         model.generation_config.pad_token_id = tok.pad_token_id
 65 |     except Exception:
 66 |         pass
 67 |     return tok, model
 68 | 
 69 | 
 70 | def apply_chat(tok, prompt: str, device, disable_thinking: bool):
 71 |     # Qwen3 uses a chat template; we feed messages to get the correct formatting.
 72 |     messages = [{"role": "user", "content": prompt}]
 73 |     # Some Qwen3 templates support enable_thinking; if you want to disable, pass the flag.
 74 |     kwargs = dict(add_generation_prompt=True, return_tensors="pt")
 75 |     if disable_thinking:
 76 |         # If the model’s chat template supports it, this will be honored; otherwise ignored.
 77 |         kwargs["enable_thinking"] = False
 78 |     inputs = tok.apply_chat_template(messages, **kwargs).to(device)
 79 |     return inputs
 80 | 
 81 | 
 82 | def generate(model, tok, prompts: List[str], max_new_tokens: int, temperature: float, top_p: float, do_sample: bool, disable_thinking: bool):
 83 |     outs: List[str] = []
 84 |     for p in prompts:
 85 |         inputs = apply_chat(tok, p, model.device, disable_thinking)
 86 |         with torch.no_grad():
 87 |             out = model.generate(
 88 |                 inputs,
 89 |                 max_new_tokens=max_new_tokens,
 90 |                 do_sample=do_sample,
 91 |                 temperature=temperature,
 92 |                 top_p=top_p,
 93 |             )
 94 |         text = tok.decode(out[0], skip_special_tokens=True)
 95 |         outs.append(text)
 96 |     return outs
 97 | 
 98 | 
 99 | def main():
100 |     ap = argparse.ArgumentParser(description="Compare base vs LoRA-tuned outputs on the same prompts.")
101 |     ap.add_argument("--base-model", required=True, help="HF model id or local path (e.g., Qwen/Qwen3-4B-Instruct-2507)")
102 |     ap.add_argument("--adapter-dir", required=True, help="Path to your saved LoRA adapters (e.g., .\\output)")
103 |     ap.add_argument("--prompts", nargs="*", default=[], help="List of prompts (space-separated; quote each)")
104 |     ap.add_argument("--prompts-file", default=None, help="Optional text file with one prompt per line")
105 |     ap.add_argument("--out", default=r".\output\compare_base_vs_lora.csv", help="CSV output path")
106 |     ap.add_argument("--use-4bit", action="store_true", help="Load both models in 4-bit")
107 |     ap.add_argument("--local-only", action="store_true", help="Use local files only (offline)")
108 |     ap.add_argument("--attn-impl", choices=["sdpa","eager"], default="sdpa")
109 |     ap.add_argument("--max-new-tokens", type=int, default=200)
110 |     ap.add_argument("--temperature", type=float, default=0.2, help="Low temp for reproducible sanity checks")
111 |     ap.add_argument("--top-p", type=float, default=0.9)
112 |     ap.add_argument("--do-sample", action="store_true", help="Enable sampling (off by default for determinism)")
113 |     ap.add_argument("--disable-thinking", action="store_true", help="Try to disable Qwen 'thinking' if supported")
114 |     args = ap.parse_args()
115 | 
116 |     os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
117 |     os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True")
118 |     try:
119 |         torch.set_float32_matmul_precision("high")
120 |     except Exception:
121 |         pass
122 | 
123 |     if not args.prompts and not args.prompts_file:
124 |         # Default quick sanity set
125 |         args.prompts = [
126 |             "Explain GRPO in two sentences.",
127 |             "List three safe ways to speed up PyTorch inference on Windows.",
128 |             "You are given: 12 apples, you eat 5 and buy 4 more. How many now?",
129 |             "Write a very short email subject for a job application follow-up.",
130 |             "What's one surprising fact about the James Webb Space Telescope?",
131 |         ]
132 | 
133 |     prompts = list(args.prompts)
134 |     if args.prompts_file:
135 |         with open(args.prompts_file, "r", encoding="utf-8") as f:
136 |             prompts.extend([ln.strip() for ln in f if ln.strip()])
137 | 
138 |     # Choose precision
139 |     dtype = torch.bfloat16 if (torch.cuda.is_available() and torch.cuda.is_bf16_supported()) else torch.float16
140 | 
141 |     # -------- Pass 1: BASE --------
142 |     tok, model = load_model_and_tokenizer(
143 |         base_model=args.base_model,
144 |         use_4bit=args.use_4bit,
145 |         local_only=args.local_only,
146 |         attn_impl=args.attn_impl,
147 |         dtype=dtype,
148 |     )
149 |     base_outs = generate(
150 |         model, tok, prompts,
151 |         max_new_tokens=args.max_new_tokens,
152 |         temperature=args.temperature, top_p=args.top_p, do_sample=args.do_sample,
153 |         disable_thinking=args.disable_thinking
154 |     )
155 |     del model
156 |     torch.cuda.empty_cache()
157 | 
158 |     # -------- Pass 2: LoRA (same base + adapters) --------
159 |     tok2, base2 = load_model_and_tokenizer(
160 |         base_model=args.base_model,
161 |         use_4bit=args.use_4bit,
162 |         local_only=args.local_only,
163 |         attn_impl=args.attn_impl,
164 |         dtype=dtype,
165 |     )
166 |     lora_model = PeftModel.from_pretrained(base2, args.adapter_dir)
167 |     lora_outs = generate(
168 |         lora_model, tok2, prompts,
169 |         max_new_tokens=args.max_new_tokens,
170 |         temperature=args.temperature, top_p=args.top_p, do_sample=args.do_sample,
171 |         disable_thinking=args.disable_thinking
172 |     )
173 | 
174 |     # Print a small table to console (truncated)
175 |     def trunc(s, n=160):
176 |         s = s.replace("\n", " ")
177 |         return (s[:n] + "…") if len(s) > n else s
178 | 
179 |     print("\n=== COMPARISON (first 5) ===")
180 |     for i, p in enumerate(prompts[:5]):
181 |         print(f"\n[{i+1}] PROMPT: {p}")
182 |         print("BASE:", trunc(base_outs[i]))
183 |         print("LORA:", trunc(lora_outs[i]))
184 | 
185 |     # Save CSV
186 |     os.makedirs(os.path.dirname(args.out), exist_ok=True)
187 |     with open(args.out, "w", encoding="utf-8", newline="") as f:
188 |         w = csv.writer(f)
189 |         w.writerow(["prompt", "base_output", "lora_output"])
190 |         for p, b, l in zip(prompts, base_outs, lora_outs):
191 |             w.writerow([p, b, l])
192 |     print(f"\n[INFO] Wrote {len(prompts)} comparisons to: {args.out}")
193 |     print("[INFO] Done.")
194 | 
195 | 
196 | if __name__ == "__main__":
197 |     main()
198 | 


--------------------------------------------------------------------------------
/projects/vllm-fine-tuning-smolvlm/README.md:
--------------------------------------------------------------------------------
  1 | # SmolVLM-256M ChartQA Fine-Tuning with Lazy Loading
  2 | 
  3 | This project demonstrates ultra-efficient fine-tuning of SmolVLM-256M on the ChartQA dataset using lazy loading and streaming techniques for maximum memory efficiency.
  4 | 
  5 | ## 🚀 Overview
  6 | 
  7 | SmolVLM-256M is Hugging Face's most efficient vision-language model with only 256 million parameters. This project shows how to fine-tune it on chart understanding tasks while maintaining minimal memory usage through innovative lazy loading techniques.
  8 | 
  9 | ### Key Features
 10 | - **Ultra-Efficient Training**: Fine-tune in 12-25 minutes on consumer GPUs
 11 | - **Memory Optimized**: Uses <2GB VRAM through lazy loading
 12 | - **Streaming Dataset**: Processes data on-demand without loading everything into memory
 13 | - **Configurable LoRA**: Dynamic rank and alpha parameters for optimal performance
 14 | - **High-Performance Mode**: Targets 87% GPU utilization on 16GB GPUs
 15 | - **Comprehensive Testing**: Built-in evaluation with detailed metrics
 16 | 
 17 | ## 📋 Table of Contents
 18 | - [Quick Start](#-quick-start)
 19 | - [Installation](#-installation)
 20 | - [Usage](#-usage)
 21 | - [Configuration](#-configuration)
 22 | - [Performance](#-performance)
 23 | - [Troubleshooting](#-troubleshooting)
 24 | - [Technical Details](#-technical-details)
 25 | - [Contributing](#-contributing)
 26 | 
 27 | ## 🚀 Quick Start
 28 | 
 29 | ### 1. Basic Training (5 minutes)
 30 | ```bash
 31 | # Clone and navigate to the project
 32 | cd Reinforcement-learning-with-verifable-rewards-Learnings/projects/vllm-fine-tuning-smolvlm
 33 | 
 34 | # Run with optimized defaults for 16GB GPU
 35 | python train_smolvlm_chartqa.py
 36 | ```
 37 | 
 38 | ### 2. Custom Configuration
 39 | ```bash
 40 | # High-performance mode with custom LoRA
 41 | python train_smolvlm_chartqa.py \
 42 |   --lora_r 24 --lora_alpha 48 \
 43 |   --batch_size 16 --memory_limit 14.0 \
 44 |   --learning_rate 0.001 --epochs 3
 45 | ```
 46 | 
 47 | ### 3. Test Your Model
 48 | ```bash
 49 | # Test the fine-tuned model
 50 | python test_smolvlm_chartqa.py
 51 | 
 52 | # Test with custom settings
 53 | python test_smolvlm_chartqa.py --num_samples 20 --memory_limit 4.0
 54 | ```
 55 | 
 56 | ## 📦 Installation
 57 | 
 58 | ### Requirements
 59 | - Python 3.8+
 60 | - CUDA-compatible GPU (optional, works on CPU)
 61 | - 8GB+ RAM (16GB+ recommended for optimal performance)
 62 | 
 63 | ### Dependencies
 64 | ```bash
 65 | pip install torch>=2.0.0
 66 | pip install transformers>=4.40.0
 67 | pip install datasets>=2.18.0
 68 | pip install trl>=0.14.0
 69 | pip install peft>=0.10.0
 70 | pip install pillow
 71 | pip install requests
 72 | ```
 73 | 
 74 | ### Full Installation
 75 | ```bash
 76 | # Install all dependencies
 77 | pip install -r requirements.txt
 78 | 
 79 | # Optional: For CUDA acceleration
 80 | pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
 81 | ```
 82 | 
 83 | ## 🎯 Usage
 84 | 
 85 | ### Training
 86 | 
 87 | #### Basic Training
 88 | ```bash
 89 | python train_smolvlm_chartqa.py
 90 | ```
 91 | This runs with optimized defaults for 16GB GPUs.
 92 | 
 93 | #### Advanced Training Options
 94 | ```bash
 95 | python train_smolvlm_chartqa.py \
 96 |   --batch_size 16 \
 97 |   --memory_limit 14.0 \
 98 |   --learning_rate 0.001 \
 99 |   --epochs 3 \
100 |   --lora_r 24 \
101 |   --lora_alpha 48 \
102 |   --mixed_precision bf16
103 | ```
104 | 
105 | #### Memory-Constrained Systems
106 | ```bash
107 | python train_smolvlm_chartqa.py \
108 |   --batch_size 8 \
109 |   --memory_limit 8.0 \
110 |   --enable_gradient_checkpointing
111 | ```
112 | 
113 | ### Testing
114 | 
115 | #### Basic Testing
116 | ```bash
117 | python test_smolvlm_chartqa.py
118 | ```
119 | 
120 | #### Advanced Testing
121 | ```bash
122 | python test_smolvlm_chartqa.py \
123 |   --adapter_dir "./my_custom_adapter" \
124 |   --num_samples 50 \
125 |   --memory_limit 4.0
126 | ```
127 | 
128 | ## ⚙️ Configuration
129 | 
130 | ### Training Parameters
131 | 
132 | | Parameter | Default | Description |
133 | |-----------|---------|-------------|
134 | | `--batch_size` | 16 | Training batch size (optimized for 16GB GPU) |
135 | | `--memory_limit` | 14.0 | GPU memory limit in GB |
136 | | `--learning_rate` | 0.001 | Learning rate for training |
137 | | `--epochs` | 2 | Number of training epochs |
138 | | `--max_steps` | 500 | Maximum training steps |
139 | | `--lora_r` | 16 | LoRA rank parameter |
140 | | `--lora_alpha` | 32 | LoRA alpha scaling factor |
141 | | `--mixed_precision` | bf16 | Precision mode (bf16/fp16/fp32) |
142 | 
143 | ### LoRA Configuration
144 | 
145 | The script supports dynamic LoRA configuration:
146 | 
147 | ```bash
148 | # Conservative (good for stability)
149 | --lora_r 12 --lora_alpha 24
150 | 
151 | # Balanced (recommended)
152 | --lora_r 16 --lora_alpha 32
153 | 
154 | # Aggressive (maximum capacity)
155 | --lora_r 32 --lora_alpha 64
156 | ```
157 | 
158 | ### Memory Optimization
159 | 
160 | ```bash
161 | # High-performance mode (16GB GPU)
162 | --memory_limit 14.0 --batch_size 16
163 | 
164 | # Balanced mode (12GB GPU)
165 | --memory_limit 10.0 --batch_size 12
166 | 
167 | # Conservative mode (8GB GPU)
168 | --memory_limit 6.0 --batch_size 8 --enable_gradient_checkpointing
169 | ```
170 | 
171 | ## 📊 Performance
172 | 
173 | ### Expected Performance
174 | 
175 | | Configuration | GPU Memory | Training Time | Expected Accuracy |
176 | |---------------|------------|---------------|-------------------|
177 | | High-Performance | 12-14GB | 15-25 min | 70-80% |
178 | | Balanced | 8-10GB | 20-30 min | 65-75% |
179 | | Conservative | 4-6GB | 25-35 min | 60-70% |
180 | 
181 | ### Actual Results from Testing
182 | 
183 | Based on recent testing with the fine-tuned model:
184 | - **Accuracy**: 40% (4/10 correct answers)
185 | - **Strengths**:
186 |   - Exact numerical matching (100% on precise values)
187 |   - Basic categorical understanding
188 |   - Color recognition
189 | - **Areas for Improvement**:
190 |   - Numerical precision (decimal handling)
191 |   - Complex chart interpretation
192 |   - Year identification
193 | 
194 | ### Memory Usage
195 | - **Training**: <2GB VRAM with lazy loading
196 | - **Testing**: Minimal memory usage
197 | - **Peak Usage**: ~0.5GB during inference
198 | 
199 | ## 🔧 Troubleshooting
200 | 
201 | ### Common Issues
202 | 
203 | #### 1. Memory Errors
204 | ```bash
205 | # Reduce memory usage
206 | python train_smolvlm_chartqa.py --batch_size 8 --memory_limit 8.0
207 | 
208 | # Enable gradient checkpointing
209 | python train_smolvlm_chartqa.py --enable_gradient_checkpointing
210 | ```
211 | 
212 | #### 2. CUDA Out of Memory
213 | ```bash
214 | # Use FP16 instead of BF16
215 | python train_smolvlm_chartqa.py --mixed_precision fp16
216 | 
217 | # Reduce batch size
218 | python train_smolvlm_chartqa.py --batch_size 4
219 | ```
220 | 
221 | #### 3. Slow Training
222 | ```bash
223 | # Increase batch size if you have more VRAM
224 | python train_smolvlm_chartqa.py --batch_size 24 --memory_limit 15.0
225 | 
226 | # Use higher learning rate
227 | python train_smolvlm_chartqa.py --learning_rate 0.002
228 | ```
229 | 
230 | #### 4. Poor Model Performance
231 | ```bash
232 | # Increase LoRA capacity
233 | python train_smolvlm_chartqa.py --lora_r 24 --lora_alpha 48 --epochs 3
234 | 
235 | # More training steps
236 | python train_smolvlm_chartqa.py --max_steps 750
237 | ```
238 | 
239 | ### Hardware Requirements
240 | 
241 | #### Minimum
242 | - **CPU**: Any modern processor
243 | - **RAM**: 8GB
244 | - **Storage**: 10GB free space
245 | 
246 | #### Recommended
247 | - **GPU**: NVIDIA GPU with 8GB+ VRAM (16GB+ for optimal performance)
248 | - **RAM**: 16GB+
249 | - **CUDA**: Version 11.8+ (if using GPU)
250 | 
251 | ### Software Compatibility
252 | 
253 | | Component | Version | Notes |
254 | |-----------|---------|-------|
255 | | Python | 3.8+ | Tested with 3.11 |
256 | | PyTorch | 2.0+ | CUDA 11.8+ recommended |
257 | | Transformers | 4.40+ | Vision model support required |
258 | | TRL | 0.14+ | SFT and GRPO support |
259 | | PEFT | 0.10+ | LoRA and DoRA support |
260 | | Datasets | 2.18+ | Streaming support required |
261 | 
262 | ## 🏗️ Technical Details
263 | 
264 | ### Lazy Loading Implementation
265 | 
266 | The script implements advanced lazy loading techniques:
267 | 
268 | 1. **Streaming Dataset Loading**
269 |    ```python
270 |    full_dataset = load_dataset(DATASET_ID, streaming=True)
271 |    ```
272 | 
273 | 2. **On-Demand Processing**
274 |    ```python
275 |    raw_train = full_dataset["train"].take(train_size)
276 |    ```
277 | 
278 | 3. **Memory-Efficient Mapping**
279 |    ```python
280 |    train_ds = raw_train.map(format_sample, keep_in_memory=False)
281 |    ```
282 | 
283 | ### LoRA Configuration
284 | 
285 | Dynamic LoRA with validation:
286 | ```python
287 | PEFT_CONFIG = LoraConfig(
288 |     r=args.lora_r,
289 |     lora_alpha=args.lora_alpha,
290 |     lora_dropout=args.lora_dropout,
291 |     target_modules=['down_proj', 'o_proj', 'k_proj', 'q_proj', 'gate_proj', 'up_proj', 'v_proj'],
292 |     use_dora=True,
293 |     init_lora_weights="gaussian",
294 | )
295 | ```
296 | 
297 | ### Memory Management
298 | 
299 | Advanced memory optimization:
300 | ```python
301 | # Set memory allocation limits
302 | os.environ['PYTORCH_CUDA_ALLOC_CONF'] = f'max_split_size_mb: {memory_mb},expandable_segments:True'
303 | ```
304 | 
305 | ### Mixed Precision Support
306 | 
307 | Automatic precision selection based on hardware:
308 | ```python
309 | if args.mixed_precision == 'bf16':
310 |     if bf16_supported():
311 |         TRAINING_KW['bf16'] = True
312 |     else:
313 |         TRAINING_KW['fp16'] = True  # Fallback
314 | ```
315 | 
316 | ## 🤝 Contributing
317 | 
318 | ### Ways to Contribute
319 | 
320 | 1. **Bug Reports**: Report issues in the GitHub repository
321 | 2. **Feature Requests**: Suggest new features or improvements
322 | 3. **Code Contributions**: Submit pull requests with enhancements
323 | 4. **Documentation**: Improve documentation and examples
324 | 
325 | ### Development Setup
326 | 
327 | ```bash
328 | # Clone the repository
329 | git clone https://github.com/yourusername/Reinforcement-learning-with-verifable-rewards-Learnings.git
330 | 
331 | # Navigate to the project
332 | cd Reinforcement-learning-with-verifable-rewards-Learnings/projects/vllm-fine-tuning-smolvlm
333 | 
334 | # Install development dependencies
335 | pip install -r requirements.txt
336 | pip install pytest black isort flake8
337 | 
338 | # Run tests
339 | python -m pytest
340 | 
341 | # Format code
342 | black .
343 | isort .
344 | ```
345 | 
346 | ### Code Style
347 | 
348 | - Follow PEP 8 style guidelines
349 | - Use type hints for function parameters
350 | - Write docstrings for all functions
351 | - Keep functions under 50 lines when possible
352 | - Use meaningful variable names
353 | 
354 | ## 📄 License
355 | 
356 | This project is licensed under the MIT License - see the [LICENSE](../../LICENSE) file for details.
357 | 
358 | ## 🙏 Acknowledgments
359 | 
360 | - **Hugging Face** for the SmolVLM-256M model and Transformers library
361 | - **TRL Team** for the efficient fine-tuning framework
362 | - **ChartQA Dataset** creators for the comprehensive chart understanding dataset
363 | - **PEFT Team** for the LoRA and DoRA implementations
364 | 
365 | ## 📚 Further Reading
366 | 
367 | - [SmolVLM-256M Model Card](https://huggingface.co/HuggingFaceTB/SmolVLM-256M-Instruct)
368 | - [ChartQA Dataset](https://huggingface.co/datasets/HuggingFaceM4/ChartQA)
369 | - [TRL Documentation](https://huggingface.co/docs/trl/index)
370 | - [PEFT Documentation](https://huggingface.co/docs/peft/index)
371 | 
372 | ## 🎯 Roadmap
373 | 
374 | ### Planned Features
375 | - [ ] Support for additional chart types
376 | - [ ] Integration with other vision-language models
377 | - [ ] Advanced data augmentation techniques
378 | - [ ] Model compression and quantization options
379 | - [ ] Web-based demo interface
380 | 
381 | ### Known Limitations
382 | - Limited to ChartQA dataset currently
383 | - Requires significant GPU memory for optimal performance
384 | - May not generalize well to all chart types
385 | 
386 | ---
387 | 
388 | **Made with ❤️ for the AI community**
389 | 
390 | *If you find this project helpful, please consider giving it a star on GitHub!* 🌟
391 | 


--------------------------------------------------------------------------------
/projects/vllm-fine-tuning-smolvlm/test_smolvlm_chartqa.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | 🚀 SmolVLM-256M ChartQA Fine-tuning Test Script
  4 | 
  5 | This script tests the fine-tuned SmolVLM-256M model from train_smolvlm_chartqa.py
  6 | on the ChartQA dataset to evaluate performance and accuracy.
  7 | 
  8 | Features:
  9 | - Loads the fine-tuned adapter from smolvlm-256m-chartqa-sft
 10 | - Tests on ChartQA validation set
 11 | - Generates detailed responses with images
 12 | - Saves results to JSON and summary files
 13 | - Configurable via command-line arguments
 14 | """
 15 | 
 16 | import json
 17 | import torch
 18 | from datasets import load_dataset
 19 | from transformers import AutoProcessor, AutoTokenizer
 20 | from transformers import Idefics3ForConditionalGeneration
 21 | from peft import PeftModel
 22 | from PIL import Image
 23 | import requests
 24 | from io import BytesIO
 25 | import os
 26 | 
 27 | # Config - defaults match train_smolvlm_chartqa.py
 28 | BASE_MODEL_ID = "HuggingFaceTB/SmolVLM-256M-Instruct"  # Base model from finetuning script
 29 | ADAPTER_DIR = "./smolvlm-256m-chartqa-sft"  # LoRA adapter directory from finetuning
 30 | SAMPLE_SPLIT = "val[:10%]"  # Use validation split, get 10% for more samples
 31 | OUTPUT_DIR = "./smolvlm_256m_test_output"  # Directory to save images and results
 32 | NUM_SAMPLES = 10  # Number of samples to process
 33 | 
 34 | def load_image(src):
 35 |     """Load image from URL, file path, or PIL object."""
 36 |     if isinstance(src, str) and src.startswith("http"):
 37 |         try:
 38 |             resp = requests.get(src, timeout=5)
 39 |             resp.raise_for_status()
 40 |             return Image.open(BytesIO(resp.content)).convert("RGB")
 41 |         except Exception:
 42 |             return None
 43 |     elif isinstance(src, str) and os.path.exists(src):
 44 |         try:
 45 |             return Image.open(src).convert("RGB")
 46 |         except Exception:
 47 |             return None
 48 |     return None
 49 | 
 50 | def main():
 51 |     import argparse  # Add argparse for command-line arguments
 52 | 
 53 |     # Parse arguments
 54 |     parser = argparse.ArgumentParser(description='Test fine-tuned SmolVLM-256M model with configurable parameters.')
 55 |     parser.add_argument('--model_id', type=str, default=BASE_MODEL_ID, help=f'Base model ID to use (default: {BASE_MODEL_ID})')
 56 |     parser.add_argument('--adapter_dir', type=str, default=ADAPTER_DIR, help=f'LoRA adapter directory (default: {ADAPTER_DIR})')
 57 |     parser.add_argument('--memory_limit', type=float, default=2.0, help='Maximum memory limit in GB to monitor and warn (default: 2.0 GB)')
 58 |     parser.add_argument('--num_samples', type=int, default=NUM_SAMPLES, help=f'Number of samples to process (default: {NUM_SAMPLES})')
 59 |     args = parser.parse_args()
 60 | 
 61 |     # Update configs from arguments
 62 |     model_id = args.model_id
 63 |     adapter_dir = args.adapter_dir
 64 |     memory_limit = args.memory_limit
 65 |     num_samples = args.num_samples
 66 | 
 67 |     device = 'cuda' if torch.cuda.is_available() else 'cpu'
 68 |     print(f'Using device: {device}')
 69 | 
 70 |     # Create output directories
 71 |     os.makedirs(OUTPUT_DIR, exist_ok=True)
 72 |     images_dir = os.path.join(OUTPUT_DIR, "images")
 73 |     os.makedirs(images_dir, exist_ok=True)
 74 | 
 75 |     # Add memory limit check
 76 |     if torch.cuda.is_available() and memory_limit > 0:
 77 |         os.environ['PYTORCH_CUDA_ALLOC_CONF'] = f'max_split_size_mb: {int(memory_limit * 1024)}'  # Convert GB to MB for allocation hint
 78 | 
 79 |     # Try to load a multimodal processor first, fall back if needed
 80 |     processor = None
 81 |     tokenizer = None
 82 |     try:
 83 |         processor = AutoProcessor.from_pretrained(model_id)
 84 |         # Tokenizer is often available alongside the processor for decoding
 85 |         tokenizer = AutoTokenizer.from_pretrained(model_id)
 86 |         use_processor = True
 87 |         print("✅ Successfully loaded multimodal processor and tokenizer")
 88 |     except Exception as e:
 89 |         print(f"⚠️  AutoProcessor load failed: {e}")
 90 |         print("   Attempting to use AutoTokenizer only (text-only).")
 91 |         processor = None
 92 |         tokenizer = AutoTokenizer.from_pretrained(model_id)
 93 |         use_processor = False
 94 | 
 95 |     # Load the base model
 96 |     print(f"Loading base model: {model_id}")
 97 |     model = Idefics3ForConditionalGeneration.from_pretrained(model_id)
 98 |     model = model.to(device)
 99 |     print("✅ Base model loaded successfully")
100 | 
101 |     # Load and apply LoRA adapter
102 |     print(f"Loading LoRA adapter from: {adapter_dir}")
103 |     if os.path.exists(adapter_dir):
104 |         try:
105 |             # Check adapter config first
106 |             adapter_config_path = os.path.join(adapter_dir, "adapter_config.json")
107 |             if os.path.exists(adapter_config_path):
108 |                 print("📄 Found adapter config, loading adapter...")
109 |                 model = PeftModel.from_pretrained(model, adapter_dir)
110 | 
111 |                 # For inference, merge the adapter weights to avoid issues
112 |                 print("🔄 Merging LoRA adapter weights for inference...")
113 |                 model = model.merge_and_unload()
114 |                 print("✅ LoRA adapter merged successfully")
115 | 
116 |                 # Test if adapter was applied
117 |                 total_params = sum(p.numel() for p in model.parameters())
118 |                 print(f"📊 Model parameters: {total_params:,} total")
119 |             else:
120 |                 print("❌ No adapter_config.json found in adapter directory")
121 | 
122 |         except Exception as e:
123 |             print(f"❌ Error loading LoRA adapter: {e}")
124 |             print("   Using base model without fine-tuning")
125 |     else:
126 |         print(f"⚠️  LoRA adapter directory not found: {adapter_dir}")
127 |         print("   Using base model without fine-tuning")
128 | 
129 |     # Load validation subset from ChartQA
130 |     print(f"Loading ChartQA dataset (split: {SAMPLE_SPLIT})...")
131 |     ds = load_dataset("HuggingFaceM4/ChartQA", split=SAMPLE_SPLIT)
132 |     print(f"✅ Dataset loaded: {len(ds)} samples available")
133 | 
134 |     results = []
135 |     processed_count = 0
136 | 
137 |     print(f"\n🚀 Starting to process {num_samples} samples...")
138 |     print("=" * 60)
139 | 
140 |     for idx, sample in enumerate(ds):
141 |         if processed_count >= num_samples:
142 |             break
143 | 
144 |         image_src = sample.get("image")
145 |         prompt = sample.get("query", "")
146 |         ground_truth = sample.get("label", "")
147 | 
148 |         print(f"\n📊 Processing Sample {processed_count + 1}/{num_samples} (Dataset index: {idx})")
149 |         print("-" * 50)
150 | 
151 |         # Load and save image locally
152 |         # Check if image_src is already a PIL Image object (from HuggingFace datasets)
153 |         if hasattr(image_src, 'mode') and hasattr(image_src, 'size'):
154 |             # It's already a PIL Image object
155 |             image = image_src
156 |             print("✅ Image is already loaded as PIL Image object")
157 |         else:
158 |             # Try to load as URL or file path
159 |             image = load_image(image_src)
160 |             if image is None:
161 |                 print(f"❌ Could not load image: {image_src}")
162 |                 continue
163 | 
164 |         # Save image locally
165 |         image_filename = f"sample_{processed_count:02d}.png"
166 |         image_path = os.path.join(images_dir, image_filename)
167 |         try:
168 |             image.save(image_path)
169 |             print(f"✅ Image saved: {image_path}")
170 |         except Exception as e:
171 |             print(f"⚠️  Could not save image: {e}")
172 |             image_path = image_src  # Use original URL if save fails
173 | 
174 |         # Process with model
175 |         try:
176 |             if use_processor:
177 |                 # Try a simpler approach for SmolVLM
178 |                 # Use the processor directly with text and images
179 |                 try:
180 |                     inputs = processor(text=prompt, images=image, return_tensors="pt")
181 |                 except Exception as e:
182 |                     print(f"⚠️  Direct processor failed: {e}")
183 |                     print("   Falling back to chat template approach...")
184 |                     messages = [
185 |                         {
186 |                             "role": "user",
187 |                             "content": [
188 |                                 {"type": "image", "image": image},
189 |                                 {"type": "text", "text": prompt}
190 |                             ]
191 |                         }
192 |                     ]
193 |                     inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt")
194 |             else:
195 |                 # If only text is available, tokenize prompt and run a text-only path
196 |                 inputs = tokenizer(prompt, return_tensors="pt")
197 | 
198 |             inputs = {k: v.to(device) for k, v in inputs.items()}
199 | 
200 |             # Debug: show input shapes for first few samples
201 |             if processed_count < 2:
202 |                 print(f"🔧 Input shapes: {[(k, v.shape if hasattr(v, 'shape') else type(v)) for k, v in inputs.items()]}")
203 | 
204 |             with torch.no_grad():
205 |                 outputs = model.generate(
206 |                     **inputs,
207 |                     max_new_tokens=32,  # Shorter for chart QA answers
208 |                     temperature=0.1,    # Lower temperature for more focused answers
209 |                     do_sample=True,     # Enable sampling but with low temperature
210 |                     top_p=0.9,          # Nucleus sampling
211 |                     top_k=50,           # Top-k sampling
212 |                     repetition_penalty=1.1,  # Reduce repetition
213 |                     pad_token_id=processor.tokenizer.pad_token_id if hasattr(processor, 'tokenizer') else tokenizer.pad_token_id
214 |                 )
215 | 
216 |             # Decode and extract just the answer part
217 |             if tokenizer is not None:
218 |                 full_response = tokenizer.decode(outputs[0], skip_special_tokens=True)
219 | 
220 |                 # Debug: print raw response for first few samples
221 |                 if processed_count < 2:
222 |                     print(f"🔍 Raw model output: '{full_response}'")
223 | 
224 |                 # Extract just the assistant's answer (remove the conversation format)
225 |                 if "Assistant:" in full_response:
226 |                     answer = full_response.split("Assistant:")[-1].strip()
227 |                 elif "\nAssistant:" in full_response:
228 |                     answer = full_response.split("\nAssistant:")[-1].strip()
229 |                 else:
230 |                     # Fallback: try to find the last meaningful answer
231 |                     lines = full_response.strip().split('\n')
232 |                     answer = lines[-1].strip() if lines else full_response.strip()
233 | 
234 |                 # Clean up any remaining artifacts
235 |                 answer = answer.replace('<|end_of_text|>', '').replace('<|im_end|>', '').strip()
236 | 
237 |                 # If answer is too long or contains too many tokens, take first reasonable part
238 |                 if len(answer.split()) > 10:  # If more than 10 words, likely not a clean answer
239 |                     words = answer.split()
240 |                     # Look for common chart answer patterns (numbers, colors, short phrases)
241 |                     if any(word.isdigit() for word in words[:5]):
242 |                         answer = ' '.join(words[:5])
243 |                     else:
244 |                         answer = words[0] if words else "Unable to extract answer"
245 | 
246 |             else:
247 |                 # Fallback: raw generation output
248 |                 answer = str(outputs[0])
249 | 
250 |             print(f"❓ Question: {prompt}")
251 |             print(f"🎯 Ground Truth: {ground_truth}")
252 |             print(f"🤖 Model Answer: {answer}")
253 |             print(f"🖼️  Image: {image_path}")
254 | 
255 |             results.append({
256 |                 "sample_id": processed_count,
257 |                 "dataset_index": idx,
258 |                 "image_path": image_path,
259 |                 "original_image_url": str(image_src) if not isinstance(image_src, str) else image_src,
260 |                 "question": prompt,
261 |                 "ground_truth": ground_truth,
262 |                 "model_answer": answer,
263 |                 "processing_status": "success"
264 |             })
265 | 
266 |             processed_count += 1
267 | 
268 |         except Exception as e:
269 |             print(f"❌ Error processing sample: {e}")
270 |             results.append({
271 |                 "sample_id": processed_count,
272 |                 "dataset_index": idx,
273 |                 "image_path": image_path,
274 |                 "original_image_url": str(image_src) if not isinstance(image_src, str) else image_src,
275 |                 "question": prompt,
276 |                 "ground_truth": ground_truth,
277 |                 "model_answer": f"ERROR: {str(e)}",
278 |                 "processing_status": "error"
279 |             })
280 |             processed_count += 1
281 | 
282 |     # Save detailed results
283 |     results_path = os.path.join(OUTPUT_DIR, "test_results.json")
284 |     with open(results_path, "w", encoding="utf-8") as f:
285 |         json.dump(results, f, indent=2, ensure_ascii=False)
286 | 
287 |     # Create a clean summary file
288 |     summary_path = os.path.join(OUTPUT_DIR, "summary.txt")
289 |     with open(summary_path, "w", encoding="utf-8") as f:
290 |         f.write("SMOLVLM-256M FINETUNED MODEL TEST RESULTS\n")
291 |         f.write("=" * 50 + "\n\n")
292 |         f.write(f"Model: {model_id}\n")
293 |         f.write(f"Adapter: {adapter_dir}\n")
294 |         f.write(f"Total samples processed: {len(results)}\n")
295 |         f.write(f"Output directory: {OUTPUT_DIR}\n\n")
296 | 
297 |         for result in results:
298 |             f.write(f"SAMPLE {result['sample_id'] + 1:02d}\n")
299 |             f.write("-" * 30 + "\n")
300 |             f.write(f"Image: {result['image_path']}\n")
301 |             f.write(f"Question: {result['question']}\n")
302 |             f.write(f"Ground Truth: {result['ground_truth']}\n")
303 |             f.write(f"Model Answer: {result['model_answer']}\n")
304 |             f.write(f"Status: {result['processing_status']}\n\n")
305 | 
306 |     print("\n" + "=" * 60)
307 |     print("🎉 TESTING COMPLETE!")
308 |     print(f"📁 Results saved to: {OUTPUT_DIR}")
309 |     print(f"📊 Processed {len(results)} samples")
310 |     print(f"🖼️  Images saved to: {images_dir}")
311 |     print(f"📋 Detailed results: {results_path}")
312 |     print(f"📝 Summary: {summary_path}")
313 |     print("=" * 60)
314 | 
315 | if __name__ == "__main__":
316 |     main()
317 | 


--------------------------------------------------------------------------------
/projects/trl-ppo-fine-tuning/train.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | """
  5 | run_windows_setup_and_train.py  — TRL GRPO (datasets v3+), Windows-friendly
  6 | 
  7 | Fixes:
  8 | - Use `num_generations` (no `group_size` in GRPOConfig)
  9 | - Send `attn_implementation` to model.from_pretrained, not GRPOConfig
 10 | - Dynamic HF dataset split/column mapping; offline via DownloadConfig
 11 | Tested combos per HF cookbook: transformers≈4.47–4.48, trl≈0.14.x, datasets≈3.2.x. :contentReference[oaicite:3]{index=3}
 12 | """
 13 | 
 14 | import argparse
 15 | import json
 16 | import os
 17 | import re
 18 | from typing import Dict, List, Optional, Any, Tuple, Union
 19 | 
 20 | import torch
 21 | from datasets import (
 22 |     load_dataset,
 23 |     Dataset,
 24 |     DatasetDict,
 25 |     get_dataset_config_names,
 26 |     get_dataset_split_names,
 27 |     DownloadConfig,
 28 | )
 29 | from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
 30 | from trl import GRPOConfig, GRPOTrainer
 31 | from peft import LoraConfig, get_peft_model
 32 | 
 33 | # ---------- helpers: precision ----------
 34 | def bf16_supported() -> bool:
 35 |     try:
 36 |         return torch.cuda.is_available() and torch.cuda.is_bf16_supported()
 37 |     except Exception:
 38 |         return False
 39 | 
 40 | def choose_precision(args) -> Dict[str, bool]:
 41 |     if args.fp16:
 42 |         return {"fp16": True, "bf16": False}
 43 |     if args.bf16:
 44 |         return {"fp16": False, "bf16": True}
 45 |     if bf16_supported():
 46 |         return {"fp16": False, "bf16": True}
 47 |     return {"fp16": True, "bf16": False}
 48 | 
 49 | # ---------- helpers: prompt mapping ----------
 50 | PROMPT_FIELDS_PRIMARY = [
 51 |     "prompt","question","query","instruction","text","input","context","ctx","document","source",
 52 | ]
 53 | CHAT_FIELDS = ["messages","conversations","dialog","chat"]
 54 | REFERENCE_FIELDS = ["reference","response","answer","output","label","target","gold","completion","chosen"]
 55 | 
 56 | def normalize_text(s: str) -> str:
 57 |     return re.sub(r"\s+", " ", (s or "").strip()).lower()
 58 | 
 59 | def messages_to_prompt(msgs: Any) -> Optional[str]:
 60 |     try:
 61 |         if not isinstance(msgs, list) or not msgs:
 62 |             return None
 63 |         lines = []
 64 |         for m in msgs:
 65 |             if isinstance(m, dict):
 66 |                 role = m.get("role", "user")
 67 |                 content = m.get("content", "")
 68 |                 if isinstance(content, list):
 69 |                     content = " ".join(str(p.get("text","")) if isinstance(p, dict) else str(p) for p in content)
 70 |                 lines.append(f"{role}: {str(content).strip()}")
 71 |             else:
 72 |                 lines.append(str(m))
 73 |         return "\n".join(lines).strip()
 74 |     except Exception:
 75 |         return None
 76 | 
 77 | def synthesize_prompt(ex: Dict[str, Any]) -> str:
 78 |     for k in PROMPT_FIELDS_PRIMARY:
 79 |         if k in ex and ex[k]:
 80 |             return str(ex[k])
 81 |     for k in CHAT_FIELDS:
 82 |         if k in ex and ex[k]:
 83 |             p = messages_to_prompt(ex[k])
 84 |             if p:
 85 |                 return p
 86 |     instr = str(ex.get("instruction","")).strip()
 87 |     inp   = str(ex.get("input","")).strip()
 88 |     if instr and inp: return f"{instr}\n\nInput: {inp}"
 89 |     if instr: return instr
 90 |     if inp:   return inp
 91 |     ctx = str(ex.get("context","")).strip()
 92 |     q   = str(ex.get("question","")).strip()
 93 |     if ctx and q: return f"{ctx}\n\nQuestion: {q}"
 94 |     if q: return q
 95 |     try:
 96 |         return json.dumps(ex, ensure_ascii=False)
 97 |     except Exception:
 98 |         return str(ex)
 99 | 
100 | def pick_reference(ex: Dict[str, Any]) -> Optional[str]:
101 |     for k in REFERENCE_FIELDS:
102 |         if k in ex and ex[k] is not None:
103 |             return str(ex[k])
104 |     if "rejected" in ex and ex.get("chosen"):
105 |         return str(ex["chosen"])
106 |     return None
107 | 
108 | def map_to_prompt_reference(ds: Dataset) -> Dataset:
109 |     def mapper(ex):
110 |         return {"prompt": synthesize_prompt(ex), "reference": pick_reference(ex)}
111 |     mapped = ds.map(mapper)
112 |     for col in list(mapped.column_names):
113 |         if col not in {"prompt","reference"}:
114 |             try:
115 |                 mapped = mapped.remove_columns(col)
116 |             except Exception:
117 |                 pass
118 |     return mapped
119 | 
120 | # ---------- reward ----------
121 | def _to_text(c: Any) -> str:
122 |     # Handle plain text or chat-style structures
123 |     if isinstance(c, str):
124 |         return c
125 |     if isinstance(c, list):
126 |         # list of messages [{"role":..., "content":...}, ...]
127 |         parts = []
128 |         for m in c:
129 |             if isinstance(m, dict):
130 |                 parts.append(str(m.get("content","")))
131 |             else:
132 |                 parts.append(str(m))
133 |         return "\n".join(parts)
134 |     if isinstance(c, dict):
135 |         return str(c.get("content", c.get("text","")))
136 |     return str(c)
137 | 
138 | def extract_number(s: str) -> Optional[str]:
139 |     m = re.search(r"(?:####\s*)?(-?\d+(?:\.\d+)?)\s*$", (s or "").strip())
140 |     return m.group(1) if m else None
141 | 
142 | def reward_function(completions: List[Any], references: Optional[List[Optional[str]]] = None, **kwargs) -> List[float]:
143 |     rewards: List[float] = []
144 |     for i, comp in enumerate(completions):
145 |         txt = _to_text(comp) or ""
146 |         r = 0.0
147 |         # length heuristic (keep answers compact-ish)
148 |         tokens_est = max(1, len(txt) // 4)
149 |         if tokens_est <= 256: r += 0.2
150 |         elif tokens_est <= 512: r += 0.1
151 |         else: r -= 0.2
152 |         # formatting bonus
153 |         if re.search(r"(final answer|answer)\s*[:\-]", txt, flags=re.I):
154 |             r += 0.2
155 |         # reference-based
156 |         ref = references[i] if references is not None and i < len(references) else None
157 |         if ref:
158 |             if normalize_text(txt) == normalize_text(ref):
159 |                 r += 1.0
160 |             ns, nr = extract_number(txt), extract_number(ref)
161 |             if ns is not None and nr is not None and ns == nr:
162 |                 r += 0.6
163 |         # boilerplate penalty
164 |         if "as an ai" in normalize_text(txt):
165 |             r -= 0.3
166 |         rewards.append(float(r))
167 |     return rewards
168 | 
169 | # ---------- split auto-detection ----------
170 | PREFERRED_TRAIN_SPLITS = ["train_sft","train","train_gen","train_prefs","train_all"]
171 | PREFERRED_EVAL_SPLITS  = ["test_sft","validation","valid","test","test_gen","test_prefs"]
172 | 
173 | def pick_splits_from_dataset_dict(dd: DatasetDict) -> Tuple[str, Optional[str]]:
174 |     keys = set(dd.keys())
175 |     train = next((k for k in PREFERRED_TRAIN_SPLITS if k in keys), None)
176 |     if train is None:
177 |         train = next((k for k in keys if "train" in k), next(iter(keys)))
178 |     eval_split = next((k for k in PREFERRED_EVAL_SPLITS if k in keys), None)
179 |     if eval_split is None:
180 |         for k in keys:
181 |             if (("test" in k) or ("valid" in k)) and k != train:
182 |                 eval_split = k
183 |                 break
184 |     if eval_split == train:
185 |         eval_split = None
186 |     return train, eval_split
187 | 
188 | def safe_load_dataset(name: str, cfg: Optional[str], split: Optional[str], local_only: bool) -> Union[Dataset, DatasetDict]:
189 |     dlc = DownloadConfig(local_files_only=local_only) if local_only else None
190 |     if split:
191 |         return load_dataset(name, cfg, split=split, download_config=dlc)
192 |     return load_dataset(name, cfg, download_config=dlc)
193 | 
194 | def auto_load_hf_splits(name: str, cfg: Optional[str], local_only: bool) -> Tuple[Dataset, Optional[Dataset], str, Optional[str]]:
195 |     try:
196 |         dd_or_ds = safe_load_dataset(name, cfg, None, local_only)
197 |         if isinstance(dd_or_ds, DatasetDict):
198 |             train_split, eval_split = pick_splits_from_dataset_dict(dd_or_ds)
199 |             train_ds = map_to_prompt_reference(dd_or_ds[train_split])
200 |             eval_ds  = map_to_prompt_reference(dd_or_ds[eval_split]) if (eval_split and eval_split in dd_or_ds) else None
201 |             return train_ds, eval_ds, train_split, eval_split
202 |         else:
203 |             return map_to_prompt_reference(dd_or_ds), None, "train", None
204 |     except Exception:
205 |         try:
206 |             dlc = DownloadConfig(local_files_only=local_only) if local_only else None
207 |             use_cfg = cfg or (get_dataset_config_names(name, download_config=dlc)[0] if get_dataset_config_names(name, download_config=dlc) else None)
208 |             splits = get_dataset_split_names(name, use_cfg, download_config=dlc)
209 |             train_choice = next((s for s in PREFERRED_TRAIN_SPLITS if s in splits), None) or next((s for s in splits if "train" in s), splits[0])
210 |             eval_choice  = next((s for s in PREFERRED_EVAL_SPLITS if s in splits), None)
211 |             train_ds = map_to_prompt_reference(safe_load_dataset(name, use_cfg, train_choice, local_only))  # type: ignore
212 |             eval_ds  = map_to_prompt_reference(safe_load_dataset(name, use_cfg, eval_choice,  local_only)) if eval_choice else None  # type: ignore
213 |             return train_ds, eval_ds, train_choice, eval_choice
214 |         except Exception as e:
215 |             raise RuntimeError(f"Could not auto-detect splits for dataset '{name}': {e}")
216 | 
217 | # ---------- main ----------
218 | def maybe_login_hf(token: Optional[str]):
219 |     if not token:
220 |         return
221 |     try:
222 |         from huggingface_hub import login
223 |         login(token=token, add_to_git_credential=True)
224 |         print("[INFO] HF token set.")
225 |     except Exception as e:
226 |         print(f"[WARN] HF login failed (continuing): {e}")
227 | 
228 | def main():
229 |     ap = argparse.ArgumentParser(description="Local GRPO (TRL) on Windows with HF models/datasets")
230 |     # model & data
231 |     ap.add_argument("--base-model", required=True)
232 |     ap.add_argument("--dataset", default=None)
233 |     ap.add_argument("--dataset-config", default=None)
234 |     ap.add_argument("--train-jsonl", default=None)
235 |     ap.add_argument("--eval-jsonl", default=None)
236 |     ap.add_argument("--output-dir", required=True)
237 | 
238 |     # training
239 |     ap.add_argument("--max-prompt-len", type=int, default=512)
240 |     ap.add_argument("--max-gen-len", type=int, default=128)
241 |     ap.add_argument("--num-epochs", type=int, default=1)
242 |     ap.add_argument("--per-device-batch", type=int, default=1)
243 |     ap.add_argument("--grad-accum", type=int, default=8)
244 |     ap.add_argument("--lr", type=float, default=5e-6)
245 |     ap.add_argument("--warmup-ratio", type=float, default=0.03)
246 |     ap.add_argument("--weight-decay", type=float, default=0.0)
247 |     ap.add_argument("--save-steps", type=int, default=1000)
248 |     ap.add_argument("--logging-steps", type=int, default=10)
249 |     ap.add_argument("--seed", type=int, default=42)
250 | 
251 |     # GRPO sampling
252 |     ap.add_argument("--num-generations", type=int, default=4, help="completions per prompt (group size)")
253 |     ap.add_argument("--num-iterations", type=int, default=1, help="policy updates per generation batch")
254 | 
255 |     # system / precision
256 |     ap.add_argument("--attn-impl", default="sdpa", choices=["eager","sdpa"])
257 |     ap.add_argument("--bf16", action="store_true")
258 |     ap.add_argument("--fp16", action="store_true")
259 |     ap.add_argument("--local-only", action="store_true")
260 | 
261 |     # LoRA / quant
262 |     ap.add_argument("--lora-off", action="store_true")
263 |     ap.add_argument("--lora-r", type=int, default=16)
264 |     ap.add_argument("--lora-alpha", type=float, default=32.0)
265 |     ap.add_argument("--lora-dropout", type=float, default=0.05)
266 |     ap.add_argument("--lora-target", default="q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj")
267 |     ap.add_argument("--use-4bit", action="store_true")
268 | 
269 |     # hub
270 |     ap.add_argument("--hub-token", default=None)
271 | 
272 |     args = ap.parse_args()
273 | 
274 |     # env/perf
275 |     os.environ.setdefault("TOKENIZERS_PARALLELISM","false")
276 |     os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF","expandable_segments:True")
277 |     if args.local_only:
278 |         os.environ["HF_HUB_OFFLINE"] = "1"
279 |     torch.backends.cuda.matmul.allow_tf32 = True
280 |     try: torch.set_float32_matmul_precision("high")
281 |     except Exception: pass
282 | 
283 |     maybe_login_hf(args.hub_token or os.getenv("HF_TOKEN"))
284 | 
285 |     device = "cuda" if torch.cuda.is_available() else "cpu"
286 |     print(f"[INFO] Using device: {device}")
287 | 
288 |     # tokenizer
289 |     tokenizer = AutoTokenizer.from_pretrained(
290 |         args.base_model, use_fast=True, trust_remote_code=True, local_files_only=args.local_only
291 |     )
292 |     if tokenizer.pad_token_id is None:
293 |         tokenizer.pad_token = tokenizer.eos_token
294 | 
295 |     # 4-bit optional
296 |     quant_cfg = None
297 |     if args.use_4bit:
298 |         try:
299 |             quant_cfg = BitsAndBytesConfig(
300 |                 load_in_4bit=True,
301 |                 bnb_4bit_compute_dtype=torch.bfloat16 if bf16_supported() else torch.float16,
302 |                 bnb_4bit_use_double_quant=True,
303 |                 bnb_4bit_quant_type="nf4",
304 |             )
305 |         except Exception as e:
306 |             print(f"[WARN] bitsandbytes unavailable/incompatible: {e}")
307 |             quant_cfg = None
308 | 
309 |     # precision + model kwargs (send attn_implementation here)
310 |     prec = choose_precision(args)
311 |     model_kwargs = dict(
312 |         torch_dtype=torch.bfloat16 if prec["bf16"] else torch.float16,
313 |         trust_remote_code=True,
314 |         local_files_only=args.local_only,
315 |         attn_implementation=args.attn_impl,
316 |     )
317 |     if quant_cfg is not None:
318 |         model_kwargs["quantization_config"] = quant_cfg
319 |         model_kwargs["device_map"] = "auto"
320 |     else:
321 |         model_kwargs["device_map"] = {"": 0} if device == "cuda" else None
322 | 
323 |     model = AutoModelForCausalLM.from_pretrained(args.base_model, **model_kwargs)
324 |     model.config.use_cache = False
325 |     try:
326 |         model.generation_config.pad_token_id = tokenizer.pad_token_id
327 |         model.generation_config.eos_token_id = tokenizer.eos_token_id
328 |     except Exception:
329 |         pass
330 | 
331 |     # LoRA
332 |     if not args.lora_off:
333 |         targets = [m.strip() for m in args.lora_target.split(",") if m.strip()]
334 |         lora_cfg = LoraConfig(r=args.lora_r, lora_alpha=args.lora_alpha, lora_dropout=args.lora_dropout,
335 |                               target_modules=targets, bias="none", task_type="CAUSAL_LM")
336 |         model = get_peft_model(model, lora_cfg)
337 |         model.print_trainable_parameters()
338 | 
339 |     # data
340 |     if args.dataset:
341 |         train_ds, eval_ds, tname, ename = auto_load_hf_splits(args.dataset, args.dataset_config, args.local_only)
342 |         print(f"[INFO] Using dataset='{args.dataset}' config='{args.dataset_config}'")
343 |         print(f"[INFO] Picked train split: {tname} | eval split: {ename}")
344 |     elif args.train_jsonl:
345 |         train_ds = map_to_prompt_reference(load_dataset("json", data_files=args.train_jsonl, split="train"))
346 |         eval_ds  = map_to_prompt_reference(load_dataset("json", data_files=args.eval_jsonl,  split="train")) if args.eval_jsonl else None
347 |         print(f"[INFO] Using local JSONL. Train: {args.train_jsonl} | Eval: {args.eval_jsonl}")
348 |     else:
349 |         raise ValueError("Provide either --dataset or --train-jsonl.")
350 | 
351 | 
352 | # ↓ add this
353 |     N = 100
354 |     train_ds = train_ds.shuffle(seed=args.seed).select(range(min(N, len(train_ds))))
355 |     if eval_ds is not None:
356 |         eval_ds = eval_ds.shuffle(seed=args.seed).select(range(min(N, len(eval_ds))))
357 |     # reward refs (optional)
358 |     reward_refs = train_ds["reference"] if "reference" in train_ds.column_names else None
359 |     def _wrapped_reward(completions: List[Any], **kwargs) -> List[float]:
360 |         return reward_function(completions, reward_refs, **kwargs)
361 | 
362 |     # GRPO args — note: no `group_size` in GRPOConfig; use `num_generations`
363 |     grpo_args = GRPOConfig(
364 |         output_dir=args.output_dir,
365 |         report_to=[],
366 |         learning_rate=args.lr,
367 |         weight_decay=args.weight_decay,
368 |         warmup_ratio=args.warmup_ratio,
369 |         num_train_epochs=args.num_epochs,
370 |         per_device_train_batch_size=args.per_device_batch,
371 |         gradient_accumulation_steps=args.grad_accum,
372 |         logging_steps=args.logging_steps,
373 |         save_steps=args.save_steps,
374 |         save_total_limit=2,
375 |         seed=args.seed,
376 |         fp16=prec["fp16"],
377 |         bf16=prec["bf16"],
378 |         # GRPO-specific
379 |         max_prompt_length=args.max_prompt_len,
380 |         max_completion_length=args.max_gen_len,
381 |         num_generations=args.num_generations,   # ← controls group size
382 |         num_iterations=args.num_iterations,
383 |         temperature=0.7,
384 |         top_p=0.9,
385 |         # do_sample=True,
386 |     )
387 | 
388 |     trainer = GRPOTrainer(
389 |         model=model,
390 |         # tokenizer=tokenizer,
391 |         args=grpo_args,
392 |         train_dataset=train_ds,
393 |         eval_dataset=eval_ds,
394 |         reward_funcs=[_wrapped_reward],
395 |         # dataset_text_field="prompt",
396 |     )
397 | 
398 |     trainer.train()
399 |     trainer.save_model(args.output_dir)
400 |     try: tokenizer.save_pretrained(args.output_dir)
401 |     except Exception: pass
402 |     print("[INFO] Done. Artifacts in:", args.output_dir)
403 | 
404 | if __name__ == "__main__":
405 |     main()
406 | 


--------------------------------------------------------------------------------
/projects/vllm-fine-tuning-smolvlm/train_smolvlm_chartqa.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | 🚀 SmolVLM-256M ChartQA Fine-tuning with Lazy Loading
  4 | 
  5 | This script implements ultra-efficient fine-tuning of SmolVLM-256M on ChartQA dataset.
  6 | Features maximum memory efficiency through lazy loading and streaming.
  7 | 
  8 | LAZY LOADING IMPLEMENTATION:
  9 | - Streaming dataset loading with load_dataset(streaming=True)
 10 | - On-demand data processing using .take() and .skip()
 11 | - Lazy map operations with keep_in_memory=False
 12 | - Minimal memory footprint (< 2GB VRAM)
 13 | - Fast startup time
 14 | - Compatible with TRL library
 15 | 
 16 | EFFICIENCY FEATURES:
 17 | - SmolVLM-256M (256M parameters) - most efficient model available
 18 | - Lazy loading prevents memory spikes
 19 | - Optimized batch processing
 20 | - Memory monitoring and warnings
 21 | - Ultra-fast training (10-15 minutes on laptop)
 22 | """
 23 | 
 24 | import os
 25 | import platform
 26 | import time
 27 | import gc
 28 | from typing import Dict
 29 | 
 30 | import torch
 31 | from datasets import load_dataset
 32 | from huggingface_hub import login
 33 | from PIL import Image  # noqa: F401 (used implicitly by HF Datasets Image feature)
 34 | 
 35 | from transformers import (
 36 |     AutoProcessor,
 37 |     BitsAndBytesConfig,
 38 |     Idefics3ForConditionalGeneration,
 39 |     AutoTokenizer,
 40 | )
 41 | 
 42 | # Safe imports - only import what actually exists
 43 | from peft import LoraConfig
 44 | 
 45 | # Import TRL components safely
 46 | try:
 47 |     from trl import SFTConfig, SFTTrainer
 48 |     TRL_AVAILABLE = True
 49 | except ImportError:
 50 |     print("❌ TRL not available - install with: pip install trl")
 51 |     TRL_AVAILABLE = False
 52 | 
 53 | 
 54 | # =========================
 55 | # Config
 56 | # =========================
 57 | MODEL_ID = "HuggingFaceTB/SmolVLM-256M-Instruct"
 58 | DATASET_ID = "HuggingFaceM4/ChartQA"
 59 | OUTPUT_DIR = "smolvlm-256m-chartqa-sft"
 60 | DATASET_SLICE = "[:80%]"        # Use 80% of data for training, 20% for validation
 61 | PUSH_TO_HUB = False
 62 | HF_REPO_ID = None
 63 | SEED = 42
 64 | 
 65 | SYSTEM_MESSAGE = (
 66 |     """
 67 |     You are a Vision Language Model specialized in interpreting visual data from chart images.
 68 |     Your task is to analyze the provided chart image and respond to queries with concise answers, usually a single word, number, or short phrase.
 69 |     The charts include a variety of types (e.g., line charts, bar charts) and contain colors, labels, and text.
 70 |     Focus on delivering accurate, succinct answers based on the visual information. Avoid additional explanation unless absolutely necessary.
 71 |     """
 72 | )
 73 | 
 74 | # LoRA config will be created dynamically based on argparse parameters
 75 | 
 76 | 
 77 | # Training config (ultra-fast for SmolVLM-256M)
 78 | TRAINING_KW = dict(
 79 |     num_train_epochs=1,  # Tiny model learns fast
 80 |     per_device_train_batch_size=4,  # Higher batch size for tiny model
 81 |     per_device_eval_batch_size=4,   # Higher eval batch size
 82 |     gradient_accumulation_steps=4,  # Reduced for faster training
 83 |     learning_rate=5e-4,  # Higher LR for fast learning
 84 |     weight_decay=0.01,
 85 |     warmup_ratio=0.05,  # Shorter warmup
 86 |     logging_steps=5,  # Very frequent logging
 87 |     save_strategy="steps",
 88 |     save_steps=10,  # Save every 10 steps
 89 |     save_total_limit=3,  # Keep more checkpoints
 90 |     eval_strategy="steps",  # Add validation
 91 |     eval_steps=10,  # Evaluate every 10 steps
 92 |     load_best_model_at_end=True,  # Load best model
 93 |     metric_for_best_model="eval_loss",  # Use eval loss for best model
 94 |     greater_is_better=False,  # Lower loss is better
 95 |     gradient_checkpointing=False,  # Not needed for tiny model
 96 |     max_grad_norm=1.0,
 97 |     report_to="none",
 98 |     push_to_hub=PUSH_TO_HUB,
 99 |     output_dir=OUTPUT_DIR,
100 |     max_steps=200,  # Specify max_steps for streaming datasets
101 | )
102 | 
103 | 
104 | # ===========================================
105 | # Utilities
106 | # ===========================================
107 | def clear_memory():
108 |     """Ultra-efficient memory management for tiny model"""
109 |     # Force garbage collection
110 |     gc.collect()
111 | 
112 |     # Clear CUDA cache efficiently
113 |     if torch.cuda.is_available():
114 |         torch.cuda.empty_cache()
115 |         torch.cuda.synchronize()
116 |         allocated = torch.cuda.memory_allocated() / 2**30
117 |         reserved = torch.cuda.memory_reserved() / 2**30
118 |         print(f"[GPU] allocated: {allocated:.2f} GB | reserved: {reserved:.2f} GB")
119 | 
120 |         # SmolVLM-256M should use very little memory
121 |         if allocated > 2.0:  # Warning if over 2GB
122 |             print("WARNING: High memory usage detected")
123 |     else:
124 |         print("[CPU] CUDA not available.")
125 | 
126 | def optimize_memory_for_tiny_model():
127 |     """Memory optimizations specifically for SmolVLM-256M"""
128 |     # Set environment variables for efficiency
129 |     os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
130 |     os.environ['CUDA_LAUNCH_BLOCKING'] = '0'  # Non-blocking launches
131 | 
132 |     print("Memory optimizations activated for SmolVLM-256M")
133 | 
134 | 
135 | def bf16_supported() -> bool:
136 |     if not torch.cuda.is_available():
137 |         return False
138 |     try:
139 |         return torch.cuda.is_bf16_supported()
140 |     except (AttributeError, RuntimeError):
141 |         # Fallback: check compute capability for GPUs that support bfloat16
142 |         try:
143 |             major, minor = torch.cuda.get_device_capability()
144 |             # RTX 30-series and newer support bfloat16 (compute capability >= 8.0)
145 |             return major >= 8
146 |         except Exception:
147 |             return False
148 | 
149 | 
150 | 
151 | def format_sample(sample: Dict) -> Dict:
152 |     answer = sample["label"][0] if isinstance(sample.get("label"), list) else sample["label"]
153 | 
154 |     return {
155 |         "images": [sample["image"]],
156 |         "messages": [
157 |             {"role": "system", "content": [{"type": "text", "text": SYSTEM_MESSAGE}]},
158 |             {
159 |                 "role": "user",
160 |                 "content": [
161 |                     {"type": "image", "image": sample["image"]},
162 |                     {"type": "text", "text": sample["query"]},
163 |                 ],
164 |             },
165 |             {"role": "assistant", "content": [{"type": "text", "text": answer}]},
166 |         ],
167 |     }
168 | 
169 | 
170 | # ===========================================
171 | # Main
172 | # ===========================================
173 | def main():
174 |     import argparse  # Add argparse for command-line arguments
175 | 
176 |     # Parse arguments
177 |     parser = argparse.ArgumentParser(description='Fine-tune SmolVLM-256M with high-performance configurable parameters.')
178 |     parser.add_argument('--batch_size', type=int, default=16, help='Batch size for training and evaluation (default: 16, optimized for 16GB GPU)')
179 |     parser.add_argument('--memory_limit', type=float, default=14.0, help='Maximum memory limit in GB to monitor and warn (default: 14.0 GB for 16GB GPU)')
180 |     parser.add_argument('--learning_rate', type=float, default=1e-3, help='Learning rate (default: 1e-3, higher for faster convergence)')
181 |     parser.add_argument('--epochs', type=int, default=2, help='Number of training epochs (default: 2)')
182 |     parser.add_argument('--max_steps', type=int, default=500, help='Maximum training steps (default: 500, more steps for better learning)')
183 |     parser.add_argument('--gradient_accumulation', type=int, default=2, help='Gradient accumulation steps (default: 2, reduced for memory efficiency)')
184 |     parser.add_argument('--enable_gradient_checkpointing', action='store_true', help='Enable gradient checkpointing for memory efficiency')
185 |     parser.add_argument('--mixed_precision', type=str, default='bf16', choices=['fp16', 'bf16', 'fp32'], help='Mixed precision training (default: bf16)')
186 | 
187 |     # LoRA hyperparameters
188 |     parser.add_argument('--lora_r', type=int, default=16, help='LoRA rank (default: 16, increased for better capacity)')
189 |     parser.add_argument('--lora_alpha', type=int, default=32, help='LoRA alpha scaling factor (default: 32)')
190 |     parser.add_argument('--lora_dropout', type=float, default=0.05, help='LoRA dropout rate (default: 0.05)')
191 | 
192 |     args = parser.parse_args()
193 | 
194 |     torch.manual_seed(SEED)
195 | 
196 |     # ACTIVATE ULTRA-EFFICIENT MODE
197 |     print("Activating SmolVLM-256M ultra-efficient training mode...")
198 |     optimize_memory_for_tiny_model()
199 | 
200 |     # Create dynamic LoRA config with parsed arguments and validation
201 |     if args.lora_r <= 0:
202 |         raise ValueError(f"LoRA rank (r) must be positive, got {args.lora_r}")
203 |     if args.lora_alpha <= 0:
204 |         raise ValueError(f"LoRA alpha must be positive, got {args.lora_alpha}")
205 |     if not 0 < args.lora_dropout <= 1:
206 |         raise ValueError(f"LoRA dropout must be between 0 and 1, got {args.lora_dropout}")
207 | 
208 |     # Calculate effective learning rate scaling
209 |     effective_lr_scale = args.lora_alpha / args.lora_r
210 | 
211 |     global PEFT_CONFIG
212 |     PEFT_CONFIG = LoraConfig(
213 |         r=args.lora_r,
214 |         lora_alpha=args.lora_alpha,
215 |         lora_dropout=args.lora_dropout,
216 |         target_modules=[
217 |             'down_proj', 'o_proj', 'k_proj', 'q_proj',
218 |             'gate_proj', 'up_proj', 'v_proj'
219 |         ],
220 |         use_dora=True,
221 |         init_lora_weights="gaussian",
222 |     )
223 | 
224 |     print(f"🎯 LoRA Configuration:")
225 |     print(f"   • Rank (r): {args.lora_r}")
226 |     print(f"   • Alpha: {args.lora_alpha}")
227 |     print(f"   • Dropout: {args.lora_dropout}")
228 |     print(f"   • Effective LR Scale: {effective_lr_scale:.2f}")
229 |     print(f"   • Trainable Parameters: ~{args.lora_r * args.lora_alpha * 7} per target module")
230 | 
231 |     # Update TRAINING_KW with parsed arguments for high-performance training
232 |     TRAINING_KW['num_train_epochs'] = args.epochs
233 |     TRAINING_KW['per_device_train_batch_size'] = args.batch_size
234 |     TRAINING_KW['per_device_eval_batch_size'] = args.batch_size
235 |     TRAINING_KW['gradient_accumulation_steps'] = args.gradient_accumulation
236 |     TRAINING_KW['learning_rate'] = args.learning_rate
237 |     TRAINING_KW['max_steps'] = args.max_steps
238 |     TRAINING_KW['gradient_checkpointing'] = args.enable_gradient_checkpointing
239 | 
240 |     # Set mixed precision based on argument (override compute_dtype if specified)
241 |     if args.mixed_precision == 'bf16':
242 |         TRAINING_KW['bf16'] = True
243 |         TRAINING_KW['fp16'] = False
244 |         compute_dtype = torch.bfloat16
245 |         print(f"🔧 Mixed precision set to BF16 (optimal for performance)")
246 |     elif args.mixed_precision == 'fp16':
247 |         TRAINING_KW['bf16'] = False
248 |         TRAINING_KW['fp16'] = True
249 |         compute_dtype = torch.float16
250 |         print(f"🔧 Mixed precision set to FP16")
251 |     else:
252 |         TRAINING_KW['bf16'] = False
253 |         TRAINING_KW['fp16'] = False
254 |         compute_dtype = torch.float32
255 |         print(f"🔧 Mixed precision disabled (FP32)")
256 |         # Adjust batch size for FP32 (uses more memory)
257 |         if args.batch_size > 8:
258 |             args.batch_size = max(8, args.batch_size // 2)
259 |             TRAINING_KW['per_device_train_batch_size'] = args.batch_size
260 |             TRAINING_KW['per_device_eval_batch_size'] = args.batch_size
261 |             print(f"⚠️  Reduced batch size to {args.batch_size} for FP32 compatibility")
262 | 
263 |     # Add memory limit check for high-performance monitoring (targeting 12GB+ usage)
264 |     if torch.cuda.is_available() and args.memory_limit > 0:
265 |         # Validate memory limit doesn't exceed GPU capacity
266 |         if args.memory_limit > 16.0:
267 |             print(f"⚠️  Memory limit {args.memory_limit}GB exceeds typical 16GB GPU capacity, reducing to 15.5GB")
268 |             args.memory_limit = 15.5
269 | 
270 |         # Set memory allocation to maximize GPU usage for 16GB GPU
271 |         memory_mb = int(args.memory_limit * 1024)
272 |         os.environ['PYTORCH_CUDA_ALLOC_CONF'] = f'max_split_size_mb: {memory_mb},expandable_segments:True'
273 |         print(f"🎯 Targeting {args.memory_limit}GB GPU memory usage ({args.memory_limit * 100 / 16:.1f}% of 16GB)")
274 |         print(f"🔧 Memory allocation: {memory_mb}MB max split size")
275 | 
276 |     print(f"🚀 High-Performance Training Configuration:")
277 |     print(f"   • Batch Size: {args.batch_size}")
278 |     print(f"   • Learning Rate: {args.learning_rate}")
279 |     print(f"   • Epochs: {args.epochs}")
280 |     print(f"   • Max Steps: {args.max_steps}")
281 |     print(f"   • Gradient Accumulation: {args.gradient_accumulation}")
282 |     print(f"   • Mixed Precision: {args.mixed_precision}")
283 |     print(f"   • Memory Target: {args.memory_limit}GB")
284 |     print(f"   • Gradient Checkpointing: {'Enabled' if args.enable_gradient_checkpointing else 'Disabled'}")
285 |     print(f"   • LoRA Config: r={args.lora_r}, α={args.lora_alpha}, dropout={args.lora_dropout}")
286 | 
287 |     if PUSH_TO_HUB:
288 |         try:
289 |             login()
290 |         except Exception as e:
291 |             print(f"[WARN] HF login failed: {e}")
292 | 
293 |     print(f"Loading dataset: {DATASET_ID}")
294 | 
295 |     # ULTRA-EFFICIENT LAZY LOADING (Memory Optimized)
296 |     print("Implementing lazy loading for maximum memory efficiency...")
297 | 
298 |     # Enable streaming for true lazy loading
299 |     print("Loading dataset in streaming mode...")
300 |     full_dataset = load_dataset(DATASET_ID, streaming=True)
301 | 
302 |     # For streaming datasets, we need to load a small portion to estimate size
303 |     # This is still much more memory efficient than loading the full dataset
304 |     sample_dataset = load_dataset(DATASET_ID, split="train[:100]")  # Load small sample
305 |     estimated_total_train = len(sample_dataset) * 10  # Rough estimate (10% sample * 10)
306 |     train_size = int(0.8 * estimated_total_train)
307 | 
308 |     print(f"Dataset info: ~{estimated_total_train} training samples (estimated)")
309 |     print(f"Lazy loading: Active - minimal memory usage")
310 | 
311 |     # Create truly lazy splits
312 |     raw_train = full_dataset["train"].take(train_size)
313 |     raw_val = full_dataset["train"].skip(train_size).take(int(0.2 * estimated_total_train))
314 |     raw_test = full_dataset["val"]
315 | 
316 |     print(f"Training samples: {train_size} (lazy streaming)")
317 |     print(f"Validation samples: {int(0.2 * estimated_total_train)} (lazy streaming)")
318 |     print(f"Test samples: Lazy loaded (streaming)")
319 | 
320 |     print("Lazy data formatting - only processes when training starts...")
321 | 
322 |     # Use lazy map for streaming datasets (limited options available)
323 |     # This is the key to true lazy loading - data is processed on-the-fly during training
324 |     train_ds = raw_train.map(
325 |         format_sample,
326 |         remove_columns=list(full_dataset["train"].column_names),
327 |         # Note: streaming datasets have limited map options
328 |     )
329 |     eval_ds = raw_val.map(
330 |         format_sample,
331 |         remove_columns=list(full_dataset["train"].column_names),
332 |     )
333 |     test_ds = raw_test.map(
334 |         format_sample,
335 |         remove_columns=list(full_dataset["val"].column_names),
336 |     )
337 | 
338 |     print("Lazy loading configured:")
339 |     print("   Data will be processed on-demand during training")
340 |     print("   Minimal memory footprint until needed")
341 |     print("   Training can start immediately")
342 | 
343 |     # Monitor memory usage to prove lazy loading efficiency
344 |     clear_memory()
345 |     print("   Current memory status: Data not loaded yet")
346 | 
347 |     clear_memory()
348 | 
349 |     # Use the compute_dtype set by mixed precision arguments, with hardware validation
350 |     if not bf16_supported() and compute_dtype == torch.bfloat16:
351 |         print(f"⚠️  BF16 not supported by hardware, falling back to FP16")
352 |         compute_dtype = torch.float16
353 |         # Update TRAINING_KW accordingly
354 |         TRAINING_KW['bf16'] = False
355 |         TRAINING_KW['fp16'] = True
356 | 
357 |     print(f"Using compute dtype: {compute_dtype} (mixed precision: {args.mixed_precision})")
358 | 
359 |     # SmolVLM-256M is tiny - no quantization needed!
360 |     print(f"Loading ultra-efficient SmolVLM-256M model: {MODEL_ID}")
361 |     processor = AutoProcessor.from_pretrained(MODEL_ID)
362 | 
363 |     # Fix for Idefics3Processor missing pad_token attribute
364 |     # Set pad_token explicitly if not present
365 |     if not hasattr(processor, 'pad_token') or processor.pad_token is None:
366 |         # Try to set pad_token from tokenizer if available
367 |         if hasattr(processor, 'tokenizer') and hasattr(processor.tokenizer, 'pad_token') and processor.tokenizer.pad_token is not None:
368 |             processor.pad_token = processor.tokenizer.pad_token
369 |         else:
370 |             # Fallback: use eos_token as pad_token or set a default
371 |             if hasattr(processor, 'eos_token') and processor.eos_token is not None:
372 |                 processor.pad_token = processor.eos_token
373 |             else:
374 |                 # Last resort: set a default pad token
375 |                 processor.pad_token = "<PAD>"
376 | 
377 |     model = Idefics3ForConditionalGeneration.from_pretrained(
378 |         MODEL_ID,
379 |         device_map="auto",
380 |         torch_dtype=compute_dtype,
381 |         # No quantization needed for 256M model
382 |     )
383 | 
384 |     # SmolVLM-256M handles tokens automatically
385 |     print("SmolVLM-256M loaded successfully - tiny but mighty!")
386 | 
387 |     training_args = SFTConfig(
388 |         **TRAINING_KW,
389 |         optim="adamw_torch",
390 |         save_safetensors=True,
391 |         seed=SEED,
392 |         pad_token=processor.pad_token,
393 |         eos_token=processor.eos_token if hasattr(processor, 'eos_token') else None,
394 |     )
395 | 
396 |     # SIMPLE DATA COLLATOR (TRL Compatible)
397 |     print("Using simple data collator for SmolVLM-256M...")
398 | 
399 |     # TRL will handle data collation automatically for vision models
400 |     data_collator = None
401 | 
402 |     # Check if TRL is available
403 |     if not TRL_AVAILABLE:
404 |         raise ImportError("TRL library is required for training. Install with: pip install trl")
405 | 
406 |     # Fix for Idefics3Processor missing required methods
407 |     # Use the tokenizer component of the processor instead
408 |     if hasattr(processor, 'tokenizer'):
409 |         processing_class = processor.tokenizer
410 |     else:
411 |         processing_class = processor
412 | 
413 |     trainer = SFTTrainer(
414 |         model=model,
415 |         args=training_args,
416 |         train_dataset=train_ds,
417 |         eval_dataset=eval_ds,
418 |         peft_config=PEFT_CONFIG,
419 |         processing_class=processing_class,
420 |         data_collator=data_collator,
421 | 
422 |     )
423 | 
424 |     print("🚀 Starting HIGH-PERFORMANCE SmolVLM-256M training with lazy loading...")
425 |     print(f"Expected time: ~15-25 minutes on laptop (with {args.epochs} epochs)")
426 |     print(f"Expected memory: {args.memory_limit}GB VRAM usage (optimized for 16GB GPU)")
427 |     print("Lazy loading: Active - data processed on-demand")
428 |     print(f"Performance Mode: Ultra-High (Batch: {args.batch_size}, LR: {args.learning_rate})")
429 |     print("=" * 80)
430 | 
431 |     try:
432 |         trainer.train()
433 |         print("✅ Training completed successfully!")
434 |     except KeyboardInterrupt:
435 |         print("[WARN] Training interrupted by user.")
436 |     except Exception as e:
437 |         print(f"[ERROR] Training failed: {e}")
438 |         # Try to save partial model if possible
439 |         try:
440 |             trainer.save_model(OUTPUT_DIR + "_partial")
441 |             print(f"💾 Partial model saved to {OUTPUT_DIR}_partial")
442 |         except:
443 |             print("❌ Could not save partial model")
444 | 
445 |     print(f"💾 Saving final model to {OUTPUT_DIR} …")
446 |     trainer.save_model(OUTPUT_DIR)
447 | 
448 |     if PUSH_TO_HUB:
449 |         repo_id = HF_REPO_ID or os.path.basename(OUTPUT_DIR)
450 |         trainer.push_to_hub(repo_id=repo_id)
451 | 
452 |     clear_memory()
453 |     print("Done.")
454 | 
455 | 
456 | if __name__ == "__main__":
457 |     main()
458 | 


--------------------------------------------------------------------------------
/projects/financial-reasoning-enhanced/financial_reasoning_enhanced.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | """
  3 | Financial "Thinking" Fine-Tune on Gemma 3 270M: SFT + GRPO (Windows-friendly)
  4 | 
  5 | Highlights
  6 | - Multi-level reward:
  7 |   (1) strict format gate,
  8 |   (2) reasoning bundle (quality, logic, context),
  9 |   (3) FinBERT teacher alignment,
 10 |   (4) confidence calibration,
 11 |   (5) directional consistency.
 12 | - Clean prompts with hard contracts.
 13 | - Argparse overrides for datasets, steps, batches, lr, lengths, generations, beta, decoding, etc.
 14 | - Works with Unsloth 4-bit; optional bitsandbytes not required explicitly.
 15 | 
 16 | Tested with (approx): transformers≈4.55, trl≈0.14.x, datasets≈3.x, torch 2.7, Windows+CUDA.
 17 | """
 18 | 
 19 | import os
 20 | os.environ["TORCHDYNAMO_DISABLE"] = "1"  # safer on Windows
 21 | import argparse
 22 | import re
 23 | import gc
 24 | import json
 25 | from dataclasses import dataclass
 26 | from typing import Any, Dict, List, Optional, Tuple
 27 | from unsloth import FastLanguageModel
 28 | import torch
 29 | import torch.nn.functional as F
 30 | from datasets import load_dataset, Dataset
 31 | from transformers import TextStreamer
 32 | 
 33 | 
 34 | # Optional cosmetics (not required)
 35 | try:
 36 |     import seaborn as sns
 37 |     SEABORN_OK = True
 38 | except Exception:
 39 |     SEABORN_OK = False
 40 | 
 41 | import matplotlib.pyplot as plt
 42 | 
 43 | # TRL
 44 | from trl import SFTTrainer, SFTConfig, GRPOConfig, GRPOTrainer
 45 | 
 46 | # --- FinBERT Teacher ----------------------------------------------------------
 47 | from transformers import (
 48 |     AutoTokenizer as HFAutoTokenizer,
 49 |     AutoModelForSequenceClassification,
 50 | )
 51 | 
 52 | class FinBERTTeacher:
 53 |     def __init__(self, device: Optional[str] = None):
 54 |         self.labels = ["negative", "neutral", "positive"]
 55 |         self.tok = HFAutoTokenizer.from_pretrained("ProsusAI/finbert")
 56 |         self.model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")
 57 |         dev = device or ("cuda" if torch.cuda.is_available() else "cpu")
 58 |         self.model.to(dev).eval()
 59 | 
 60 |     @torch.no_grad()
 61 |     def predict_proba(self, text: str) -> Dict[str, float]:
 62 |         if not text:
 63 |             # equal uncertainty if blank
 64 |             return {k: 1.0 / len(self.labels) for k in self.labels}
 65 |         enc = self.tok(
 66 |             text[:1024],
 67 |             return_tensors="pt",
 68 |             truncation=True,
 69 |             max_length=512,
 70 |         ).to(self.model.device)
 71 |         logits = self.model(**enc).logits
 72 |         probs = F.softmax(logits, dim=-1).squeeze().tolist()
 73 |         return dict(zip(self.labels, probs))
 74 | 
 75 | # --- Tokens & Prompts ---------------------------------------------------------
 76 | REASONING_START = "<REASONING>"
 77 | REASONING_END = "</REASONING>"
 78 | SENTIMENT_START = "<SENTIMENT>"
 79 | SENTIMENT_END = "</SENTIMENT>"
 80 | CONFIDENCE_START = "<CONFIDENCE>"
 81 | CONFIDENCE_END = "</CONFIDENCE>"
 82 | 
 83 | SYSTEM_FINANCIAL_SFT = (
 84 |     "You are a financial analyst. Analyze sentiment with clear, balanced reasoning.\n"
 85 |     "OUTPUT CONTRACT (exactly this structure):\n"
 86 |     f"{REASONING_START} 2–3 concise sentences with both positives and negatives; use financial terms and connectives (because/however/therefore). {REASONING_END}\n"
 87 |     f"{SENTIMENT_START} one of: positive | negative | neutral {SENTIMENT_END}\n"
 88 |     f"{CONFIDENCE_START} a decimal between 0.1 and 1.0 {CONFIDENCE_END}\n"
 89 |     "Do not add any other sections or tags.\n"
 90 |     "\n"
 91 |     "Example:\n"
 92 |     f"{REASONING_START} Revenue growth is strong but margins compressed due to higher input costs; guidance is cautious, suggesting near-term volatility. Therefore, outlook is balanced with upside from new products. {REASONING_END}\n"
 93 |     f"{SENTIMENT_START} neutral {SENTIMENT_END}\n"
 94 |     f"{CONFIDENCE_START} 0.72 {CONFIDENCE_END}"
 95 | )
 96 | 
 97 | SYSTEM_FINANCIAL_GRPO = (
 98 |     "Follow the exact contract. Keep the reasoning compact and balanced.\n"
 99 |     f"{REASONING_START} ... {REASONING_END}\n"
100 |     f"{SENTIMENT_START} positive|negative|neutral {SENTIMENT_END}\n"
101 |     f"{CONFIDENCE_START} 0.1–1.0 {CONFIDENCE_END}"
102 | )
103 | 
104 | # --- Config -------------------------------------------------------------------
105 | @dataclass
106 | class FinancialConfig:
107 |     model_name: str = "unsloth/gemma-3-270m-it"
108 |     max_seq_length: int = 512
109 | 
110 |     # SFT
111 |     sft_epochs: int = 3
112 |     sft_batch_size: int = 4
113 |     sft_grad_accum: int = 2
114 |     sft_lr: float = 1e-4
115 |     sft_warmup: float = 0.1
116 |     sft_weight_decay: float = 0.01
117 | 
118 |     # GRPO
119 |     grpo_epochs: float = 2.0
120 |     grpo_batch_size: int = 1
121 |     grpo_grad_accum: int = 4
122 |     grpo_lr: float = 1e-5
123 |     grpo_warmup: float = 0.1
124 |     grpo_weight_decay: float = 0.01
125 |     num_generations: int = 6
126 |     max_completion_length: int = 256
127 |     max_prompt_length: int = 512
128 |     beta: float = 0.15
129 |     temperature: float = 0.7
130 |     top_p: float = 0.9
131 | 
132 |     # LoRA
133 |     lora_rank: int = 32
134 |     lora_alpha: int = 64
135 | 
136 |     # Data
137 |     data_mode: str = "mixed"  # [mixed|real|synthetic]
138 |     max_real_examples: int = 200
139 |     min_total_examples: int = 20
140 | 
141 | # --- Synthetic fallback examples ---------------------------------------------
142 | SYNTHETIC = [
143 |     {
144 |         "text": "Tech company reports 25% revenue growth but 8% profit decline due to R&D investment",
145 |         "reasoning": "Revenue growth is strong, reflecting demand and expansion. Profit dipped due to R&D, which is a strategic cost with long-term upside. Near-term margins compress, but growth story remains intact.",
146 |         "sentiment": "positive",
147 |         "confidence": 0.75,
148 |     },
149 |     {
150 |         "text": "Bank announces 3% dividend increase while facing regulatory scrutiny over compliance issues",
151 |         "reasoning": "Dividend increase signals capital strength and shareholder focus. However, regulatory scrutiny introduces material risk, including fines or constraints; this offsets the positive signal.",
152 |         "sentiment": "negative",
153 |         "confidence": 0.80,
154 |     },
155 |     {
156 |         "text": "Manufacturing firm reports stable earnings but warns of supply chain disruptions ahead",
157 |         "reasoning": "Current results are stable, showing operational control. The forward warning raises uncertainty, with potential cost and delivery impacts. Overall, signals are mixed.",
158 |         "sentiment": "neutral",
159 |         "confidence": 0.65,
160 |     },
161 | ]
162 | 
163 | # --- Dataset utilities --------------------------------------------------------
164 | def load_phrasebank(split_cfg: str = "sentences_50agree", max_n: int = 200) -> List[Dict[str, Any]]:
165 |     out = []
166 |     try:
167 |         ds = load_dataset("financial_phrasebank", split="train", name=split_cfg, trust_remote_code=True)
168 |         n = min(max_n, len(ds))
169 |         for ex in ds.select(range(n)):
170 |             text = ex["sentence"]
171 |             label = ex["label"]  # 0 neg, 1 neu, 2 pos
172 |             if label == 0:
173 |                 rsn = "Text implies risks or deteriorating performance; signals likely weigh on valuation."
174 |                 sent = "negative"
175 |                 conf = 0.75
176 |             elif label == 1:
177 |                 rsn = "Information is balanced; positives and negatives offset each other, implying a wait-and-see stance."
178 |                 sent = "neutral"
179 |                 conf = 0.60
180 |             else:
181 |                 rsn = "Text implies improving fundamentals or favorable momentum; sentiment is constructive."
182 |                 sent = "positive"
183 |                 conf = 0.75
184 |             out.append(
185 |                 {"text": text, "reasoning": rsn, "sentiment": sent, "confidence": conf, "source": "phrasebank"}
186 |             )
187 |     except Exception as e:
188 |         print(f"[WARN] Could not load Financial PhraseBank: {e}")
189 |     return out
190 | 
191 | def build_dataset(cfg: FinancialConfig, data_mode: str) -> Dataset:
192 |     examples: List[Dict[str, Any]] = []
193 |     if data_mode in ("mixed", "real"):
194 |         real = load_phrasebank(max_n=cfg.max_real_examples)
195 |         examples.extend(real)
196 |     if data_mode in ("mixed", "synthetic") or (not examples):
197 |         examples.extend(SYNTHETIC)
198 |     ds = Dataset.from_list(examples)
199 |     if len(ds) < cfg.min_total_examples:
200 |         print(f"[WARN] Only {len(ds)} examples; below min_total_examples={cfg.min_total_examples}. Training will still run.")
201 |     return ds
202 | 
203 | # --- Formatting for SFT / GRPO ------------------------------------------------
204 | def format_sft(ex, tokenizer) -> Dict[str, Any]:
205 |     messages = [
206 |         {"role": "system", "content": SYSTEM_FINANCIAL_SFT},
207 |         {"role": "user", "content": f"Analyze the sentiment of this financial news:\n{ex['text']}"},
208 |         {
209 |             "role": "assistant",
210 |             "content": (
211 |                 f"{REASONING_START}{ex['reasoning']}{REASONING_END}\n"
212 |                 f"{SENTIMENT_START}{ex['sentiment']}{SENTIMENT_END}\n"
213 |                 f"{CONFIDENCE_START}{ex['confidence']}{CONFIDENCE_END}"
214 |             ),
215 |         },
216 |     ]
217 |     enc = tokenizer.apply_chat_template(messages, tokenize=True)
218 |     token_len = len(enc["input_ids"]) if isinstance(enc, dict) else len(enc)
219 |     text_str = tokenizer.apply_chat_template(messages, tokenize=False)
220 |     return {"token_len": token_len, "text": text_str}
221 | 
222 | def format_grpo(ex, tokenizer) -> Dict[str, Any]:
223 |     messages = [
224 |         {"role": "system", "content": SYSTEM_FINANCIAL_GRPO},
225 |         {"role": "user", "content": f"Analyze the sentiment of this financial news:\n{ex['text']}"},
226 |     ]
227 |     prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
228 |     full_ids = tokenizer(prompt, truncation=False)["input_ids"]
229 |     return {"prompt": prompt, "full_len": len(full_ids), "gold_text": ex["text"]}
230 | 
231 | # --- Light analyzers for the reasoning bundle --------------------------------
232 | class FinancialReasoningAnalyzer:
233 |     def __init__(self):
234 |         self.financial_terms = ["revenue", "profit", "margin", "guidance", "debt", "cash", "capex", "dividend"]
235 |         self.connectives = ["because", "however", "although", "while", "despite", "therefore", "thus"]
236 |         self.context_terms = ["market", "sector", "industry", "trend", "environment", "macro", "near-term", "long-term"]
237 | 
238 |     def quality(self, txt: str) -> float:
239 |         t = txt.lower()
240 |         score = 0.0
241 |         score += min(sum(1 for w in self.financial_terms if w in t) / 3.0, 1.0) * 0.4
242 |         score += min(sum(1 for w in self.connectives if w in t) / 2.0, 1.0) * 0.3
243 |         # balanced: contains both positive and negative cues
244 |         pos = any(w in t for w in ["growth", "increase", "improve", "strong", "up"])
245 |         neg = any(w in t for w in ["decline", "decrease", "worse", "weak", "down"])
246 |         score += (0.3 if (pos and neg) else 0.15 if (pos or neg) else 0.0)
247 |         return max(0.0, min(1.0, score))
248 | 
249 |     def logic(self, txt: str) -> float:
250 |         t = txt.lower()
251 |         # crude contradiction check
252 |         contradictory = ("growth" in t and "decline" in t) or ("profit" in t and "loss" in t)
253 |         score = 0.5 if not contradictory else 0.2
254 |         if "therefore" in t or "thus" in t:
255 |             score += 0.3
256 |         if "mixed" in t or "uncertain" in t or "cautious" in t:
257 |             score += 0.2
258 |         return max(0.0, min(1.0, score))
259 | 
260 |     def context(self, txt: str) -> float:
261 |         t = txt.lower()
262 |         c = min(sum(1 for w in self.context_terms if w in t) / 3.0, 1.0) * 0.6
263 |         timing = 0.4 if ("short-term" in t or "long-term" in t or "near-term" in t) else 0.2 if "future" in t else 0.0
264 |         return max(0.0, min(1.0, c + timing))
265 | 
266 | # --- Reward helpers -----------------------------------------------------------
267 | def extract_components(s: str) -> Dict[str, str]:
268 |     def _grab(a, b):
269 |         m = re.search(rf"{re.escape(a)}(.*?){re.escape(b)}", s, re.DOTALL | re.IGNORECASE)
270 |         return m.group(1).strip() if m else ""
271 |     return {
272 |         "reasoning": _grab(REASONING_START, REASONING_END),
273 |         "sentiment": _grab(SENTIMENT_START, SENTIMENT_END).lower(),
274 |         "confidence": _grab(CONFIDENCE_START, CONFIDENCE_END),
275 |     }
276 | 
277 | def reward_format_gate(txt: str) -> float:
278 |     need = [(REASONING_START, REASONING_END), (SENTIMENT_START, SENTIMENT_END), (CONFIDENCE_START, CONFIDENCE_END)]
279 |     ok = all(txt.count(s) == 1 and txt.count(e) == 1 for s, e in need)
280 |     return 1.0 if ok else 0.0
281 | 
282 | def parse_raw_text_from_prompt(prompt_str: str) -> str:
283 |     key = "Analyze the sentiment of this financial news:\n"
284 |     if key in prompt_str:
285 |         return prompt_str.split(key, 1)[1].strip()
286 |     return ""
287 | 
288 | def reward_confidence_calibration(p_teacher: float, p_model: float) -> float:
289 |     # Brier-like: 1 - (gap^2), clamp [0,1]
290 |     try:
291 |         return max(0.0, 1.0 - float(p_model - p_teacher) ** 2)
292 |     except Exception:
293 |         return 0.0
294 | 
295 | def reward_directional(text: str, reasoning: str, sentiment: str) -> float:
296 |     t = reasoning.lower()
297 |     pos = any(w in t for w in ["increase", "growth", "improve", "up", "higher"])
298 |     neg = any(w in t for w in ["decrease", "decline", "worse", "down", "lower"])
299 |     if pos and neg and sentiment == "neutral":
300 |         return 1.0
301 |     if pos and not neg and sentiment == "positive":
302 |         return 1.0
303 |     if neg and not pos and sentiment == "negative":
304 |         return 1.0
305 |     return 0.0
306 | 
307 | # Robust reward wrapper that matches TRL call orders across versions
308 | def make_rewards(analyzer: FinancialReasoningAnalyzer, teacher: FinBERTTeacher):
309 |     def reward_gate(prompts=None, completions=None, **kwargs):
310 |         # Accept either (completions, **kwargs) or (prompts, completions, **kwargs)
311 |         comp_list = completions if completions is not None else kwargs.get("completions") or prompts
312 |         return [reward_format_gate(str(c)) for c in comp_list]
313 | 
314 |     def reward_finance(prompts=None, completions=None, **kwargs):
315 |         # Normalize args
316 |         pr = prompts
317 |         comp_list = completions
318 |         if comp_list is None:
319 |             # Some TRL versions pass (completions, **kwargs)
320 |             comp_list = pr
321 |             pr = kwargs.get("prompts", [])
322 |         if pr is None:
323 |             pr = []
324 |         pr = list(pr)
325 |         comp_list = list(comp_list)
326 | 
327 |         scores = []
328 |         for p, c in zip(pr, comp_list):
329 |             txt = str(c)
330 |             gate = reward_format_gate(txt)
331 |             if gate == 0.0:
332 |                 scores.append(0.0)
333 |                 continue
334 | 
335 |             comp = extract_components(txt)
336 |             sent = comp["sentiment"]
337 |             try:
338 |                 conf = float(comp["confidence"])
339 |             except Exception:
340 |                 conf = 0.0
341 | 
342 |             raw = parse_raw_text_from_prompt(str(p))
343 |             probs = teacher.predict_proba(raw)
344 |             p_teacher = float(probs.get(sent, 0.0))
345 | 
346 |             r_q = analyzer.quality(comp["reasoning"])
347 |             r_l = analyzer.logic(comp["reasoning"])
348 |             r_c = analyzer.context(comp["reasoning"])
349 |             r_reason = 0.5 * r_q + 0.3 * r_l + 0.2 * r_c
350 | 
351 |             r_sent = p_teacher
352 |             r_cal = reward_confidence_calibration(p_teacher, conf)
353 |             r_dir = reward_directional(raw, comp["reasoning"], sent)
354 | 
355 |             total = (0.35 * r_sent) + (0.25 * r_reason) + (0.20 * r_cal) + (0.15 * r_dir)
356 |             scores.append(float(gate * total))
357 |         return scores
358 | 
359 |     return reward_gate, reward_finance
360 | 
361 | # --- Plot utility -------------------------------------------------------------
362 | def plot_grpo_metrics(log_history: List[dict]) -> None:
363 |     if not log_history:
364 |         print("No GRPO log history to plot.")
365 |         return
366 |     if SEABORN_OK:
367 |         plt.style.use("seaborn-v0_8")
368 |         sns.set_palette("pastel")
369 | 
370 |     steps, losses, rewards, kls = [], [], [], []
371 |     for log in log_history:
372 |         if "step" in log and "loss" in log:
373 |             steps.append(log["step"])
374 |             losses.append(log["loss"])
375 |             rewards.append(log.get("reward", None))
376 |             kls.append(log.get("kl", None))
377 | 
378 |     plt.figure(figsize=(11, 3))
379 |     plt.subplot(1, 3, 1); plt.plot(steps, losses, marker="x"); plt.title("Policy Loss"); plt.grid(alpha=0.3)
380 |     plt.subplot(1, 3, 2); plt.plot(steps, rewards, marker="x"); plt.title("Total Reward"); plt.grid(alpha=0.3)
381 |     plt.subplot(1, 3, 3); plt.plot(steps, kls, marker="x"); plt.title("KL Penalty"); plt.grid(alpha=0.3)
382 |     plt.tight_layout(); plt.show()
383 | 
384 | # --- Main pipeline ------------------------------------------------------------
385 | def main(args):
386 |     cfg = FinancialConfig(
387 |         model_name=args.base_model or "unsloth/gemma-3-270m-it",
388 |         max_seq_length=args.max_prompt_length or 512,
389 |         sft_epochs=args.sft_epochs,
390 |         sft_batch_size=args.sft_batch,
391 |         sft_grad_accum=args.sft_grad_accum,
392 |         sft_lr=args.sft_lr,
393 |         sft_warmup=args.sft_warmup,
394 |         sft_weight_decay=args.sft_weight_decay,
395 |         grpo_epochs=args.grpo_epochs,
396 |         grpo_batch_size=args.grpo_batch,
397 |         grpo_grad_accum=args.grpo_grad_accum,
398 |         grpo_lr=args.grpo_lr,
399 |         grpo_warmup=args.grpo_warmup,
400 |         grpo_weight_decay=args.grpo_weight_decay,
401 |         num_generations=args.num_generations,
402 |         max_completion_length=args.max_completion_length,
403 |         max_prompt_length=args.max_prompt_length,
404 |         beta=args.beta,
405 |         temperature=args.temperature,
406 |         top_p=args.top_p,
407 |         lora_rank=args.lora_rank,
408 |         lora_alpha=args.lora_alpha,
409 |         data_mode=args.data_mode,
410 |         max_real_examples=args.max_real_examples,
411 |         min_total_examples=args.min_total_examples,
412 |     )
413 | 
414 |     print("🚀 Financial Thinking Pipeline — SFT + GRPO")
415 |     print(f"Base model: {cfg.model_name}")
416 |     print(f"Data mode: {cfg.data_mode}")
417 | 
418 |     # Load model/tokenizer via Unsloth (4-bit optional)
419 |     model, tokenizer = FastLanguageModel.from_pretrained(
420 |         model_name=cfg.model_name,
421 |         max_seq_length=cfg.max_seq_length,
422 |         load_in_4bit=args.use_4bit,
423 |         fast_inference=False,
424 |         max_lora_rank=cfg.lora_rank,
425 |         local_files_only=args.local_only,
426 |     )
427 | 
428 |     model = FastLanguageModel.get_peft_model(
429 |         model,
430 |         random_state=123,
431 |         r=cfg.lora_rank,
432 |         lora_alpha=cfg.lora_alpha,
433 |         bias="none",
434 |         use_gradient_checkpointing="unsloth",
435 |         target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
436 |     )
437 | 
438 |     # --------------------- SFT ---------------------
439 |     print("\n[Phase 1] Supervised Fine-Tuning (SFT)")
440 |     if args.train_jsonl:
441 |         raw_ds = load_dataset("json", data_files=args.train_jsonl, split="train")
442 |         # Expect fields: text, reasoning, sentiment, confidence
443 |         if not all(k in raw_ds.column_names for k in ["text", "reasoning", "sentiment", "confidence"]):
444 |             raise ValueError("Custom SFT JSONL must contain fields: text, reasoning, sentiment, confidence")
445 |         base_ds = raw_ds
446 |     else:
447 |         base_ds = build_dataset(cfg, data_mode=cfg.data_mode)
448 | 
449 |     sft_ds = base_ds.map(lambda ex: format_sft(ex, tokenizer), remove_columns=base_ds.column_names)
450 |     sft_ds = sft_ds.filter(lambda ex: ex["token_len"] <= cfg.max_seq_length)
451 |     # cap if desired (keep all by default)
452 |     if args.sft_limit > 0:
453 |         sft_ds = sft_ds.select(range(min(args.sft_limit, len(sft_ds))))
454 | 
455 |     sft_args = SFTConfig(
456 |         output_dir=os.path.join(args.output_dir, "sft"),
457 |         seed=123,
458 |         do_train=True,
459 |         num_train_epochs=cfg.sft_epochs,
460 |         per_device_train_batch_size=cfg.sft_batch_size,
461 |         gradient_accumulation_steps=cfg.sft_grad_accum,
462 |         learning_rate=cfg.sft_lr,
463 |         lr_scheduler_type="linear",
464 |         warmup_ratio=cfg.sft_warmup,
465 |         weight_decay=cfg.sft_weight_decay,
466 |         logging_strategy="steps",
467 |         logging_steps=args.logging_steps,
468 |         report_to="none",
469 |         dataset_num_proc=1,
470 |     )
471 |     sft_trainer = SFTTrainer(model=model, args=sft_args, train_dataset=sft_ds, tokenizer=tokenizer)
472 |     sft_trainer.train()
473 | 
474 |     del sft_trainer, sft_ds
475 |     gc.collect()
476 |     if torch.cuda.is_available():
477 |         torch.cuda.empty_cache()
478 | 
479 |     # --------------------- GRPO ---------------------
480 |     print("\n[Phase 2] GRPO (RL with multi-level rewards)")
481 |     if args.eval_jsonl:
482 |         raw_grpo = load_dataset("json", data_files=args.eval_jsonl, split="train")
483 |         if "text" not in raw_grpo.column_names:
484 |             raise ValueError("Custom GRPO JSONL must contain 'text' field.")
485 |         grpo_src = raw_grpo
486 |     else:
487 |         grpo_src = build_dataset(cfg, data_mode=cfg.data_mode)
488 | 
489 |     grpo_ds = grpo_src.map(lambda ex: format_grpo(ex, tokenizer), remove_columns=grpo_src.column_names)
490 |     if args.grpo_limit > 0:
491 |         grpo_ds = grpo_ds.select(range(min(args.grpo_limit, len(grpo_ds))))
492 | 
493 |     analyzer = FinancialReasoningAnalyzer()
494 |     teacher = FinBERTTeacher()
495 |     r_gate, r_fin = make_rewards(analyzer, teacher)
496 | 
497 |     grpo_args = GRPOConfig(
498 |         seed=123,
499 |         do_train=True,
500 |         num_train_epochs=cfg.grpo_epochs,
501 |         per_device_train_batch_size=cfg.grpo_batch_size,
502 |         gradient_accumulation_steps=cfg.grpo_grad_accum,
503 |         learning_rate=cfg.grpo_lr,
504 |         lr_scheduler_type="linear",
505 |         warmup_ratio=cfg.grpo_warmup,
506 |         weight_decay=cfg.grpo_weight_decay,
507 |         num_generations=cfg.num_generations,
508 |         max_prompt_length=cfg.max_prompt_length,
509 |         max_completion_length=cfg.max_completion_length,
510 |         logging_strategy="steps",
511 |         logging_steps=args.logging_steps,
512 |         report_to="none",
513 |         output_dir=args.output_dir,
514 |         overwrite_output_dir=True,
515 |         save_strategy="epoch",
516 |         beta=cfg.beta,
517 |         temperature=cfg.temperature,
518 |         top_p=cfg.top_p,
519 |     )
520 |     try:
521 |         grpo_trainer = GRPOTrainer(
522 |             model=model,
523 |             processing_class=tokenizer,
524 |             train_dataset=grpo_ds,
525 |             args=grpo_args,
526 |             reward_funcs=[r_gate, r_fin],
527 |         )
528 |     except TypeError:
529 |         grpo_trainer = GRPOTrainer(
530 |             model=model,
531 |             tokenizer=tokenizer,
532 |             train_dataset=grpo_ds,
533 |             args=grpo_args,
534 |             reward_funcs=[r_gate, r_fin],
535 |         )
536 |     grpo_trainer.train()
537 | 
538 |     if torch.cuda.is_available():
539 |         used_mem = round(torch.cuda.max_memory_reserved() / 1024**3, 2)
540 |         print(f"VRAM peak reserved: {used_mem} GB")
541 | 
542 |     plot_grpo_metrics(grpo_trainer.state.log_history)
543 | 
544 |     # --------------------- Quick sanity inference ---------------------
545 |     print("\n[Sanity Check] Generation samples")
546 |     model.eval()
547 |     device = "cuda" if torch.cuda.is_available() else "cpu"
548 |     samples = [
549 |         "Energy company reports 30% production increase but faces environmental lawsuit",
550 |         "Software firm announces major acquisition while reporting 5% decline in quarterly revenue",
551 |         "Bank reports record profits but warns of potential regulatory changes affecting lending",
552 |     ]
553 |     for i, s in enumerate(samples):
554 |         messages = [
555 |             {"role": "system", "content": SYSTEM_FINANCIAL_GRPO},
556 |             {"role": "user", "content": f"Analyze the sentiment of this financial news:\n{s}"},
557 |         ]
558 |         text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
559 |         toks = tokenizer(text, return_tensors="pt")
560 |         toks = {k: v.to(device) for k, v in toks.items()}
561 |         streamer = TextStreamer(tokenizer, skip_prompt=False)
562 |         _ = model.generate(
563 |             **toks,
564 |             temperature=cfg.temperature,
565 |             top_p=cfg.top_p,
566 |             max_new_tokens=cfg.max_completion_length,
567 |             streamer=streamer,
568 |             do_sample=True,
569 |         )
570 |         print("\n" + "=" * 60)
571 | 
572 |     # Save final
573 |     os.makedirs(args.output_dir, exist_ok=True)
574 |     model.save_pretrained(os.path.join(args.output_dir, "final_model"))
575 |     tokenizer.save_pretrained(os.path.join(args.output_dir, "final_model"))
576 |     print(f"\n✔ Saved final model to {os.path.join(args.output_dir, 'final_model')}")
577 | 
578 | 
579 | if __name__ == "__main__":
580 |     ap = argparse.ArgumentParser(description="Financial Thinking Model: SFT + GRPO on Gemma 3 270M")
581 | 
582 |     # Model & IO
583 |     ap.add_argument("--base-model", type=str, default="unsloth/gemma-3-270m-it")
584 |     ap.add_argument("--output-dir", type=str, default="financial_reasoning_improved-outputs")
585 |     ap.add_argument("--local-only", action="store_true")
586 |     ap.add_argument("--use-4bit", action="store_true", default=True)
587 | 
588 |     # Data control
589 |     ap.add_argument("--data-mode", choices=["mixed", "real", "synthetic"], default="mixed",
590 |                     help="Use Financial PhraseBank (real) and/or synthetic fallbacks.")
591 |     ap.add_argument("--max-real-examples", type=int, default=200)
592 |     ap.add_argument("--min-total-examples", type=int, default=20)
593 |     ap.add_argument("--train-jsonl", type=str, default=None,
594 |                     help="Custom SFT JSONL with fields: text, reasoning, sentiment, confidence")
595 |     ap.add_argument("--eval-jsonl", type=str, default=None,
596 |                     help="Custom GRPO JSONL with field: text")
597 |     ap.add_argument("--sft-limit", type=int, default=0, help="Limit SFT examples (0 = all)")
598 |     ap.add_argument("--grpo-limit", type=int, default=0, help="Limit GRPO examples (0 = all)")
599 | 
600 |     # SFT knobs
601 |     ap.add_argument("--sft-epochs", type=int, default=3)
602 |     ap.add_argument("--sft-batch", type=int, default=12)
603 |     ap.add_argument("--sft-grad-accum", type=int, default=2)
604 |     ap.add_argument("--sft-lr", type=float, default=1e-4)
605 |     ap.add_argument("--sft-warmup", type=float, default=0.1)
606 |     ap.add_argument("--sft-weight-decay", type=float, default=0.01)
607 | 
608 |     # GRPO knobs
609 |     ap.add_argument("--grpo-epochs", type=float, default=4.0)
610 |     ap.add_argument("--grpo-batch", type=int, default=12)
611 |     ap.add_argument("--grpo-grad-accum", type=int, default=4)
612 |     ap.add_argument("--grpo-lr", type=float, default=1e-5)
613 |     ap.add_argument("--grpo-warmup", type=float, default=0.1)
614 |     ap.add_argument("--grpo-weight-decay", type=float, default=0.01)
615 |     ap.add_argument("--num-generations", type=int, default=6)
616 |     ap.add_argument("--max-completion-length", type=int, default=512)
617 |     ap.add_argument("--max-prompt-length", type=int, default=1024)
618 |     ap.add_argument("--beta", type=float, default=0.15)
619 |     ap.add_argument("--temperature", type=float, default=0.7)
620 |     ap.add_argument("--top-p", type=float, default=0.9)
621 | 
622 |     # LoRA parameters
623 |     ap.add_argument("--lora-rank", type=int, default=32, help="LoRA rank for parameter-efficient fine-tuning")
624 |     ap.add_argument("--lora-alpha", type=int, default=64, help="LoRA alpha scaling factor")
625 | 
626 |     # Logging
627 |     ap.add_argument("--logging-steps", type=int, default=10)
628 | 
629 |     args = ap.parse_args()
630 |     main(args)
631 | 


--------------------------------------------------------------------------------