├── .gitignore ├── Dockerfile.txt ├── README.md ├── configs └── rag_config.json ├── create_bucket.sh ├── install_requirements.bash ├── manage_resources.py ├── requirements.txt ├── run_training.sh ├── save_to_hf.bash ├── save_to_hf.py ├── setup_directories.sh ├── setup_gcp.sh ├── setup_vertex_ai.sh ├── test_model.py ├── train_pipeline.py ├── utils ├── ___init__.py ├── a3c_training.py ├── advantage_weighted_regression.py ├── adversarial_irl.py ├── bayesian_rl.py ├── contrastive_rl.py ├── cot_trainer.py ├── curiosity_rl.py ├── curriculum_rl.py ├── data_loader.py ├── ddpg_training.py ├── diffusion_rl.py ├── distributional_rl.py ├── dpo_trainer.py ├── dqn_training.py ├── ensemble_rl.py ├── graph_based_rl.py ├── her_rl.py ├── hierarchical_rl.py ├── hybrid_model_rl.py ├── information_exploration.py ├── intrinsic_motivation.py ├── irl_training.py ├── knowledge_grounded_rl.py ├── meta_rl.py ├── meta_rl_task_decomposition.py ├── model_manager.py ├── multi_modal_rl.py ├── multi_objective_rl.py ├── off_policy_correction.py ├── ppo_trainer.py ├── q_learning.py ├── rag_training.py ├── rainbow_dqn_training.py ├── reverse_curriculum.py ├── reward_model.py ├── sac_training.py ├── self_supervised_rl.py ├── td3_training.py ├── transformer_xl_rl.py ├── trpo.py ├── tsallis_entropy_rl.py └── world_models.py └── ้how_to_trian.bash /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # UV 98 | # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | #uv.lock 102 | 103 | # poetry 104 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 105 | # This is especially recommended for binary packages to ensure reproducibility, and is more 106 | # commonly ignored for libraries. 107 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 108 | #poetry.lock 109 | 110 | # pdm 111 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 112 | #pdm.lock 113 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 114 | # in version control. 115 | # https://pdm.fming.dev/latest/usage/project/#working-with-version-control 116 | .pdm.toml 117 | .pdm-python 118 | .pdm-build/ 119 | 120 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 121 | __pypackages__/ 122 | 123 | # Celery stuff 124 | celerybeat-schedule 125 | celerybeat.pid 126 | 127 | # SageMath parsed files 128 | *.sage.py 129 | 130 | # Environments 131 | .env 132 | .venv 133 | env/ 134 | venv/ 135 | ENV/ 136 | env.bak/ 137 | venv.bak/ 138 | 139 | # Spyder project settings 140 | .spyderproject 141 | .spyproject 142 | 143 | # Rope project settings 144 | .ropeproject 145 | 146 | # mkdocs documentation 147 | /site 148 | 149 | # mypy 150 | .mypy_cache/ 151 | .dmypy.json 152 | dmypy.json 153 | 154 | # Pyre type checker 155 | .pyre/ 156 | 157 | # pytype static type analyzer 158 | .pytype/ 159 | 160 | # Cython debug symbols 161 | cython_debug/ 162 | 163 | # PyCharm 164 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 165 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 166 | # and can be added to the global gitignore or merged into this file. For a more nuclear 167 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 168 | #.idea/ 169 | 170 | # Ruff stuff: 171 | .ruff_cache/ 172 | 173 | # PyPI configuration file 174 | .pypirc 175 | -------------------------------------------------------------------------------- /Dockerfile.txt: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime 2 | 3 | WORKDIR /app 4 | 5 | # ติดตั้งไลบรารีที่จำเป็น 6 | RUN pip install --no-cache-dir transformers==4.30.2 \ 7 | datasets==2.13.1 \ 8 | trl==0.4.7 \ 9 | accelerate==0.21.0 \ 10 | safetensors==0.3.1 \ 11 | gym==0.26.2 \ 12 | stable-baselines3==2.0.0 \ 13 | google-cloud-storage==2.10.0 \ 14 | wandb==0.15.5 \ 15 | nltk==3.8.1 16 | 17 | # ติดตั้ง punkt สำหรับ NLTK 18 | RUN python -c "import nltk; nltk.download('punkt')" 19 | 20 | # คัดลอกไฟล์ training script 21 | COPY train_pipeline.py /app/ 22 | COPY utils/ /app/utils/ 23 | 24 | # สคริปต์สำหรับเริ่มการฝึกโมเดล 25 | ENTRYPOINT ["python", "-u", "train_pipeline.py"] -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 🤖 Advanced Reinforcement Learning Training Framework 2 | 3 | ## 📝 ภาพรวมโครงการ 4 | ระบบฝึกฝนและพัฒนาโมเดล Reinforcement Learning แบบครบวงจร รองรับอัลกอริทึมที่หลากหลายและมีเครื่องมือสำหรับการฝึกฝนโมเดลอย่างมีประสิทธิภาพ 5 | 6 | ## ✨ คุณสมบัติหลัก 7 | - รองรับอัลกอริทึม RL หลากหลายรูปแบบ: 8 | - Policy Gradient: PPO, TRPO, A3C 9 | - Value-Based: DQN, Rainbow DQN 10 | - Actor-Critic: DDPG, TD3, SAC 11 | - Meta-RL และ Hierarchical RL 12 | - Self-Supervised RL 13 | - Curiosity-driven RL 14 | - ระบบการจัดการโมเดลอัตโนมัติ 15 | - รองรับการฝึกฝนแบบ Multi-modal 16 | - มีระบบ Curriculum Learning 17 | - รองรับการทำ Distributional RL 18 | - มีระบบ Knowledge-grounded RL 19 | - รองรับการใช้งานร่วมกับ Diffusion Models 20 | - มีระบบ World Models สำหรับการเรียนรู้แบบ Model-based 21 | 22 | ## 📋 ความต้องการของระบบ 23 | - Python 3.8+ 24 | - CUDA compatible GPU (แนะนำ) 25 | - เครื่องมือและไลบรารีที่จำเป็น (ติดตั้งผ่าน requirements.txt) 26 | 27 | ## 🚀 การติดตั้ง 28 | 29 | 1. โคลนโปรเจค: 30 | ```bash 31 | git clone 32 | cd normaldayinthailand 33 | ``` 34 | 35 | 2. ติดตั้ง dependencies: 36 | ```bash 37 | bash install_requirements.bash 38 | ``` 39 | 40 | 3. ตั้งค่าไดเรกทอรีที่จำเป็น: 41 | ```bash 42 | bash setup_directories.sh 43 | ``` 44 | 45 | ## 💻 การใช้งาน 46 | 47 | ### การเริ่มต้นฝึกฝนโมเดล 48 | ```bash 49 | bash how_to_train.bash 50 | ``` 51 | 52 | ### การบันทึกโมเดลไปยัง Hugging Face 53 | ```bash 54 | bash save_to_hf.bash 55 | ``` 56 | 57 | ## 📁 โครงสร้างไดเรกทอรี 58 | 59 | ``` 60 | . 61 | ├── configs/ # ไฟล์การตั้งค่าต่างๆ 62 | ├── utils/ # โมดูลและฟังก์ชันสนับสนุน 63 | │ ├── ppo_trainer.py # การฝึกฝนด้วย PPO 64 | │ ├── dqn_training.py # การฝึกฝนด้วย DQN 65 | │ ├── a3c_training.py # การฝึกฝนด้วย A3C 66 | │ └── ... # อัลกอริทึมอื่นๆ 67 | ├── train_pipeline.py # ไปป์ไลน์หลักสำหรับการฝึกฝน 68 | └── test_model.py # สคริปต์สำหรับทดสอบโมเดล 69 | ``` 70 | 71 | ## 🛠️ รายละเอียดโมดูล Utils 72 | 73 | ### Policy-Based Algorithms 74 | - **ppo_trainer.py**: การฝึกฝนด้วยอัลกอริทึม Proximal Policy Optimization สำหรับการเรียนรู้นโยบายที่มีเสถียรภาพ 75 | - **trpo.py**: Trust Region Policy Optimization สำหรับการปรับปรุงนโยบายภายในขอบเขตที่กำหนด 76 | - **a3c_training.py**: Asynchronous Advantage Actor-Critic สำหรับการเรียนรู้แบบขนาน 77 | 78 | ### Value-Based Methods 79 | - **dqn_training.py**: Deep Q-Network สำหรับการเรียนรู้ฟังก์ชันมูลค่า 80 | - **rainbow_dqn_training.py**: Rainbow DQN รวมการปรับปรุง DQN หลายรูปแบบ 81 | - **distributional_rl.py**: การเรียนรู้แบบกระจายสำหรับการประมาณค่าผลตอบแทน 82 | 83 | ### Actor-Critic Methods 84 | - **ddpg_training.py**: Deep Deterministic Policy Gradient สำหรับ continuous action spaces 85 | - **td3_training.py**: Twin Delayed DDPG เพิ่มความเสถียรในการเรียนรู้ 86 | - **sac_training.py**: Soft Actor-Critic สำหรับการเรียนรู้แบบ maximum entropy 87 | 88 | ### Advanced RL Techniques 89 | - **meta_rl.py**: Meta Reinforcement Learning สำหรับการปรับตัวกับงานใหม่ 90 | - **hierarchical_rl.py**: การเรียนรู้แบบลำดับชั้นสำหรับงานซับซ้อน 91 | - **curriculum_rl.py**: การเรียนรู้แบบลำดับขั้นตอน 92 | - **world_models.py**: การสร้างโมเดลสภาพแวดล้อมเสมือน 93 | 94 | ### Exploration Strategies 95 | - **curiosity_rl.py**: การสำรวจด้วยความอยากรู้ 96 | - **information_exploration.py**: การสำรวจโดยใช้ทฤษฎีข้อมูล 97 | - **intrinsic_motivation.py**: แรงจูงใจภายในสำหรับการสำรวจ 98 | 99 | ### Imitation and Inverse RL 100 | - **adversarial_irl.py**: Inverse RL แบบ Adversarial 101 | - **her_rl.py**: Hindsight Experience Replay 102 | 103 | ### Multi-Task and Meta-Learning 104 | - **meta_rl_task_decomposition.py**: การแยกงานย่อยใน Meta-RL 105 | - **multi_objective_rl.py**: การเรียนรู้แบบหลายวัตถุประสงค์ 106 | 107 | ### Advanced Model Architectures 108 | - **transformer_xl_rl.py**: Transformer-XL สำหรับการเรียนรู้แบบ long-term dependencies 109 | - **diffusion_rl.py**: การใช้ Diffusion Models ใน RL 110 | - **hybrid_model_rl.py**: การผสมผสานโมเดลหลายรูปแบบ 111 | 112 | ### Ensemble and Robust Methods 113 | - **ensemble_rl.py**: การใช้หลายโมเดลเพื่อเพิ่มความแม่นยำ 114 | - **bayesian_rl.py**: การเรียนรู้แบบ Bayesian 115 | - **off_policy_correction.py**: การแก้ไขการเรียนรู้แบบ Off-policy 116 | 117 | ### Knowledge-Based Methods 118 | - **knowledge_grounded_rl.py**: การใช้ความรู้พื้นฐานในการเรียนรู้ 119 | - **rag_training.py**: Retrieval-Augmented Generation Training 120 | - **graph_based_rl.py**: การเรียนรู้บนโครงสร้างกราฟ 121 | 122 | ### Other Utilities 123 | - **model_manager.py**: ระบบจัดการโมเดลและการบันทึก รวมถึงการจัดการเวอร์ชัน การโหลด/บันทึกโมเดล การติดตามการทดลอง และการจัดการ checkpoints 124 | - **data_loader.py**: การโหลดและจัดการข้อมูล รองรับหลายรูปแบบข้อมูล (episodes, transitions, demonstrations) พร้อมระบบ preprocessing และ augmentation 125 | - **reward_model.py**: การสร้างและปรับแต่งฟังก์ชันรางวัล รวมถึงการเรียนรู้ฟังก์ชันรางวัลจากผู้เชี่ยวชาญและการปรับแต่งแบบ inverse RL 126 | - **contrastive_rl.py**: การเรียนรู้แบบ Contrastive เพื่อสร้าง representations ที่มีประสิทธิภาพสำหรับสถานะและการกระทำ 127 | - **self_supervised_rl.py**: การเรียนรู้แบบไม่ต้องการผู้สอนเพื่อปรับปรุง representations และนโยบายการเรียนรู้ 128 | - **q_learning.py**: การเรียนรู้แบบ Q-Learning พื้นฐานและการขยายความสามารถสำหรับ deep learning 129 | - **tsallis_entropy_rl.py**: การใช้ Tsallis entropy ในการควบคุมการสำรวจและการเรียนรู้นโยบาย 130 | - **advantage_weighted_regression.py**: การถดถอยแบบถ่วงน้ำหนักด้วย advantage สำหรับการปรับปรุงนโยบาย 131 | - **cot_trainer.py**: Chain of Thought Trainer สำหรับการฝึกฝนการคิดเป็นลำดับขั้นตอน 132 | - **dpo_trainer.py**: Direct Preference Optimization สำหรับการเรียนรู้จากการเปรียบเทียบคู่ 133 | - **irl_training.py**: Inverse Reinforcement Learning สำหรับการเรียนรู้ฟังก์ชันรางวัลจากการสาธิต 134 | - **multi_modal_rl.py**: การผสมผสานข้อมูลหลายรูปแบบ (ภาพ, ข้อความ, เสียง) ในการเรียนรู้ 135 | 136 | ## ⚙️ การกำหนดค่า 137 | การตั้งค่าหลักสามารถปรับแต่งได้ผ่านไฟล์ในโฟลเดอร์ `configs/` 138 | - `rag_config.json`: การตั้งค่าสำหรับ Retrieval-Augmented Generation 139 | - อื่นๆ: สามารถเพิ่มไฟล์คอนฟิกเพิ่มเติมตามความต้องการ 140 | 141 | ## 🎯 การฝึกฝนโมเดล 142 | 143 | 1. **การเตรียมข้อมูล** 144 | - จัดเตรียมข้อมูลในรูปแบบที่เหมาะสม 145 | - ตั้งค่าพารามิเตอร์ในไฟล์คอนฟิก 146 | 147 | 2. **การเริ่มฝึกฝน** 148 | ```bash 149 | python train_pipeline.py --config configs/your_config.json 150 | ``` 151 | 152 | 3. **การติดตามผล** 153 | - ระบบจะแสดงผลการฝึกฝนในระหว่างการทำงาน 154 | - สามารถดูผลลัพธ์เพิ่มเติมได้จากไฟล์ล็อก 155 | 156 | ## 🌟 คุณสมบัติพิเศษ 157 | 158 | ### 🔄 Curriculum Learning 159 | - รองรับการเรียนรู้แบบลำดับขั้น 160 | - ปรับความยากของงานอัตโนมัติ 161 | 162 | ### 🧠 Meta-RL 163 | - เรียนรู้การปรับตัวกับงานใหม่ได้อย่างรวดเร็ว 164 | - รองรับการแยกงานย่อยอัตโนมัติ 165 | 166 | ### 🎯 Multi-Objective RL 167 | - ฝึกฝนโมเดลสำหรับเป้าหมายหลายอย่างพร้อมกัน 168 | - ปรับสมดุลระหว่างวัตถุประสงค์ต่างๆ 169 | 170 | ## 🛠️ การใช้งานบน Cloud 171 | 172 | ### Google Cloud Platform 173 | ```bash 174 | bash setup_gcp.sh 175 | ``` 176 | 177 | ### Vertex AI 178 | ```bash 179 | bash setup_vertex_ai.sh 180 | ``` 181 | 182 | ## 📈 การทดสอบและประเมินผล 183 | ```bash 184 | python test_model.py --model-path path/to/your/model 185 | ``` 186 | 187 | ## 🔍 การแก้ไขปัญหาทั่วไป 188 | 189 | 1. **ปัญหาหน่วยความจำ** 190 | - ลดขนาด batch size 191 | - ใช้ gradient accumulation 192 | 193 | 2. **ปัญหา CUDA** 194 | - ตรวจสอบการติดตั้ง CUDA 195 | - ตรวจสอบความเข้ากันได้ของเวอร์ชัน 196 | 197 | ## 📚 อ้างอิงและทรัพยากร 198 | - [อัลกอริทึม PPO](https://arxiv.org/abs/1707.06347) 199 | - [Rainbow DQN](https://arxiv.org/abs/1710.02298) 200 | - [Meta-RL](https://arxiv.org/abs/1611.05763) 201 | 202 | ## 🤝 การมีส่วนร่วม 203 | ยินดีรับ Pull Requests และการรายงานปัญหา สามารถเปิด Issue ได้ที่ repository 204 | 205 | ## 📄 ลิขสิทธิ์ 206 | โปรเจคนี้อยู่ภายใต้ลิขสิทธิ์ MIT License 207 | -------------------------------------------------------------------------------- /configs/rag_config.json: -------------------------------------------------------------------------------- 1 | { 2 | "model_config": { 3 | "base_model": "scb10x/llama3.2-typhoon2-t1-3b-research-preview", 4 | "model_type": "causal", 5 | "pretrained_models": { 6 | "encoder": "facebook/rag-token-base", 7 | "generator": "facebook/bart-large", 8 | "retriever": "facebook/dpr-ctx_encoder-multiset-base" 9 | } 10 | }, 11 | "training_config": { 12 | "batch_size": 4, 13 | "epochs": 3, 14 | "learning_rate": 1e-5, 15 | "warmup_steps": 500, 16 | "weight_decay": 0.01, 17 | "gradient_accumulation_steps": 4, 18 | "max_grad_norm": 1.0, 19 | "fp16": true 20 | }, 21 | "distributed_config": { 22 | "use_deepspeed": true, 23 | "zero_stage": 2, 24 | "distributed_port": 29500, 25 | "gradient_checkpointing": true 26 | }, 27 | "optimizer_config": { 28 | "optimizer_type": "adamw", 29 | "scheduler_type": "cosine", 30 | "num_warmup_steps": 500, 31 | "num_training_steps": 5000 32 | }, 33 | "hyperopt_config": { 34 | "n_trials": 5, 35 | "parameters": { 36 | "learning_rate": { 37 | "type": "float", 38 | "min": 1e-6, 39 | "max": 1e-4, 40 | "log": true 41 | }, 42 | "batch_size": { 43 | "type": "categorical", 44 | "choices": [4, 8, 16, 32] 45 | }, 46 | "warmup_steps": { 47 | "type": "int", 48 | "min": 100, 49 | "max": 1000 50 | } 51 | } 52 | } 53 | } -------------------------------------------------------------------------------- /create_bucket.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # สร้าง bucket สำหรับเก็บข้อมูลและโมเดล 3 | BUCKET_NAME="gs://llm-training-bucket-$(date +%s)" 4 | REGION="us-central1" # เลือก region ที่มี A100 GPUs 5 | 6 | gcloud storage buckets create $BUCKET_NAME --location=$REGION 7 | 8 | echo "สร้าง bucket $BUCKET_NAME เสร็จสิ้น" -------------------------------------------------------------------------------- /install_requirements.bash: -------------------------------------------------------------------------------- 1 | pip install -r requirements.txt -------------------------------------------------------------------------------- /manage_resources.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | import argparse 3 | import logging 4 | from .utils.model_manager import ModelManager 5 | 6 | logging.basicConfig( 7 | level=logging.INFO, 8 | format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' 9 | ) 10 | logger = logging.getLogger(__name__) 11 | 12 | def parse_args(): 13 | parser = argparse.ArgumentParser(description="Manage models and datasets from Hugging Face Hub") 14 | 15 | # Create subparsers for different commands 16 | subparsers = parser.add_subparsers(dest="command", help="Command to execute") 17 | 18 | # Download model command 19 | download_model_parser = subparsers.add_parser("download-model", help="Download a model from Hugging Face Hub") 20 | download_model_parser.add_argument("--name", required=True, help="Model name on Hugging Face Hub") 21 | download_model_parser.add_argument("--type", choices=["pretrained", "finetuned"], default="pretrained", 22 | help="Type of model to download") 23 | 24 | # Download dataset command 25 | download_dataset_parser = subparsers.add_parser("download-dataset", help="Download a dataset from Hugging Face Hub") 26 | download_dataset_parser.add_argument("--name", required=True, help="Dataset name on Hugging Face Hub") 27 | download_dataset_parser.add_argument("--subset", help="Specific subset/configuration of the dataset") 28 | 29 | # List available models command 30 | list_models_parser = subparsers.add_parser("list-models", help="List available models on Hugging Face Hub") 31 | list_models_parser.add_argument("--tags", nargs="*", help="Filter models by tags") 32 | 33 | # List available datasets command 34 | list_datasets_parser = subparsers.add_parser("list-datasets", help="List available datasets on Hugging Face Hub") 35 | list_datasets_parser.add_argument("--tags", nargs="*", help="Filter datasets by tags") 36 | 37 | # Get model info command 38 | model_info_parser = subparsers.add_parser("model-info", help="Get detailed information about a model") 39 | model_info_parser.add_argument("--name", required=True, help="Model name on Hugging Face Hub") 40 | 41 | # Get dataset info command 42 | dataset_info_parser = subparsers.add_parser("dataset-info", help="Get detailed information about a dataset") 43 | dataset_info_parser.add_argument("--name", required=True, help="Dataset name on Hugging Face Hub") 44 | 45 | return parser.parse_args() 46 | 47 | def main(): 48 | try: 49 | args = parse_args() 50 | manager = ModelManager() 51 | 52 | if args.command == "download-model": 53 | model_path = manager.download_model(args.name, args.type) 54 | logger.info(f"Model downloaded successfully to: {model_path}") 55 | 56 | elif args.command == "download-dataset": 57 | dataset_path = manager.download_dataset(args.name, args.subset) 58 | logger.info(f"Dataset downloaded successfully to: {dataset_path}") 59 | 60 | elif args.command == "list-models": 61 | models = manager.list_available_models(args.tags) 62 | logger.info("Available models:") 63 | for model in models: 64 | print(f"- {model}") 65 | 66 | elif args.command == "list-datasets": 67 | datasets = manager.list_available_datasets(args.tags) 68 | logger.info("Available datasets:") 69 | for dataset in datasets: 70 | print(f"- {dataset}") 71 | 72 | elif args.command == "model-info": 73 | info = manager.get_model_info(args.name) 74 | logger.info(f"Model information for {args.name}:") 75 | for key, value in info.items(): 76 | print(f"{key}: {value}") 77 | 78 | elif args.command == "dataset-info": 79 | info = manager.get_dataset_info(args.name) 80 | logger.info(f"Dataset information for {args.name}:") 81 | for key, value in info.items(): 82 | print(f"{key}: {value}") 83 | 84 | else: 85 | logger.error("No command specified. Use -h for help.") 86 | return 1 87 | 88 | return 0 89 | 90 | except Exception as e: 91 | logger.error(f"Error executing command: {str(e)}") 92 | return 1 93 | 94 | if __name__ == "__main__": 95 | exit(main()) -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | transformers==4.30.2 2 | datasets==2.13.1 3 | torch==2.0.1 4 | accelerate==0.21.0 5 | trl==0.4.7 6 | safetensors==0.3.1 7 | gym==0.26.2 8 | stable-baselines3==2.0.0 9 | google-cloud-storage==2.10.0 10 | wandb==0.15.5 11 | nltk==3.8.1 12 | torch_geometric 13 | -------------------------------------------------------------------------------- /run_training.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # รับพารามิเตอร์จาก command line 4 | REGION=${1:-"us-central1"} 5 | RUN_STAGE=${2:-"all"} # all, dpo, reward, irl, q_learning, sac, ppo 6 | BATCH_SIZE=${3:-4} 7 | EPOCHS=${4:-1} 8 | 9 | # Load project info 10 | PROJECT_ID=$(gcloud config get-value project) 11 | GCS_BUCKET_NAME="gs://${PROJECT_ID}-llm-training" 12 | IMAGE_URI="gcr.io/${PROJECT_ID}/llm-rl-training:v1" 13 | 14 | # Create timestamp for job name 15 | TIMESTAMP=$(date +%Y%m%d_%H%M%S) 16 | JOB_NAME="llm_training_${RUN_STAGE}_${TIMESTAMP}" 17 | 18 | # สร้างไฟล์ config สำหรับ Vertex AI Custom Job 19 | cat > vertex_job_config_${TIMESTAMP}.json << EOF 20 | { 21 | "displayName": "${JOB_NAME}", 22 | "jobSpec": { 23 | "workerPoolSpecs": [ 24 | { 25 | "machineSpec": { 26 | "machineType": "n1-standard-16", 27 | "acceleratorType": "NVIDIA_TESLA_A100", 28 | "acceleratorCount": 1 29 | }, 30 | "replicaCount": 1, 31 | "diskSpec": { 32 | "bootDiskType": "pd-ssd", 33 | "bootDiskSizeGb": 100 34 | }, 35 | "containerSpec": { 36 | "imageUri": "${IMAGE_URI}", 37 | "args": [ 38 | "--base-model", "scb10x/llama3.2-typhoon2-t1-3b-research-preview", 39 | "--batch-size", "${BATCH_SIZE}", 40 | "--gcs-output-path", "${GCS_BUCKET_NAME}/output/${TIMESTAMP}", 41 | "--epochs", "${EPOCHS}", 42 | "--run-stage", "${RUN_STAGE}", 43 | "--wandb-project", "llm-rl-training" 44 | ] 45 | } 46 | } 47 | ] 48 | } 49 | } 50 | EOF 51 | 52 | echo "Creating Vertex AI custom job..." 53 | gcloud ai custom-jobs create --region=${REGION} --config=vertex_job_config_${TIMESTAMP}.json 54 | 55 | echo "Job submitted! Monitor at:" 56 | echo "https://console.cloud.google.com/vertex-ai/training/custom-jobs?project=${PROJECT_ID}" 57 | echo "" 58 | echo "After completion, find outputs at: ${GCS_BUCKET_NAME}/output/${TIMESTAMP}" -------------------------------------------------------------------------------- /save_to_hf.bash: -------------------------------------------------------------------------------- 1 | pip install transformers huggingface_hub safetensors torch -------------------------------------------------------------------------------- /save_to_hf.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from transformers import AutoModelForCausalLM, AutoTokenizer 4 | from huggingface_hub import HfApi, HfFolder 5 | from safetensors.torch import save_file 6 | 7 | def save_model_to_hf(model_name, save_dir, hf_repo_name, hf_token): 8 | # โหลดโมเดลและ tokenizer 9 | tokenizer = AutoTokenizer.from_pretrained(model_name) 10 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto") 11 | 12 | # สร้างโฟลเดอร์สำหรับบันทึกโมเดล 13 | os.makedirs(save_dir, exist_ok=True) 14 | 15 | # บันทึกโมเดลและ tokenizer ในรูปแบบ .safetensors 16 | state_dict = model.state_dict() 17 | save_file(state_dict, os.path.join(save_dir, "model.safetensors")) 18 | tokenizer.save_pretrained(save_dir) 19 | 20 | # อัพโหลดไปยัง Hugging Face 21 | api = HfApi() 22 | api.upload_folder( 23 | folder_path=save_dir, 24 | path_in_repo="", 25 | repo_id=hf_repo_name, 26 | token=hf_token 27 | ) 28 | 29 | if __name__ == "__main__": 30 | # กำหนดค่าพารามิเตอร์ 31 | model_name = "scb10x/llama3.2-typhoon2-t1-3b-research-preview" 32 | save_dir = "./saved_model" 33 | hf_repo_name = "JonusNattapong/llama3.2-typhoon2-t1-3b" 34 | hf_token = HfFolder.get_token() 35 | 36 | # เรียกใช้งานฟังก์ชันเพื่อบันทึกโมเดลไปยัง Hugging Face 37 | save_model_to_hf(model_name, save_dir, hf_repo_name, hf_token) -------------------------------------------------------------------------------- /setup_directories.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Create main directories 4 | mkdir -p models/pretrained 5 | mkdir -p models/finetuned 6 | mkdir -p datasets/raw 7 | mkdir -p datasets/processed 8 | mkdir -p configs/training 9 | mkdir -p configs/model 10 | mkdir -p logs/training 11 | mkdir -p logs/evaluation 12 | mkdir -p checkpoints 13 | 14 | # Add .gitkeep to keep empty directories in git 15 | touch models/pretrained/.gitkeep 16 | touch models/finetuned/.gitkeep 17 | touch datasets/raw/.gitkeep 18 | touch datasets/processed/.gitkeep 19 | touch configs/training/.gitkeep 20 | touch configs/model/.gitkeep 21 | touch logs/training/.gitkeep 22 | touch logs/evaluation/.gitkeep 23 | touch checkpoints/.gitkeep 24 | 25 | echo "Directory structure created successfully!" -------------------------------------------------------------------------------- /setup_gcp.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | # ติดตั้ง Google Cloud SDK ถ้ายังไม่มี 3 | if ! command -v gcloud &> /dev/null 4 | then 5 | echo "กำลังติดตั้ง Google Cloud SDK..." 6 | curl https://sdk.cloud.google.com | bash 7 | exec -l $SHELL 8 | gcloud init 9 | fi 10 | 11 | # สร้าง project หรือใช้ project ที่มีอยู่ 12 | PROJECT_ID="llm-training-project" 13 | gcloud projects create $PROJECT_ID --name="LLM Training Project" 14 | 15 | # ตั้งค่า project เป็น default 16 | gcloud config set project $PROJECT_ID 17 | 18 | # เปิดใช้งาน Vertex AI API, Compute Engine API และ IAM API 19 | gcloud services enable compute.googleapis.com 20 | gcloud services enable aiplatform.googleapis.com 21 | gcloud services enable iam.googleapis.com 22 | 23 | # สร้าง service account สำหรับ Vertex AI 24 | gcloud iam service-accounts create vertex-ai-training \ 25 | --display-name="Vertex AI Training Service Account" 26 | 27 | # ให้สิทธิ์ที่จำเป็น 28 | gcloud projects add-iam-policy-binding $PROJECT_ID \ 29 | --member="serviceAccount:vertex-ai-training@$PROJECT_ID.iam.gserviceaccount.com" \ 30 | --role="roles/aiplatform.user" 31 | 32 | gcloud projects add-iam-policy-binding $PROJECT_ID \ 33 | --member="serviceAccount:vertex-ai-training@$PROJECT_ID.iam.gserviceaccount.com" \ 34 | --role="roles/storage.objectAdmin" -------------------------------------------------------------------------------- /setup_vertex_ai.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # คำสั่งสำหรับการสร้าง custom container และอัพโหลดไปยัง Container Registry 4 | PROJECT_ID=$(gcloud config get-value project) 5 | REGION="us-central1" # เลือก region ที่มี A100 GPU 6 | IMAGE_NAME="llm-rl-training" 7 | IMAGE_TAG="v1" 8 | IMAGE_URI="gcr.io/${PROJECT_ID}/${IMAGE_NAME}:${IMAGE_TAG}" 9 | 10 | # Create a temporary directory for our Dockerfile 11 | TMP_DIR=$(mktemp -d) 12 | cp Dockerfile ${TMP_DIR}/ 13 | mkdir -p ${TMP_DIR}/utils 14 | cp train_pipeline.py ${TMP_DIR}/ 15 | cp utils/*.py ${TMP_DIR}/utils/ 16 | 17 | # Build and push container image 18 | echo "Building container image..." 19 | cd ${TMP_DIR} 20 | gcloud builds submit --tag ${IMAGE_URI} . 21 | 22 | # Clean up 23 | cd - 24 | rm -rf ${TMP_DIR} 25 | 26 | echo "Container image built and pushed to ${IMAGE_URI}" 27 | 28 | # สร้าง Cloud Storage bucket สำหรับเก็บโมเดลและผลลัพธ์ 29 | GCS_BUCKET_NAME="gs://${PROJECT_ID}-llm-training" 30 | gcloud storage buckets create ${GCS_BUCKET_NAME} --location=${REGION} --uniform-bucket-level-access 31 | 32 | echo "Created Cloud Storage bucket: ${GCS_BUCKET_NAME}" 33 | echo "Image URI: ${IMAGE_URI}" 34 | echo "GCS Output Path: ${GCS_BUCKET_NAME}/output" 35 | 36 | # สร้างไฟล์ config สำหรับ Vertex AI Custom Job 37 | cat > vertex_job_config.json << EOF 38 | { 39 | "displayName": "LLM Training with RL techniques", 40 | "jobSpec": { 41 | "workerPoolSpecs": [ 42 | { 43 | "machineSpec": { 44 | "machineType": "n1-standard-16", 45 | "acceleratorType": "NVIDIA_TESLA_A100", 46 | "acceleratorCount": 1 47 | }, 48 | "replicaCount": 1, 49 | "diskSpec": { 50 | "bootDiskType": "pd-ssd", 51 | "bootDiskSizeGb": 100 52 | }, 53 | "containerSpec": { 54 | "imageUri": "${IMAGE_URI}", 55 | "args": [ 56 | "--base-model", "scb10x/llama3.2-typhoon2-t1-3b-research-preview", 57 | "--batch-size", "4", 58 | "--gcs-output-path", "${GCS_BUCKET_NAME}/output", 59 | "--epochs", "1", 60 | "--wandb-project", "llm-rl-training", 61 | "--max-samples", "5000" 62 | ] 63 | } 64 | } 65 | ] 66 | } 67 | } 68 | EOF 69 | 70 | echo "Created Vertex AI job config: vertex_job_config.json" 71 | echo "To start training job run:" 72 | echo "gcloud ai custom-jobs create --region=${REGION} --display-name=\"LLM Training Job\" --config=vertex_job_config.json" -------------------------------------------------------------------------------- /test_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import argparse 4 | from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 5 | from google.cloud import storage 6 | 7 | def download_from_gcs(gcs_path, local_dir): 8 | """ 9 | ดาวน์โหลดโมเดลจาก GCS มายังเครื่องเฉพาะกาล 10 | """ 11 | if not gcs_path.startswith("gs://"): 12 | return local_dir 13 | 14 | os.makedirs(local_dir, exist_ok=True) 15 | 16 | storage_client = storage.Client() 17 | bucket_name = gcs_path.replace("gs://", "").split("/")[0] 18 | prefix = "/".join(gcs_path.replace("gs://", "").split("/")[1:]) 19 | bucket = storage_client.bucket(bucket_name) 20 | 21 | blobs = bucket.list_blobs(prefix=prefix) 22 | for blob in blobs: 23 | name = blob.name 24 | rel_path = name[len(prefix):] if name.startswith(prefix) else name 25 | if rel_path.startswith("/"): 26 | rel_path = rel_path[1:] 27 | 28 | if not rel_path: 29 | continue 30 | 31 | target_path = os.path.join(local_dir, rel_path) 32 | os.makedirs(os.path.dirname(target_path), exist_ok=True) 33 | blob.download_to_filename(target_path) 34 | print(f"Downloaded {name} to {target_path}") 35 | 36 | return local_dir 37 | 38 | def main(): 39 | parser = argparse.ArgumentParser(description="Test trained LLM model") 40 | parser.add_argument("--model-path", type=str, required=True, 41 | help="Path to model directory, can be GCS path or local") 42 | parser.add_argument("--test-prompts", type=str, nargs="+", 43 | default=["รีวิวร้านอาหาร: ", "รีวิวร้านอาหาร: อยากกินอะไรอร่อยๆ"]) 44 | parser.add_argument("--temperature", type=float, default=0.7) 45 | parser.add_argument("--max-length", type=int, default=100) 46 | parser.add_argument("--num-return-sequences", type=int, default=2) 47 | args = parser.parse_args() 48 | 49 | # ดาวน์โหลดโมเดลถ้าเป็น GCS path 50 | local_model_dir = "./downloaded_model" 51 | if args.model_path.startswith("gs://"): 52 | print(f"Downloading model from {args.model_path}...") 53 | download_from_gcs(args.model_path, local_model_dir) 54 | model_path = local_model_dir 55 | else: 56 | model_path = args.model_path 57 | 58 | # โหลดโมเดลและ tokenizer 59 | print(f"Loading model from {model_path}...") 60 | tokenizer = AutoTokenizer.from_pretrained(model_path) 61 | model = AutoModelForCausalLM.from_pretrained( 62 | model_path, 63 | torch_dtype=torch.bfloat16, 64 | device_map="auto" 65 | ) 66 | 67 | # สร้าง text generation pipeline 68 | generator = pipeline( 69 | "text-generation", 70 | model=model, 71 | tokenizer=tokenizer, 72 | device=0 if torch.cuda.is_available() else -1 73 | ) 74 | 75 | # ทดสอบโมเดล 76 | print("\n===== ผลการทดสอบโมเดล =====\n") 77 | for prompt in args.test_prompts: 78 | print(f"\nPrompt: {prompt}") 79 | outputs = generator( 80 | prompt, 81 | max_length=args.max_length, 82 | num_return_sequences=args.num_return_sequences, 83 | temperature=args.temperature, 84 | do_sample=True, 85 | ) 86 | 87 | for i, output in enumerate(outputs): 88 | generated_text = output['generated_text'] 89 | print(f"Output {i+1}: {generated_text}") 90 | 91 | print("\n===== ทดสอบเสร็จสิ้น =====") 92 | 93 | if __name__ == "__main__": 94 | main() -------------------------------------------------------------------------------- /utils/___init__.py: -------------------------------------------------------------------------------- 1 | # Package initialization file -------------------------------------------------------------------------------- /utils/a3c_training.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import logging 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class ActorCriticNetwork(nn.Module): 12 | def __init__(self, base_model_path, vocab_size): 13 | super(ActorCriticNetwork, self).__init__() 14 | self.base_model = AutoModelForCausalLM.from_pretrained(base_model_path) 15 | 16 | # Actor head (policy network) 17 | self.actor = nn.Sequential( 18 | nn.Linear(self.base_model.config.hidden_size, self.base_model.config.hidden_size // 2), 19 | nn.ReLU(), 20 | nn.Linear(self.base_model.config.hidden_size // 2, vocab_size) 21 | ) 22 | 23 | # Critic head (value network) 24 | self.critic = nn.Sequential( 25 | nn.Linear(self.base_model.config.hidden_size, self.base_model.config.hidden_size // 2), 26 | nn.ReLU(), 27 | nn.Linear(self.base_model.config.hidden_size // 2, 1) 28 | ) 29 | 30 | def forward(self, input_ids, attention_mask=None): 31 | outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True) 32 | hidden_states = outputs.hidden_states[-1] 33 | last_token_hidden = hidden_states[:, -1, :] 34 | 35 | # Actor outputs probability distribution over vocabulary 36 | actor_output = self.actor(last_token_hidden) 37 | policy_logits = F.log_softmax(actor_output, dim=-1) 38 | 39 | # Critic outputs value estimate 40 | value = self.critic(last_token_hidden) 41 | 42 | return policy_logits, value 43 | 44 | def train_a3c( 45 | model_path, 46 | dataset, 47 | output_dir, 48 | reward_model_path=None, 49 | batch_size=4, 50 | epochs=1, 51 | lr=1e-5, 52 | gamma=0.99, 53 | entropy_weight=0.01, 54 | max_grad_norm=0.5 55 | ): 56 | """ 57 | Train an A3C (Asynchronous Advantage Actor-Critic) model for language generation 58 | 59 | Args: 60 | model_path: Path to the pre-trained model 61 | dataset: Dataset for training 62 | output_dir: Directory to save the model 63 | reward_model_path: Path to a pre-trained reward model (optional) 64 | batch_size: Batch size for training 65 | epochs: Number of epochs for training 66 | lr: Learning rate 67 | gamma: Discount factor 68 | entropy_weight: Weight for entropy regularization 69 | max_grad_norm: Maximum gradient norm for clipping 70 | 71 | Returns: 72 | Trained model 73 | """ 74 | os.makedirs(output_dir, exist_ok=True) 75 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 76 | 77 | # Load tokenizer 78 | tokenizer = AutoTokenizer.from_pretrained(model_path) 79 | 80 | # Load reward model if provided 81 | reward_model = None 82 | if reward_model_path: 83 | try: 84 | reward_model = AutoModelForCausalLM.from_pretrained(reward_model_path).to(device) 85 | logger.info(f"Loaded reward model from {reward_model_path}") 86 | except Exception as e: 87 | logger.warning(f"Could not load reward model: {e}") 88 | 89 | # Create Actor-Critic network 90 | actor_critic = ActorCriticNetwork(model_path, tokenizer.vocab_size).to(device) 91 | 92 | # Set up optimizer 93 | optimizer = torch.optim.Adam(actor_critic.parameters(), lr=lr) 94 | 95 | logger.info("Starting A3C training...") 96 | 97 | # Training loop 98 | actor_critic.train() 99 | for epoch in range(epochs): 100 | total_actor_loss = 0 101 | total_critic_loss = 0 102 | total_entropy = 0 103 | 104 | for i, batch in enumerate(dataset): 105 | if i >= len(dataset) // batch_size: 106 | break 107 | 108 | # Process batch data 109 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).to(device) 110 | 111 | # Forward pass 112 | policy_logits, values = actor_critic(inputs.input_ids, inputs.attention_mask) 113 | 114 | # Generate next token to get reward 115 | with torch.no_grad(): 116 | next_token_logits = policy_logits.detach() 117 | next_token = torch.multinomial(torch.exp(next_token_logits), 1) 118 | 119 | # Get reward (from reward model or simple heuristic) 120 | if reward_model: 121 | # Add generated token to input and get reward from reward model 122 | extended_inputs = torch.cat([inputs.input_ids, next_token], dim=1) 123 | reward_output = reward_model(extended_inputs) 124 | rewards = reward_output.logits[:, -1].unsqueeze(-1) 125 | else: 126 | # Simple heuristic reward based on token likelihood 127 | rewards = F.softmax(policy_logits, dim=-1).gather(1, next_token) 128 | 129 | # Calculate advantage = R - V 130 | advantages = rewards - values 131 | 132 | # Actor loss: -log_prob * advantage 133 | selected_log_probs = policy_logits.gather(1, next_token) 134 | actor_loss = -selected_log_probs * advantages.detach() 135 | 136 | # Critic loss: MSE between value and reward 137 | critic_loss = F.mse_loss(values, rewards.detach()) 138 | 139 | # Entropy regularization to encourage exploration 140 | entropy = -(torch.exp(policy_logits) * policy_logits).sum(dim=-1).mean() 141 | 142 | # Total loss 143 | loss = actor_loss.mean() + 0.5 * critic_loss - entropy_weight * entropy 144 | 145 | # Backward and optimize 146 | optimizer.zero_grad() 147 | loss.backward() 148 | torch.nn.utils.clip_grad_norm_(actor_critic.parameters(), max_grad_norm) 149 | optimizer.step() 150 | 151 | # Track metrics 152 | total_actor_loss += actor_loss.mean().item() 153 | total_critic_loss += critic_loss.item() 154 | total_entropy += entropy.item() 155 | 156 | if i % 10 == 0: 157 | logger.info(f"Epoch {epoch+1}/{epochs}, Batch {i}/{len(dataset)//batch_size}, " 158 | f"Actor Loss: {total_actor_loss/(i+1):.4f}, " 159 | f"Critic Loss: {total_critic_loss/(i+1):.4f}, " 160 | f"Entropy: {total_entropy/(i+1):.4f}") 161 | 162 | # Save the fine-tuned model 163 | actor_critic.base_model.save_pretrained(output_dir) 164 | tokenizer.save_pretrained(output_dir) 165 | 166 | logger.info(f"A3C training complete. Model saved to {output_dir}") 167 | return actor_critic.base_model, tokenizer -------------------------------------------------------------------------------- /utils/advantage_weighted_regression.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from transformers import AutoModelForCausalLM, AutoTokenizer, PreTrainedModel 5 | import logging 6 | import numpy as np 7 | from torch.utils.data import DataLoader, TensorDataset 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class AWRModel(nn.Module): 12 | def __init__(self, base_model): 13 | super().__init__() 14 | self.base_model = base_model 15 | self.hidden_size = base_model.config.hidden_size 16 | 17 | # Value function head 18 | self.value_head = nn.Linear(self.hidden_size, 1) 19 | 20 | def forward(self, input_ids, attention_mask=None): 21 | outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask) 22 | hidden_states = outputs.last_hidden_state 23 | logits = outputs.logits 24 | values = self.value_head(hidden_states) 25 | 26 | return { 27 | 'logits': logits, 28 | 'values': values, 29 | 'hidden_states': hidden_states 30 | } 31 | 32 | def compute_advantages(values, rewards, gamma=0.99, lam=0.95): 33 | """Compute generalized advantage estimates""" 34 | advantages = torch.zeros_like(rewards) 35 | last_gae = 0 36 | 37 | # Reverse iterate through time 38 | for t in reversed(range(len(rewards))): 39 | if t == len(rewards) - 1: 40 | # For last timestep, next value is 0 41 | next_value = 0 42 | else: 43 | next_value = values[t + 1] 44 | 45 | delta = rewards[t] + gamma * next_value - values[t] 46 | advantages[t] = delta + gamma * lam * last_gae 47 | last_gae = advantages[t] 48 | 49 | return advantages 50 | 51 | def train_awr(base_model_path, train_dataset, output_dir, reward_model_path=None, 52 | batch_size=4, epochs=1, lr=1e-5, beta=1.0, max_weight=20.0): 53 | """Train a policy using Advantage-Weighted Regression.""" 54 | logger.info("Initializing Advantage Weighted Regression (AWR) Training") 55 | 56 | # Load tokenizer and model 57 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 58 | base_model = AutoModelForCausalLM.from_pretrained(base_model_path) 59 | 60 | # Create AWR model 61 | model = AWRModel(base_model) 62 | 63 | # Load reward model if provided 64 | if reward_model_path: 65 | logger.info(f"Loading reward model from {reward_model_path}") 66 | reward_model = AutoModelForCausalLM.from_pretrained(reward_model_path) 67 | else: 68 | logger.info("No reward model provided, will use simple rewards") 69 | reward_model = None 70 | 71 | # Setup optimizer 72 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 73 | 74 | # Training loop 75 | logger.info(f"Starting training for {epochs} epochs") 76 | model.train() 77 | 78 | for epoch in range(epochs): 79 | total_policy_loss = 0 80 | total_value_loss = 0 81 | num_batches = 0 82 | 83 | for i in range(0, len(train_dataset), batch_size): 84 | batch = train_dataset[i:i+batch_size] 85 | 86 | # Tokenize inputs 87 | inputs = tokenizer(batch['text'], return_tensors="pt", padding=True, truncation=True) 88 | inputs = {k: v.to(base_model.device) for k, v in inputs.items()} 89 | 90 | # Generate samples with behavior policy (using base model) 91 | with torch.no_grad(): 92 | outputs = base_model.generate( 93 | inputs['input_ids'], 94 | max_new_tokens=20, 95 | do_sample=True, 96 | temperature=0.7, 97 | output_scores=True, 98 | return_dict_in_generate=True, 99 | attention_mask=inputs['attention_mask'] 100 | ) 101 | 102 | # Get generated sequences 103 | sequences = outputs.sequences 104 | 105 | # Get scores (log probs) from behavior policy 106 | behavior_log_probs = [] 107 | for step_scores in outputs.scores: 108 | behavior_log_probs.append(F.log_softmax(step_scores, dim=-1)) 109 | behavior_log_probs = torch.stack(behavior_log_probs, dim=1) 110 | 111 | # Get rewards for generated sequences 112 | if reward_model: 113 | with torch.no_grad(): 114 | reward_outputs = reward_model(sequences) 115 | rewards = reward_outputs.logits.mean(dim=-1) 116 | else: 117 | # Simple reward function: prefer diversity and avoid repetition 118 | rewards = torch.zeros(sequences.size(0), device=sequences.device) 119 | 120 | for b in range(sequences.size(0)): 121 | # Count unique tokens as a diversity measure 122 | unique_tokens = torch.unique(sequences[b]).size(0) 123 | rewards[b] = unique_tokens / sequences.size(1) 124 | 125 | # Get values and target policy log probs 126 | model_outputs = model(sequences) 127 | values = model_outputs['values'].squeeze(-1) 128 | target_logits = model_outputs['logits'] 129 | 130 | # Compute advantages 131 | advantages = compute_advantages(values.detach(), rewards.unsqueeze(1).expand_as(values).detach()) 132 | 133 | # Compute exponential advantage weights, clipped to prevent extremely large weights 134 | weights = torch.exp(advantages / beta) 135 | weights = torch.clamp(weights, 0, max_weight) 136 | 137 | # AWR update: weighted supervised learning 138 | # For each position in the sequence with a generated token: 139 | policy_loss = 0 140 | for t in range(sequences.size(1) - 1): 141 | # Target is the next token 142 | target_tokens = sequences[:, t+1] 143 | 144 | # Predicted distribution for current token 145 | logits = target_logits[:, t, :] 146 | 147 | # Compute cross-entropy loss weighted by advantages 148 | token_loss = F.cross_entropy( 149 | logits, 150 | target_tokens, 151 | reduction='none' 152 | ) 153 | 154 | # Apply weights from advantages 155 | weighted_loss = (token_loss * weights[:, t]).mean() 156 | policy_loss += weighted_loss 157 | 158 | # Value function loss 159 | value_loss = F.mse_loss(values, rewards.unsqueeze(1).expand_as(values)) 160 | 161 | # Combined loss 162 | loss = policy_loss + 0.5 * value_loss 163 | 164 | # Update model 165 | optimizer.zero_grad() 166 | loss.backward() 167 | optimizer.step() 168 | 169 | total_policy_loss += policy_loss.item() 170 | total_value_loss += value_loss.item() 171 | num_batches += 1 172 | 173 | if num_batches % 10 == 0: 174 | logger.info(f"Epoch {epoch+1}, Batch {num_batches}, " 175 | f"Policy Loss: {policy_loss.item():.4f}, " 176 | f"Value Loss: {value_loss.item():.4f}") 177 | 178 | avg_policy_loss = total_policy_loss / num_batches 179 | avg_value_loss = total_value_loss / num_batches 180 | 181 | logger.info(f"Epoch {epoch+1} completed. " 182 | f"Average Policy Loss: {avg_policy_loss:.4f}, " 183 | f"Average Value Loss: {avg_value_loss:.4f}") 184 | 185 | # Save the model 186 | logger.info(f"Training completed. Saving model to {output_dir}") 187 | model.base_model.save_pretrained(output_dir) 188 | tokenizer.save_pretrained(output_dir) 189 | 190 | return model.base_model, tokenizer 191 | -------------------------------------------------------------------------------- /utils/adversarial_irl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | import logging 6 | import os 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | class RewardNetwork(nn.Module): 11 | """Reward network that learns to distinguish expert data from generated data""" 12 | def __init__(self, base_model): 13 | super().__init__() 14 | self.base_model = base_model 15 | self.hidden_size = base_model.config.hidden_size 16 | 17 | # Discriminator head 18 | self.reward_head = nn.Sequential( 19 | nn.Linear(self.hidden_size, 128), 20 | nn.ReLU(), 21 | nn.Linear(128, 1) 22 | ) 23 | 24 | def forward(self, input_ids, attention_mask=None): 25 | outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask) 26 | hidden_states = outputs.last_hidden_state 27 | 28 | # Get reward prediction for each token 29 | rewards = self.reward_head(hidden_states) 30 | 31 | return rewards 32 | 33 | def get_sequence_reward(self, input_ids, attention_mask=None): 34 | """Get the total reward for a sequence""" 35 | token_rewards = self(input_ids, attention_mask) 36 | # Sum rewards across sequence 37 | return token_rewards.sum(dim=1) 38 | 39 | class GeneratorNetwork(nn.Module): 40 | """Generator network that creates text to fool the discriminator""" 41 | def __init__(self, base_model): 42 | super().__init__() 43 | self.base_model = base_model 44 | 45 | def forward(self, input_ids, attention_mask=None): 46 | return self.base_model(input_ids=input_ids, attention_mask=attention_mask) 47 | 48 | def generate(self, input_ids, attention_mask=None, **kwargs): 49 | return self.base_model.generate( 50 | input_ids=input_ids, 51 | attention_mask=attention_mask, 52 | **kwargs 53 | ) 54 | 55 | def train_adversarial_irl(base_model_path, train_dataset, output_dir, expert_dataset=None, 56 | batch_size=4, epochs=1, lr=1e-5, disc_steps=5, gen_steps=1): 57 | """Train a model using Adversarial Inverse Reinforcement Learning.""" 58 | logger.info("Initializing Adversarial Inverse Reinforcement Learning") 59 | 60 | # Load tokenizer and model 61 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 62 | base_model = AutoModelForCausalLM.from_pretrained(base_model_path) 63 | 64 | # Create reward and generator networks 65 | reward_network = RewardNetwork(AutoModelForCausalLM.from_pretrained(base_model_path)) 66 | generator = GeneratorNetwork(base_model) 67 | 68 | # If no expert dataset provided, use a portion of train dataset 69 | if expert_dataset is None: 70 | expert_size = len(train_dataset) // 4 71 | expert_indices = torch.randperm(len(train_dataset))[:expert_size] 72 | expert_dataset = torch.utils.data.Subset(train_dataset, expert_indices) 73 | logger.info(f"Created expert dataset with {len(expert_dataset)} examples") 74 | 75 | # Setup optimizers 76 | reward_optimizer = torch.optim.Adam(reward_network.parameters(), lr=lr) 77 | generator_optimizer = torch.optim.Adam(generator.parameters(), lr=lr) 78 | 79 | # Training loop 80 | logger.info(f"Starting training for {epochs} epochs") 81 | 82 | for epoch in range(epochs): 83 | total_reward_loss = 0 84 | total_generator_loss = 0 85 | num_batches = 0 86 | 87 | for i in range(0, len(train_dataset), batch_size): 88 | # Get batch 89 | train_batch = train_dataset[i:i+batch_size] 90 | 91 | # Get expert batch 92 | expert_idx = torch.randperm(len(expert_dataset))[:batch_size] 93 | expert_batch = torch.utils.data.Subset(expert_dataset, expert_idx) 94 | 95 | # Tokenize inputs 96 | train_inputs = tokenizer(train_batch['text'], return_tensors="pt", padding=True, truncation=True) 97 | train_inputs = {k: v.to(base_model.device) for k, v in train_inputs.items()} 98 | 99 | expert_inputs = tokenizer([item['text'] for item in expert_batch], return_tensors="pt", padding=True, truncation=True) 100 | expert_inputs = {k: v.to(base_model.device) for k, v in expert_inputs.items()} 101 | 102 | # Step 1: Train discriminator/reward network 103 | for _ in range(disc_steps): 104 | # Generate samples from current policy 105 | with torch.no_grad(): 106 | generated_outputs = generator.generate( 107 | train_inputs['input_ids'], 108 | max_new_tokens=20, 109 | do_sample=True, 110 | temperature=0.7, 111 | attention_mask=train_inputs['attention_mask'] 112 | ) 113 | 114 | # Get discriminator predictions 115 | generated_rewards = reward_network.get_sequence_reward(generated_outputs) 116 | expert_rewards = reward_network.get_sequence_reward(expert_inputs['input_ids'], expert_inputs['attention_mask']) 117 | 118 | # Discriminator loss: expert should get high reward, generated should get low reward 119 | # Using binary cross-entropy loss 120 | expert_labels = torch.ones_like(expert_rewards) 121 | generated_labels = torch.zeros_like(generated_rewards) 122 | 123 | expert_loss = F.binary_cross_entropy_with_logits(expert_rewards, expert_labels) 124 | generated_loss = F.binary_cross_entropy_with_logits(generated_rewards, generated_labels) 125 | 126 | reward_loss = expert_loss + generated_loss 127 | 128 | # Update reward network 129 | reward_optimizer.zero_grad() 130 | reward_loss.backward() 131 | reward_optimizer.step() 132 | 133 | total_reward_loss += reward_loss.item() 134 | 135 | # Step 2: Train generator using rewards 136 | for _ in range(gen_steps): 137 | # Generate trajectories with gradient tracking 138 | generator_outputs = generator(train_inputs['input_ids'], train_inputs['attention_mask']) 139 | generator_logits = generator_outputs.logits 140 | 141 | # Sample from generator 142 | probs = F.softmax(generator_logits[:, -1, :], dim=-1) 143 | actions = torch.multinomial(probs, 1) 144 | 145 | # Prepare next tokens for generator 146 | next_tokens = torch.cat([train_inputs['input_ids'], actions], dim=1) 147 | 148 | # Get rewards for generated tokens 149 | with torch.no_grad(): 150 | rewards = reward_network.get_sequence_reward(next_tokens) 151 | 152 | # Compute policy gradient loss 153 | log_probs = F.log_softmax(generator_logits[:, -1, :], dim=-1) 154 | selected_log_probs = torch.gather(log_probs, 1, actions) 155 | 156 | # Policy gradient: maximize rewards 157 | generator_loss = -(selected_log_probs * rewards).mean() 158 | 159 | # Update generator 160 | generator_optimizer.zero_grad() 161 | generator_loss.backward() 162 | generator_optimizer.step() 163 | 164 | total_generator_loss += generator_loss.item() 165 | 166 | num_batches += 1 167 | 168 | if num_batches % 10 == 0: 169 | logger.info(f"Epoch {epoch+1}, Batch {num_batches}, " 170 | f"Reward Loss: {reward_loss.item():.4f}, " 171 | f"Generator Loss: {generator_loss.item():.4f}") 172 | 173 | avg_reward_loss = total_reward_loss / (num_batches * disc_steps) 174 | avg_generator_loss = total_generator_loss / (num_batches * gen_steps) 175 | 176 | logger.info(f"Epoch {epoch+1} completed. " 177 | f"Average Reward Loss: {avg_reward_loss:.4f}, " 178 | f"Average Generator Loss: {avg_generator_loss:.4f}") 179 | 180 | # Save the models 181 | logger.info(f"Training completed. Saving models to {output_dir}") 182 | os.makedirs(output_dir, exist_ok=True) 183 | 184 | # Save generator (main model) 185 | generator.base_model.save_pretrained(output_dir) 186 | tokenizer.save_pretrained(output_dir) 187 | 188 | # Save reward network 189 | reward_network.base_model.save_pretrained(os.path.join(output_dir, "reward_network")) 190 | 191 | return generator.base_model, tokenizer 192 | -------------------------------------------------------------------------------- /utils/bayesian_rl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import logging 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class BayesianRLModel(nn.Module): 12 | """ 13 | Bayesian Reinforcement Learning Model 14 | ใช้ Bayesian inference เพื่อจัดการกับความไม่แน่นอน 15 | """ 16 | def __init__(self, model_path, vocab_size, device='cuda'): 17 | super(BayesianRLModel, self).__init__() 18 | self.device = device 19 | self.model = AutoModelForCausalLM.from_pretrained(model_path).to(device) 20 | self.vocab_size = vocab_size 21 | 22 | def forward(self, input_ids, attention_mask=None): 23 | outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) 24 | return outputs.logits 25 | 26 | def generate(self, input_ids, attention_mask=None, max_length=30, **kwargs): 27 | current_input_ids = input_ids 28 | current_attention_mask = attention_mask 29 | 30 | for _ in range(max_length): 31 | next_token_logits = self.forward(current_input_ids, current_attention_mask) 32 | next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1) 33 | current_input_ids = torch.cat([current_input_ids, next_token], dim=1) 34 | if current_attention_mask is not None: 35 | current_attention_mask = torch.cat([current_attention_mask, torch.ones_like(next_token)], dim=1) 36 | 37 | return current_input_ids 38 | 39 | def train_bayesian_rl( 40 | model_path, 41 | dataset, 42 | output_dir, 43 | batch_size=4, 44 | epochs=1, 45 | lr=1e-5 46 | ): 47 | """ 48 | Train a Bayesian Reinforcement Learning model 49 | 50 | Args: 51 | model_path: Path to the pre-trained model 52 | dataset: Dataset for training 53 | output_dir: Directory to save the model 54 | batch_size: Batch size for training 55 | epochs: Number of epochs for training 56 | lr: Learning rate 57 | 58 | Returns: 59 | Trained model 60 | """ 61 | os.makedirs(output_dir, exist_ok=True) 62 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 63 | 64 | tokenizer = AutoTokenizer.from_pretrained(model_path) 65 | vocab_size = tokenizer.vocab_size 66 | 67 | model = BayesianRLModel(model_path, vocab_size, device).to(device) 68 | 69 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 70 | 71 | logger.info("Starting Bayesian RL training...") 72 | 73 | for epoch in range(epochs): 74 | total_reward = 0 75 | total_loss = 0 76 | 77 | for i, batch in enumerate(dataset): 78 | if i >= len(dataset) // batch_size: 79 | break 80 | 81 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).to(device) 82 | 83 | generated_ids = model.generate(inputs.input_ids, inputs.attention_mask, max_length=10) 84 | generated_part = generated_ids[:, inputs.input_ids.shape[1]:] 85 | 86 | # Calculate rewards 87 | rewards = F.softmax(model(inputs.input_ids, inputs.attention_mask), dim=-1).gather(1, generated_part[:, 0].unsqueeze(1)) 88 | 89 | # Policy gradient loss (maximize reward) 90 | loss = -torch.mean(rewards) 91 | 92 | optimizer.zero_grad() 93 | loss.backward() 94 | optimizer.step() 95 | 96 | total_reward += rewards.mean().item() 97 | total_loss += loss.item() 98 | 99 | if i % 10 == 0: 100 | logger.info(f"Epoch {epoch+1}/{epochs}, Batch {i}/{len(dataset)//batch_size}, " 101 | f"Loss: {total_loss/(i+1):.4f}, " 102 | f"Avg Reward: {total_reward/(i+1):.4f}") 103 | 104 | model.save_pretrained(output_dir) 105 | tokenizer.save_pretrained(output_dir) 106 | 107 | logger.info(f"Bayesian RL training complete. Model saved to {output_dir}") 108 | return model, tokenizer -------------------------------------------------------------------------------- /utils/contrastive_rl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import logging 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class ContrastiveRLModel(nn.Module): 12 | """ 13 | Contrastive Learning + Reinforcement Learning Model 14 | ผสมผสาน contrastive learning เข้ากับ RL เพื่อสร้างการแสดงผล (representations) ที่มีประสิทธิภาพ 15 | """ 16 | def __init__(self, model_path, vocab_size, device='cuda'): 17 | super(ContrastiveRLModel, self).__init__() 18 | self.device = device 19 | self.model = AutoModelForCausalLM.from_pretrained(model_path).to(device) 20 | self.vocab_size = vocab_size 21 | 22 | def forward(self, input_ids, attention_mask=None): 23 | outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) 24 | return outputs.logits 25 | 26 | def generate(self, input_ids, attention_mask=None, max_length=30, **kwargs): 27 | current_input_ids = input_ids 28 | current_attention_mask = attention_mask 29 | 30 | for _ in range(max_length): 31 | next_token_logits = self.forward(current_input_ids, current_attention_mask) 32 | next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1) 33 | current_input_ids = torch.cat([current_input_ids, next_token], dim=1) 34 | if current_attention_mask is not None: 35 | current_attention_mask = torch.cat([current_attention_mask, torch.ones_like(next_token)], dim=1) 36 | 37 | return current_input_ids 38 | 39 | def train_contrastive_rl( 40 | model_path, 41 | dataset, 42 | output_dir, 43 | batch_size=4, 44 | epochs=1, 45 | lr=1e-5, 46 | contrastive_loss_weight=0.1 47 | ): 48 | """ 49 | Train a Contrastive Learning + Reinforcement Learning model 50 | 51 | Args: 52 | model_path: Path to the pre-trained model 53 | dataset: Dataset for training 54 | output_dir: Directory to save the model 55 | batch_size: Batch size for training 56 | epochs: Number of epochs for training 57 | lr: Learning rate 58 | contrastive_loss_weight: Weight for the contrastive loss 59 | 60 | Returns: 61 | Trained model 62 | """ 63 | os.makedirs(output_dir, exist_ok=True) 64 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 65 | 66 | tokenizer = AutoTokenizer.from_pretrained(model_path) 67 | vocab_size = tokenizer.vocab_size 68 | 69 | model = ContrastiveRLModel(model_path, vocab_size, device).to(device) 70 | 71 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 72 | 73 | logger.info("Starting Contrastive RL training...") 74 | 75 | for epoch in range(epochs): 76 | total_reward = 0 77 | total_loss = 0 78 | 79 | for i, batch in enumerate(dataset): 80 | if i >= len(dataset) // batch_size: 81 | break 82 | 83 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).to(device) 84 | 85 | generated_ids = model.generate(inputs.input_ids, inputs.attention_mask, max_length=10) 86 | generated_part = generated_ids[:, inputs.input_ids.shape[1]:] 87 | 88 | # Calculate rewards 89 | rewards = F.softmax(model(inputs.input_ids, inputs.attention_mask), dim=-1).gather(1, generated_part[:, 0].unsqueeze(1)) 90 | 91 | # Contrastive loss 92 | positive_pairs = generated_part[:, 0] == inputs.input_ids[:, 0] 93 | contrastive_loss = F.cross_entropy(model(inputs.input_ids, inputs.attention_mask).view(-1, vocab_size), inputs.input_ids.view(-1), reduction='none') 94 | contrastive_loss = contrastive_loss[positive_pairs].mean() 95 | 96 | # Policy gradient loss (maximize reward) 97 | loss = -torch.mean(rewards) + contrastive_loss_weight * contrastive_loss 98 | 99 | optimizer.zero_grad() 100 | loss.backward() 101 | optimizer.step() 102 | 103 | total_reward += rewards.mean().item() 104 | total_loss += loss.item() 105 | 106 | if i % 10 == 0: 107 | logger.info(f"Epoch {epoch+1}/{epochs}, Batch {i}/{len(dataset)//batch_size}, " 108 | f"Loss: {total_loss/(i+1):.4f}, " 109 | f"Avg Reward: {total_reward/(i+1):.4f}") 110 | 111 | model.save_pretrained(output_dir) 112 | tokenizer.save_pretrained(output_dir) 113 | 114 | logger.info(f"Contrastive RL training complete. Model saved to {output_dir}") 115 | return model, tokenizer -------------------------------------------------------------------------------- /utils/cot_trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline 4 | 5 | def train_cot(model_name, train_dataset, eval_dataset, output_dir, batch_size, epochs, learning_rate): 6 | """ 7 | ฝึกโมเดลด้วย Chain-of-Thought (CoT) 8 | """ 9 | os.makedirs(output_dir, exist_ok=True) 10 | 11 | # Load model and tokenizer 12 | tokenizer = AutoTokenizer.from_pretrained(model_name) 13 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto") 14 | 15 | if tokenizer.pad_token is None: 16 | tokenizer.pad_token = tokenizer.eos_token 17 | model.config.pad_token_id = model.config.eos_token_id 18 | 19 | # Prepare data for CoT 20 | def prepare_cot_data(dataset): 21 | cot_data = [] 22 | for example in dataset: 23 | prompt = example['prompt'] 24 | response = example['chosen'] 25 | cot_example = f"{prompt} Let's think step by step: {response}" 26 | cot_data.append({"prompt": prompt, "response": cot_example}) 27 | return cot_data 28 | 29 | train_cot_data = prepare_cot_data(train_dataset) 30 | eval_cot_data = prepare_cot_data(eval_dataset) 31 | 32 | # Training loop 33 | optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) 34 | model.train() 35 | 36 | for epoch in range(epochs): 37 | for batch_idx in range(0, len(train_cot_data), batch_size): 38 | batch = train_cot_data[batch_idx:batch_idx + batch_size] 39 | inputs = tokenizer([ex['prompt'] for ex in batch], return_tensors='pt', padding=True, truncation=True) 40 | labels = tokenizer([ex['response'] for ex in batch], return_tensors='pt', padding=True, truncation=True) 41 | 42 | inputs = inputs.to(model.device) 43 | labels = labels.to(model.device) 44 | 45 | outputs = model(**inputs, labels=labels.input_ids) 46 | loss = outputs.loss 47 | loss.backward() 48 | 49 | optimizer.step() 50 | optimizer.zero_grad() 51 | 52 | if batch_idx % 10 == 0: 53 | print(f"Epoch {epoch + 1}/{epochs}, Batch {batch_idx}, Loss: {loss.item()}") 54 | 55 | # Save the final model 56 | model.save_pretrained(output_dir) 57 | tokenizer.save_pretrained(output_dir) 58 | 59 | return model, tokenizer -------------------------------------------------------------------------------- /utils/curiosity_rl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import logging 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class CuriosityPolicy(nn.Module): 12 | def __init__(self, model_path, vocab_size, device='cuda'): 13 | super(CuriosityPolicy, self).__init__() 14 | self.device = device 15 | self.model = AutoModelForCausalLM.from_pretrained(model_path).to(device) 16 | self.vocab_size = vocab_size 17 | self.curiosity_module = nn.Linear(768, 1) 18 | 19 | def forward(self, input_ids, attention_mask=None): 20 | outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) 21 | curiosity_reward = self.curiosity_module(outputs.hidden_states[-1][:, -1, :]) 22 | return outputs.logits, curiosity_reward 23 | 24 | def train_curiosity_rl( 25 | model_path, 26 | dataset, 27 | output_dir, 28 | batch_size=4, 29 | epochs=1, 30 | lr=1e-5, 31 | curiosity_weight=0.1 32 | ): 33 | os.makedirs(output_dir, exist_ok=True) 34 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 35 | 36 | tokenizer = AutoTokenizer.from_pretrained(model_path) 37 | vocab_size = tokenizer.vocab_size 38 | 39 | policy = CuriosityPolicy(model_path, vocab_size, device).to(device) 40 | 41 | optimizer = torch.optim.Adam(policy.parameters(), lr=lr) 42 | 43 | logger.info("Starting Curiosity-driven RL training...") 44 | 45 | for epoch in range(epochs): 46 | total_reward = 0 47 | total_loss = 0 48 | 49 | for i, batch in enumerate(dataset): 50 | if i >= len(dataset) // batch_size: 51 | break 52 | 53 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).to(device) 54 | 55 | logits, curiosity_rewards = policy(inputs.input_ids, inputs.attention_mask) 56 | action = torch.argmax(logits, dim=-1) 57 | rewards = F.softmax(logits, dim=-1).gather(1, action.unsqueeze(1)) 58 | total_rewards = rewards + curiosity_weight * curiosity_rewards 59 | 60 | loss = -torch.mean(total_rewards) 61 | 62 | optimizer.zero_grad() 63 | loss.backward() 64 | optimizer.step() 65 | 66 | total_reward += total_rewards.mean().item() 67 | total_loss += loss.item() 68 | 69 | if i % 10 == 0: 70 | logger.info(f"Epoch {epoch+1}/{epochs}, Batch {i}/{len(dataset)//batch_size}, " 71 | f"Loss: {total_loss/(i+1):.4f}, " 72 | f"Avg Reward: {total_reward/(i+1):.4f}") 73 | 74 | policy.save_pretrained(output_dir) 75 | tokenizer.save_pretrained(output_dir) 76 | 77 | logger.info(f"Curiosity-driven RL training complete. Model saved to {output_dir}") 78 | return policy, tokenizer -------------------------------------------------------------------------------- /utils/curriculum_rl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import logging 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class CurriculumRLModel(nn.Module): 12 | """ 13 | Curriculum Reinforcement Learning Model 14 | ฝึกโมเดลจากงานง่ายไปยาก 15 | """ 16 | def __init__(self, model_path, vocab_size, device='cuda'): 17 | super(CurriculumRLModel, self).__init__() 18 | self.device = device 19 | self.model = AutoModelForCausalLM.from_pretrained(model_path).to(device) 20 | self.vocab_size = vocab_size 21 | 22 | def forward(self, input_ids, attention_mask=None): 23 | outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) 24 | return outputs.logits 25 | 26 | def generate(self, input_ids, attention_mask=None, max_length=30, **kwargs): 27 | current_input_ids = input_ids 28 | current_attention_mask = attention_mask 29 | 30 | for _ in range(max_length): 31 | next_token_logits = self.forward(current_input_ids, current_attention_mask) 32 | next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1) 33 | current_input_ids = torch.cat([current_input_ids, next_token], dim=1) 34 | if current_attention_mask is not None: 35 | current_attention_mask = torch.cat([current_attention_mask, torch.ones_like(next_token)], dim=1) 36 | 37 | return current_input_ids 38 | 39 | def train_curriculum_rl( 40 | model_path, 41 | dataset, 42 | output_dir, 43 | batch_size=4, 44 | epochs=1, 45 | lr=1e-5, 46 | curriculum_schedule=[0.5, 0.3, 0.2] # Percentages for different levels of difficulty 47 | ): 48 | """ 49 | Train a Curriculum Reinforcement Learning model 50 | 51 | Args: 52 | model_path: Path to the pre-trained model 53 | dataset: Dataset for training 54 | output_dir: Directory to save the model 55 | batch_size: Batch size for training 56 | epochs: Number of epochs for training 57 | lr: Learning rate 58 | curriculum_schedule: List of percentages for different levels of difficulty 59 | 60 | Returns: 61 | Trained model 62 | """ 63 | os.makedirs(output_dir, exist_ok=True) 64 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 65 | 66 | tokenizer = AutoTokenizer.from_pretrained(model_path) 67 | vocab_size = tokenizer.vocab_size 68 | 69 | model = CurriculumRLModel(model_path, vocab_size, device).to(device) 70 | 71 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 72 | 73 | logger.info("Starting Curriculum RL training...") 74 | 75 | for epoch in range(epochs): 76 | total_reward = 0 77 | total_loss = 0 78 | 79 | for difficulty, percentage in enumerate(curriculum_schedule): 80 | num_batches = int(len(dataset) * percentage / batch_size) 81 | 82 | for i, batch in enumerate(dataset): 83 | if i >= num_batches: 84 | break 85 | 86 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).to(device) 87 | 88 | generated_ids = model.generate(inputs.input_ids, inputs.attention_mask, max_length=10) 89 | generated_part = generated_ids[:, inputs.input_ids.shape[1]:] 90 | 91 | # Calculate rewards 92 | rewards = F.softmax(model(inputs.input_ids, inputs.attention_mask), dim=-1).gather(1, generated_part[:, 0].unsqueeze(1)) 93 | 94 | # Policy gradient loss (maximize reward) 95 | loss = -torch.mean(rewards) 96 | 97 | optimizer.zero_grad() 98 | loss.backward() 99 | optimizer.step() 100 | 101 | total_reward += rewards.mean().item() 102 | total_loss += loss.item() 103 | 104 | if i % 10 == 0: 105 | logger.info(f"Epoch {epoch+1}/{epochs}, Difficulty Level {difficulty+1}, Batch {i}/{num_batches}, " 106 | f"Loss: {total_loss/(i+1):.4f}, " 107 | f"Avg Reward: {total_reward/(i+1):.4f}") 108 | 109 | model.save_pretrained(output_dir) 110 | tokenizer.save_pretrained(output_dir) 111 | 112 | logger.info(f"Curriculum RL training complete. Model saved to {output_dir}") 113 | return model, tokenizer -------------------------------------------------------------------------------- /utils/data_loader.py: -------------------------------------------------------------------------------- 1 | from datasets import load_dataset, Dataset 2 | import random 3 | import torch 4 | 5 | def prepare_dataset(dataset_name, max_samples=5000): 6 | """ 7 | โหลดและเตรียม dataset พร้อมสร้าง preference pairs 8 | """ 9 | if dataset_name == "wisesight_sentiment": 10 | dataset = load_dataset("wisesight_sentiment") 11 | train_data = dataset["train"] 12 | 13 | preference_data = {"prompt": [], "chosen": [], "rejected": []} 14 | negative_responses = [ 15 | "ร้านนี้แย่มาก อาหารไม่อร่อย", 16 | "บริการแย่ ราคาแพงเกินไป ไม่คุ้มค่า", 17 | "อาหารรสชาติแย่ ไม่อร่อยเลย ไม่แนะนำ" 18 | ] 19 | positive_responses = [ 20 | "ร้านนี้ดีมาก อาหารอร่อย", 21 | "บริการประทับใจ คุ้มค่ากับราคา", 22 | "อาหารรสชาติดีเยี่ยม แนะนำให้ลอง" 23 | ] 24 | 25 | sample_count = min(max_samples, len(train_data)) 26 | sampled_indices = random.sample(range(len(train_data)), sample_count) 27 | 28 | for i in sampled_indices: 29 | prompt = "รีวิวร้านอาหาร: " 30 | text = train_data[i]["text"] 31 | label = train_data[i]["category"] 32 | 33 | if label == "positive": 34 | preference_data["prompt"].append(prompt) 35 | preference_data["chosen"].append(text) 36 | preference_data["rejected"].append(random.choice(negative_responses)) 37 | elif label == "negative": 38 | preference_data["prompt"].append(prompt) 39 | preference_data["chosen"].append(random.choice(positive_responses)) 40 | preference_data["rejected"].append(text) 41 | elif label == "neutral": 42 | preference_data["prompt"].append(prompt) 43 | preference_data["chosen"].append(text if random.random() > 0.5 else random.choice(positive_responses)) 44 | preference_data["rejected"].append(random.choice(negative_responses)) 45 | 46 | preference_dataset = Dataset.from_dict(preference_data) 47 | train_test_split = preference_dataset.train_test_split(test_size=0.2, seed=42) 48 | return train_test_split["train"], train_test_split["test"] 49 | else: 50 | raise ValueError(f"Dataset {dataset_name} is not supported") 51 | 52 | def create_preference_data(dataset): 53 | """ 54 | สร้าง preference data จาก dataset ที่มีอยู่ 55 | """ 56 | return dataset -------------------------------------------------------------------------------- /utils/distributional_rl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | class CategoricalDQN(nn.Module): 10 | def __init__(self, base_model, n_atoms=51, vmin=-10, vmax=10): 11 | super().__init__() 12 | self.base_model = base_model 13 | self.n_atoms = n_atoms 14 | self.vmin = vmin 15 | self.vmax = vmax 16 | self.supports = torch.linspace(vmin, vmax, n_atoms).to(self.base_model.device) 17 | self.delta = (vmax - vmin) / (n_atoms - 1) 18 | 19 | # Distribution head 20 | hidden_size = self.base_model.config.hidden_size 21 | self.value_dist = nn.Linear(hidden_size, n_atoms) 22 | 23 | def forward(self, input_ids, attention_mask=None): 24 | outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask) 25 | hidden_states = outputs.last_hidden_state[:, -1, :] 26 | logits = self.value_dist(hidden_states) 27 | return F.softmax(logits, dim=-1) # Return probability distribution 28 | 29 | def get_value(self, input_ids, attention_mask=None): 30 | dist = self(input_ids, attention_mask) 31 | return torch.sum(dist * self.supports.expand_as(dist), dim=1) 32 | 33 | def project_distribution(target_support, target_dist, support, n_atoms, vmin, vmax, delta): 34 | """Projects the categorical distribution onto a new support.""" 35 | batch_size = target_dist.size(0) 36 | 37 | # Clipping projection 38 | proj_support = torch.clamp(target_support, vmin, vmax) 39 | 40 | # Compute projection 41 | tz_j = (proj_support - vmin) / delta 42 | tz_j_floor = tz_j.floor().long() 43 | tz_j_ceil = tz_j.ceil().long() 44 | 45 | # Handle corner cases 46 | tz_j_floor = torch.clamp(tz_j_floor, 0, n_atoms - 1) 47 | tz_j_ceil = torch.clamp(tz_j_ceil, 0, n_atoms - 1) 48 | 49 | # Compute weights 50 | ceil_weight = tz_j - tz_j_floor.float() 51 | floor_weight = 1.0 - ceil_weight 52 | 53 | # Distribute probability 54 | proj_dist = torch.zeros_like(target_dist) 55 | 56 | for b in range(batch_size): 57 | for i in range(n_atoms): 58 | floor_idx, ceil_idx = tz_j_floor[b][i], tz_j_ceil[b][i] 59 | proj_dist[b][floor_idx] += target_dist[b][i] * floor_weight[b][i] 60 | proj_dist[b][ceil_idx] += target_dist[b][i] * ceil_weight[b][i] 61 | 62 | return proj_dist 63 | 64 | def train_distributional_rl(base_model_path, train_dataset, output_dir, reward_model_path=None, 65 | batch_size=4, epochs=1, lr=1e-5, n_atoms=51, vmin=-10, vmax=10): 66 | """Train a distributional RL model for language generation.""" 67 | logger.info("Initializing Distributional RL training with Categorical DQN") 68 | 69 | # Load tokenizer and model 70 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 71 | base_model = AutoModelForCausalLM.from_pretrained(base_model_path) 72 | 73 | # Create distributional RL model 74 | model = CategoricalDQN(base_model, n_atoms, vmin, vmax) 75 | 76 | # Load reward model if provided 77 | if reward_model_path: 78 | logger.info(f"Loading reward model from {reward_model_path}") 79 | reward_model = AutoModelForCausalLM.from_pretrained(reward_model_path) 80 | else: 81 | logger.info("No reward model provided, will use base model for rewards") 82 | reward_model = base_model 83 | 84 | # Training setup 85 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 86 | support = torch.linspace(vmin, vmax, n_atoms).to(base_model.device) 87 | delta = (vmax - vmin) / (n_atoms - 1) 88 | 89 | # Training loop 90 | logger.info(f"Starting training for {epochs} epochs") 91 | model.train() 92 | 93 | for epoch in range(epochs): 94 | total_loss = 0 95 | num_batches = 0 96 | 97 | for i in range(0, len(train_dataset), batch_size): 98 | batch = train_dataset[i:i+batch_size] 99 | 100 | # Tokenize inputs 101 | inputs = tokenizer(batch['text'], return_tensors="pt", padding=True, truncation=True) 102 | inputs = {k: v.to(base_model.device) for k, v in inputs.items()} 103 | 104 | # Get current distribution 105 | current_dist = model(inputs['input_ids'], inputs['attention_mask']) 106 | 107 | # Generate next tokens 108 | with torch.no_grad(): 109 | outputs = base_model.generate( 110 | inputs['input_ids'], 111 | max_new_tokens=20, 112 | do_sample=True, 113 | temperature=0.7, 114 | attention_mask=inputs['attention_mask'] 115 | ) 116 | 117 | # Get rewards 118 | with torch.no_grad(): 119 | rewards = reward_model(outputs, attention_mask=torch.ones_like(outputs)).logits.mean(dim=1) 120 | 121 | # Calculate target distribution 122 | target_support = support.unsqueeze(0).expand(batch_size, -1) + rewards.unsqueeze(1) 123 | target_dist = project_distribution( 124 | target_support, 125 | current_dist.detach(), 126 | support, 127 | n_atoms, 128 | vmin, 129 | vmax, 130 | delta 131 | ) 132 | 133 | # Compute KL divergence loss 134 | log_probs = F.log_softmax(model.value_dist(base_model(inputs['input_ids']).last_hidden_state[:, -1, :]), dim=1) 135 | loss = -(target_dist * log_probs).sum(dim=1).mean() 136 | 137 | # Update model 138 | optimizer.zero_grad() 139 | loss.backward() 140 | optimizer.step() 141 | 142 | total_loss += loss.item() 143 | num_batches += 1 144 | 145 | if num_batches % 10 == 0: 146 | logger.info(f"Epoch {epoch+1}, Batch {num_batches}, Loss: {loss.item():.4f}") 147 | 148 | avg_loss = total_loss / num_batches 149 | logger.info(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}") 150 | 151 | # Save the model 152 | logger.info(f"Training completed. Saving model to {output_dir}") 153 | model.base_model.save_pretrained(output_dir) 154 | tokenizer.save_pretrained(output_dir) 155 | 156 | return model, tokenizer 157 | -------------------------------------------------------------------------------- /utils/dpo_trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from transformers import AutoTokenizer, AutoModelForCausalLM 4 | from trl import DPOConfig, DPOTrainer 5 | 6 | def run_dpo_training(model_name, train_dataset, eval_dataset, output_dir, batch_size, epochs, learning_rate): 7 | """ 8 | ฝึกโมเดลด้วย Direct Preference Optimization 9 | """ 10 | os.makedirs(output_dir, exist_ok=True) 11 | 12 | # Load model and tokenizer 13 | tokenizer = AutoTokenizer.from_pretrained(model_name) 14 | model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto") 15 | if tokenizer.pad_token is None: 16 | tokenizer.pad_token = tokenizer.eos_token 17 | model.config.pad_token_id = model.config.eos_token_id 18 | 19 | # Load reference model 20 | ref_model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="auto") 21 | 22 | # Configure DPO training 23 | dpo_config = DPOConfig( 24 | output_dir=output_dir, 25 | per_device_train_batch_size=batch_size, 26 | per_device_eval_batch_size=batch_size, 27 | gradient_accumulation_steps=4, 28 | num_train_epochs=epochs, 29 | fp16=torch.cuda.is_available(), 30 | learning_rate=learning_rate, 31 | lr_scheduler_type="cosine", 32 | warmup_ratio=0.1, 33 | logging_steps=10, 34 | evaluation_strategy="steps", 35 | eval_steps=50, 36 | save_strategy="steps", 37 | save_steps=50, 38 | save_total_limit=2, 39 | load_best_model_at_end=True, 40 | metric_for_best_model="eval_loss", 41 | beta=0.1, 42 | max_prompt_length=128, 43 | max_length=256, 44 | ) 45 | 46 | # Initialize DPO trainer 47 | dpo_trainer = DPOTrainer( 48 | model=model, 49 | ref_model=ref_model, 50 | args=dpo_config, 51 | train_dataset=train_dataset, 52 | eval_dataset=eval_dataset, 53 | tokenizer=tokenizer, 54 | ) 55 | 56 | # Start DPO training 57 | dpo_trainer.train() 58 | 59 | # Save the final model 60 | dpo_trainer.model.save_pretrained(output_dir) 61 | tokenizer.save_pretrained(output_dir) 62 | 63 | return dpo_trainer.model, tokenizer -------------------------------------------------------------------------------- /utils/ensemble_rl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import logging 6 | from transformers import AutoModelForCausalLM, AutoTokenizer 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | class EnsembleRLModel(nn.Module): 11 | """ 12 | Ensemble Reinforcement Learning Model 13 | ผสมผสานหลายโมเดล RL เข้าด้วยกันเพื่อตัดสินใจร่วมกัน 14 | """ 15 | def __init__(self, model_paths, model_types, vocab_size, device='cuda'): 16 | """ 17 | Args: 18 | model_paths (list): รายการ path ของโมเดลแต่ละตัว 19 | model_types (list): ชนิดของแต่ละโมเดล (เช่น 'ppo', 'sac', 'dqn') 20 | vocab_size (int): ขนาดคำศัพท์ 21 | device (str): อุปกรณ์ที่ใช้ในการประมวลผล 22 | """ 23 | super(EnsembleRLModel, self).__init__() 24 | self.device = device 25 | self.models = nn.ModuleList() 26 | self.model_types = model_types 27 | self.vocab_size = vocab_size 28 | 29 | for path, model_type in zip(model_paths, model_types): 30 | try: 31 | model = AutoModelForCausalLM.from_pretrained(path).to(device) 32 | self.models.append(model) 33 | logger.info(f"Loaded {model_type} model from {path}") 34 | except Exception as e: 35 | logger.error(f"Error loading model {model_type} from {path}: {e}") 36 | raise e 37 | 38 | # Dynamic weight parameters for each model 39 | self.model_weights = nn.Parameter(torch.ones(len(model_paths))) 40 | 41 | # Context-dependent weight prediction network 42 | self.context_network = nn.Sequential( 43 | nn.Linear(768, 256), # Assuming 768 for hidden size 44 | nn.ReLU(), 45 | nn.Linear(256, len(model_paths)), 46 | nn.Softmax(dim=-1) 47 | ) 48 | 49 | def forward(self, input_ids, attention_mask=None): 50 | # Get the hidden state from the last token for context 51 | with torch.no_grad(): 52 | # Using the first model to get context representation 53 | outputs = self.models[0](input_ids, attention_mask=attention_mask, output_hidden_states=True) 54 | hidden_states = outputs.hidden_states[-1] 55 | context_features = hidden_states[:, -1, :] 56 | 57 | # Predict context-dependent weights 58 | context_weights = self.context_network(context_features) 59 | 60 | # Combine normalized base weights with context weights 61 | combined_weights = F.softmax(self.model_weights, dim=0) * context_weights 62 | 63 | # Get predictions from each model 64 | all_logits = [] 65 | for model in self.models: 66 | with torch.no_grad(): 67 | outputs = model(input_ids, attention_mask=attention_mask) 68 | logits = outputs.logits[:, -1, :] # Last token logits 69 | all_logits.append(logits) 70 | 71 | # Stack and weight the logits 72 | stacked_logits = torch.stack(all_logits, dim=0) 73 | weighted_logits = torch.sum(stacked_logits * combined_weights.view(-1, 1, 1), dim=0) 74 | 75 | return weighted_logits 76 | 77 | def generate(self, input_ids, attention_mask=None, max_length=30, **kwargs): 78 | """ 79 | Generate text using the ensemble model 80 | """ 81 | # Start with the input context 82 | current_input_ids = input_ids 83 | current_attention_mask = attention_mask 84 | 85 | for _ in range(max_length): 86 | # Get weighted predictions for next token 87 | next_token_logits = self.forward(current_input_ids, current_attention_mask) 88 | 89 | # Sample from the logits 90 | next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1) 91 | 92 | # Append to the sequence 93 | current_input_ids = torch.cat([current_input_ids, next_token], dim=1) 94 | if current_attention_mask is not None: 95 | current_attention_mask = torch.cat( 96 | [current_attention_mask, torch.ones_like(next_token)], dim=1 97 | ) 98 | 99 | return current_input_ids 100 | 101 | def train_ensemble_rl( 102 | model_paths, 103 | model_types, 104 | dataset, 105 | output_dir, 106 | reward_model_path=None, 107 | batch_size=4, 108 | epochs=1, 109 | lr=1e-5 110 | ): 111 | """ 112 | Train an Ensemble Reinforcement Learning model 113 | 114 | Args: 115 | model_paths (list): รายการ path ของโมเดลแต่ละตัว 116 | model_types (list): ชนิดของแต่ละโมเดล (เช่น 'ppo', 'sac', 'dqn') 117 | dataset: Dataset for training 118 | output_dir: Directory to save the model 119 | reward_model_path: Path to a pre-trained reward model (optional) 120 | batch_size: Batch size for training 121 | epochs: Number of epochs for training 122 | lr: Learning rate 123 | 124 | Returns: 125 | Trained ensemble model 126 | """ 127 | os.makedirs(output_dir, exist_ok=True) 128 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 129 | 130 | # Load tokenizer from the first model 131 | tokenizer = AutoTokenizer.from_pretrained(model_paths[0]) 132 | vocab_size = tokenizer.vocab_size 133 | 134 | # Load reward model if provided 135 | reward_model = None 136 | if reward_model_path: 137 | try: 138 | reward_model = AutoModelForCausalLM.from_pretrained(reward_model_path).to(device) 139 | logger.info(f"Loaded reward model from {reward_model_path}") 140 | except Exception as e: 141 | logger.warning(f"Could not load reward model: {e}") 142 | 143 | # Initialize ensemble model 144 | ensemble_model = EnsembleRLModel(model_paths, model_types, vocab_size, device).to(device) 145 | 146 | # Only train the weights and context network, not the base models 147 | optimizer = torch.optim.Adam([ 148 | {'params': ensemble_model.model_weights}, 149 | {'params': ensemble_model.context_network.parameters()} 150 | ], lr=lr) 151 | 152 | logger.info("Starting Ensemble RL training...") 153 | 154 | # Track best performance 155 | best_reward = float('-inf') 156 | 157 | # Training loop 158 | for epoch in range(epochs): 159 | total_reward = 0 160 | total_loss = 0 161 | 162 | for i, batch in enumerate(dataset): 163 | if i >= len(dataset) // batch_size: 164 | break 165 | 166 | # Process batch data 167 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).to(device) 168 | 169 | # Generate completions with ensemble model 170 | with torch.no_grad(): 171 | generated_ids = ensemble_model.generate( 172 | inputs.input_ids, 173 | attention_mask=inputs.attention_mask, 174 | max_length=10 175 | ) 176 | 177 | # Extract only the generated part 178 | generated_part = generated_ids[:, inputs.input_ids.shape[1]:] 179 | 180 | # Calculate reward 181 | if reward_model: 182 | with torch.no_grad(): 183 | reward_outputs = reward_model(generated_ids) 184 | rewards = reward_outputs.logits[:, -1] 185 | else: 186 | # Simple heuristic reward based on probability 187 | with torch.no_grad(): 188 | # Calculate probability of generated sequence under ensemble model 189 | outputs = ensemble_model(inputs.input_ids, attention_mask=inputs.attention_mask) 190 | probs = F.softmax(outputs, dim=-1) 191 | selected_probs = probs.gather(1, generated_part[:, 0].unsqueeze(1)) 192 | rewards = torch.log(selected_probs + 1e-10).squeeze() 193 | 194 | # Policy gradient loss (maximize reward) 195 | loss = -torch.mean(rewards) 196 | 197 | # Update weights 198 | optimizer.zero_grad() 199 | loss.backward() 200 | optimizer.step() 201 | 202 | total_reward += rewards.mean().item() 203 | total_loss += loss.item() 204 | 205 | if i % 10 == 0: 206 | logger.info(f"Epoch {epoch+1}/{epochs}, Batch {i}/{len(dataset)//batch_size}, " 207 | f"Loss: {total_loss/(i+1):.4f}, " 208 | f"Avg Reward: {total_reward/(i+1):.4f}") 209 | 210 | # Save the fine-tuned model 211 | ensemble_model.save_pretrained(output_dir) 212 | tokenizer.save_pretrained(output_dir) 213 | 214 | logger.info(f"Ensemble RL training complete. Model saved to {output_dir}") 215 | return ensemble_model, tokenizer -------------------------------------------------------------------------------- /utils/graph_based_rl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import logging 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | from torch_geometric.nn import GCNConv 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class GraphRLModel(nn.Module): 13 | """ 14 | Graph-based Reinforcement Learning Model 15 | แสดงความสัมพันธ์ระหว่างข้อความด้วยโครงสร้างกราฟ 16 | """ 17 | def __init__(self, model_path, vocab_size, device='cuda'): 18 | super(GraphRLModel, self).__init__() 19 | self.device = device 20 | self.model = AutoModelForCausalLM.from_pretrained(model_path).to(device) 21 | self.vocab_size = vocab_size 22 | 23 | # Graph Convolutional Network layers 24 | self.conv1 = GCNConv(768, 256) 25 | self.conv2 = GCNConv(256, 128) 26 | 27 | # Fully connected layer for final predictions 28 | self.fc = nn.Linear(128, vocab_size) 29 | 30 | def forward(self, input_ids, attention_mask=None, edge_index=None): 31 | outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) 32 | hidden_states = outputs.hidden_states[-1] 33 | 34 | # Apply GCN layers 35 | x = self.conv1(hidden_states, edge_index) 36 | x = F.relu(x) 37 | x = self.conv2(x, edge_index) 38 | 39 | # Apply final fully connected layer 40 | logits = self.fc(x[:, -1, :]) 41 | 42 | return logits 43 | 44 | def generate(self, input_ids, attention_mask=None, edge_index=None, max_length=30, **kwargs): 45 | current_input_ids = input_ids 46 | current_attention_mask = attention_mask 47 | 48 | for _ in range(max_length): 49 | next_token_logits = self.forward(current_input_ids, current_attention_mask, edge_index) 50 | next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1) 51 | current_input_ids = torch.cat([current_input_ids, next_token], dim=1) 52 | if current_attention_mask is not None: 53 | current_attention_mask = torch.cat([current_attention_mask, torch.ones_like(next_token)], dim=1) 54 | 55 | return current_input_ids 56 | 57 | def train_graph_based_rl( 58 | model_path, 59 | dataset, 60 | output_dir, 61 | edge_index, 62 | batch_size=4, 63 | epochs=1, 64 | lr=1e-5 65 | ): 66 | """ 67 | Train a Graph-based Reinforcement Learning model 68 | 69 | Args: 70 | model_path: Path to the pre-trained model 71 | dataset: Dataset for training 72 | output_dir: Directory to save the model 73 | edge_index: Edge index tensor for the graph structure 74 | batch_size: Batch size for training 75 | epochs: Number of epochs for training 76 | lr: Learning rate 77 | 78 | Returns: 79 | Trained model 80 | """ 81 | os.makedirs(output_dir, exist_ok=True) 82 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 83 | 84 | tokenizer = AutoTokenizer.from_pretrained(model_path) 85 | vocab_size = tokenizer.vocab_size 86 | 87 | model = GraphRLModel(model_path, vocab_size, device).to(device) 88 | 89 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 90 | 91 | logger.info("Starting Graph-based RL training...") 92 | 93 | for epoch in range(epochs): 94 | total_reward = 0 95 | total_loss = 0 96 | 97 | for i, batch in enumerate(dataset): 98 | if i >= len(dataset) // batch_size: 99 | break 100 | 101 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).to(device) 102 | 103 | generated_ids = model.generate(inputs.input_ids, inputs.attention_mask, edge_index, max_length=10) 104 | generated_part = generated_ids[:, inputs.input_ids.shape[1]:] 105 | 106 | # Calculate rewards 107 | rewards = F.softmax(model(inputs.input_ids, inputs.attention_mask, edge_index), dim=-1).gather(1, generated_part[:, 0].unsqueeze(1)) 108 | 109 | # Policy gradient loss (maximize reward) 110 | loss = -torch.mean(rewards) 111 | 112 | optimizer.zero_grad() 113 | loss.backward() 114 | optimizer.step() 115 | 116 | total_reward += rewards.mean().item() 117 | total_loss += loss.item() 118 | 119 | if i % 10 == 0: 120 | logger.info(f"Epoch {epoch+1}/{epochs}, Batch {i}/{len(dataset)//batch_size}, " 121 | f"Loss: {total_loss/(i+1):.4f}, " 122 | f"Avg Reward: {total_reward/(i+1):.4f}") 123 | 124 | model.save_pretrained(output_dir) 125 | tokenizer.save_pretrained(output_dir) 126 | 127 | logger.info(f"Graph-based RL training complete. Model saved to {output_dir}") 128 | return model, tokenizer -------------------------------------------------------------------------------- /utils/her_rl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import logging 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class HERPolicy(nn.Module): 12 | def __init__(self, model_path, vocab_size, device='cuda'): 13 | super(HERPolicy, self).__init__() 14 | self.device = device 15 | self.model = AutoModelForCausalLM.from_pretrained(model_path).to(device) 16 | self.vocab_size = vocab_size 17 | 18 | def forward(self, input_ids, attention_mask=None): 19 | outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) 20 | return outputs.logits 21 | 22 | def train_her_rl( 23 | model_path, 24 | dataset, 25 | output_dir, 26 | batch_size=4, 27 | epochs=1, 28 | lr=1e-5, 29 | her_k=4 30 | ): 31 | os.makedirs(output_dir, exist_ok=True) 32 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 33 | 34 | tokenizer = AutoTokenizer.from_pretrained(model_path) 35 | vocab_size = tokenizer.vocab_size 36 | 37 | policy = HERPolicy(model_path, vocab_size, device).to(device) 38 | 39 | optimizer = torch.optim.Adam(policy.parameters(), lr=lr) 40 | 41 | logger.info("Starting Hindsight Experience Replay (HER) RL training...") 42 | 43 | replay_buffer = [] 44 | 45 | for epoch in range(epochs): 46 | total_reward = 0 47 | total_loss = 0 48 | 49 | for i, batch in enumerate(dataset): 50 | if i >= len(dataset) // batch_size: 51 | break 52 | 53 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).to(device) 54 | 55 | logits = policy(inputs.input_ids, inputs.attention_mask) 56 | action = torch.argmax(logits, dim=-1) 57 | rewards = F.softmax(logits, dim=-1).gather(1, action.unsqueeze(1)) 58 | 59 | replay_buffer.append((inputs.input_ids, inputs.attention_mask, action, rewards)) 60 | 61 | if len(replay_buffer) > her_k: 62 | replay_buffer.pop(0) 63 | 64 | # HER: Replay with different goals 65 | for replay in replay_buffer: 66 | replay_inputs, replay_mask, replay_action, replay_reward = replay 67 | new_goal = torch.randint(0, vocab_size, replay_action.shape).to(device) 68 | new_inputs = torch.where(replay_inputs == replay_action, new_goal, replay_inputs) 69 | new_logits = policy(new_inputs, replay_mask) 70 | new_rewards = F.softmax(new_logits, dim=-1).gather(1, new_goal.unsqueeze(1)) 71 | total_rewards = (replay_reward + new_rewards) / 2 72 | 73 | loss = -torch.mean(total_rewards) 74 | 75 | optimizer.zero_grad() 76 | loss.backward() 77 | optimizer.step() 78 | 79 | total_reward += total_rewards.mean().item() 80 | total_loss += loss.item() 81 | 82 | if i % 10 == 0: 83 | logger.info(f"Epoch {epoch+1}/{epochs}, Batch {i}/{len(dataset)//batch_size}, " 84 | f"Loss: {total_loss/(i+1):.4f}, " 85 | f"Avg Reward: {total_reward/(i+1):.4f}") 86 | 87 | policy.save_pretrained(output_dir) 88 | tokenizer.save_pretrained(output_dir) 89 | 90 | logger.info(f"Hindsight Experience Replay (HER) RL training complete. Model saved to {output_dir}") 91 | return policy, tokenizer -------------------------------------------------------------------------------- /utils/hierarchical_rl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import logging 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class HighLevelPolicy(nn.Module): 12 | def __init__(self, model_path, vocab_size, device='cuda'): 13 | super(HighLevelPolicy, self).__init__() 14 | self.device = device 15 | self.model = AutoModelForCausalLM.from_pretrained(model_path).to(device) 16 | self.vocab_size = vocab_size 17 | 18 | def forward(self, input_ids, attention_mask=None): 19 | outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) 20 | return outputs.logits 21 | 22 | class LowLevelPolicy(nn.Module): 23 | def __init__(self, model_path, vocab_size, device='cuda'): 24 | super(LowLevelPolicy, self).__init__() 25 | self.device = device 26 | self.model = AutoModelForCausalLM.from_pretrained(model_path).to(device) 27 | self.vocab_size = vocab_size 28 | 29 | def forward(self, input_ids, attention_mask=None): 30 | outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) 31 | return outputs.logits 32 | 33 | def train_hierarchical_rl( 34 | high_level_model_path, 35 | low_level_model_path, 36 | dataset, 37 | output_dir, 38 | batch_size=4, 39 | epochs=1, 40 | lr=1e-5 41 | ): 42 | os.makedirs(output_dir, exist_ok=True) 43 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 44 | 45 | tokenizer = AutoTokenizer.from_pretrained(high_level_model_path) 46 | vocab_size = tokenizer.vocab_size 47 | 48 | high_level_policy = HighLevelPolicy(high_level_model_path, vocab_size, device).to(device) 49 | low_level_policy = LowLevelPolicy(low_level_model_path, vocab_size, device).to(device) 50 | 51 | high_level_optimizer = torch.optim.Adam(high_level_policy.parameters(), lr=lr) 52 | low_level_optimizer = torch.optim.Adam(low_level_policy.parameters(), lr=lr) 53 | 54 | logger.info("Starting Hierarchical RL training...") 55 | 56 | for epoch in range(epochs): 57 | total_reward = 0 58 | total_loss = 0 59 | 60 | for i, batch in enumerate(dataset): 61 | if i >= len(dataset) // batch_size: 62 | break 63 | 64 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).to(device) 65 | 66 | high_level_logits = high_level_policy(inputs.input_ids, inputs.attention_mask) 67 | high_level_action = torch.argmax(high_level_logits, dim=-1) 68 | 69 | low_level_inputs = torch.cat([inputs.input_ids, high_level_action.unsqueeze(1)], dim=1) 70 | low_level_mask = torch.cat([inputs.attention_mask, torch.ones_like(high_level_action).unsqueeze(1)], dim=1) 71 | low_level_logits = low_level_policy(low_level_inputs, low_level_mask) 72 | low_level_action = torch.argmax(low_level_logits, dim=-1) 73 | 74 | rewards = F.softmax(low_level_logits, dim=-1).gather(1, low_level_action.unsqueeze(1)) 75 | loss = -torch.mean(rewards) 76 | 77 | high_level_optimizer.zero_grad() 78 | low_level_optimizer.zero_grad() 79 | loss.backward() 80 | high_level_optimizer.step() 81 | low_level_optimizer.step() 82 | 83 | total_reward += rewards.mean().item() 84 | total_loss += loss.item() 85 | 86 | if i % 10 == 0: 87 | logger.info(f"Epoch {epoch+1}/{epochs}, Batch {i}/{len(dataset)//batch_size}, " 88 | f"Loss: {total_loss/(i+1):.4f}, " 89 | f"Avg Reward: {total_reward/(i+1):.4f}") 90 | 91 | high_level_policy.save_pretrained(os.path.join(output_dir, "high_level")) 92 | low_level_policy.save_pretrained(os.path.join(output_dir, "low_level")) 93 | tokenizer.save_pretrained(output_dir) 94 | 95 | logger.info(f"Hierarchical RL training complete. Model saved to {output_dir}") 96 | return high_level_policy, low_level_policy, tokenizer -------------------------------------------------------------------------------- /utils/information_exploration.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | import logging 6 | import numpy as np 7 | from scipy.stats import entropy 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class InformationExplorationModel(nn.Module): 12 | """Model that uses information-theoretic measures for exploration""" 13 | def __init__(self, base_model): 14 | super().__init__() 15 | self.base_model = base_model 16 | self.hidden_size = base_model.config.hidden_size 17 | 18 | # Value head 19 | self.value_head = nn.Linear(self.hidden_size, 1) 20 | 21 | # Uncertainty estimator 22 | self.uncertainty_head = nn.Sequential( 23 | nn.Linear(self.hidden_size, 128), 24 | nn.ReLU(), 25 | nn.Linear(128, 1) 26 | ) 27 | 28 | def forward(self, input_ids, attention_mask=None): 29 | outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask) 30 | hidden_states = outputs.last_hidden_state 31 | logits = outputs.logits 32 | 33 | # Compute values 34 | values = self.value_head(hidden_states) 35 | 36 | # Compute uncertainty estimates 37 | uncertainty = self.uncertainty_head(hidden_states) 38 | 39 | return { 40 | 'logits': logits, 41 | 'values': values, 42 | 'hidden_states': hidden_states, 43 | 'uncertainty': uncertainty 44 | } 45 | 46 | def compute_entropy_bonus(self, logits): 47 | """Compute entropy of token distribution as exploration bonus""" 48 | probs = F.softmax(logits, dim=-1) 49 | log_probs = F.log_softmax(logits, dim=-1) 50 | entropy = -(probs * log_probs).sum(dim=-1) 51 | return entropy 52 | 53 | def compute_information_gain(model, tokenizer, input_ids, num_samples=10): 54 | """Estimate information gain of different actions""" 55 | batch_size, seq_len = input_ids.shape 56 | device = input_ids.device 57 | vocab_size = model.base_model.config.vocab_size 58 | 59 | # Get current hidden states 60 | with torch.no_grad(): 61 | outputs = model.base_model(input_ids) 62 | current_hidden = outputs.last_hidden_state[:, -1, :] # Last token 63 | 64 | # Sample possible next tokens 65 | with torch.no_grad(): 66 | logits = outputs.logits[:, -1, :] 67 | probs = F.softmax(logits, dim=-1) 68 | 69 | # Sample top-k tokens for efficiency 70 | top_k_values, top_k_indices = torch.topk(probs, k=min(num_samples, vocab_size), dim=-1) 71 | 72 | # Compute expected information gain for each token 73 | info_gains = [] 74 | 75 | for batch_idx in range(batch_size): 76 | token_gains = [] 77 | 78 | for i in range(top_k_indices.size(1)): 79 | token = top_k_indices[batch_idx, i].unsqueeze(0).unsqueeze(0) 80 | 81 | # Create next input with this token 82 | next_input = torch.cat([input_ids[batch_idx:batch_idx+1], token], dim=1) 83 | 84 | # Get prediction for next step 85 | with torch.no_grad(): 86 | next_outputs = model.base_model(next_input) 87 | next_hidden = next_outputs.last_hidden_state[:, -1, :] 88 | 89 | # Compute KL divergence as information gain 90 | # Simplified: using L2 distance between hidden states as proxy 91 | info_gain = torch.sum((next_hidden - current_hidden[batch_idx:batch_idx+1]) ** 2) 92 | 93 | # Weight by probability 94 | weighted_gain = top_k_values[batch_idx, i].item() * info_gain.item() 95 | token_gains.append((token.item(), weighted_gain)) 96 | 97 | # Sort by information gain 98 | token_gains.sort(key=lambda x: x[1], reverse=True) 99 | info_gains.append(token_gains) 100 | 101 | return info_gains 102 | 103 | def train_information_exploration(base_model_path, train_dataset, output_dir, reward_model_path=None, 104 | batch_size=4, epochs=1, lr=1e-5, explore_coef=0.1): 105 | """Train a model with information-theoretic exploration.""" 106 | logger.info("Initializing Information-theoretic Exploration") 107 | 108 | # Load tokenizer and model 109 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 110 | base_model = AutoModelForCausalLM.from_pretrained(base_model_path) 111 | 112 | # Create model 113 | model = InformationExplorationModel(base_model) 114 | 115 | # Load reward model if provided 116 | if reward_model_path: 117 | logger.info(f"Loading reward model from {reward_model_path}") 118 | reward_model = AutoModelForCausalLM.from_pretrained(reward_model_path) 119 | else: 120 | logger.info("No reward model provided, will use entropy-based rewards") 121 | reward_model = None 122 | 123 | # Setup optimizer 124 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 125 | 126 | # Training loop 127 | logger.info(f"Starting training for {epochs} epochs") 128 | model.train() 129 | 130 | for epoch in range(epochs): 131 | total_policy_loss = 0 132 | total_value_loss = 0 133 | total_entropy_bonus = 0 134 | num_batches = 0 135 | 136 | for i in range(0, len(train_dataset), batch_size): 137 | batch = train_dataset[i:i+batch_size] 138 | 139 | # Tokenize inputs 140 | inputs = tokenizer(batch['text'], return_tensors="pt", padding=True, truncation=True) 141 | inputs = {k: v.to(base_model.device) for k, v in inputs.items()} 142 | 143 | # Forward pass 144 | outputs = model(inputs['input_ids'], inputs['attention_mask']) 145 | logits = outputs['logits'] 146 | values = outputs['values'] 147 | uncertainty = outputs['uncertainty'] 148 | 149 | # Generate next tokens based on information gain 150 | info_gains = compute_information_gain(model, tokenizer, inputs['input_ids']) 151 | 152 | # Create targets based on information gain 153 | # We'll use the tokens with highest information gain as targets 154 | targets = [] 155 | for batch_idx in range(len(info_gains)): 156 | if info_gains[batch_idx]: 157 | targets.append(info_gains[batch_idx][0][0]) # First token in sorted list 158 | else: 159 | # Fallback if no info gain computed 160 | targets.append(tokenizer.eos_token_id) 161 | 162 | targets = torch.tensor(targets, device=base_model.device).unsqueeze(1) 163 | 164 | # Compute entropy bonus 165 | entropy_bonus = model.compute_entropy_bonus(logits[:, -1, :]) 166 | 167 | # Compute intrinsic rewards based on information gain and uncertainty 168 | with torch.no_grad(): 169 | intrinsic_rewards = uncertainty[:, -1, 0] + explore_coef * entropy_bonus 170 | 171 | # Get extrinsic rewards if reward model available 172 | if reward_model: 173 | with torch.no_grad(): 174 | reward_outputs = reward_model(inputs['input_ids'], attention_mask=inputs['attention_mask']) 175 | extrinsic_rewards = reward_outputs.logits.mean(dim=-1) 176 | 177 | # Combine rewards 178 | combined_rewards = extrinsic_rewards + explore_coef * intrinsic_rewards 179 | else: 180 | # Use only intrinsic rewards 181 | combined_rewards = intrinsic_rewards 182 | 183 | # Policy loss: maximize reward by picking high information gain tokens 184 | policy_logits = logits[:, -1, :] 185 | policy_loss = F.cross_entropy(policy_logits, targets.squeeze()) 186 | 187 | # Value loss: predict combined rewards 188 | value_loss = F.mse_loss(values[:, -1, 0], combined_rewards) 189 | 190 | # Combined loss 191 | loss = policy_loss + 0.5 * value_loss 192 | 193 | # Update model 194 | optimizer.zero_grad() 195 | loss.backward() 196 | optimizer.step() 197 | 198 | total_policy_loss += policy_loss.item() 199 | total_value_loss += value_loss.item() 200 | total_entropy_bonus += entropy_bonus.mean().item() 201 | num_batches += 1 202 | 203 | if num_batches % 10 == 0: 204 | logger.info(f"Epoch {epoch+1}, Batch {num_batches}, " 205 | f"Policy Loss: {policy_loss.item():.4f}, " 206 | f"Value Loss: {value_loss.item():.4f}, " 207 | f"Entropy Bonus: {entropy_bonus.mean().item():.4f}") 208 | 209 | avg_policy_loss = total_policy_loss / num_batches 210 | avg_value_loss = total_value_loss / num_batches 211 | avg_entropy_bonus = total_entropy_bonus / num_batches 212 | 213 | logger.info(f"Epoch {epoch+1} completed. " 214 | f"Average Policy Loss: {avg_policy_loss:.4f}, " 215 | f"Average Value Loss: {avg_value_loss:.4f}, " 216 | f"Average Entropy Bonus: {avg_entropy_bonus:.4f}") 217 | 218 | # Save the model 219 | logger.info(f"Training completed. Saving model to {output_dir}") 220 | model.base_model.save_pretrained(output_dir) 221 | tokenizer.save_pretrained(output_dir) 222 | 223 | # Also save the exploration-specific components 224 | torch.save({ 225 | 'value_head': model.value_head.state_dict(), 226 | 'uncertainty_head': model.uncertainty_head.state_dict() 227 | }, f"{output_dir}/exploration_components.pt") 228 | 229 | return model.base_model, tokenizer 230 | -------------------------------------------------------------------------------- /utils/intrinsic_motivation.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | import logging 6 | import numpy as np 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | class EmpowermentModel(nn.Module): 11 | """Model that computes empowerment-based intrinsic motivation""" 12 | def __init__(self, base_model, hidden_size=128): 13 | super().__init__() 14 | self.base_model = base_model 15 | self.model_hidden_size = base_model.config.hidden_size 16 | self.hidden_size = hidden_size 17 | 18 | # Forward model: predicts next state given current state and action 19 | self.forward_model = nn.Sequential( 20 | nn.Linear(self.model_hidden_size + self.model_hidden_size, hidden_size), 21 | nn.ReLU(), 22 | nn.Linear(hidden_size, self.model_hidden_size) 23 | ) 24 | 25 | # Inverse model: predicts action given current and next state 26 | self.inverse_model = nn.Sequential( 27 | nn.Linear(self.model_hidden_size + self.model_hidden_size, hidden_size), 28 | nn.ReLU(), 29 | nn.Linear(hidden_size, base_model.config.vocab_size) 30 | ) 31 | 32 | def forward(self, input_ids, attention_mask=None): 33 | outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask) 34 | hidden_states = outputs.last_hidden_state 35 | logits = outputs.logits 36 | 37 | return { 38 | 'logits': logits, 39 | 'hidden_states': hidden_states 40 | } 41 | 42 | def compute_empowerment(self, current_state, actions, next_states): 43 | """Compute empowerment as mutual information between actions and next states""" 44 | batch_size = current_state.size(0) 45 | num_actions = actions.size(1) 46 | 47 | # Expand current state for all actions 48 | expanded_current = current_state.unsqueeze(1).expand(-1, num_actions, -1) 49 | expanded_current = expanded_current.reshape(batch_size * num_actions, -1) 50 | 51 | # Reshape actions and next states 52 | flattened_actions = actions.reshape(batch_size * num_actions, -1) 53 | flattened_next_states = next_states.reshape(batch_size * num_actions, -1) 54 | 55 | # Forward prediction: p(next_state | current_state, action) 56 | predicted_next_states = self.forward_model( 57 | torch.cat([expanded_current, flattened_actions], dim=1) 58 | ) 59 | forward_loss = F.mse_loss(predicted_next_states, flattened_next_states) 60 | 61 | # Inverse prediction: p(action | current_state, next_state) 62 | predicted_actions = self.inverse_model( 63 | torch.cat([expanded_current, flattened_next_states], dim=1) 64 | ) 65 | inverse_loss = F.cross_entropy( 66 | predicted_actions, 67 | flattened_actions.argmax(dim=1) if flattened_actions.dim() > 1 else flattened_actions 68 | ) 69 | 70 | # Empowerment is approximated by the negative of the inverse model loss 71 | # The better the inverse model can predict the action, the higher the empowerment 72 | empowerment = -inverse_loss 73 | 74 | return empowerment, forward_loss, inverse_loss 75 | 76 | def train_empowerment(base_model_path, train_dataset, output_dir, reward_model_path=None, 77 | batch_size=4, epochs=1, lr=1e-5, empowerment_coef=0.01): 78 | """Train a model with intrinsic motivation through empowerment.""" 79 | logger.info("Initializing Intrinsic Motivation through Empowerment") 80 | 81 | # Load tokenizer and model 82 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 83 | base_model = AutoModelForCausalLM.from_pretrained(base_model_path) 84 | 85 | # Create empowerment model 86 | model = EmpowermentModel(base_model) 87 | 88 | # Load reward model if provided 89 | if reward_model_path: 90 | logger.info(f"Loading reward model from {reward_model_path}") 91 | reward_model = AutoModelForCausalLM.from_pretrained(reward_model_path) 92 | else: 93 | logger.info("No reward model provided, will use only intrinsic rewards") 94 | reward_model = None 95 | 96 | # Setup optimizer 97 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 98 | 99 | # Training loop 100 | logger.info(f"Starting training for {epochs} epochs") 101 | model.train() 102 | 103 | for epoch in range(epochs): 104 | total_policy_loss = 0 105 | total_empowerment = 0 106 | total_forward_loss = 0 107 | total_inverse_loss = 0 108 | num_batches = 0 109 | 110 | for i in range(0, len(train_dataset), batch_size): 111 | batch = train_dataset[i:i+batch_size] 112 | 113 | # Tokenize inputs 114 | inputs = tokenizer(batch['text'], return_tensors="pt", padding=True, truncation=True) 115 | inputs = {k: v.to(base_model.device) for k, v in inputs.items()} 116 | 117 | # Get model outputs 118 | outputs = model(inputs['input_ids'], inputs['attention_mask']) 119 | logits = outputs['logits'] 120 | hidden_states = outputs['hidden_states'] 121 | 122 | # Generate next tokens 123 | with torch.no_grad(): 124 | next_token_probs = F.softmax(logits[:, -1, :], dim=-1) 125 | next_tokens = torch.multinomial(next_token_probs, 5) # Sample 5 possible next tokens 126 | 127 | # Get embeddings for actions 128 | action_embeddings = base_model.get_input_embeddings()(next_tokens) 129 | 130 | # Generate next states for each action 131 | next_states = [] 132 | for j in range(next_tokens.size(1)): 133 | next_input = torch.cat([inputs['input_ids'], next_tokens[:, j:j+1]], dim=1) 134 | next_output = model(next_input) 135 | next_state = next_output['hidden_states'][:, -1, :].unsqueeze(1) 136 | next_states.append(next_state) 137 | 138 | next_states = torch.cat(next_states, dim=1) 139 | 140 | # Compute empowerment and model losses 141 | empowerment, forward_loss, inverse_loss = model.compute_empowerment( 142 | hidden_states[:, -1, :], 143 | action_embeddings, 144 | next_states 145 | ) 146 | 147 | # Get extrinsic rewards if reward model available 148 | if reward_model: 149 | with torch.no_grad(): 150 | next_input = torch.cat([inputs['input_ids'], next_tokens[:, 0:1]], dim=1) # Use first sampled token 151 | reward_outputs = reward_model(next_input) 152 | extrinsic_rewards = reward_outputs.logits.mean(dim=-1) 153 | else: 154 | extrinsic_rewards = torch.zeros(batch_size, device=base_model.device) 155 | 156 | # Combined rewards: extrinsic + empowerment-based intrinsic 157 | combined_rewards = extrinsic_rewards + empowerment_coef * empowerment.detach() 158 | 159 | # Policy loss: encourage actions that maximize combined reward 160 | policy_logits = logits[:, -1, :] 161 | policy_log_probs = F.log_softmax(policy_logits, dim=-1) 162 | 163 | # Use the rewards to guide policy improvement 164 | policy_loss = -(policy_log_probs * combined_rewards.unsqueeze(1)).mean() 165 | 166 | # Total loss 167 | loss = policy_loss + forward_loss + inverse_loss 168 | 169 | # Update model 170 | optimizer.zero_grad() 171 | loss.backward() 172 | optimizer.step() 173 | 174 | total_policy_loss += policy_loss.item() 175 | total_empowerment += empowerment.mean().item() 176 | total_forward_loss += forward_loss.item() 177 | total_inverse_loss += inverse_loss.item() 178 | num_batches += 1 179 | 180 | if num_batches % 10 == 0: 181 | logger.info(f"Epoch {epoch+1}, Batch {num_batches}, " 182 | f"Policy Loss: {policy_loss.item():.4f}, " 183 | f"Empowerment: {empowerment.mean().item():.4f}, " 184 | f"Forward Loss: {forward_loss.item():.4f}, " 185 | f"Inverse Loss: {inverse_loss.item():.4f}") 186 | 187 | avg_policy_loss = total_policy_loss / num_batches 188 | avg_empowerment = total_empowerment / num_batches 189 | avg_forward_loss = total_forward_loss / num_batches 190 | avg_inverse_loss = total_inverse_loss / num_batches 191 | 192 | logger.info(f"Epoch {epoch+1} completed. " 193 | f"Average Policy Loss: {avg_policy_loss:.4f}, " 194 | f"Average Empowerment: {avg_empowerment:.4f}, " 195 | f"Average Forward Loss: {avg_forward_loss:.4f}, " 196 | f"Average Inverse Loss: {avg_inverse_loss:.4f}") 197 | 198 | # Save the model 199 | logger.info(f"Training completed. Saving model to {output_dir}") 200 | model.base_model.save_pretrained(output_dir) 201 | tokenizer.save_pretrained(output_dir) 202 | 203 | return model.base_model, tokenizer 204 | -------------------------------------------------------------------------------- /utils/irl_training.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import gym 4 | import numpy as np 5 | from gym import spaces 6 | from stable_baselines3 import PPO 7 | from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline 8 | 9 | class TextGenerationEnv(gym.Env): 10 | def __init__(self, model, tokenizer, reward_model, prompts): 11 | super().__init__() 12 | self.model = model 13 | self.tokenizer = tokenizer 14 | self.reward_model = reward_model 15 | self.prompts = prompts 16 | self.current_prompt_idx = 0 17 | 18 | # Define action and observation space 19 | self.action_space = spaces.Box(low=-1, high=1, shape=(10,), dtype=np.float32) 20 | self.observation_space = spaces.Box(low=0, high=1, shape=(128,), dtype=np.float32) 21 | 22 | def reset(self): 23 | self.current_prompt = self.prompts[self.current_prompt_idx] 24 | self.current_prompt_idx = (self.current_prompt_idx + 1) % len(self.prompts) 25 | 26 | encoded = self.tokenizer(self.current_prompt, return_tensors="pt").to(self.model.device) 27 | self.current_input_ids = encoded["input_ids"] 28 | 29 | return np.random.rand(128).astype(np.float32) 30 | 31 | def step(self, action): 32 | with torch.no_grad(): 33 | output = self.model.generate( 34 | self.current_input_ids, 35 | max_length=self.current_input_ids.shape[1] + 20, 36 | temperature=0.7, 37 | do_sample=True 38 | ) 39 | 40 | generated_text = self.tokenizer.decode(output[0], skip_special_tokens=True) 41 | generated_text = generated_text[len(self.current_prompt):] 42 | 43 | # Get reward from reward model 44 | with torch.no_grad(): 45 | inputs = self.tokenizer(generated_text, return_tensors="pt").to(self.reward_model.device) 46 | reward_score = self.reward_model(**inputs).logits.item() 47 | 48 | observation = np.random.rand(128).astype(np.float32) 49 | done = True 50 | 51 | return observation, reward_score, done, {} 52 | 53 | def train_irl(dpo_model_path, train_dataset, output_dir, reward_model_path, batch_size): 54 | """ 55 | ฝึกโมเดลด้วย Inverse Reinforcement Learning 56 | """ 57 | os.makedirs(output_dir, exist_ok=True) 58 | 59 | # Load models 60 | tokenizer = AutoTokenizer.from_pretrained(dpo_model_path) 61 | model = AutoModelForCausalLM.from_pretrained(dpo_model_path, torch_dtype=torch.bfloat16, device_map="auto") 62 | reward_pipeline = pipeline("text-classification", model=reward_model_path, tokenizer=tokenizer, device=0) 63 | 64 | # Create prompts for IRL 65 | prompts = [example["prompt"] for example in train_dataset[:min(100, len(train_dataset))]] 66 | 67 | # Create IRL environment 68 | env = TextGenerationEnv(model, tokenizer, reward_pipeline.model, prompts) 69 | 70 | # Train with PPO as the IRL algorithm 71 | irl_model = PPO("MlpPolicy", env, verbose=1) 72 | irl_model.learn(total_timesteps=100) 73 | irl_model.save(os.path.join(output_dir, "irl_model")) 74 | 75 | return irl_model -------------------------------------------------------------------------------- /utils/knowledge_grounded_rl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import logging 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class KnowledgeGroundedRLModel(nn.Module): 12 | """ 13 | Knowledge-Grounded Reinforcement Learning Model 14 | บูรณาการฐานความรู้ภายนอกเข้ากับกระบวนการเรียนรู้ 15 | """ 16 | def __init__(self, model_path, vocab_size, knowledge_base, device='cuda'): 17 | super(KnowledgeGroundedRLModel, self).__init__() 18 | self.device = device 19 | self.model = AutoModelForCausalLM.from_pretrained(model_path).to(device) 20 | self.vocab_size = vocab_size 21 | self.knowledge_base = knowledge_base 22 | 23 | def forward(self, input_ids, attention_mask=None): 24 | outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) 25 | return outputs.logits 26 | 27 | def generate(self, input_ids, attention_mask=None, max_length=30, **kwargs): 28 | current_input_ids = input_ids 29 | current_attention_mask = attention_mask 30 | 31 | for _ in range(max_length): 32 | next_token_logits = self.forward(current_input_ids, current_attention_mask) 33 | next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1) 34 | current_input_ids = torch.cat([current_input_ids, next_token], dim=1) 35 | if current_attention_mask is not None: 36 | current_attention_mask = torch.cat([current_attention_mask, torch.ones_like(next_token)], dim=1) 37 | 38 | return current_input_ids 39 | 40 | def train_knowledge_grounded_rl( 41 | model_path, 42 | dataset, 43 | output_dir, 44 | knowledge_base, 45 | batch_size=4, 46 | epochs=1, 47 | lr=1e-5 48 | ): 49 | """ 50 | Train a Knowledge-Grounded Reinforcement Learning model 51 | 52 | Args: 53 | model_path: Path to the pre-trained model 54 | dataset: Dataset for training 55 | output_dir: Directory to save the model 56 | knowledge_base: External knowledge base 57 | batch_size: Batch size for training 58 | epochs: Number of epochs for training 59 | lr: Learning rate 60 | 61 | Returns: 62 | Trained model 63 | """ 64 | os.makedirs(output_dir, exist_ok=True) 65 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 66 | 67 | tokenizer = AutoTokenizer.from_pretrained(model_path) 68 | vocab_size = tokenizer.vocab_size 69 | 70 | model = KnowledgeGroundedRLModel(model_path, vocab_size, knowledge_base, device).to(device) 71 | 72 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 73 | 74 | logger.info("Starting Knowledge-Grounded RL training...") 75 | 76 | for epoch in range(epochs): 77 | total_reward = 0 78 | total_loss = 0 79 | 80 | for i, batch in enumerate(dataset): 81 | if i >= len(dataset) // batch_size: 82 | break 83 | 84 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).to(device) 85 | 86 | generated_ids = model.generate(inputs.input_ids, inputs.attention_mask, max_length=10) 87 | generated_part = generated_ids[:, inputs.input_ids.shape[1]:] 88 | 89 | # Calculate rewards with knowledge grounding 90 | rewards = F.softmax(model(inputs.input_ids, inputs.attention_mask), dim=-1).gather(1, generated_part[:, 0].unsqueeze(1)) 91 | 92 | # Policy gradient loss (maximize reward) 93 | loss = -torch.mean(rewards) 94 | 95 | optimizer.zero_grad() 96 | loss.backward() 97 | optimizer.step() 98 | 99 | total_reward += rewards.mean().item() 100 | total_loss += loss.item() 101 | 102 | if i % 10 == 0: 103 | logger.info(f"Epoch {epoch+1}/{epochs}, Batch {i}/{len(dataset)//batch_size}, " 104 | f"Loss: {total_loss/(i+1):.4f}, " 105 | f"Avg Reward: {total_reward/(i+1):.4f}") 106 | 107 | model.save_pretrained(output_dir) 108 | tokenizer.save_pretrained(output_dir) 109 | 110 | logger.info(f"Knowledge-Grounded RL training complete. Model saved to {output_dir}") 111 | return model, tokenizer -------------------------------------------------------------------------------- /utils/meta_rl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import logging 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class MetaRLPolicy(nn.Module): 12 | def __init__(self, model_path, vocab_size, device='cuda'): 13 | super(MetaRLPolicy, self).__init__() 14 | self.device = device 15 | self.model = AutoModelForCausalLM.from_pretrained(model_path).to(device) 16 | self.vocab_size = vocab_size 17 | 18 | def forward(self, input_ids, attention_mask=None): 19 | outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) 20 | return outputs.logits 21 | 22 | def train_meta_rl( 23 | model_path, 24 | dataset, 25 | output_dir, 26 | batch_size=4, 27 | epochs=1, 28 | lr=1e-5 29 | ): 30 | os.makedirs(output_dir, exist_ok=True) 31 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 32 | 33 | tokenizer = AutoTokenizer.from_pretrained(model_path) 34 | vocab_size = tokenizer.vocab_size 35 | 36 | policy = MetaRLPolicy(model_path, vocab_size, device).to(device) 37 | 38 | optimizer = torch.optim.Adam(policy.parameters(), lr=lr) 39 | 40 | logger.info("Starting Meta-Reinforcement Learning (Meta-RL) training...") 41 | 42 | for epoch in range(epochs): 43 | total_reward = 0 44 | total_loss = 0 45 | 46 | for i, batch in enumerate(dataset): 47 | if i >= len(dataset) // batch_size: 48 | break 49 | 50 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).to(device) 51 | 52 | logits = policy(inputs.input_ids, inputs.attention_mask) 53 | action = torch.argmax(logits, dim=-1) 54 | rewards = F.softmax(logits, dim=-1).gather(1, action.unsqueeze(1)) 55 | 56 | loss = -torch.mean(rewards) 57 | 58 | optimizer.zero_grad() 59 | loss.backward() 60 | optimizer.step() 61 | 62 | # Meta-learning: Adapt to new tasks 63 | for task in range(5): # Assume 5 different tasks 64 | task_inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).to(device) 65 | task_logits = policy(task_inputs.input_ids, task_inputs.attention_mask) 66 | task_action = torch.argmax(task_logits, dim=-1) 67 | task_rewards = F.softmax(task_logits, dim=-1).gather(1, task_action.unsqueeze(1)) 68 | 69 | task_loss = -torch.mean(task_rewards) 70 | 71 | optimizer.zero_grad() 72 | task_loss.backward() 73 | optimizer.step() 74 | 75 | total_reward += task_rewards.mean().item() 76 | total_loss += task_loss.item() 77 | 78 | if i % 10 == 0: 79 | logger.info(f"Epoch {epoch+1}/{epochs}, Batch {i}/{len(dataset)//batch_size}, " 80 | f"Loss: {total_loss/(i+1):.4f}, " 81 | f"Avg Reward: {total_reward/(i+1):.4f}") 82 | 83 | policy.save_pretrained(output_dir) 84 | tokenizer.save_pretrained(output_dir) 85 | 86 | logger.info(f"Meta-Reinforcement Learning (Meta-RL) training complete. Model saved to {output_dir}") 87 | return policy, tokenizer -------------------------------------------------------------------------------- /utils/meta_rl_task_decomposition.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import logging 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class TaskDecompositionRLModel(nn.Module): 12 | """ 13 | Meta-RL with Task Decomposition 14 | แบ่งงานใหญ่เป็นงานย่อยหลายๆ ชิ้น 15 | """ 16 | def __init__(self, model_path, vocab_size, device='cuda'): 17 | super(TaskDecompositionRLModel, self).__init__() 18 | self.device = device 19 | self.model = AutoModelForCausalLM.from_pretrained(model_path).to(device) 20 | self.vocab_size = vocab_size 21 | 22 | # Task decomposition network 23 | self.task_network = nn.Sequential( 24 | nn.Linear(768, 256), 25 | nn.ReLU(), 26 | nn.Linear(256, 128) 27 | ) 28 | 29 | def forward(self, input_ids, attention_mask=None): 30 | outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) 31 | hidden_states = outputs.hidden_states[-1] 32 | 33 | # Apply task decomposition network 34 | task_features = self.task_network(hidden_states[:, -1, :]) 35 | 36 | # Final logits 37 | logits = F.linear(task_features, self.model.lm_head.weight) 38 | 39 | return logits 40 | 41 | def generate(self, input_ids, attention_mask=None, max_length=30, **kwargs): 42 | current_input_ids = input_ids 43 | current_attention_mask = attention_mask 44 | 45 | for _ in range(max_length): 46 | next_token_logits = self.forward(current_input_ids, current_attention_mask) 47 | next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1) 48 | current_input_ids = torch.cat([current_input_ids, next_token], dim=1) 49 | if current_attention_mask is not None: 50 | current_attention_mask = torch.cat([current_attention_mask, torch.ones_like(next_token)], dim=1) 51 | 52 | return current_input_ids 53 | 54 | def train_meta_rl_task_decomposition( 55 | model_path, 56 | dataset, 57 | output_dir, 58 | batch_size=4, 59 | epochs=1, 60 | lr=1e-5 61 | ): 62 | """ 63 | Train a Meta-RL with Task Decomposition model 64 | 65 | Args: 66 | model_path: Path to the pre-trained model 67 | dataset: Dataset for training 68 | output_dir: Directory to save the model 69 | batch_size: Batch size for training 70 | epochs: Number of epochs for training 71 | lr: Learning rate 72 | 73 | Returns: 74 | Trained model 75 | """ 76 | os.makedirs(output_dir, exist_ok=True) 77 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 78 | 79 | tokenizer = AutoTokenizer.from_pretrained(model_path) 80 | vocab_size = tokenizer.vocab_size 81 | 82 | model = TaskDecompositionRLModel(model_path, vocab_size, device).to(device) 83 | 84 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 85 | 86 | logger.info("Starting Meta-RL with Task Decomposition training...") 87 | 88 | for epoch in range(epochs): 89 | total_reward = 0 90 | total_loss = 0 91 | 92 | for i, batch in enumerate(dataset): 93 | if i >= len(dataset) // batch_size: 94 | break 95 | 96 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).to(device) 97 | 98 | generated_ids = model.generate(inputs.input_ids, inputs.attention_mask, max_length=10) 99 | generated_part = generated_ids[:, inputs.input_ids.shape[1]:] 100 | 101 | # Calculate rewards 102 | rewards = F.softmax(model(inputs.input_ids, inputs.attention_mask), dim=-1).gather(1, generated_part[:, 0].unsqueeze(1)) 103 | 104 | # Policy gradient loss (maximize reward) 105 | loss = -torch.mean(rewards) 106 | 107 | optimizer.zero_grad() 108 | loss.backward() 109 | optimizer.step() 110 | 111 | total_reward += rewards.mean().item() 112 | total_loss += loss.item() 113 | 114 | if i % 10 == 0: 115 | logger.info(f"Epoch {epoch+1}/{epochs}, Batch {i}/{len(dataset)//batch_size}, " 116 | f"Loss: {total_loss/(i+1):.4f}, " 117 | f"Avg Reward: {total_reward/(i+1):.4f}") 118 | 119 | model.save_pretrained(output_dir) 120 | tokenizer.save_pretrained(output_dir) 121 | 122 | logger.info(f"Meta-RL with Task Decomposition training complete. Model saved to {output_dir}") 123 | return model, tokenizer -------------------------------------------------------------------------------- /utils/model_manager.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | from typing import Optional 4 | from transformers import AutoModel, AutoTokenizer 5 | from datasets import load_dataset 6 | from huggingface_hub import HfApi 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | class ModelManager: 11 | def __init__(self): 12 | self.pretrained_dir = "models/pretrained" 13 | self.finetuned_dir = "models/finetuned" 14 | self.datasets_dir = "datasets" 15 | self.api = HfApi() 16 | 17 | # Ensure directories exist 18 | os.makedirs(self.pretrained_dir, exist_ok=True) 19 | os.makedirs(self.finetuned_dir, exist_ok=True) 20 | os.makedirs(self.datasets_dir, exist_ok=True) 21 | 22 | def download_model(self, model_name: str, model_type: str = "pretrained") -> str: 23 | """ 24 | Download a model from Hugging Face Hub. 25 | 26 | Args: 27 | model_name (str): Name of the model on Hugging Face Hub 28 | model_type (str): Either "pretrained" or "finetuned" 29 | 30 | Returns: 31 | str: Path to the downloaded model 32 | """ 33 | try: 34 | logger.info(f"Downloading model: {model_name}") 35 | 36 | # Choose directory based on model type 37 | target_dir = self.pretrained_dir if model_type == "pretrained" else self.finetuned_dir 38 | model_dir = os.path.join(target_dir, model_name.replace("/", "_")) 39 | 40 | # Download model and tokenizer 41 | model = AutoModel.from_pretrained(model_name) 42 | tokenizer = AutoTokenizer.from_pretrained(model_name) 43 | 44 | # Save locally 45 | model.save_pretrained(model_dir) 46 | tokenizer.save_pretrained(model_dir) 47 | 48 | logger.info(f"Model downloaded to: {model_dir}") 49 | return model_dir 50 | 51 | except Exception as e: 52 | logger.error(f"Error downloading model {model_name}: {str(e)}") 53 | raise 54 | 55 | def download_dataset(self, dataset_name: str, subset: Optional[str] = None) -> str: 56 | """ 57 | Download a dataset from Hugging Face Hub. 58 | 59 | Args: 60 | dataset_name (str): Name of the dataset on Hugging Face Hub 61 | subset (str, optional): Specific subset/configuration of the dataset 62 | 63 | Returns: 64 | str: Path to the downloaded dataset 65 | """ 66 | try: 67 | logger.info(f"Downloading dataset: {dataset_name}" + (f" ({subset})" if subset else "")) 68 | 69 | # Create dataset directory 70 | dataset_dir = os.path.join(self.datasets_dir, "raw", dataset_name.replace("/", "_")) 71 | os.makedirs(dataset_dir, exist_ok=True) 72 | 73 | # Download dataset 74 | if subset: 75 | dataset = load_dataset(dataset_name, subset) 76 | else: 77 | dataset = load_dataset(dataset_name) 78 | 79 | # Save dataset info 80 | dataset_info = { 81 | "name": dataset_name, 82 | "subset": subset, 83 | "splits": list(dataset.keys()), 84 | "features": str(dataset["train"].features if "train" in dataset else list(dataset.keys())[0]) 85 | } 86 | 87 | # Save each split 88 | for split_name, split_data in dataset.items(): 89 | split_path = os.path.join(dataset_dir, f"{split_name}.arrow") 90 | split_data.save_to_disk(split_path) 91 | 92 | logger.info(f"Dataset downloaded to: {dataset_dir}") 93 | return dataset_dir 94 | 95 | except Exception as e: 96 | logger.error(f"Error downloading dataset {dataset_name}: {str(e)}") 97 | raise 98 | 99 | def list_available_models(self, filter_tags: Optional[list] = None) -> list: 100 | """ 101 | List available models from Hugging Face Hub. 102 | 103 | Args: 104 | filter_tags (list, optional): List of tags to filter models 105 | 106 | Returns: 107 | list: List of model names 108 | """ 109 | try: 110 | models = self.api.list_models(filter=filter_tags) 111 | return [model.modelId for model in models] 112 | except Exception as e: 113 | logger.error(f"Error listing models: {str(e)}") 114 | raise 115 | 116 | def list_available_datasets(self, filter_tags: Optional[list] = None) -> list: 117 | """ 118 | List available datasets from Hugging Face Hub. 119 | 120 | Args: 121 | filter_tags (list, optional): List of tags to filter datasets 122 | 123 | Returns: 124 | list: List of dataset names 125 | """ 126 | try: 127 | datasets = self.api.list_datasets(filter=filter_tags) 128 | return [dataset.id for dataset in datasets] 129 | except Exception as e: 130 | logger.error(f"Error listing datasets: {str(e)}") 131 | raise 132 | 133 | def get_model_info(self, model_name: str) -> dict: 134 | """ 135 | Get detailed information about a model. 136 | 137 | Args: 138 | model_name (str): Name of the model on Hugging Face Hub 139 | 140 | Returns: 141 | dict: Model information 142 | """ 143 | try: 144 | model_info = self.api.model_info(model_name) 145 | return { 146 | "name": model_info.modelId, 147 | "tags": model_info.tags, 148 | "downloads": model_info.downloads, 149 | "likes": model_info.likes, 150 | "library": model_info.library_name 151 | } 152 | except Exception as e: 153 | logger.error(f"Error getting model info for {model_name}: {str(e)}") 154 | raise 155 | 156 | def get_dataset_info(self, dataset_name: str) -> dict: 157 | """ 158 | Get detailed information about a dataset. 159 | 160 | Args: 161 | dataset_name (str): Name of the dataset on Hugging Face Hub 162 | 163 | Returns: 164 | dict: Dataset information 165 | """ 166 | try: 167 | dataset_info = self.api.dataset_info(dataset_name) 168 | return { 169 | "name": dataset_info.id, 170 | "tags": dataset_info.tags, 171 | "downloads": dataset_info.downloads, 172 | "likes": dataset_info.likes, 173 | "size": dataset_info.size_categories 174 | } 175 | except Exception as e: 176 | logger.error(f"Error getting dataset info for {dataset_name}: {str(e)}") 177 | raise -------------------------------------------------------------------------------- /utils/multi_modal_rl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import logging 7 | from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class MultiModalRLModel(nn.Module): 12 | """ 13 | Multi-modal Reinforcement Learning Model 14 | ฝึกโมเดลกับข้อมูลหลายรูปแบบ (ข้อความ, รูปภาพ, เสียง) 15 | """ 16 | def __init__(self, text_model_path, image_model_path, vocab_size, device='cuda'): 17 | super(MultiModalRLModel, self).__init__() 18 | self.device = device 19 | self.text_model = AutoModelForCausalLM.from_pretrained(text_model_path).to(device) 20 | self.image_model = AutoModel.from_pretrained(image_model_path).to(device) 21 | self.vocab_size = vocab_size 22 | 23 | # Combine text and image features 24 | self.fc = nn.Linear(self.text_model.config.hidden_size + self.image_model.config.hidden_size, vocab_size) 25 | 26 | def forward(self, input_ids, attention_mask=None, pixel_values=None): 27 | text_outputs = self.text_model(input_ids=input_ids, attention_mask=attention_mask) 28 | image_outputs = self.image_model(pixel_values=pixel_values) 29 | 30 | combined_features = torch.cat((text_outputs.hidden_states[-1][:, -1, :], image_outputs.last_hidden_state[:, -1, :]), dim=1) 31 | logits = self.fc(combined_features) 32 | 33 | return logits 34 | 35 | def generate(self, input_ids, attention_mask=None, pixel_values=None, max_length=30, **kwargs): 36 | current_input_ids = input_ids 37 | current_attention_mask = attention_mask 38 | 39 | for _ in range(max_length): 40 | next_token_logits = self.forward(current_input_ids, current_attention_mask, pixel_values) 41 | next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1) 42 | current_input_ids = torch.cat([current_input_ids, next_token], dim=1) 43 | if current_attention_mask is not None: 44 | current_attention_mask = torch.cat([current_attention_mask, torch.ones_like(next_token)], dim=1) 45 | 46 | return current_input_ids 47 | 48 | def train_multi_modal_rl( 49 | text_model_path, 50 | image_model_path, 51 | dataset, 52 | output_dir, 53 | batch_size=4, 54 | epochs=1, 55 | lr=1e-5 56 | ): 57 | """ 58 | Train a Multi-modal Reinforcement Learning model 59 | 60 | Args: 61 | text_model_path: Path to the pre-trained text model 62 | image_model_path: Path to the pre-trained image model 63 | dataset: Dataset for training 64 | output_dir: Directory to save the model 65 | batch_size: Batch size for training 66 | epochs: Number of epochs for training 67 | lr: Learning rate 68 | 69 | Returns: 70 | Trained model 71 | """ 72 | os.makedirs(output_dir, exist_ok=True) 73 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 74 | 75 | tokenizer = AutoTokenizer.from_pretrained(text_model_path) 76 | vocab_size = tokenizer.vocab_size 77 | 78 | model = MultiModalRLModel(text_model_path, image_model_path, vocab_size, device).to(device) 79 | 80 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 81 | 82 | logger.info("Starting Multi-modal RL training...") 83 | 84 | for epoch in range(epochs): 85 | total_reward = 0 86 | total_loss = 0 87 | 88 | for i, batch in enumerate(dataset): 89 | if i >= len(dataset) // batch_size: 90 | break 91 | 92 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).to(device) 93 | pixel_values = batch["image"].to(device) if "image" in batch else None 94 | 95 | generated_ids = model.generate(inputs.input_ids, inputs.attention_mask, pixel_values, max_length=10) 96 | generated_part = generated_ids[:, inputs.input_ids.shape[1]:] 97 | 98 | # Calculate rewards 99 | rewards = F.softmax(model(inputs.input_ids, inputs.attention_mask, pixel_values), dim=-1).gather(1, generated_part[:, 0].unsqueeze(1)) 100 | 101 | # Policy gradient loss (maximize reward) 102 | loss = -torch.mean(rewards) 103 | 104 | optimizer.zero_grad() 105 | loss.backward() 106 | optimizer.step() 107 | 108 | total_reward += rewards.mean().item() 109 | total_loss += loss.item() 110 | 111 | if i % 10 == 0: 112 | logger.info(f"Epoch {epoch+1}/{epochs}, Batch {i}/{len(dataset)//batch_size}, " 113 | f"Loss: {total_loss/(i+1):.4f}, " 114 | f"Avg Reward: {total_reward/(i+1):.4f}") 115 | 116 | model.save_pretrained(output_dir) 117 | tokenizer.save_pretrained(output_dir) 118 | 119 | logger.info(f"Multi-modal RL training complete. Model saved to {output_dir}") 120 | return model, tokenizer -------------------------------------------------------------------------------- /utils/multi_objective_rl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import logging 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class MultiObjectiveRLModel(nn.Module): 12 | """ 13 | Multi-objective Reinforcement Learning Model 14 | ฝึกโมเดลให้ทำงานกับ reward functions หลายอันพร้อมกัน 15 | """ 16 | def __init__(self, model_path, vocab_size, reward_functions, device='cuda'): 17 | super(MultiObjectiveRLModel, self).__init__() 18 | self.device = device 19 | self.model = AutoModelForCausalLM.from_pretrained(model_path).to(device) 20 | self.reward_functions = reward_functions 21 | self.vocab_size = vocab_size 22 | 23 | def forward(self, input_ids, attention_mask=None): 24 | outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) 25 | return outputs.logits 26 | 27 | def generate(self, input_ids, attention_mask=None, max_length=30, **kwargs): 28 | current_input_ids = input_ids 29 | current_attention_mask = attention_mask 30 | 31 | for _ in range(max_length): 32 | next_token_logits = self.forward(current_input_ids, current_attention_mask) 33 | next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1) 34 | current_input_ids = torch.cat([current_input_ids, next_token], dim=1) 35 | if current_attention_mask is not None: 36 | current_attention_mask = torch.cat([current_attention_mask, torch.ones_like(next_token)], dim=1) 37 | 38 | return current_input_ids 39 | 40 | def train_multi_objective_rl( 41 | model_path, 42 | dataset, 43 | output_dir, 44 | reward_functions, 45 | batch_size=4, 46 | epochs=1, 47 | lr=1e-5 48 | ): 49 | """ 50 | Train a Multi-objective Reinforcement Learning model 51 | 52 | Args: 53 | model_path: Path to the pre-trained model 54 | dataset: Dataset for training 55 | output_dir: Directory to save the model 56 | reward_functions: List of reward functions 57 | batch_size: Batch size for training 58 | epochs: Number of epochs for training 59 | lr: Learning rate 60 | 61 | Returns: 62 | Trained model 63 | """ 64 | os.makedirs(output_dir, exist_ok=True) 65 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 66 | 67 | tokenizer = AutoTokenizer.from_pretrained(model_path) 68 | vocab_size = tokenizer.vocab_size 69 | 70 | model = MultiObjectiveRLModel(model_path, vocab_size, reward_functions, device).to(device) 71 | 72 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 73 | 74 | logger.info("Starting Multi-objective RL training...") 75 | 76 | for epoch in range(epochs): 77 | total_reward = 0 78 | total_loss = 0 79 | 80 | for i, batch in enumerate(dataset): 81 | if i >= len(dataset) // batch_size: 82 | break 83 | 84 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).to(device) 85 | 86 | generated_ids = model.generate(inputs.input_ids, inputs.attention_mask, max_length=10) 87 | generated_part = generated_ids[:, inputs.input_ids.shape[1]:] 88 | 89 | rewards = torch.zeros(batch_size).to(device) 90 | for reward_function in reward_functions: 91 | rewards += reward_function(generated_part) 92 | 93 | loss = -torch.mean(rewards) 94 | 95 | optimizer.zero_grad() 96 | loss.backward() 97 | optimizer.step() 98 | 99 | total_reward += rewards.mean().item() 100 | total_loss += loss.item() 101 | 102 | if i % 10 == 0: 103 | logger.info(f"Epoch {epoch+1}/{epochs}, Batch {i}/{len(dataset)//batch_size}, " 104 | f"Loss: {total_loss/(i+1):.4f}, " 105 | f"Avg Reward: {total_reward/(i+1):.4f}") 106 | 107 | model.save_pretrained(output_dir) 108 | tokenizer.save_pretrained(output_dir) 109 | 110 | logger.info(f"Multi-objective RL training complete. Model saved to {output_dir}") 111 | return model, tokenizer -------------------------------------------------------------------------------- /utils/off_policy_correction.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | import logging 6 | import numpy as np 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | class VTraceModel(nn.Module): 11 | def __init__(self, base_model): 12 | super().__init__() 13 | self.base_model = base_model 14 | self.hidden_size = base_model.config.hidden_size 15 | 16 | # Value function head 17 | self.value_head = nn.Linear(self.hidden_size, 1) 18 | 19 | def forward(self, input_ids, attention_mask=None): 20 | outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask) 21 | hidden_states = outputs.last_hidden_state 22 | 23 | # Get logits for policy and values 24 | logits = outputs.logits 25 | values = self.value_head(hidden_states) 26 | 27 | return { 28 | 'logits': logits, 29 | 'values': values, 30 | 'hidden_states': hidden_states 31 | } 32 | 33 | def compute_vtrace_advantages(values, next_values, rewards, behavior_logits, target_logits, 34 | gamma=0.99, lambda_=0.95, rho_clipping=1.0, c_clipping=1.0): 35 | """Compute V-trace advantages to correct for off-policy data.""" 36 | 37 | # Compute importance weights 38 | behavior_probs = F.softmax(behavior_logits, dim=-1) 39 | target_probs = F.softmax(target_logits, dim=-1) 40 | # Use clipped importance sampling ratio 41 | rho = torch.clamp(target_probs / (behavior_probs + 1e-10), 0, rho_clipping) 42 | c = torch.clamp(target_probs / (behavior_probs + 1e-10), 0, c_clipping) 43 | 44 | # Compute TD errors 45 | td_errors = rewards + gamma * next_values - values 46 | 47 | # Compute v-trace targets 48 | vtrace_targets = values + rho * td_errors 49 | 50 | # Compute GAE-style advantages 51 | advantages = torch.zeros_like(vtrace_targets) 52 | last_gae = 0 53 | 54 | # Reverse accumulate advantages 55 | for t in reversed(range(len(advantages))): 56 | if t == len(advantages) - 1: 57 | next_value = next_values[-1] 58 | else: 59 | next_value = vtrace_targets[t+1] 60 | 61 | delta = rewards[t] + gamma * next_value - values[t] 62 | last_gae = delta + gamma * lambda_ * c[t] * last_gae 63 | advantages[t] = last_gae 64 | 65 | return advantages, vtrace_targets 66 | 67 | def train_off_policy_correction(base_model_path, train_dataset, output_dir, behavior_model_path=None, 68 | reward_model_path=None, batch_size=4, epochs=1, lr=1e-5): 69 | """Train a model using off-policy correction with V-trace.""" 70 | logger.info("Initializing Off-Policy Correction with V-trace") 71 | 72 | # Load tokenizer and model 73 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 74 | base_model = AutoModelForCausalLM.from_pretrained(base_model_path) 75 | 76 | # Create V-trace model 77 | model = VTraceModel(base_model) 78 | 79 | # Load behavior model if provided, otherwise use a copy of base model 80 | if behavior_model_path: 81 | logger.info(f"Loading behavior model from {behavior_model_path}") 82 | behavior_model = AutoModelForCausalLM.from_pretrained(behavior_model_path) 83 | else: 84 | logger.info("No behavior model provided, will use a copy of base model") 85 | behavior_model = AutoModelForCausalLM.from_pretrained(base_model_path) 86 | 87 | # Load reward model if provided 88 | if reward_model_path: 89 | logger.info(f"Loading reward model from {reward_model_path}") 90 | reward_model = AutoModelForCausalLM.from_pretrained(reward_model_path) 91 | else: 92 | logger.info("No reward model provided, will use simple rewards") 93 | reward_model = None 94 | 95 | # Setup optimizer 96 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 97 | 98 | # Training loop 99 | logger.info(f"Starting training for {epochs} epochs") 100 | model.train() 101 | behavior_model.eval() # Behavior model is fixed 102 | 103 | for epoch in range(epochs): 104 | total_actor_loss = 0 105 | total_value_loss = 0 106 | total_entropy = 0 107 | num_batches = 0 108 | 109 | for i in range(0, len(train_dataset), batch_size): 110 | batch = train_dataset[i:i+batch_size] 111 | 112 | # Tokenize inputs 113 | inputs = tokenizer(batch['text'], return_tensors="pt", padding=True, truncation=True) 114 | inputs = {k: v.to(base_model.device) for k, v in inputs.items()} 115 | 116 | # Get behavior policy logits (frozen) 117 | with torch.no_grad(): 118 | behavior_outputs = behavior_model(inputs['input_ids'], attention_mask=inputs['attention_mask']) 119 | behavior_logits = behavior_outputs.logits 120 | 121 | # Get target policy outputs 122 | target_outputs = model(inputs['input_ids'], inputs['attention_mask']) 123 | target_logits = target_outputs['logits'] 124 | values = target_outputs['values'] 125 | 126 | # Compute rewards 127 | if reward_model: 128 | with torch.no_grad(): 129 | reward_outputs = reward_model(inputs['input_ids'], attention_mask=inputs['attention_mask']) 130 | rewards = reward_outputs.logits.mean(dim=-1).unsqueeze(-1) 131 | else: 132 | # Simple reward function based on next token prediction 133 | seq_len = inputs['input_ids'].size(1) 134 | if seq_len > 1: 135 | next_tokens = inputs['input_ids'][:, 1:] 136 | pred_logits = target_logits[:, :-1, :] 137 | 138 | # Get probability of correct next token 139 | next_token_probs = F.softmax(pred_logits, dim=-1) 140 | next_token_indices = next_tokens.unsqueeze(-1) 141 | correct_probs = torch.gather(next_token_probs, 2, next_token_indices).squeeze(-1) 142 | 143 | # Reward is log probability of correct next token 144 | rewards = torch.log(correct_probs + 1e-10) 145 | rewards = rewards.unsqueeze(-1) 146 | else: 147 | rewards = torch.zeros((inputs['input_ids'].size(0), 1, 1), device=inputs['input_ids'].device) 148 | 149 | # Get next state values 150 | with torch.no_grad(): 151 | next_inputs = torch.cat([inputs['input_ids'][:, 1:], 152 | torch.ones((inputs['input_ids'].size(0), 1), 153 | dtype=torch.long, 154 | device=inputs['input_ids'].device) * tokenizer.eos_token_id], dim=1) 155 | next_mask = torch.cat([inputs['attention_mask'][:, 1:], 156 | torch.ones((inputs['attention_mask'].size(0), 1), 157 | device=inputs['attention_mask'].device)], dim=1) 158 | next_outputs = model(next_inputs, next_mask) 159 | next_values = next_outputs['values'] 160 | 161 | # Compute V-trace advantages and targets 162 | advantages, vtrace_targets = compute_vtrace_advantages( 163 | values[:, :-1].detach(), 164 | next_values[:, :-1].detach(), 165 | rewards[:, :-1].detach(), 166 | behavior_logits[:, :-1].detach(), 167 | target_logits[:, :-1].detach() 168 | ) 169 | 170 | # Compute policy loss (actor loss) 171 | log_probs = F.log_softmax(target_logits[:, :-1], dim=-1) 172 | action_log_probs = torch.gather( 173 | log_probs, 174 | 2, 175 | inputs['input_ids'][:, 1:].unsqueeze(-1) 176 | ).squeeze(-1) 177 | 178 | # Actor loss uses advantages from V-trace 179 | actor_loss = -(action_log_probs * advantages.detach()).mean() 180 | 181 | # Value loss 182 | value_loss = F.mse_loss(values[:, :-1], vtrace_targets.detach()) 183 | 184 | # Entropy bonus 185 | entropy = -(F.softmax(target_logits[:, :-1], dim=-1) * F.log_softmax(target_logits[:, :-1], dim=-1)).sum(-1).mean() 186 | 187 | # Combined loss 188 | entropy_coef = 0.01 189 | value_coef = 0.5 190 | loss = actor_loss + value_coef * value_loss - entropy_coef * entropy 191 | 192 | # Update model 193 | optimizer.zero_grad() 194 | loss.backward() 195 | optimizer.step() 196 | 197 | total_actor_loss += actor_loss.item() 198 | total_value_loss += value_loss.item() 199 | total_entropy += entropy.item() 200 | num_batches += 1 201 | 202 | if num_batches % 10 == 0: 203 | logger.info(f"Epoch {epoch+1}, Batch {num_batches}, " 204 | f"Actor Loss: {actor_loss.item():.4f}, Value Loss: {value_loss.item():.4f}, " 205 | f"Entropy: {entropy.item():.4f}") 206 | 207 | avg_actor_loss = total_actor_loss / num_batches 208 | avg_value_loss = total_value_loss / num_batches 209 | avg_entropy = total_entropy / num_batches 210 | 211 | logger.info(f"Epoch {epoch+1} completed. " 212 | f"Average Actor Loss: {avg_actor_loss:.4f}, " 213 | f"Average Value Loss: {avg_value_loss:.4f}, " 214 | f"Average Entropy: {avg_entropy:.4f}") 215 | 216 | # Save the model 217 | logger.info(f"Training completed. Saving model to {output_dir}") 218 | model.base_model.save_pretrained(output_dir) 219 | tokenizer.save_pretrained(output_dir) 220 | 221 | return model.base_model, tokenizer 222 | -------------------------------------------------------------------------------- /utils/ppo_trainer.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from trl import PPOConfig, PPOTrainer 4 | from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer 5 | 6 | def create_enhanced_reward_fn(reward_model_path, q_model_path=None, irl_model_path=None, sac_model_path=None): 7 | """ 8 | สร้าง reward function ที่ผสมผสานผลลัพธ์จากทุกโมเดล 9 | """ 10 | tokenizer = AutoTokenizer.from_pretrained(reward_model_path) 11 | reward_pipeline = pipeline("text-classification", model=reward_model_path, tokenizer=tokenizer, device=0) 12 | 13 | # Load Q-network if available 14 | q_net = None 15 | if q_model_path: 16 | from q_learning import QNetworkForText 17 | state_dim = 128 18 | action_dim = 10 19 | q_net = QNetworkForText(state_dim, action_dim).to(reward_pipeline.model.device) 20 | q_net.load_state_dict(torch.load(os.path.join(q_model_path, "q_network.pt"))) 21 | q_net.eval() 22 | 23 | def reward_fn(samples): 24 | scores = [] 25 | 26 | for sample in samples: 27 | # Base reward from reward model 28 | pred = reward_pipeline(sample, truncation=True, max_length=256) 29 | rm_score = pred[0]["score"] if pred[0]["label"] == "POSITIVE" else 1 - pred[0]["score"] 30 | 31 | # Enhanced score with Q-network if available 32 | q_score = 0.0 33 | if q_net: 34 | # Create a simple state representation from text 35 | sample_encoding = tokenizer(sample, truncation=True, max_length=128, 36 | padding="max_length", return_tensors="pt").to(reward_pipeline.model.device) 37 | state_repr = torch.mean(sample_encoding.input_ids.float(), dim=1).cpu().numpy()[0][:state_dim] 38 | with torch.no_grad(): 39 | q_values = q_net(torch.FloatTensor(state_repr).to(reward_pipeline.model.device)) 40 | q_score = q_values.max().item() / 10.0 # Normalize 41 | 42 | # Weight the components (hyperparameters to tune) 43 | weights = { 44 | 'reward_model': 0.7, 45 | 'q_learning': 0.3, 46 | } 47 | 48 | combined_score = weights['reward_model'] * rm_score 49 | if q_net: 50 | combined_score += weights['q_learning'] * q_score 51 | 52 | scores.append(torch.tensor(combined_score)) 53 | 54 | return scores 55 | 56 | return reward_fn 57 | 58 | def run_ppo_training( 59 | dpo_model_path, 60 | train_dataset, 61 | eval_dataset, 62 | output_dir, 63 | reward_model_path, 64 | q_model_path=None, 65 | irl_model_path=None, 66 | sac_model_path=None, 67 | batch_size=4, 68 | epochs=1, 69 | learning_rate=1e-5 70 | ): 71 | """ 72 | ฝึกโมเดลด้วย PPO โดยใช้ reward function ที่ผสมผสานจากทุกโมเดล 73 | """ 74 | os.makedirs(output_dir, exist_ok=True) 75 | 76 | # Load model from DPO 77 | tokenizer = AutoTokenizer.from_pretrained(dpo_model_path) 78 | model = AutoModelForCausalLM.from_pretrained(dpo_model_path, torch_dtype=torch.bfloat16, device_map="auto") 79 | 80 | # Create enhanced reward function 81 | enhanced_reward_fn = create_enhanced_reward_fn( 82 | reward_model_path, 83 | q_model_path, 84 | irl_model_path, 85 | sac_model_path 86 | ) 87 | 88 | # PPO configuration 89 | ppo_config = PPOConfig( 90 | model_name=dpo_model_path, 91 | learning_rate=learning_rate, 92 | batch_size=batch_size * 4, # Effective batch size 93 | ppo_epochs=epochs, 94 | mini_batch_size=batch_size, 95 | gradient_accumulation_steps=4, 96 | optimize_cuda_cache=True, 97 | ) 98 | 99 | # Initialize PPO Trainer 100 | ppo_trainer = PPOTrainer( 101 | model=model, 102 | config=ppo_config, 103 | dataset=train_dataset, 104 | tokenizer=tokenizer, 105 | ) 106 | 107 | # Training loop 108 | for epoch in range(epochs): 109 | for batch_idx, batch in enumerate(ppo_trainer.dataloader): 110 | # Generate responses 111 | query_tensors = batch["input_ids"] 112 | response_tensors = ppo_trainer.generate(query_tensors, max_length=50, temperature=0.7) 113 | responses = [tokenizer.decode(r, skip_special_tokens=True) for r in response_tensors] 114 | 115 | # Get rewards from enhanced reward function 116 | rewards = enhanced_reward_fn(responses) 117 | 118 | # PPO step 119 | ppo_trainer.step(query_tensors, response_tensors, rewards) 120 | 121 | # Save the final model 122 | model.save_pretrained(output_dir) 123 | tokenizer.save_pretrained(output_dir) 124 | 125 | # Save as safetensors 126 | from safetensors.torch import save_file 127 | state_dict = model.state_dict() 128 | save_file(state_dict, os.path.join(output_dir, "model.safetensors")) 129 | 130 | return model -------------------------------------------------------------------------------- /utils/q_learning.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline 5 | 6 | class QNetworkForText(torch.nn.Module): 7 | def __init__(self, input_dim, output_dim): 8 | super().__init__() 9 | self.network = torch.nn.Sequential( 10 | torch.nn.Linear(input_dim, 128), 11 | torch.nn.ReLU(), 12 | torch.nn.Linear(128, 128), 13 | torch.nn.ReLU(), 14 | torch.nn.Linear(128, output_dim) 15 | ) 16 | 17 | def forward(self, x): 18 | return self.network(x) 19 | 20 | def train_q_learning(dpo_model_path, train_dataset, output_dir, reward_model_path, batch_size): 21 | """ 22 | ฝึกโมเดลด้วย Q-Learning 23 | """ 24 | os.makedirs(output_dir, exist_ok=True) 25 | 26 | # Load models 27 | tokenizer = AutoTokenizer.from_pretrained(dpo_model_path) 28 | model = AutoModelForCausalLM.from_pretrained(dpo_model_path, torch_dtype=torch.bfloat16, device_map="auto") 29 | reward_pipeline = pipeline("text-classification", model=reward_model_path, tokenizer=tokenizer, device=0) 30 | 31 | # Create Q-Network 32 | state_dim = 128 33 | action_dim = 10 34 | q_network = QNetworkForText(state_dim, action_dim).to(model.device) 35 | optimizer = torch.optim.Adam(q_network.parameters(), lr=0.001) 36 | 37 | # Q-learning parameters 38 | epsilon = 0.1 39 | gamma = 0.99 40 | episodes = 10 41 | 42 | # Create replay buffer (simplified) 43 | replay_buffer = [] 44 | 45 | # Training loop 46 | for episode in range(episodes): 47 | # Select sample from training data 48 | sample_idx = np.random.randint(len(train_dataset)) 49 | prompt = train_dataset[sample_idx]["prompt"] 50 | 51 | # Current state (simplified) 52 | state = np.random.rand(state_dim).astype(np.float32) 53 | 54 | # Epsilon-greedy action selection 55 | if np.random.random() < epsilon: 56 | action = np.random.randint(action_dim) 57 | else: 58 | with torch.no_grad(): 59 | q_values = q_network(torch.FloatTensor(state).to(model.device)) 60 | action = q_values.argmax().item() 61 | 62 | # Generate text based on action (simplified) 63 | with torch.no_grad(): 64 | input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device) 65 | output = model.generate( 66 | input_ids, 67 | max_length=input_ids.shape[1] + 20, 68 | temperature=0.7, 69 | do_sample=True 70 | ) 71 | 72 | generated_text = tokenizer.decode(output[0], skip_special_tokens=True) 73 | generated_text = generated_text[len(prompt):] 74 | 75 | # Get reward from reward model 76 | with torch.no_grad(): 77 | inputs = tokenizer(generated_text, return_tensors="pt").to(reward_pipeline.model.device) 78 | reward = reward_pipeline.model(**inputs).logits.item() 79 | 80 | # Next state (simplified) 81 | next_state = np.random.rand(state_dim).astype(np.float32) 82 | 83 | # Store transition in replay buffer 84 | replay_buffer.append((state, action, reward, next_state)) 85 | 86 | # Learn from replay buffer 87 | if len(replay_buffer) > batch_size: 88 | # Sample batch 89 | batch_indices = np.random.choice(len(replay_buffer), batch_size, replace=False) 90 | batch = [replay_buffer[i] for i in batch_indices] 91 | 92 | states = torch.FloatTensor([b[0] for b in batch]).to(model.device) 93 | actions = torch.LongTensor([b[1] for b in batch]).to(model.device) 94 | rewards = torch.FloatTensor([b[2] for b in batch]).to(model.device) 95 | next_states = torch.FloatTensor([b[3] for b in batch]).to(model.device) 96 | 97 | # Q-learning update 98 | current_q_values = q_network(states) 99 | current_q_values = current_q_values.gather(1, actions.unsqueeze(1)).squeeze(1) 100 | 101 | with torch.no_grad(): 102 | next_q_values = q_network(next_states) 103 | max_next_q_values = next_q_values.max(1)[0] 104 | target_q_values = rewards + gamma * max_next_q_values 105 | 106 | # Compute loss and update 107 | loss = torch.nn.functional.mse_loss(current_q_values, target_q_values) 108 | optimizer.zero_grad() 109 | loss.backward() 110 | optimizer.step() 111 | 112 | # Save Q-network 113 | torch.save(q_network.state_dict(), os.path.join(output_dir, "q_network.pt")) 114 | 115 | return q_network -------------------------------------------------------------------------------- /utils/reverse_curriculum.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | import logging 6 | import numpy as np 7 | import random 8 | from tqdm import tqdm 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class CurriculumGenerator: 13 | """Generates a sequence of increasingly difficult tasks""" 14 | def __init__(self, base_model, tokenizer, reward_model=None, num_samples=100): 15 | self.base_model = base_model 16 | self.tokenizer = tokenizer 17 | self.reward_model = reward_model 18 | self.num_samples = num_samples 19 | self.device = base_model.device 20 | 21 | def generate_curriculum(self, goal_texts, num_tasks=10, max_length=50): 22 | """ 23 | Generate curriculum starting from the goal and working backwards 24 | 25 | Args: 26 | goal_texts: List of target texts (final state) 27 | num_tasks: Number of tasks in curriculum 28 | max_length: Maximum length of generated sequences 29 | 30 | Returns: 31 | List of curriculum tasks, from easiest to hardest 32 | """ 33 | logger.info(f"Generating curriculum with {num_tasks} tasks") 34 | 35 | all_tasks = [] 36 | current_tasks = goal_texts 37 | 38 | # Store goal as hardest task 39 | all_tasks.append(current_tasks) 40 | 41 | # Work backwards to create easier tasks 42 | for step in range(num_tasks - 1): 43 | logger.info(f"Generating task {num_tasks - step - 1}/{num_tasks}") 44 | 45 | easier_tasks = [] 46 | for task in tqdm(current_tasks): 47 | # Tokenize the task 48 | inputs = self.tokenizer(task, return_tensors="pt").to(self.device) 49 | 50 | # Find key points where we can simplify 51 | # For simplicity, we'll just trim the text progressively 52 | # In a more sophisticated implementation, we might use the model itself 53 | 54 | # Get the length of the current task 55 | task_length = len(task.split()) 56 | 57 | # Simplify by keeping a smaller fraction of the text 58 | keep_ratio = 0.7 # Keep 70% of the text 59 | simplified_length = max(1, int(task_length * keep_ratio)) 60 | 61 | # Get the simplified text (first part of the task) 62 | simplified_text = " ".join(task.split()[:simplified_length]) 63 | 64 | # Add some randomness to avoid all tasks being simple prefixes 65 | if random.random() < 0.3 and simplified_length > 2: 66 | # Sometimes replace specific details with generic placeholders 67 | words = simplified_text.split() 68 | for i in range(len(words)): 69 | if random.random() < 0.2: 70 | words[i] = "[...]" 71 | simplified_text = " ".join(words) 72 | 73 | easier_tasks.append(simplified_text) 74 | 75 | # Add this set of easier tasks to our curriculum 76 | all_tasks.append(easier_tasks) 77 | current_tasks = easier_tasks 78 | 79 | # Reverse so we start with the easiest tasks 80 | all_tasks.reverse() 81 | 82 | return all_tasks 83 | 84 | def evaluate_curriculum(model, tokenizer, curriculum_tasks, reward_model=None): 85 | """Evaluate how well the model performs on each level of the curriculum""" 86 | results = [] 87 | 88 | for level, tasks in enumerate(curriculum_tasks): 89 | # Sample a few tasks from this level 90 | sample_size = min(5, len(tasks)) 91 | sampled_tasks = random.sample(tasks, sample_size) 92 | 93 | level_rewards = [] 94 | for task in sampled_tasks: 95 | # Generate response 96 | inputs = tokenizer(task, return_tensors="pt").to(model.device) 97 | outputs = model.generate( 98 | inputs.input_ids, 99 | max_new_tokens=50, 100 | do_sample=True, 101 | temperature=0.7 102 | ) 103 | 104 | generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True) 105 | 106 | # Compute reward 107 | if reward_model: 108 | with torch.no_grad(): 109 | reward_outputs = reward_model(outputs) 110 | reward = reward_outputs.logits.mean().item() 111 | else: 112 | # Simple proxy for reward: length of generation 113 | reward = len(generated_text.split()) / 50.0 # Normalize 114 | 115 | level_rewards.append(reward) 116 | 117 | avg_reward = sum(level_rewards) / len(level_rewards) 118 | results.append({ 119 | 'level': level, 120 | 'average_reward': avg_reward, 121 | 'num_tasks': len(tasks) 122 | }) 123 | 124 | return results 125 | 126 | def train_reverse_curriculum(base_model_path, train_dataset, output_dir, reward_model_path=None, 127 | batch_size=4, epochs=1, lr=1e-5, curriculum_steps=5): 128 | """Train a model using reverse curriculum learning.""" 129 | logger.info("Initializing Reverse Curriculum Generation") 130 | 131 | # Load tokenizer and model 132 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 133 | base_model = AutoModelForCausalLM.from_pretrained(base_model_path) 134 | 135 | # Load reward model if provided 136 | if reward_model_path: 137 | logger.info(f"Loading reward model from {reward_model_path}") 138 | reward_model = AutoModelForCausalLM.from_pretrained(reward_model_path) 139 | else: 140 | logger.info("No reward model provided, using simple rewards") 141 | reward_model = None 142 | 143 | # Create curriculum generator 144 | curriculum_gen = CurriculumGenerator(base_model, tokenizer, reward_model) 145 | 146 | # Generate curriculum from the training data 147 | # For curriculum learning, we'll use the full texts as goals 148 | goal_texts = [item['text'] for item in train_dataset[:100]] # Use a subset for efficiency 149 | 150 | logger.info("Generating curriculum...") 151 | curriculum = curriculum_gen.generate_curriculum(goal_texts, num_tasks=curriculum_steps) 152 | 153 | logger.info(f"Generated curriculum with {len(curriculum)} levels") 154 | for i, level in enumerate(curriculum): 155 | logger.info(f"Level {i}: {len(level)} tasks, Example: {level[0][:50]}...") 156 | 157 | # Setup optimizer 158 | optimizer = torch.optim.Adam(base_model.parameters(), lr=lr) 159 | 160 | # Training loop - train progressively on each curriculum level 161 | logger.info(f"Starting curriculum training for {epochs} epochs") 162 | base_model.train() 163 | 164 | for level, tasks in enumerate(curriculum): 165 | logger.info(f"Training on curriculum level {level}/{len(curriculum)-1}") 166 | 167 | # Create a dataset from this level's tasks 168 | level_texts = tasks 169 | 170 | for epoch in range(epochs): 171 | total_loss = 0 172 | num_batches = 0 173 | 174 | # Process tasks in batches 175 | for i in range(0, len(level_texts), batch_size): 176 | batch_texts = level_texts[i:i+batch_size] 177 | 178 | # Tokenize 179 | inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True) 180 | inputs = {k: v.to(base_model.device) for k, v in inputs.items()} 181 | 182 | # Forward pass 183 | outputs = base_model(**inputs, labels=inputs['input_ids']) 184 | loss = outputs.loss 185 | 186 | # Update model 187 | optimizer.zero_grad() 188 | loss.backward() 189 | optimizer.step() 190 | 191 | total_loss += loss.item() 192 | num_batches += 1 193 | 194 | if num_batches % 10 == 0: 195 | logger.info(f"Level {level}, Epoch {epoch+1}, Batch {num_batches}, " 196 | f"Loss: {loss.item():.4f}") 197 | 198 | avg_loss = total_loss / num_batches 199 | logger.info(f"Level {level}, Epoch {epoch+1} completed. " 200 | f"Average Loss: {avg_loss:.4f}") 201 | 202 | # Evaluate on current level 203 | logger.info(f"Evaluating on curriculum level {level}") 204 | level_results = evaluate_curriculum(base_model, tokenizer, [curriculum[level]], reward_model) 205 | logger.info(f"Level {level} average reward: {level_results[0]['average_reward']:.4f}") 206 | 207 | # Final evaluation on all curriculum levels 208 | logger.info("Final evaluation on all curriculum levels") 209 | final_results = evaluate_curriculum(base_model, tokenizer, curriculum, reward_model) 210 | 211 | for result in final_results: 212 | logger.info(f"Level {result['level']} final average reward: {result['average_reward']:.4f}") 213 | 214 | # Save the model 215 | logger.info(f"Training completed. Saving model to {output_dir}") 216 | base_model.save_pretrained(output_dir) 217 | tokenizer.save_pretrained(output_dir) 218 | 219 | return base_model, tokenizer 220 | -------------------------------------------------------------------------------- /utils/reward_model.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | from transformers import ( 4 | AutoTokenizer, 5 | AutoModelForSequenceClassification, 6 | TrainingArguments, 7 | Trainer 8 | ) 9 | 10 | class RewardDataset(torch.utils.data.Dataset): 11 | def __init__(self, dataset, tokenizer): 12 | self.dataset = dataset 13 | self.tokenizer = tokenizer 14 | 15 | def __len__(self): 16 | return len(self.dataset) 17 | 18 | def __getitem__(self, idx): 19 | item = self.dataset[idx] 20 | 21 | chosen_input = self.tokenizer( 22 | item["chosen"], 23 | padding="max_length", 24 | truncation=True, 25 | max_length=128, 26 | return_tensors="pt" 27 | ) 28 | rejected_input = self.tokenizer( 29 | item["rejected"], 30 | padding="max_length", 31 | truncation=True, 32 | max_length=128, 33 | return_tensors="pt" 34 | ) 35 | 36 | return { 37 | "input_ids": torch.cat([chosen_input["input_ids"][0], rejected_input["input_ids"][0]]), 38 | "attention_mask": torch.cat([chosen_input["attention_mask"][0], rejected_input["attention_mask"][0]]), 39 | "labels": torch.tensor([1.0, 0.0]) # chosen = 1, rejected = 0 40 | } 41 | 42 | def train_reward_model(dpo_model_path, train_dataset, eval_dataset, output_dir, batch_size, epochs, learning_rate): 43 | """ 44 | ฝึกโมเดลสำหรับการประเมินรางวัล (Reward Model) 45 | """ 46 | os.makedirs(output_dir, exist_ok=True) 47 | 48 | # Load model and tokenizer 49 | tokenizer = AutoTokenizer.from_pretrained(dpo_model_path) 50 | reward_model = AutoModelForSequenceClassification.from_pretrained( 51 | dpo_model_path, 52 | num_labels=1, 53 | torch_dtype=torch.bfloat16, 54 | device_map="auto" 55 | ) 56 | 57 | # Create datasets 58 | reward_train_dataset = RewardDataset(train_dataset, tokenizer) 59 | reward_eval_dataset = RewardDataset(eval_dataset, tokenizer) 60 | 61 | # Configure training 62 | reward_args = TrainingArguments( 63 | output_dir=output_dir, 64 | per_device_train_batch_size=batch_size, 65 | per_device_eval_batch_size=batch_size, 66 | gradient_accumulation_steps=4, 67 | num_train_epochs=epochs, 68 | fp16=torch.cuda.is_available(), 69 | learning_rate=learning_rate, 70 | logging_steps=10, 71 | evaluation_strategy="steps", 72 | eval_steps=50, 73 | save_strategy="steps", 74 | save_steps=50, 75 | load_best_model_at_end=True, 76 | ) 77 | 78 | # Initialize trainer 79 | trainer = Trainer( 80 | model=reward_model, 81 | args=reward_args, 82 | train_dataset=reward_train_dataset, 83 | eval_dataset=reward_eval_dataset, 84 | ) 85 | 86 | # Start training 87 | trainer.train() 88 | 89 | # Save the final model 90 | reward_model.save_pretrained(output_dir) 91 | tokenizer.save_pretrained(output_dir) 92 | 93 | return reward_model -------------------------------------------------------------------------------- /utils/sac_training.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import numpy as np 4 | import gym 5 | from gym import spaces 6 | from stable_baselines3 import SAC 7 | from stable_baselines3.common.buffers import ReplayBuffer 8 | from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline 9 | 10 | class SACEnvironment(gym.Env): 11 | def __init__(self, model, tokenizer, reward_model, prompts): 12 | super().__init__() 13 | self.model = model 14 | self.tokenizer = tokenizer 15 | self.reward_model = reward_model 16 | self.prompts = prompts 17 | self.current_prompt_idx = 0 18 | 19 | # Define action and observation space 20 | # For language models, we'll use a simplified continuous action space 21 | self.action_space = spaces.Box(low=-1, high=1, shape=(10,), dtype=np.float32) 22 | # Observation space is a simplified representation of the language state 23 | self.observation_space = spaces.Box(low=0, high=1, shape=(128,), dtype=np.float32) 24 | 25 | def reset(self): 26 | # Select a prompt 27 | self.current_prompt = self.prompts[self.current_prompt_idx] 28 | self.current_prompt_idx = (self.current_prompt_idx + 1) % len(self.prompts) 29 | 30 | # Encode prompt 31 | encoded = self.tokenizer(self.current_prompt, return_tensors="pt").to(self.model.device) 32 | self.current_input_ids = encoded["input_ids"] 33 | 34 | # Return a simplified state representation 35 | return np.random.rand(128).astype(np.float32) 36 | 37 | def step(self, action): 38 | # Use action to influence generation parameters (simplified) 39 | temperature = 0.5 + (action[0] + 1) / 4 # Map [-1,1] to [0.5, 1] 40 | 41 | # Generate text based on action-influenced parameters 42 | with torch.no_grad(): 43 | output = self.model.generate( 44 | self.current_input_ids, 45 | max_length=self.current_input_ids.shape[1] + 20, 46 | temperature=temperature, 47 | do_sample=True 48 | ) 49 | 50 | generated_text = self.tokenizer.decode(output[0], skip_special_tokens=True) 51 | generated_text = generated_text[len(self.current_prompt):] 52 | 53 | # Get reward from reward model 54 | with torch.no_grad(): 55 | inputs = self.tokenizer(generated_text, return_tensors="pt").to(self.reward_model.device) 56 | reward_score = self.reward_model(**inputs).logits.item() 57 | 58 | # Simplified next state 59 | next_state = np.random.rand(128).astype(np.float32) 60 | 61 | # Always terminate after one step (episodic) 62 | done = True 63 | info = {'generated_text': generated_text} 64 | 65 | return next_state, reward_score, done, info 66 | 67 | def train_sac(dpo_model_path, train_dataset, output_dir, reward_model_path, batch_size): 68 | """ 69 | ฝึกโมเดลด้วย Soft Actor-Critic 70 | """ 71 | os.makedirs(output_dir, exist_ok=True) 72 | 73 | # Load models 74 | tokenizer = AutoTokenizer.from_pretrained(dpo_model_path) 75 | model = AutoModelForCausalLM.from_pretrained(dpo_model_path, torch_dtype=torch.bfloat16, device_map="auto") 76 | reward_pipeline = pipeline("text-classification", model=reward_model_path, tokenizer=tokenizer, device=0) 77 | 78 | # Select prompts for training 79 | prompts = [example["prompt"] for example in train_dataset[:min(100, len(train_dataset))]] 80 | 81 | # Create SAC environment 82 | env = SACEnvironment(model, tokenizer, reward_pipeline.model, prompts) 83 | 84 | # Initialize SAC agent 85 | sac_agent = SAC( 86 | "MlpPolicy", 87 | env, 88 | verbose=1, 89 | learning_rate=3e-4, 90 | buffer_size=10000, 91 | learning_starts=100, 92 | batch_size=batch_size, 93 | tau=0.005, 94 | gamma=0.99, 95 | train_freq=1, 96 | gradient_steps=1, 97 | action_noise=None, 98 | ) 99 | 100 | # Train SAC agent (limited steps for demonstration) 101 | total_timesteps = 200 # Adjust based on available computation time 102 | sac_agent.learn(total_timesteps=total_timesteps) 103 | 104 | # Save the trained agent 105 | sac_agent.save(os.path.join(output_dir, "sac_model")) 106 | 107 | return sac_agent -------------------------------------------------------------------------------- /utils/self_supervised_rl.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.nn as nn 4 | import torch.nn.functional as F 5 | import numpy as np 6 | import logging 7 | from transformers import AutoModelForCausalLM, AutoTokenizer 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class SelfSupervisedRLModel(nn.Module): 12 | """ 13 | Self-supervised Reinforcement Learning Model 14 | โมเดลสร้างเป้าหมายของตัวเองและให้รางวัลตัวเอง 15 | """ 16 | def __init__(self, model_path, vocab_size, device='cuda'): 17 | super(SelfSupervisedRLModel, self).__init__() 18 | self.device = device 19 | self.model = AutoModelForCausalLM.from_pretrained(model_path).to(device) 20 | self.vocab_size = vocab_size 21 | 22 | def forward(self, input_ids, attention_mask=None): 23 | outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) 24 | return outputs.logits 25 | 26 | def generate(self, input_ids, attention_mask=None, max_length=30, **kwargs): 27 | current_input_ids = input_ids 28 | current_attention_mask = attention_mask 29 | 30 | for _ in range(max_length): 31 | next_token_logits = self.forward(current_input_ids, current_attention_mask) 32 | next_token = torch.multinomial(F.softmax(next_token_logits, dim=-1), num_samples=1) 33 | current_input_ids = torch.cat([current_input_ids, next_token], dim=1) 34 | if current_attention_mask is not None: 35 | current_attention_mask = torch.cat([current_attention_mask, torch.ones_like(next_token)], dim=1) 36 | 37 | return current_input_ids 38 | 39 | def train_self_supervised_rl( 40 | model_path, 41 | dataset, 42 | output_dir, 43 | batch_size=4, 44 | epochs=1, 45 | lr=1e-5, 46 | intrinsic_reward_weight=0.1 47 | ): 48 | """ 49 | Train a Self-supervised Reinforcement Learning model 50 | 51 | Args: 52 | model_path: Path to the pre-trained model 53 | dataset: Dataset for training 54 | output_dir: Directory to save the model 55 | batch_size: Batch size for training 56 | epochs: Number of epochs for training 57 | lr: Learning rate 58 | intrinsic_reward_weight: Weight for the intrinsic reward 59 | 60 | Returns: 61 | Trained model 62 | """ 63 | os.makedirs(output_dir, exist_ok=True) 64 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 65 | 66 | tokenizer = AutoTokenizer.from_pretrained(model_path) 67 | vocab_size = tokenizer.vocab_size 68 | 69 | model = SelfSupervisedRLModel(model_path, vocab_size, device).to(device) 70 | 71 | optimizer = torch.optim.Adam(model.parameters(), lr=lr) 72 | 73 | logger.info("Starting Self-supervised RL training...") 74 | 75 | for epoch in range(epochs): 76 | total_reward = 0 77 | total_loss = 0 78 | 79 | for i, batch in enumerate(dataset): 80 | if i >= len(dataset) // batch_size: 81 | break 82 | 83 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).to(device) 84 | 85 | generated_ids = model.generate(inputs.input_ids, inputs.attention_mask, max_length=10) 86 | generated_part = generated_ids[:, inputs.input_ids.shape[1]:] 87 | 88 | # Calculate intrinsic rewards 89 | intrinsic_rewards = torch.sum(F.log_softmax(model(inputs.input_ids, inputs.attention_mask), dim=-1).gather(1, generated_part[:, 0].unsqueeze(1)), dim=1) 90 | 91 | # Policy gradient loss (maximize reward) 92 | loss = -torch.mean(intrinsic_rewards) * intrinsic_reward_weight 93 | 94 | optimizer.zero_grad() 95 | loss.backward() 96 | optimizer.step() 97 | 98 | total_reward += intrinsic_rewards.mean().item() 99 | total_loss += loss.item() 100 | 101 | if i % 10 == 0: 102 | logger.info(f"Epoch {epoch+1}/{epochs}, Batch {i}/{len(dataset)//batch_size}, " 103 | f"Loss: {total_loss/(i+1):.4f}, " 104 | f"Avg Reward: {total_reward/(i+1):.4f}") 105 | 106 | model.save_pretrained(output_dir) 107 | tokenizer.save_pretrained(output_dir) 108 | 109 | logger.info(f"Self-supervised RL training complete. Model saved to {output_dir}") 110 | return model, tokenizer -------------------------------------------------------------------------------- /utils/trpo.py: -------------------------------------------------------------------------------- 1 | import os 2 | import torch 3 | import torch.optim as optim 4 | import torch.nn as nn 5 | import torch.nn.functional as F 6 | import numpy as np 7 | import logging 8 | from transformers import AutoModelForCausalLM, AutoTokenizer 9 | from torch.distributions import Categorical 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | class TRPOPolicy(nn.Module): 14 | def __init__(self, model_path, vocab_size, device='cuda'): 15 | super(TRPOPolicy, self).__init__() 16 | self.device = device 17 | self.model = AutoModelForCausalLM.from_pretrained(model_path).to(device) 18 | self.vocab_size = vocab_size 19 | 20 | def forward(self, input_ids, attention_mask=None): 21 | outputs = self.model(input_ids=input_ids, attention_mask=attention_mask) 22 | return outputs.logits 23 | 24 | def get_action(self, input_ids, attention_mask=None): 25 | logits = self.forward(input_ids, attention_mask) 26 | probs = F.softmax(logits, dim=-1) 27 | dist = Categorical(probs) 28 | action = dist.sample() 29 | return action, dist.log_prob(action), dist.entropy() 30 | 31 | def train_trpo( 32 | model_path, 33 | dataset, 34 | output_dir, 35 | batch_size=4, 36 | epochs=1, 37 | lr=1e-5, 38 | max_kl=1e-2, 39 | cg_iters=10, 40 | cg_damping=1e-2 41 | ): 42 | os.makedirs(output_dir, exist_ok=True) 43 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 44 | 45 | tokenizer = AutoTokenizer.from_pretrained(model_path) 46 | vocab_size = tokenizer.vocab_size 47 | 48 | policy = TRPOPolicy(model_path, vocab_size, device).to(device) 49 | 50 | optimizer = optim.Adam(policy.parameters(), lr=lr) 51 | 52 | logger.info("Starting TRPO training...") 53 | 54 | for epoch in range(epochs): 55 | total_reward = 0 56 | total_loss = 0 57 | 58 | for i, batch in enumerate(dataset): 59 | if i >= len(dataset) // batch_size: 60 | break 61 | 62 | inputs = tokenizer(batch["text"], return_tensors="pt", padding=True, truncation=True).to(device) 63 | 64 | actions, log_probs, entropies = policy.get_action(inputs.input_ids, inputs.attention_mask) 65 | rewards = F.softmax(policy(inputs.input_ids, inputs.attention_mask), dim=-1).gather(1, actions.unsqueeze(1)) 66 | 67 | loss = -torch.mean(rewards * log_probs) 68 | 69 | optimizer.zero_grad() 70 | loss.backward() 71 | optimizer.step() 72 | 73 | total_reward += rewards.mean().item() 74 | total_loss += loss.item() 75 | 76 | if i % 10 == 0: 77 | logger.info(f"Epoch {epoch+1}/{epochs}, Batch {i}/{len(dataset)//batch_size}, " 78 | f"Loss: {total_loss/(i+1):.4f}, " 79 | f"Avg Reward: {total_reward/(i+1):.4f}") 80 | 81 | policy.save_pretrained(output_dir) 82 | tokenizer.save_pretrained(output_dir) 83 | 84 | logger.info(f"TRPO training complete. Model saved to {output_dir}") 85 | return policy, tokenizer -------------------------------------------------------------------------------- /utils/tsallis_entropy_rl.py: -------------------------------------------------------------------------------- 1 | import torch 2 | import torch.nn as nn 3 | import torch.nn.functional as F 4 | from transformers import AutoModelForCausalLM, AutoTokenizer 5 | import logging 6 | import math 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | class TsallisEntropyPolicy(nn.Module): 11 | def __init__(self, base_model): 12 | super().__init__() 13 | self.base_model = base_model 14 | self.hidden_size = base_model.config.hidden_size 15 | 16 | # Value function head 17 | self.value_head = nn.Linear(self.hidden_size, 1) 18 | 19 | def forward(self, input_ids, attention_mask=None): 20 | outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask) 21 | hidden_states = outputs.last_hidden_state 22 | 23 | # Get logits for policy 24 | logits = outputs.logits 25 | values = self.value_head(hidden_states) 26 | 27 | return { 28 | 'logits': logits, 29 | 'values': values, 30 | 'hidden_states': hidden_states 31 | } 32 | 33 | def compute_tsallis_entropy(probs, q=2.0): 34 | """Compute Tsallis entropy with entropic index q""" 35 | if q == 1.0: 36 | # Special case: Shannon entropy 37 | entropy = -torch.sum(probs * torch.log(probs + 1e-10), dim=-1) 38 | else: 39 | # General case: Tsallis entropy 40 | entropy = (1 / (q - 1)) * (1 - torch.sum(probs**q, dim=-1)) 41 | 42 | return entropy 43 | 44 | def compute_tsallis_policy_gradient_loss(logits, values, rewards, q=2.0, gamma=0.99, entropy_coef=0.01): 45 | """Compute policy gradient loss with Tsallis entropy regularization""" 46 | # Get probabilities from logits 47 | probs = F.softmax(logits, dim=-1) 48 | 49 | # Compute log probabilities 50 | log_probs = F.log_softmax(logits, dim=-1) 51 | 52 | # Compute Tsallis entropy 53 | tsallis_entropy = compute_tsallis_entropy(probs, q) 54 | 55 | # Compute policy gradient loss 56 | policy_loss = -torch.mean(log_probs * rewards) 57 | 58 | # Apply Tsallis entropy regularization 59 | entropy_regularized_loss = policy_loss - entropy_coef * tsallis_entropy.mean() 60 | 61 | return entropy_regularized_loss, tsallis_entropy.mean() 62 | 63 | def train_tsallis_entropy_rl(base_model_path, train_dataset, output_dir, reward_model_path=None, 64 | batch_size=4, epochs=1, lr=1e-5, q=2.0, entropy_coef=0.01): 65 | """Train a policy with Tsallis entropy regularization.""" 66 | logger.info(f"Initializing Tsallis Entropy RL training with q={q}") 67 | 68 | # Load tokenizer and model 69 | tokenizer = AutoTokenizer.from_pretrained(base_model_path) 70 | base_model = AutoModelForCausalLM.from_pretrained(base_model_path) 71 | 72 | # Create policy model 73 | policy = TsallisEntropyPolicy(base_model) 74 | 75 | # Load reward model if provided 76 | if reward_model_path: 77 | logger.info(f"Loading reward model from {reward_model_path}") 78 | reward_model = AutoModelForCausalLM.from_pretrained(reward_model_path) 79 | else: 80 | logger.info("No reward model provided, will use simple rewards") 81 | reward_model = None 82 | 83 | # Setup optimizer 84 | optimizer = torch.optim.Adam(policy.parameters(), lr=lr) 85 | 86 | # Training loop 87 | logger.info(f"Starting training for {epochs} epochs") 88 | policy.train() 89 | 90 | for epoch in range(epochs): 91 | total_loss = 0 92 | total_entropy = 0 93 | num_batches = 0 94 | 95 | for i in range(0, len(train_dataset), batch_size): 96 | batch = train_dataset[i:i+batch_size] 97 | 98 | # Tokenize inputs 99 | inputs = tokenizer(batch['text'], return_tensors="pt", padding=True, truncation=True) 100 | inputs = {k: v.to(base_model.device) for k, v in inputs.items()} 101 | 102 | # Get policy outputs 103 | policy_outputs = policy(inputs['input_ids'], inputs['attention_mask']) 104 | logits = policy_outputs['logits'] 105 | values = policy_outputs['values'] 106 | 107 | # Generate next tokens using current policy 108 | with torch.no_grad(): 109 | probs = F.softmax(logits[:, -1, :], dim=-1) 110 | next_tokens = torch.multinomial(probs, 1) 111 | next_inputs = torch.cat([inputs['input_ids'], next_tokens], dim=1) 112 | 113 | # Get rewards 114 | if reward_model: 115 | reward_outputs = reward_model(next_inputs) 116 | rewards = reward_outputs.logits.mean(dim=-1) 117 | else: 118 | # Simple language modeling reward - higher probability = better 119 | next_token_probs = F.softmax(base_model(next_inputs).logits[:, -2, :], dim=-1) 120 | next_token_indices = next_tokens 121 | rewards = torch.gather(next_token_probs, 1, next_token_indices).squeeze(1) 122 | rewards = rewards.detach() 123 | 124 | # Compute advantages (simplified) 125 | advantages = rewards.unsqueeze(1).expand_as(values) - values.detach() 126 | 127 | # Compute Tsallis policy gradient loss 128 | loss, entropy = compute_tsallis_policy_gradient_loss( 129 | logits, 130 | values, 131 | advantages, 132 | q=q, 133 | entropy_coef=entropy_coef 134 | ) 135 | 136 | # Update model 137 | optimizer.zero_grad() 138 | loss.backward() 139 | optimizer.step() 140 | 141 | total_loss += loss.item() 142 | total_entropy += entropy.item() 143 | num_batches += 1 144 | 145 | if num_batches % 10 == 0: 146 | logger.info(f"Epoch {epoch+1}, Batch {num_batches}, " 147 | f"Loss: {loss.item():.4f}, Tsallis Entropy: {entropy.item():.4f}") 148 | 149 | avg_loss = total_loss / num_batches 150 | avg_entropy = total_entropy / num_batches 151 | 152 | logger.info(f"Epoch {epoch+1} completed. " 153 | f"Average Loss: {avg_loss:.4f}, " 154 | f"Average Tsallis Entropy: {avg_entropy:.4f}") 155 | 156 | # Save the model 157 | logger.info(f"Training completed. Saving model to {output_dir}") 158 | policy.base_model.save_pretrained(output_dir) 159 | tokenizer.save_pretrained(output_dir) 160 | 161 | return policy.base_model, tokenizer 162 | -------------------------------------------------------------------------------- /้how_to_trian.bash: -------------------------------------------------------------------------------- 1 | # normaldayinthailand 2 | Google Cloud Vertex AI (DPO + RLHF + PPO + IRL + Q-Learning + SAC + COT ) / Model: scb10x/llama3.2-typhoon2-t1-3b-research-preview 3 | ## Description 4 | เพื่อฝึกโมเดล Language Model (LLM) ด้วยเทคนิค Direct Preference Optimization (DPO), Inverse Reinforcement Learning (IRL), Q-Learning, Soft Actor-Critic (SAC), Proximal Policy Optimization (PPO), และ Chain-of-Thought (COT) บน Google Cloud Vertex AI คุณสามารถทำตามขั้นตอนต่อไปนี้: 5 | 6 | ## 🔍 ขั้นตอนการเตรียมการ 7 | 8 | ### 1. ตั้งค่า Google Cloud Project 9 | 10 | ```bash 11 | # ติดตั้ง Google Cloud SDK บนเครื่องของคุณ (ถ้ายังไม่มี) 12 | curl https://sdk.cloud.google.com | bash 13 | exec -l $SHELL 14 | gcloud init 15 | 16 | # สร้าง project ใหม่หรือเลือก project ที่มีอยู่ 17 | gcloud projects create llm-training-project --name="LLM Training Project" # หรือข้ามขั้นตอนนี้หากมี project อยู่แล้ว 18 | gcloud config set project llm-training-project # แทนที่ด้วยชื่อ project ของคุณ 19 | 20 | # เปิดใช้งาน API ที่จำเป็น 21 | gcloud services enable compute.googleapis.com 22 | gcloud services enable aiplatform.googleapis.com 23 | gcloud services enable iam.googleapis.com 24 | gcloud services enable artifactregistry.googleapis.com 25 | ``` 26 | 27 | ### 2. ตรวจสอบและขอ quota สำหรับ GPU 28 | 29 | ```bash 30 | # ตรวจสอบ quota ปัจจุบัน 31 | gcloud compute regions describe us-central1 | grep -A 10 "quotas:" 32 | 33 | # ถ้า quota ไม่เพียงพอ ต้องยื่นคำขอเพิ่ม quota ในเว็บ Google Cloud Console 34 | # ไปที่: IAM & Admin > Quotas & system limits > ค้นหา "NVIDIA_A100" และขอเพิ่ม quota 35 | ``` 36 | 37 | ### 3. สร้าง Storage Bucket สำหรับเก็บข้อมูลและโมเดล 38 | 39 | ```bash 40 | # สร้าง bucket 41 | export BUCKET_NAME="llm-training-$(gcloud config get-value project)" 42 | gcloud storage buckets create gs://$BUCKET_NAME --location=us-central1 43 | ``` 44 | 45 | ### 4. สร้างไฟล์โค้ดทั้งหมด 46 | 47 | ```bash 48 | # สร้างโฟลเดอร์สำหรับโปรเจค 49 | mkdir -p llm-training-project/utils 50 | cd llm-training-project 51 | 52 | # สร้างไฟล์ source code ตามที่ให้ไว้ก่อนหน้านี้ 53 | # - Dockerfile 54 | # - train_pipeline.py 55 | # - utils/*.py 56 | # - setup_vertex_ai.sh 57 | # - run_training.sh 58 | # - test_model.py 59 | ``` 60 | 61 | ## 🏗️ ขั้นตอนการสร้าง Docker Image และอัพโหลดไปยัง Container Registry 62 | 63 | ### 5. สร้างและอัพโหลด Docker Image 64 | 65 | ```bash 66 | # ทำให้สคริปต์สามารถรันได้ 67 | chmod +x setup_vertex_ai.sh 68 | 69 | # รันสคริปต์เพื่อสร้างและอัพโหลด Docker image 70 | ./setup_vertex_ai.sh 71 | ``` 72 | 73 | ## 🚄 ขั้นตอนการฝึกโมเดล 74 | 75 | ### 6. เริ่มการฝึกโมเดลตามลำดับ 76 | 77 | #### วิธีที่ 1: ฝึกทั้งหมดพร้อมกัน (ใช้เวลานาน, ต้องการ GPU ราคาแพง) 78 | 79 | ```bash 80 | # ฝึกทุกเทคนิคในรอบเดียว 81 | chmod +x run_training.sh 82 | ./run_training.sh us-central1 all 4 1 # region, stage, batch_size, epochs 83 | ``` 84 | 85 | #### วิธีที่ 2: ฝึกทีละขั้นตอน (แนะนำ - ประหยัดทรัพยากรและง่ายต่อการดีบัก) 86 | 87 | ```bash 88 | # ขั้นตอนที่ 1: ฝึกด้วย DPO ก่อน 89 | ./run_training.sh us-central1 dpo 4 1 90 | 91 | # ขั้นตอนที่ 2: ฝึก Reward Model (รอให้ขั้นตอน DPO เสร็จก่อน) 92 | ./run_training.sh us-central1 reward 4 1 93 | 94 | # ขั้นตอนที่ 3: ฝึก IRL 95 | ./run_training.sh us-central1 irl 4 1 96 | 97 | # ขั้นตอนที่ 4: ฝึก Q-Learning 98 | ./run_training.sh us-central1 q_learning 4 1 99 | 100 | # ขั้นตอนที่ 5: ฝึก SAC 101 | ./run_training.sh us-central1 sac 4 1 102 | 103 | # ขั้นตอนที่ 6: ฝึก PPO โดยใช้โมเดลจากทุกขั้นตอนก่อนหน้า 104 | ./run_training.sh us-central1 ppo 4 1 105 | 106 | # ขั้นตอนที่ 7: ฝึก CoT 107 | ./run_training.sh us-central1 cot 4 1 108 | ``` 109 | 110 | ### 7. ตรวจสอบสถานะการฝึกโมเดล 111 | 112 | ```bash 113 | # ดูรายการและสถานะการทำงาน 114 | gcloud ai custom-jobs list --region=us-central1 115 | 116 | # ดูรายละเอียดของงานฝึกโมเดลเฉพาะ 117 | gcloud ai custom-jobs describe JOB_ID --region=us-central1 118 | ``` 119 | 120 | ### 8. ตรวจสอบ logs ระหว่างการฝึกโมเดล 121 | 122 | ```bash 123 | # ดู logs ของงานฝึกโมเดล 124 | gcloud ai custom-jobs stream-logs JOB_ID --region=us-central1 125 | ``` 126 | 127 | ## 🔍 ขั้นตอนการทดสอบและใช้งานโมเดล 128 | 129 | ### 9. ทดสอบโมเดลที่ฝึกเสร็จแล้ว 130 | 131 | ```bash 132 | # ค้นหา path ของโมเดลสุดท้าย (ตัวอย่าง) 133 | export MODEL_PATH="gs://$BUCKET_NAME/output/TIMESTAMP/final_model" 134 | 135 | # ทดสอบโมเดล 136 | python test_model.py --model-path $MODEL_PATH --test-prompts "รีวิวร้านอาหาร: " "รีวิวร้านอาหาร: อยากกินอาหารไทยรสชาติดั้งเดิม" 137 | ``` 138 | 139 | ### 10. เตรียมโมเดลสำหรับการใช้งาน 140 | 141 | ```bash 142 | # ดาวน์โหลดโมเดลจาก GCS มาเก็บไว้บนเครื่อง (ถ้าต้องการ) 143 | gsutil -m cp -r $MODEL_PATH ./local_model 144 | 145 | # หรืออัพโหลดไปยัง Model Registry บน Vertex AI (แนะนำ) 146 | MODEL_NAME="typhoon2-rl-combined" 147 | MODEL_DISPLAY_NAME="Typhoon2 with RL techniques" 148 | 149 | gcloud ai models upload \ 150 | --region=us-central1 \ 151 | --display-name=$MODEL_DISPLAY_NAME \ 152 | --container-image-uri=us-docker.pkg.dev/vertex-ai/prediction/pytorch-cpu.1-13:latest \ 153 | --artifact-uri=$MODEL_PATH 154 | ``` 155 | 156 | ## ⚠️ คำแนะนำสำคัญ 157 | 158 | 1. **ระวังค่าใช้จ่าย**: A100 GPU มีราคาประมาณ $3-4 ต่อชั่วโมง ควรตรวจสอบการทำงานบ่อยๆ 159 | 2. **ตรวจสอบความคืบหน้า**: ใช้ Weights & Biases (ถ้าตั้งค่าไว้) หรือ Cloud Logging เพื่อติดตามความคืบหน้า 160 | 3. **เริ่มทีละขั้น**: แนะนำให้เริ่มจากการฝึกทีละขั้นตอนแทนการฝึกทั้งหมดพร้อมกัน 161 | 4. **ตรวจสอบ errors**: หมั่นตรวจสอบ logs เพื่อแก้ไขปัญหาได้ทันที 162 | 5. **สำรองโมเดล**: คอยสำรองไฟล์โมเดลสำคัญไว้เสมอ 163 | 164 | เมื่อทำตามขั้นตอนทั้งหมด คุณจะได้โมเดล LLM ที่ผ่านการฝึกด้วยเทคนิค DPO, RLHF, IRL, Q-Learning, SAC, PPO และ COT ซึ่งมีความสามารถในการเข้าใจความต้องการของผู้ใช้ได้ดีขึ้น และตอบสนองได้อย่างเหมาะสมตามบริบท --------------------------------------------------------------------------------